From 4a42df781019c9410d9b1b0d044b230ab73c4bc9 Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Fri, 12 Jun 2026 07:49:47 +0300 Subject: [PATCH 1/4] feat(drbd): add SafeForMkfsRetryPromote kernel-truth probe Authorises a promote+mkfs+demote retry without the dispatcher's auto-primary blessing, but only when the kernel proves it safe: every replica Secondary, local volumes UpToDate, and every connected peer-device either a lock-step UpToDate sibling or an intentional diskless witness. A disconnected or non-UpToDate diskful peer (a potential offline data holder) refuses, as does a foreign Primary (external promoters such as drbd-reactor simply defer the retry). Groundwork for the BUG-028 day0/mkfs race fix. Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- pkg/drbd/drbdadm.go | 76 ++++++++ pkg/drbd/safe_for_mkfs_retry_promote_test.go | 177 +++++++++++++++++++ 2 files changed, 253 insertions(+) create mode 100644 pkg/drbd/safe_for_mkfs_retry_promote_test.go diff --git a/pkg/drbd/drbdadm.go b/pkg/drbd/drbdadm.go index 869d0ed3..d98b3366 100644 --- a/pkg/drbd/drbdadm.go +++ b/pkg/drbd/drbdadm.go @@ -935,6 +935,82 @@ func (a *Adm) AnyConnectedPeerHasDataForVolume(ctx context.Context, resource str return false } +// SafeForMkfsRetryPromote probes `drbdsetup status --json` and +// reports whether a promote→mkfs→demote retry is provably safe to run +// RIGHT NOW on this node without the dispatcher's auto-primary +// blessing (the BUG-028 latch-free mkfs retry; see the satellite's +// latchFreeMkfsRetryAllowed for the full story of the false +// RD.Spec.Initialized latch that kills the auto-primary election). +// +// Returns true ONLY when ALL hold: +// +// - the local role is NOT Primary (we are about to promote; an +// already-Primary local slot means some consumer or a previous +// dance holds the device — let it finish); +// - every local volume is diskful UpToDate (the retry exists to add +// a missing filesystem to a HEALTHY converged replica set, never +// to promote an Inconsistent local copy); +// - NO peer is Primary (an external promoter — drbd-reactor's RWX +// mount loop — may briefly hold the device; the caller simply +// retries on a later reconcile once it has demoted again); +// - every connected peer-device is UpToDate or an intentional +// Diskless witness. UpToDate-while-Connected means the peer is in +// the SAME data generation as the local volume (bit-identical), so +// `primary --force` mints nothing unrelated and the subsequent +// mkfs writes replicate to copies that already equal ours. ANY +// other peer-disk state (Inconsistent, DUnknown of a disconnected +// peer, Negotiating, …) vetoes — a disconnected diskful peer could +// be an offline data holder, and forcing primary against one is +// exactly the Bug 342 unrelated-data wedge. +// +// Conservative on any probe / parse failure: returns false, the retry +// just waits for the next reconcile. +func (a *Adm) SafeForMkfsRetryPromote(ctx context.Context, resource string) bool { + out, err := a.exec.Run(ctx, "drbdsetup", "status", resource, "--json") + if err != nil { + return false + } + + var status drbdsetupStatusRoot + + err = json.Unmarshal(out, &status) + if err != nil || len(status) == 0 { + return false + } + + res := status[0] + + if Role(res.Role).IsPrimary() { + return false + } + + if !localIsUpToDate(res.Devices) { + return false + } + + for _, conn := range res.Connections { + if Role(conn.PeerRole).IsPrimary() { + return false + } + + for _, pd := range conn.PeerDevices { + switch DiskState(pd.PeerDiskState) { + case DiskStateUpToDate, DiskStateDiskless: + // Lock-step sibling or intentional witness — safe. + case DiskStateConsistent, DiskStateOutdated, DiskStateAttaching, + DiskStateDetaching, DiskStateFailed, DiskStateNegotiating, + DiskStateInconsistent, DiskStateDUnknown: + return false + default: + // Unknown/empty token — refuse, conservative. + return false + } + } + } + + return true +} + // NeedsRecoveryPromote probes the live kernel via `drbdsetup status // --json` and reports whether THIS node should re-arm the // auto-primary seed to unstick a fresh RD whose initial sync wedged diff --git a/pkg/drbd/safe_for_mkfs_retry_promote_test.go b/pkg/drbd/safe_for_mkfs_retry_promote_test.go new file mode 100644 index 00000000..d596a49a --- /dev/null +++ b/pkg/drbd/safe_for_mkfs_retry_promote_test.go @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: Apache-2.0 + +/* +Copyright 2026 Cozystack contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package drbd_test + +import ( + "testing" + + "github.com/cozystack/blockstor/pkg/drbd" + "github.com/cozystack/blockstor/pkg/storage" +) + +// Regression pins for the BUG-028 latch-free mkfs-retry promote-safety +// predicate (SafeForMkfsRetryPromote). The predicate authorises a +// promote→mkfs→demote retry WITHOUT the dispatcher's auto-primary +// blessing, so it must be provably conservative: it may only return +// true when every replica is Secondary and every connected peer-device +// is a lock-step UpToDate sibling (or an intentional Diskless witness) +// — i.e. when `primary --force` cannot mint an unrelated UUID against +// anyone and the mkfs writes replicate to bit-identical copies. + +const mkfsRetryStatusKey = "drbdsetup status pvc-b028 --json" + +func admWithMkfsRetryStatus(t *testing.T, json string) *drbd.Adm { + t.Helper() + + fx := storage.NewFakeExec() + fx.Responses[mkfsRetryStatusKey] = storage.FakeResponse{Stdout: []byte(json)} + + return drbd.NewAdm(fx) +} + +// TestSafeForMkfsRetryPromote_AllSecondaryLockStepUpToDate: the exact +// BUG-028 terminal state between two drbd-reactor promote cycles — +// local Secondary UpToDate, diskful peer Secondary UpToDate, diskless +// tiebreaker — must authorise the retry. +func TestSafeForMkfsRetryPromote_AllSecondaryLockStepUpToDate(t *testing.T) { + adm := admWithMkfsRetryStatus(t, `[{ + "name":"pvc-b028","node-id":0,"role":"Secondary", + "devices":[{"volume":0,"disk-state":"UpToDate"},{"volume":1,"disk-state":"UpToDate"}], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[{"volume":0,"peer-disk-state":"UpToDate"},{"volume":1,"peer-disk-state":"UpToDate"}] + },{ + "peer-node-id":2,"name":"n3","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[{"volume":0,"peer-disk-state":"Diskless"},{"volume":1,"peer-disk-state":"Diskless"}] + }] + }]`) + + if !adm.SafeForMkfsRetryPromote(t.Context(), "pvc-b028") { + t.Fatal("all-Secondary lock-step UpToDate set (with diskless witness) must authorise the latch-free mkfs retry") + } +} + +// TestSafeForMkfsRetryPromote_ForeignPrimaryDefers: drbd-reactor (or +// any external promoter) currently holds the device on a peer → +// refuse; the caller retries on a later reconcile once it demoted. +func TestSafeForMkfsRetryPromote_ForeignPrimaryDefers(t *testing.T) { + adm := admWithMkfsRetryStatus(t, `[{ + "name":"pvc-b028","node-id":0,"role":"Secondary", + "devices":[{"volume":0,"disk-state":"UpToDate"}], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected", + "peer-role":"Primary", + "peer_devices":[{"volume":0,"peer-disk-state":"UpToDate"}] + }] + }]`) + + if adm.SafeForMkfsRetryPromote(t.Context(), "pvc-b028") { + t.Fatal("a foreign Primary peer must defer the latch-free mkfs retry") + } +} + +// TestSafeForMkfsRetryPromote_LocalPrimaryRefuses: the local slot is +// already Primary (a consumer or a previous dance holds the device) → +// refuse. +func TestSafeForMkfsRetryPromote_LocalPrimaryRefuses(t *testing.T) { + adm := admWithMkfsRetryStatus(t, `[{ + "name":"pvc-b028","node-id":0,"role":"Primary", + "devices":[{"volume":0,"disk-state":"UpToDate"}], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[{"volume":0,"peer-disk-state":"UpToDate"}] + }] + }]`) + + if adm.SafeForMkfsRetryPromote(t.Context(), "pvc-b028") { + t.Fatal("a local Primary role must refuse the latch-free mkfs retry") + } +} + +// TestSafeForMkfsRetryPromote_DisconnectedPeerRefuses: a peer whose +// disk state is DUnknown (connection down) could be an OFFLINE DATA +// HOLDER — promoting against it is the Bug 342 unrelated-data wedge, +// and mkfs could overwrite real data once it reconnects. Refuse. +func TestSafeForMkfsRetryPromote_DisconnectedPeerRefuses(t *testing.T) { + adm := admWithMkfsRetryStatus(t, `[{ + "name":"pvc-b028","node-id":0,"role":"Secondary", + "devices":[{"volume":0,"disk-state":"UpToDate"}], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connecting", + "peer-role":"Unknown", + "peer_devices":[{"volume":0,"peer-disk-state":"DUnknown"}] + }] + }]`) + + if adm.SafeForMkfsRetryPromote(t.Context(), "pvc-b028") { + t.Fatal("a disconnected (DUnknown) peer must refuse the latch-free mkfs retry — it could be an offline data holder") + } +} + +// TestSafeForMkfsRetryPromote_InconsistentPeerRefuses: a peer still +// Inconsistent is not in lock-step with the local copy; the retry must +// wait (or let the Bug 366 recovery-promote own that state). +func TestSafeForMkfsRetryPromote_InconsistentPeerRefuses(t *testing.T) { + adm := admWithMkfsRetryStatus(t, `[{ + "name":"pvc-b028","node-id":0,"role":"Secondary", + "devices":[{"volume":0,"disk-state":"UpToDate"}], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[{"volume":0,"peer-disk-state":"Inconsistent","replication-state":"Established","resync-suspended":"no"}] + }] + }]`) + + if adm.SafeForMkfsRetryPromote(t.Context(), "pvc-b028") { + t.Fatal("an Inconsistent peer must refuse the latch-free mkfs retry") + } +} + +// TestSafeForMkfsRetryPromote_LocalNotUpToDateRefuses: the retry adds +// a missing filesystem to a HEALTHY converged replica — it must never +// promote an Inconsistent local copy. +func TestSafeForMkfsRetryPromote_LocalNotUpToDateRefuses(t *testing.T) { + adm := admWithMkfsRetryStatus(t, `[{ + "name":"pvc-b028","node-id":0,"role":"Secondary", + "devices":[{"volume":0,"disk-state":"Inconsistent"}], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[{"volume":0,"peer-disk-state":"UpToDate"}] + }] + }]`) + + if adm.SafeForMkfsRetryPromote(t.Context(), "pvc-b028") { + t.Fatal("a non-UpToDate local volume must refuse the latch-free mkfs retry") + } +} + +// TestSafeForMkfsRetryPromote_ProbeFailureRefuses: any probe / parse +// failure must be conservative (false) — the retry just waits for the +// next reconcile. +func TestSafeForMkfsRetryPromote_ProbeFailureRefuses(t *testing.T) { + adm := admWithMkfsRetryStatus(t, `not-json`) + + if adm.SafeForMkfsRetryPromote(t.Context(), "pvc-b028") { + t.Fatal("a malformed status probe must refuse the latch-free mkfs retry") + } +} From 3e6ba06f63f44d8318375aff16f357376c645b2b Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Fri, 12 Jun 2026 07:50:06 +0300 Subject: [PATCH 2/4] fix(satellite): never lose the day0 first-activation mkfs (BUG-028) A fresh RWX RD (FileSystem/Type on the RD) intermittently came up with no filesystem, forever. Two coupled failures: 1. Day0 race: skip-initial-sync brings both diskful replicas Connected+UpToDate at the shared day0 GI before the elected mkfs winner reaches finishDRBDApply. The Bug 342 force-promote kernel veto cannot tell an empty day0 sibling from a real data peer, so the one-and-only first-activation mkfs was silently skipped. 2. False-latch terminal state: an external promoter (drbd-reactor) then bumps the current-UUID past day0 without writing data, the controller latches RD.Spec.Initialized, the dispatcher drops the auto-primary election, and the Bug-311 mkfs retry (gated solely on autoPrimaryReplica) goes permanently dead. Fix, both sides, evidence-gated: - day0EmptyMkfsBypass: the veto may be bypassed only when the controller-persisted Spec.SkipInitialSync proves a never- initialized generation, no proven data peer exists, the local metadata current-UUID still equals the deterministic day0 GI (DRBD only lets replicas sit Connected+UpToDate in the same data generation, so this proves every connected peer is a day0-empty sibling), and no volume carries a filesystem signature. - shouldRetryAutoMkfs: the Bug-311 retry now also fires without the auto-primary election, via the deterministic lowest-diskful- node-id winner + SafeForMkfsRetryPromote kernel proof + blkid fs-absence probe, throttled, deferring while a foreign Primary holds the device. Any missing evidence refuses both paths, preserving the Bug 342/356 relocate and respawn-StandAlone protections and the v0.1.11 day0 skip-initial-sync contract. Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- pkg/satellite/reconciler.go | 269 ++++++++++++++- pkg/satellite/reconciler_bug028_test.go | 437 ++++++++++++++++++++++++ 2 files changed, 696 insertions(+), 10 deletions(-) create mode 100644 pkg/satellite/reconciler_bug028_test.go diff --git a/pkg/satellite/reconciler.go b/pkg/satellite/reconciler.go index e6b22c85..0d73d16d 100644 --- a/pkg/satellite/reconciler.go +++ b/pkg/satellite/reconciler.go @@ -2189,7 +2189,7 @@ func (r *Reconciler) applyDRBD(ctx context.Context, dr *intent.DesiredResource, return err } - return r.finishDRBDApply(ctx, dr, diskless, effectiveFirstActivation, resized, cloned) + return r.finishDRBDApply(ctx, dr, diskless, effectiveFirstActivation, resized, cloned, devices) } // healAndDispatchFsm runs the Bug 360 my-node-id self-heal and then @@ -2293,7 +2293,7 @@ func (r *Reconciler) reconcileKernelMyNodeID(ctx context.Context, dr *intent.Des // adjust, and drbd-utils' compare_volume schedules attach_cmd // automatically when kern->disk=="none" but conf->disk points at a // real path. Matches upstream LINSTOR's DrbdLayer pipeline. -func (r *Reconciler) finishDRBDApply(ctx context.Context, dr *intent.DesiredResource, diskless, firstActivation, resized, cloned bool) error { +func (r *Reconciler) finishDRBDApply(ctx context.Context, dr *intent.DesiredResource, diskless, firstActivation, resized, cloned bool, devices map[int32]string) error { // Pickup-time resize: the storage layer was just grown, drbdadm // resize tells the kernel to extend the replicated device to // match. Adjust on its own won't do this — only resize re-reads @@ -2342,13 +2342,41 @@ func (r *Reconciler) finishDRBDApply(ctx context.Context, dr *intent.DesiredReso _ = cloned if autoPromote && !r.shouldForcePromote(ctx, dr) { - // Bug 342 force-promote gate fired: a data-bearing peer exists, - // so SKIP `drbdadm primary --force`. The fresh replica stays - // Inconsistent and SyncTargets from the peer (full resync, - // data-safe). Returning here also skips the mkfs-retry below — - // correct, since the replica adopts the peer's filesystem via - // the resync rather than formatting locally. - return nil + // Bug 342 force-promote gate fired: the kernel probe saw a + // connected peer-disk in UpToDate/Consistent/Outdated. + // + // BUG-028: that probe cannot tell an EMPTY day0 skip-initial-sync + // sibling (both fresh diskful replicas reach Connected+UpToDate at + // the shared day0 GI within ~2s, BEFORE the elected winner gets + // here) from a real data-bearing peer. Blindly returning here + // silently skipped the first-activation mkfs, latched + // firstActivation=false, and an external promoter (drbd-reactor + // RWX path) then bumped the current-UUID past day0 without + // writing data → the controller false-latched RD.Spec.Initialized + // → the dispatcher dropped auto-primary → the Bug-311 retry was + // permanently dead → "Bad magic number in super-block" forever. + // + // day0EmptyMkfsBypass re-checks with day0-aware evidence: ONLY + // when the controller-persisted SkipInitialSync says this is a + // genuinely-fresh generation, no peer is a proven data holder, + // every local volume still sits at the deterministic day0 GI + // (kernel-truth: a Connected+UpToDate peer necessarily shares the + // local current-UUID, so day0 here proves every connected peer is + // a never-written day0 sibling too), and no volume carries a + // filesystem signature, do we fall through to the promote+mkfs. + if !r.day0EmptyMkfsBypass(ctx, dr, devices) { + // Genuine data-bearing peer (or any evidence is missing — + // conservative): SKIP `drbdadm primary --force`. The fresh + // replica stays Inconsistent and SyncTargets from the peer + // (full resync, data-safe). Returning here also skips the + // mkfs-retry below — correct, since the replica adopts the + // peer's filesystem via the resync rather than formatting + // locally. + return nil + } + + log.FromContext(ctx).Info("BUG-028: force-promote veto bypassed — every connected peer is a day0-empty sibling, proceeding to first-activation mkfs", + "resource", dr.GetName()) } // Reaching UpToDate no longer depends on this promote. The elected @@ -2396,7 +2424,19 @@ func (r *Reconciler) finishDRBDApply(ctx context.Context, dr *intent.DesiredReso // already populated from a previous attempt), runAutoMkfs writes // the marker and this branch becomes a no-op for the rest of the // resource's life. - if !autoPromote && autoPrimaryReplica && r.needsAutoMkfsRetry(dr) { + // + // BUG-028: the retry can no longer depend SOLELY on + // autoPrimaryReplica. When the false RD.Spec.Initialized latch fires + // (an external promoter bumped the current-UUID past day0 without + // writing data), the dispatcher stops stamping `auto-primary` and the + // autoPrimaryReplica-only gate left this retry permanently dead on a + // volume that NEVER got a filesystem. latchFreeMkfsRetryAllowed + // re-enables the retry from first principles instead: deterministic + // lowest-diskful-node-id winner, kernel state safe for a promote (all + // replicas Secondary + lock-step UpToDate — retried later when a + // foreign Primary such as drbd-reactor briefly holds the device), and + // an actual filesystem-absence probe on every local volume. + if r.shouldRetryAutoMkfs(ctx, dr, autoPromote, autoPrimaryReplica, diskless, devices) { err := r.runAutoPromote(ctx, dr) if err != nil { return err @@ -2660,6 +2700,215 @@ func (r *Reconciler) needsAutoMkfsRetry(dr *intent.DesiredResource) bool { return os.IsNotExist(err) } +// shouldRetryAutoMkfs is the Bug-311 mkfs-retry predicate (see the +// finishDRBDApply call-site comment for the full history): re-enter the +// promote→mkfs→demote dance on a steady-state reconcile when the +// first-activation mkfs never finished. The replica must be diskful and +// past its first activation, the marker/Condition must be absent +// (needsAutoMkfsRetry), and the node must be authorised either by the +// dispatcher's auto-primary election or — BUG-028 — by the latch-free +// evidence chain (latchFreeMkfsRetryAllowed) when the false +// RD.Spec.Initialized latch killed that election. +func (r *Reconciler) shouldRetryAutoMkfs(ctx context.Context, dr *intent.DesiredResource, autoPromote, autoPrimaryReplica, diskless bool, devices map[int32]string) bool { + if autoPromote || diskless || !r.needsAutoMkfsRetry(dr) { + return false + } + + return autoPrimaryReplica || r.latchFreeMkfsRetryAllowed(ctx, dr, devices) +} + +// day0EmptyMkfsBypass is the BUG-028 narrow escape hatch from the Bug +// 342 force-promote veto. The veto's kernel probe (peer-disk UpToDate/ +// Consistent/Outdated) is the right conservative default, but it cannot +// distinguish a real data-bearing peer from an EMPTY day0 +// skip-initial-sync sibling: both fresh diskful replicas of a +// skip-seeded RD reach Connected+UpToDate at the SHARED deterministic +// day0 current-UUID within seconds — often before the elected +// mkfs winner reaches finishDRBDApply. Honouring the veto there +// silently dropped the one-and-only first-activation mkfs. +// +// Returns true ONLY when every signal proves "all connected peers are +// day0-empty siblings of the same never-initialized generation" (belt +// and braces, all four must hold): +// +// 1. The RD asks for a filesystem and the satellite can run mkfs at +// all (needsMkfs + Exec wired) — otherwise the bypass is moot. +// 2. The controller-persisted Spec.SkipInitialSync is explicitly true: +// the OFFLINE-SAFE proof this replica was born into a genuinely- +// fresh, never-initialized RD generation (relocate / migrate / +// extra-replica destinations are stamped false and never bypass). +// 3. The dispatcher's CRD view reports no proven data-bearing diskful +// peer (PeerHasData=false; day0 siblings are already excluded by +// isDay0SeededVolume, a real survivor is not). +// 4. Kernel/metadata truth per volume: the LOCAL current-UUID still +// equals the deterministic day0 GI. DRBD only lets two replicas +// sit Connected+UpToDate when they are in the same data +// generation, so local==day0 proves every connected UpToDate peer +// — exactly the ones that fired the veto — is at day0 too, i.e. a +// never-written sibling. A real data holder mints a runtime UUID +// that cannot collide with day0 (2^-64), so this discriminator is +// exact. AND the volume's backing device carries no filesystem +// signature (blkid probe; the DRBD device itself is unopenable +// while Secondary, and with internal metadata the backing device +// exposes the same data bytes at offset 0). +// +// Any probe failure, missing device path, or unknown GI refuses the +// bypass — the veto then stands and behaviour is exactly pre-BUG-028 +// (skip promote, full-resync path). That keeps the relocate / +// respawn-StandAlone protections of Bug 342/356 intact: this function +// can only ever ADD an mkfs on a provably day0-empty generation, never +// remove a veto protecting real data. +func (r *Reconciler) day0EmptyMkfsBypass(ctx context.Context, dr *intent.DesiredResource, devices map[int32]string) bool { + if !needsMkfs(dr) || r.cfg.Exec == nil { + return false + } + + if skip := dr.GetSkipInitialSync(); skip == nil || !*skip { + return false + } + + if dr.GetPeerHasData() { + return false + } + + return r.allVolumesDay0Empty(ctx, dr, devices) +} + +// allVolumesDay0Empty reports whether EVERY desired volume of dr still +// sits at the deterministic day0 current-UUID in its on-disk DRBD +// metadata AND carries no filesystem signature on its backing device. +// Conservative: any missing device path, drbdmeta/blkid probe failure, +// or non-day0 GI returns false. See day0EmptyMkfsBypass for why this +// is the exact "all connected peers are day0-empty siblings" proof. +func (r *Reconciler) allVolumesDay0Empty(ctx context.Context, dr *intent.DesiredResource, devices map[int32]string) bool { + for _, vol := range dr.GetVolumes() { + device := devices[vol.GetVolumeNumber()] + if device == "" { + return false + } + + gi, err := r.cfg.Adm.CurrentGI(ctx, dr.GetName(), vol.GetVolumeNumber(), device) + if err != nil || gi == "" { + return false + } + + if !strings.EqualFold(gi, day0GiFor(dr.GetName(), vol.GetVolumeNumber())) { + return false + } + + if r.deviceHasFilesystem(ctx, device) { + return false + } + } + + return true +} + +// latchFreeMkfsRetryAllowed is the BUG-028 replacement evidence for the +// Bug-311 mkfs retry when the dispatcher no longer stamps +// `auto-primary`. The dispatcher's election is gated on +// !RD.Spec.Initialized, and that latch can fire FALSELY on a volume +// that never received its filesystem: an external promoter +// (drbd-reactor's RWX promote→mount-fail→demote loop) bumps the DRBD +// current-UUID past day0 WITHOUT writing data, the controller reads +// "UpToDate diskful with CurrentGI != day0" as proven data, latches +// Initialized=true, and the auto-primary-only retry gate goes +// permanently dead. This helper re-derives the retry permission from +// first principles instead of the (unprovable-here) latch: +// +// - This node is the DETERMINISTIC retry winner: lowest diskful +// node-id among itself and its configured peers — the same +// election rule the dispatcher uses — so at most ONE node ever +// re-enters the promote→mkfs→demote dance. +// - Kernel state is promote-safe (Adm.SafeForMkfsRetryPromote): the +// local replica is Secondary with every volume UpToDate, no +// replica anywhere is Primary (a foreign Primary — drbd-reactor +// mid-cycle — simply defers the retry to a later reconcile, when +// it has demoted again), and every connected peer-device is +// UpToDate or an intentional Diskless witness. An UNKNOWN / +// disconnected diskful peer vetoes: it could be an offline data +// holder, and `primary --force` against it is the Bug 342 wedge. +// Conversely all-UpToDate-while-Connected proves every replica is +// in the SAME data generation as the local one, so the promote +// mints nothing unrelated and the mkfs writes replicate to peers +// that are by construction bit-identical to the local volume. +// - Every local volume's backing device has NO filesystem signature +// (blkid): real data in the mkfs context means a filesystem (the +// RD requests FileSystem/Type; consumers only ever write through +// it), so fs-absence both proves the retry is still needed and +// that there is nothing to destroy. The post-promote blkid probe +// inside runAutoMkfs remains the per-volume double-mkfs safety +// net on the replicated device itself. +// - Throttled through recoveryPromoteDue (shared with the Bug 366 +// recovery-promote): the promote→demote dance churns kernel state +// and may race the external promoter's own cycle; one nudge per +// throttle window is enough and keeps the reconcile loop cold. +// +// Callers must have already checked needsAutoMkfsRetry (marker and +// Condition absent) so the probes below only ever run on a resource +// that is genuinely missing its filesystem marker — never in steady +// state. +func (r *Reconciler) latchFreeMkfsRetryAllowed(ctx context.Context, dr *intent.DesiredResource, devices map[int32]string) bool { + if !r.isLowestDiskfulNodeID(dr) { + return false + } + + if !r.cfg.Adm.SafeForMkfsRetryPromote(ctx, dr.GetName()) { + return false + } + + for _, vol := range dr.GetVolumes() { + device := devices[vol.GetVolumeNumber()] + if device == "" { + return false + } + + if r.deviceHasFilesystem(ctx, device) { + return false + } + } + + // Consume the throttle slot LAST so a pass vetoed by a foreign + // Primary / probe failure does not burn the window. + if !r.recoveryPromoteDue(dr.GetName()) { + return false + } + + log.FromContext(ctx).Info("BUG-028: latch-free mkfs retry — no auto-primary election but filesystem is provably absent, re-entering promote+mkfs+demote", + "resource", dr.GetName()) + + return true +} + +// isLowestDiskfulNodeID replicates the dispatcher's mkfs-winner +// election on the satellite side from the wire DrbdOptions: true when +// the LOCAL node-id is resolved and is strictly the lowest among the +// local node and every configured non-diskless peer. Missing / +// unparsable local id refuses (conservative — an unresolved identity +// must never promote); a peer with a missing id is treated as diskful +// id-unknown and also refuses, since the election would be ambiguous. +func (r *Reconciler) isLowestDiskfulNodeID(dr *intent.DesiredResource) bool { + opts := dr.GetDrbdOptions() + + selfID, err := strconv.Atoi(opts["node-id"]) + if err != nil { + return false + } + + for _, peer := range dr.GetPeerNames() { + if opts["peer."+peer+".diskless"] == drbdBoolPropTrue { + continue + } + + peerID, peerErr := strconv.Atoi(opts["peer."+peer+".node-id"]) + if peerErr != nil || peerID < selfID { + return false + } + } + + return true +} + // isDisklessToDiskfulFlip probes whether the local kernel slot is // currently `disk:Diskless client:yes` (intentional diskless) on a // Resource whose Spec has flipped to diskful (`linstor r td diff --git a/pkg/satellite/reconciler_bug028_test.go b/pkg/satellite/reconciler_bug028_test.go new file mode 100644 index 00000000..2d0f91e1 --- /dev/null +++ b/pkg/satellite/reconciler_bug028_test.go @@ -0,0 +1,437 @@ +// SPDX-License-Identifier: Apache-2.0 + +/* +Copyright 2026 Cozystack contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package satellite_test + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "github.com/cozystack/blockstor/pkg/drbd" + "github.com/cozystack/blockstor/pkg/satellite" + intent "github.com/cozystack/blockstor/pkg/satellite/intent" + "github.com/cozystack/blockstor/pkg/storage" + "github.com/cozystack/blockstor/pkg/storage/lvm" +) + +// BUG-028 regression pins — the day0 mkfs race and its false-latch +// terminal state. +// +// Failure chain on a fresh RWX RD (FileSystem/Type=ext4): +// +// 1. Day0 skip-initial-sync brings BOTH diskful replicas +// Connected+UpToDate at the shared deterministic day0 GI in ~2s. +// 2. The mkfs-election winner reaches finishDRBDApply AFTER that; +// the Bug 342 force-promote kernel veto (AnyConnectedPeerHasData) +// sees peer-disk:UpToDate, cannot tell the EMPTY day0 sibling from +// real data, and silently skips the one-and-only first-activation +// mkfs. firstActivation latches false. +// 3. An external promoter (drbd-reactor RWX path) promote/demotes the +// empty volume in 20s cycles; each promote bumps the current-UUID +// past day0 WITHOUT writing data. +// 4. The controller reads "UpToDate diskful, CurrentGI != day0" as +// proven data → RD.Spec.Initialized latches true (FALSELY). +// 5. The dispatcher gates the auto-primary election on !rdInitialized +// → no replica carries auto-primary → the Bug-311 mkfs retry +// (gated on autoPrimaryReplica) is permanently dead. Terminal: +// promote → fsck "Bad magic number" → demote, forever. +// +// The fix is two-sided and these tests pin both sides plus the +// data-safety counter-cases: +// +// - day0EmptyMkfsBypass: the veto may be bypassed ONLY when every +// signal proves the whole connected set is day0-empty siblings +// (Spec.SkipInitialSync=true, PeerHasData=false, local metadata +// current-UUID == day0, no fs signature) → mkfs happens (step 2 +// fixed, steps 3-5 never start). +// - latchFreeMkfsRetryAllowed: the Bug-311 retry no longer depends +// solely on the dispatcher's auto-primary election (killed by the +// false latch); the deterministic lowest-diskful-node-id winner +// re-enters promote→mkfs→demote when the kernel set is provably +// promote-safe and the filesystem is provably absent. + +// statusBothUpToDateSecondary is the kernel view of the BUG-028 race / +// terminal state: local Secondary UpToDate, one diskful peer Secondary +// UpToDate, one diskless tiebreaker. +func statusBothUpToDateSecondary(rd string) string { + return `[{ + "name":"` + rd + `","node-id":0,"role":"Secondary", + "devices":[{"volume":0,"disk-state":"UpToDate"}], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[{"volume":0,"peer-disk-state":"UpToDate"}] + }] + }]` +} + +// expectThinBacking cans the lvs probes so applyStorage resolves the +// (already-carved) thin LV and populates the devices map with its +// backing path — which the BUG-028 probes (drbdmeta get-gi current-UUID +// read, blkid fs-signature probe) target. +func expectThinBacking(fx *storage.FakeExec, rd string) string { + lv := rd + "_00000" + device := "/dev/vg/" + lv + + fx.Expect("lvs --config devices { filter=['r|^/dev/drbd|','r|^/dev/zd|'] } --noheadings -o lv_name vg/"+lv, + storage.FakeResponse{Stdout: []byte(lv + "\n")}) + fx.Expect("lvs --config devices { filter=['r|^/dev/drbd|','r|^/dev/zd|'] } --noheadings --separator | -o lv_path,lv_size --units k --nosuffix vg/"+lv, + storage.FakeResponse{Stdout: []byte(device + "|1048576\n")}) + + return device +} + +func expectGetGI(fx *storage.FakeExec, rd, device, currentGI string) { + fx.Expect("drbdmeta --force "+rd+"/0 v09 "+device+" internal get-gi --node-id 0", + storage.FakeResponse{Stdout: []byte(currentGI + ":0000000000000000:0:0:1:1:0:0:0:0\n")}) +} + +func newThinReconciler(fx *storage.FakeExec, dir string) *satellite.Reconciler { + thin := lvm.NewThin(lvm.ThinConfig{VolumeGroup: "vg", ThinPool: "tp"}, fx) + + return satellite.NewReconciler(satellite.ReconcilerConfig{ + Providers: map[string]storage.Provider{"thin1": thin}, + Adm: drbd.NewAdm(fx), + Exec: fx, + StateDir: dir, + NodeName: "n1", + }) +} + +// bug028WinnerDR is the elected mkfs winner's wire payload at first +// activation: auto-primary stamped, SkipInitialSync=true, RD-level +// FileSystem/Type, one diskful peer. +func bug028WinnerDR(rd, minor string) []*intent.DesiredResource { + return []*intent.DesiredResource{ + { + Name: rd, + NodeName: "n1", + Volumes: []*intent.DesiredVolume{ + {VolumeNumber: 0, SizeKib: 1024 * 1024, StoragePool: "thin1"}, + }, + Props: map[string]string{ + "FileSystem/Type": "ext4", + }, + Peers: []intent.DesiredPeer{{Name: "n2"}}, + SkipInitialSync: skipInitTrue(), + DrbdOptions: map[string]string{ + "port": "7000", "node-id": "0", "address": "10.0.0.1", "minor": minor, + "peer.n2.port": "7000", "peer.n2.node-id": "1", "peer.n2.address": "10.0.0.2", + "auto-primary": "true", + }, + }, + } +} + +func assertPromoteMkfsDemoteOrder(t *testing.T, cmds []string, rd, drbdDev string) { + t.Helper() + + posPrim, posMkfs, posSec := -1, -1, -1 + + for i, line := range cmds { + switch { + case posPrim < 0 && strings.Contains(line, "drbdadm primary --force "+rd): + posPrim = i + case posMkfs < 0 && strings.Contains(line, "mkfs.ext4 "+drbdDev): + posMkfs = i + case posSec < 0 && strings.Contains(line, "drbdadm secondary "+rd): + posSec = i + } + } + + if posPrim < 0 || posMkfs <= posPrim || posSec <= posMkfs { + t.Errorf("want primary --force < mkfs < secondary; got prim=%d mkfs=%d sec=%d in %v", + posPrim, posMkfs, posSec, cmds) + } +} + +func assertNoPromoteNoMkfs(t *testing.T, cmds []string) { + t.Helper() + + for _, line := range cmds { + if strings.Contains(line, "primary --force") { + t.Errorf("must NOT force-promote: %s", line) + } + + if strings.HasPrefix(line, "mkfs.") || strings.Contains(line, " mkfs.") { + t.Errorf("must NOT mkfs: %s", line) + } + } +} + +// TestApplyBug028Day0RaceVetoBypassedMkfsRuns pins the race fix: the +// day0 siblings connect Connected+UpToDate BEFORE the winner's +// first-activation pass, the Bug 342 kernel veto fires — and the +// day0-empty bypass (Spec.SkipInitialSync=true, PeerHasData=false, +// local current-UUID == day0, no fs signature) lets the promote+mkfs +// proceed anyway. Pre-fix the mkfs was silently skipped here, forever. +func TestApplyBug028Day0RaceVetoBypassedMkfsRuns(t *testing.T) { + dir := t.TempDir() + fx := storage.NewFakeExec() + device := expectThinBacking(fx, "pvc-b028") + // Kernel truth at the winner's finishDRBDApply: the day0 sibling is + // already Connected+UpToDate → AnyConnectedPeerHasData vetoes. + fx.Expect("drbdsetup status pvc-b028 --json", + storage.FakeResponse{Stdout: []byte(statusBothUpToDateSecondary("pvc-b028"))}) + // Local metadata still sits at the deterministic day0 current-UUID — + // the exact proof that every Connected+UpToDate peer is a + // never-written day0 sibling of the same generation. + expectGetGI(fx, "pvc-b028", device, satellite.Day0GiForTest("pvc-b028", 0)) + // blkid probes (backing pre-promote, /dev/drbd post-promote) return + // the FakeExec default (empty, no TYPE= line) → no fs anywhere. + + rec := newThinReconciler(fx, dir) + + _, err := rec.Apply(t.Context(), bug028WinnerDR("pvc-b028", "6500")) + if err != nil { + t.Fatalf("Apply: %v", err) + } + + cmds := fx.CommandLines() + assertPromoteMkfsDemoteOrder(t, cmds, "pvc-b028", "/dev/drbd6500") + + if _, statErr := os.Stat(filepath.Join(dir, "pvc-b028.mkfs.done")); statErr != nil { + t.Errorf(".mkfs.done marker must be written after the bypassed-veto mkfs; got stat err %v", statErr) + } +} + +// TestApplyBug028VetoHoldsOnNonDay0PeerGI is the data-safety +// counter-case: the kernel veto fires and the local current-UUID is +// NOT day0 (a real data generation — relocate survivor / post-write +// state). The bypass must refuse: no promote, no mkfs; the replica +// stays on the full-resync path. NEVER mkfs over real data. +func TestApplyBug028VetoHoldsOnNonDay0PeerGI(t *testing.T) { + dir := t.TempDir() + fx := storage.NewFakeExec() + device := expectThinBacking(fx, "pvc-b028d") + fx.Expect("drbdsetup status pvc-b028d --json", + storage.FakeResponse{Stdout: []byte(statusBothUpToDateSecondary("pvc-b028d"))}) + // A runtime current-UUID (cannot equal the deterministic day0). + expectGetGI(fx, "pvc-b028d", device, "2BCB1C8F00B058AE") + + rec := newThinReconciler(fx, dir) + + _, err := rec.Apply(t.Context(), bug028WinnerDR("pvc-b028d", "6510")) + if err != nil { + t.Fatalf("Apply: %v", err) + } + + assertNoPromoteNoMkfs(t, fx.CommandLines()) + + if _, statErr := os.Stat(filepath.Join(dir, "pvc-b028d.mkfs.done")); statErr == nil { + t.Error(".mkfs.done must NOT be written when the veto holds") + } +} + +// TestApplyBug028VetoHoldsOnFsSignature: belt-and-braces counter-case — +// even at day0 GI, a filesystem signature on the backing device refuses +// the bypass (there are bytes a mkfs would destroy). +func TestApplyBug028VetoHoldsOnFsSignature(t *testing.T) { + dir := t.TempDir() + fx := storage.NewFakeExec() + device := expectThinBacking(fx, "pvc-b028f") + fx.Expect("drbdsetup status pvc-b028f --json", + storage.FakeResponse{Stdout: []byte(statusBothUpToDateSecondary("pvc-b028f"))}) + expectGetGI(fx, "pvc-b028f", device, satellite.Day0GiForTest("pvc-b028f", 0)) + fx.Expect("blkid -o export "+device, + storage.FakeResponse{Stdout: []byte("DEVNAME=" + device + "\nTYPE=ext4\nUSAGE=filesystem\n")}) + + rec := newThinReconciler(fx, dir) + + _, err := rec.Apply(t.Context(), bug028WinnerDR("pvc-b028f", "6520")) + if err != nil { + t.Fatalf("Apply: %v", err) + } + + assertNoPromoteNoMkfs(t, fx.CommandLines()) +} + +// TestApplyBug028VetoHoldsOnPeerHasData: the dispatcher's CRD view +// reports a PROVEN data-bearing diskful peer (non-day0 GI observed on +// the peer's Status) → bypass refused before any kernel probe runs. +func TestApplyBug028VetoHoldsOnPeerHasData(t *testing.T) { + dir := t.TempDir() + fx := storage.NewFakeExec() + expectThinBacking(fx, "pvc-b028p") + + rec := newThinReconciler(fx, dir) + + dr := bug028WinnerDR("pvc-b028p", "6530") + dr[0].PeerHasData = true + + _, err := rec.Apply(t.Context(), dr) + if err != nil { + t.Fatalf("Apply: %v", err) + } + + assertNoPromoteNoMkfs(t, fx.CommandLines()) +} + +// bug028FalseLatchDR is the wire payload of the BUG-028 TERMINAL state: +// the false RD.Spec.Initialized latch fired, so the dispatcher no +// longer stamps `auto-primary`; metadata exists (MetadataCreated=true → +// firstActivation=false); the `.mkfs.done` marker never landed; the RD +// still asks for ext4. +func bug028FalseLatchDR(rd, minor string) []*intent.DesiredResource { + return []*intent.DesiredResource{ + { + Name: rd, + NodeName: "n1", + Volumes: []*intent.DesiredVolume{ + {VolumeNumber: 0, SizeKib: 1024 * 1024, StoragePool: "thin1"}, + }, + Props: map[string]string{ + "FileSystem/Type": "ext4", + }, + Peers: []intent.DesiredPeer{{Name: "n2"}}, + SkipInitialSync: skipInitTrue(), + MetadataCreated: true, + DrbdOptions: map[string]string{ + "port": "7000", "node-id": "0", "address": "10.0.0.1", "minor": minor, + "peer.n2.port": "7000", "peer.n2.node-id": "1", "peer.n2.address": "10.0.0.2", + // NO auto-primary: the false Initialized latch killed the + // dispatcher's election. + }, + }, + } +} + +// TestApplyBug028FalseLatchRetryFiresWithoutAutoPrimary pins the +// latch-independence fix: even with NO auto-primary election, the +// deterministic lowest-diskful-node-id winner re-enters +// promote→mkfs→demote when the kernel set is all-Secondary lock-step +// UpToDate and no volume carries a filesystem. Pre-fix this state was +// terminal (retry gated solely on autoPrimaryReplica). +func TestApplyBug028FalseLatchRetryFiresWithoutAutoPrimary(t *testing.T) { + dir := t.TempDir() + fx := storage.NewFakeExec() + expectThinBacking(fx, "pvc-b028r") + fx.Expect("drbdsetup status pvc-b028r --json", + storage.FakeResponse{Stdout: []byte(statusBothUpToDateSecondary("pvc-b028r"))}) + // blkid on the backing device and on /dev/drbd6600: FakeExec default + // (no TYPE=) → filesystem provably absent. + + rec := newThinReconciler(fx, dir) + + _, err := rec.Apply(t.Context(), bug028FalseLatchDR("pvc-b028r", "6600")) + if err != nil { + t.Fatalf("Apply: %v", err) + } + + cmds := fx.CommandLines() + assertPromoteMkfsDemoteOrder(t, cmds, "pvc-b028r", "/dev/drbd6600") + + if _, statErr := os.Stat(filepath.Join(dir, "pvc-b028r.mkfs.done")); statErr != nil { + t.Errorf(".mkfs.done marker must be written after the latch-free retry; got stat err %v", statErr) + } +} + +// TestApplyBug028FalseLatchRetryDefersWhileForeignPrimary pins the +// external-promoter coexistence contract: while drbd-reactor holds the +// device Primary on a peer, the retry must NOT fight it — and must fire +// on a later pass once every replica is Secondary again. +func TestApplyBug028FalseLatchRetryDefersWhileForeignPrimary(t *testing.T) { + dir := t.TempDir() + fx := storage.NewFakeExec() + expectThinBacking(fx, "pvc-b028w") + fx.Expect("drbdsetup status pvc-b028w --json", + storage.FakeResponse{Stdout: []byte(`[{ + "name":"pvc-b028w","node-id":0,"role":"Secondary", + "devices":[{"volume":0,"disk-state":"UpToDate"}], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected", + "peer-role":"Primary", + "peer_devices":[{"volume":0,"peer-disk-state":"UpToDate"}] + }] + }]`)}) + + rec := newThinReconciler(fx, dir) + + dr := bug028FalseLatchDR("pvc-b028w", "6610") + + _, err := rec.Apply(t.Context(), dr) + if err != nil { + t.Fatalf("Apply (foreign Primary): %v", err) + } + + assertNoPromoteNoMkfs(t, fx.CommandLines()) + + // The reactor demoted (mount failed again) → all Secondary → the + // next reconcile pass picks the retry up. + fx.Reset() + expectThinBacking(fx, "pvc-b028w") + fx.Expect("drbdsetup status pvc-b028w --json", + storage.FakeResponse{Stdout: []byte(statusBothUpToDateSecondary("pvc-b028w"))}) + + _, err = rec.Apply(t.Context(), dr) + if err != nil { + t.Fatalf("Apply (all Secondary): %v", err) + } + + assertPromoteMkfsDemoteOrder(t, fx.CommandLines(), "pvc-b028w", "/dev/drbd6610") +} + +// TestApplyBug028FalseLatchRetryOnlyOnElectionWinner: the latch-free +// retry replicates the dispatcher's lowest-diskful-node-id election so +// AT MOST ONE node re-enters the promote dance. A node whose diskful +// peer holds a lower id must stay quiet. +func TestApplyBug028FalseLatchRetryOnlyOnElectionWinner(t *testing.T) { + dir := t.TempDir() + fx := storage.NewFakeExec() + expectThinBacking(fx, "pvc-b028l") + fx.Expect("drbdsetup status pvc-b028l --json", + storage.FakeResponse{Stdout: []byte(statusBothUpToDateSecondary("pvc-b028l"))}) + + rec := newThinReconciler(fx, dir) + + dr := bug028FalseLatchDR("pvc-b028l", "6620") + dr[0].DrbdOptions["node-id"] = "1" + dr[0].DrbdOptions["peer.n2.node-id"] = "0" + + _, err := rec.Apply(t.Context(), dr) + if err != nil { + t.Fatalf("Apply: %v", err) + } + + assertNoPromoteNoMkfs(t, fx.CommandLines()) +} + +// TestApplyBug028FalseLatchRetryRefusedWhenFsPresent: data-safety +// counter-case for the retry side — a filesystem signature on the +// backing device means there is nothing to retry (and bytes a promote +// dance could disturb). No promote, no mkfs. +func TestApplyBug028FalseLatchRetryRefusedWhenFsPresent(t *testing.T) { + dir := t.TempDir() + fx := storage.NewFakeExec() + device := expectThinBacking(fx, "pvc-b028s") + fx.Expect("drbdsetup status pvc-b028s --json", + storage.FakeResponse{Stdout: []byte(statusBothUpToDateSecondary("pvc-b028s"))}) + fx.Expect("blkid -o export "+device, + storage.FakeResponse{Stdout: []byte("DEVNAME=" + device + "\nTYPE=ext4\nUSAGE=filesystem\n")}) + + rec := newThinReconciler(fx, dir) + + _, err := rec.Apply(t.Context(), bug028FalseLatchDR("pvc-b028s", "6630")) + if err != nil { + t.Fatalf("Apply: %v", err) + } + + assertNoPromoteNoMkfs(t, fx.CommandLines()) +} From 4873e811da1103facf6c8bd6f20c1b5160a0e56c Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Fri, 12 Jun 2026 09:02:46 +0300 Subject: [PATCH 3/4] fix(satellite): unblock BUG-028 probes wedged by drbd meta signature and CRD lag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stand forensics on the loop-backed ganesha scenario showed both new BUG-028 paths permanently refusing on real hardware: 1. blkid on the BACKING device of an internal-metadata DRBD volume always reports TYPE=drbd (libblkid recognises the meta superblock at the device tail), so the naive any-TYPE fs-absence probe read every never-formatted volume as populated. New backingHasUserFilesystem treats only a non-drbd signature as user data; the post-promote probe on the DRBD device (which hides the metadata) remains the authoritative double-mkfs guard. 2. The dispatcher's PeerHasData conservatively counts an UpToDate day0 sibling whose CurrentGI backfill has not landed yet as data-bearing; refusing the bypass on it permanently cost the one-shot first-activation mkfs. The bypass now uses kernel coverage (Day0SiblingSetConnected: no Primary anywhere, local UpToDate, every connected peer-device UpToDate/Diskless, an un-handshaken peer tolerated only when it is a configured diskless witness) which, combined with the local-GI==day0 proof, strictly supersedes the CRD signal: every state PeerHasData correctly protects is still refused. Also the latch-free retry now requires only ONE volume to be missing its filesystem (volumes that already carry one are adopted untouched by the per-volume blkid probe in the mkfs runner — the Bug 311 partial-mkfs shape). Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- pkg/drbd/drbdadm.go | 77 ++++++++++++ pkg/drbd/safe_for_mkfs_retry_promote_test.go | 88 +++++++++++++ pkg/satellite/reconciler.go | 123 ++++++++++++++++--- pkg/satellite/reconciler_bug028_test.go | 94 ++++++++++++-- 4 files changed, 353 insertions(+), 29 deletions(-) diff --git a/pkg/drbd/drbdadm.go b/pkg/drbd/drbdadm.go index d98b3366..bdcfab03 100644 --- a/pkg/drbd/drbdadm.go +++ b/pkg/drbd/drbdadm.go @@ -1011,6 +1011,83 @@ func (a *Adm) SafeForMkfsRetryPromote(ctx context.Context, resource string) bool return true } +// Day0SiblingSetConnected probes `drbdsetup status --json` and +// reports whether the ENTIRE configured replica set is currently +// visible to the kernel as a promote-safe day0 candidate set (the +// BUG-028 first-activation mkfs bypass; the GI-level day0 proof is the +// satellite's, this is only the connectivity/coverage half): +// +// - the local role is NOT Primary and every local volume is diskful +// UpToDate (the elected winner seeded UpToDate via set-gi); +// - NO peer is Primary (an external promoter mid-grab defers the +// bypass to the latch-free retry, which handles foreign Primaries); +// - every connected peer-device is UpToDate or Diskless; +// - a peer-device whose state is still unknown (DUnknown — the +// connection has not handshaken) is tolerated ONLY when the peer is +// named in disklessPeers (an intentional diskless witness carries +// no data by construction). An un-handshaken DISKFUL peer refuses: +// it could be an offline data holder, and both `primary --force` +// and mkfs against it are the Bug 342 unrelated-data / data-loss +// wedge. +// +// Why this exists: the dispatcher's CRD-level PeerHasData treats an +// UpToDate sibling whose CurrentGI has not been OBSERVED yet (the +// get-gi backfill is best-effort) as data-bearing. On a fresh day0 +// race that conservatism is FALSE and would permanently cost the +// one-shot first-activation mkfs. The kernel coverage here, combined +// with the satellite's local-GI==day0 proof (a Connected+UpToDate peer +// necessarily shares the local data generation), strictly supersedes +// the CRD signal: every case PeerHasData correctly protects is also +// refused here (a real connected data peer forces local GI != day0; a +// disconnected diskful peer is DUnknown). +// +// Conservative on any probe / parse failure: returns false. +func (a *Adm) Day0SiblingSetConnected(ctx context.Context, resource string, disklessPeers map[string]bool) bool { + out, err := a.exec.Run(ctx, "drbdsetup", "status", resource, "--json") + if err != nil { + return false + } + + var status drbdsetupStatusRoot + + err = json.Unmarshal(out, &status) + if err != nil || len(status) == 0 { + return false + } + + res := status[0] + + if Role(res.Role).IsPrimary() || !localIsUpToDate(res.Devices) { + return false + } + + for _, conn := range res.Connections { + if Role(conn.PeerRole).IsPrimary() { + return false + } + + for _, pd := range conn.PeerDevices { + switch DiskState(pd.PeerDiskState) { + case DiskStateUpToDate, DiskStateDiskless: + // Lock-step sibling or intentional witness — safe. + case DiskStateDUnknown: + if !disklessPeers[conn.PeerName] { + return false + } + case DiskStateConsistent, DiskStateOutdated, DiskStateAttaching, + DiskStateDetaching, DiskStateFailed, DiskStateNegotiating, + DiskStateInconsistent: + return false + default: + // Unknown/empty token — refuse, conservative. + return false + } + } + } + + return true +} + // NeedsRecoveryPromote probes the live kernel via `drbdsetup status // --json` and reports whether THIS node should re-arm the // auto-primary seed to unstick a fresh RD whose initial sync wedged diff --git a/pkg/drbd/safe_for_mkfs_retry_promote_test.go b/pkg/drbd/safe_for_mkfs_retry_promote_test.go index d596a49a..292e5c6b 100644 --- a/pkg/drbd/safe_for_mkfs_retry_promote_test.go +++ b/pkg/drbd/safe_for_mkfs_retry_promote_test.go @@ -175,3 +175,91 @@ func TestSafeForMkfsRetryPromote_ProbeFailureRefuses(t *testing.T) { t.Fatal("a malformed status probe must refuse the latch-free mkfs retry") } } + +// Day0SiblingSetConnected pins (BUG-028 bypass coverage probe). Same +// conservatism contract as SafeForMkfsRetryPromote, with ONE deliberate +// relaxation: a not-yet-handshaken (DUnknown) peer is tolerated when it +// is a configured diskless witness — it carries no data by construction +// and must not cost the one-shot first-activation mkfs. + +// TestDay0SiblingSetConnected_DisklessWitnessStillConnecting: the day0 +// race shape — diskful sibling Connected+UpToDate, tiebreaker witness +// still handshaking → covered. +func TestDay0SiblingSetConnected_DisklessWitnessStillConnecting(t *testing.T) { + adm := admWithMkfsRetryStatus(t, `[{ + "name":"pvc-b028","node-id":0,"role":"Secondary", + "devices":[{"volume":0,"disk-state":"UpToDate"}], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[{"volume":0,"peer-disk-state":"UpToDate"}] + },{ + "peer-node-id":2,"name":"n3","connection-state":"Connecting", + "peer-role":"Unknown", + "peer_devices":[{"volume":0,"peer-disk-state":"DUnknown"}] + }] + }]`) + + if !adm.Day0SiblingSetConnected(t.Context(), "pvc-b028", map[string]bool{"n3": true}) { + t.Fatal("a still-connecting DISKLESS witness must not block the day0 bypass coverage") + } +} + +// TestDay0SiblingSetConnected_DiskfulPeerStillConnecting: the same +// DUnknown peer WITHOUT the diskless marking is a potential offline +// data holder → refuse. +func TestDay0SiblingSetConnected_DiskfulPeerStillConnecting(t *testing.T) { + adm := admWithMkfsRetryStatus(t, `[{ + "name":"pvc-b028","node-id":0,"role":"Secondary", + "devices":[{"volume":0,"disk-state":"UpToDate"}], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[{"volume":0,"peer-disk-state":"UpToDate"}] + },{ + "peer-node-id":2,"name":"n3","connection-state":"Connecting", + "peer-role":"Unknown", + "peer_devices":[{"volume":0,"peer-disk-state":"DUnknown"}] + }] + }]`) + + if adm.Day0SiblingSetConnected(t.Context(), "pvc-b028", map[string]bool{}) { + t.Fatal("a not-yet-handshaken DISKFUL peer must refuse the day0 bypass coverage") + } +} + +// TestDay0SiblingSetConnected_ForeignPrimaryRefuses: an external +// promoter already holds the device → defer to the latch-free retry. +func TestDay0SiblingSetConnected_ForeignPrimaryRefuses(t *testing.T) { + adm := admWithMkfsRetryStatus(t, `[{ + "name":"pvc-b028","node-id":0,"role":"Secondary", + "devices":[{"volume":0,"disk-state":"UpToDate"}], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected", + "peer-role":"Primary", + "peer_devices":[{"volume":0,"peer-disk-state":"UpToDate"}] + }] + }]`) + + if adm.Day0SiblingSetConnected(t.Context(), "pvc-b028", map[string]bool{}) { + t.Fatal("a foreign Primary must refuse the day0 bypass coverage") + } +} + +// TestDay0SiblingSetConnected_InconsistentPeerRefuses: an Inconsistent +// peer-device is not a lock-step day0 sibling → refuse. +func TestDay0SiblingSetConnected_InconsistentPeerRefuses(t *testing.T) { + adm := admWithMkfsRetryStatus(t, `[{ + "name":"pvc-b028","node-id":0,"role":"Secondary", + "devices":[{"volume":0,"disk-state":"UpToDate"}], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[{"volume":0,"peer-disk-state":"Inconsistent","replication-state":"Established","resync-suspended":"no"}] + }] + }]`) + + if adm.Day0SiblingSetConnected(t.Context(), "pvc-b028", map[string]bool{}) { + t.Fatal("an Inconsistent peer must refuse the day0 bypass coverage") + } +} diff --git a/pkg/satellite/reconciler.go b/pkg/satellite/reconciler.go index 0d73d16d..c7939ff3 100644 --- a/pkg/satellite/reconciler.go +++ b/pkg/satellite/reconciler.go @@ -2737,9 +2737,18 @@ func (r *Reconciler) shouldRetryAutoMkfs(ctx context.Context, dr *intent.Desired // the OFFLINE-SAFE proof this replica was born into a genuinely- // fresh, never-initialized RD generation (relocate / migrate / // extra-replica destinations are stamped false and never bypass). -// 3. The dispatcher's CRD view reports no proven data-bearing diskful -// peer (PeerHasData=false; day0 siblings are already excluded by -// isDay0SeededVolume, a real survivor is not). +// 3. Kernel coverage of the whole configured replica set +// (Adm.Day0SiblingSetConnected): no Primary anywhere, local +// UpToDate, every connected peer-device UpToDate/Diskless, and any +// not-yet-handshaken peer is an intentional diskless witness. An +// un-handshaken DISKFUL peer (potential offline data holder) +// refuses. NOTE this deliberately supersedes the dispatcher's +// CRD-level PeerHasData: that flag treats an UpToDate day0 sibling +// whose CurrentGI backfill has not landed yet as data-bearing +// (correct for the re-computed seed gates, but a FALSE positive +// here would permanently cost the one-shot first-activation mkfs). +// Every state PeerHasData correctly protects is still refused by +// this kernel check plus the GI proof below. // 4. Kernel/metadata truth per volume: the LOCAL current-UUID still // equals the deterministic day0 GI. DRBD only lets two replicas // sit Connected+UpToDate when they are in the same data @@ -2767,13 +2776,33 @@ func (r *Reconciler) day0EmptyMkfsBypass(ctx context.Context, dr *intent.Desired return false } - if dr.GetPeerHasData() { + if !r.cfg.Adm.Day0SiblingSetConnected(ctx, dr.GetName(), disklessPeerSet(dr)) { return false } return r.allVolumesDay0Empty(ctx, dr, devices) } +// disklessPeerSet collects the configured peers the dispatcher marked +// as intentional diskless witnesses (`peer..diskless=true` in +// the wire DrbdOptions). Consumed by the day0 bypass coverage probe to +// tolerate a witness whose connection has not handshaken yet — it +// carries no data by construction, so it cannot be a data holder the +// bypass must wait for. +func disklessPeerSet(dr *intent.DesiredResource) map[string]bool { + opts := dr.GetDrbdOptions() + + out := make(map[string]bool) + + for _, peer := range dr.GetPeerNames() { + if opts["peer."+peer+".diskless"] == drbdBoolPropTrue { + out[peer] = true + } + } + + return out +} + // allVolumesDay0Empty reports whether EVERY desired volume of dr still // sits at the deterministic day0 current-UUID in its on-disk DRBD // metadata AND carries no filesystem signature on its backing device. @@ -2796,7 +2825,12 @@ func (r *Reconciler) allVolumesDay0Empty(ctx context.Context, dr *intent.Desired return false } - if r.deviceHasFilesystem(ctx, device) { + // backingHasUserFilesystem, NOT deviceHasFilesystem: the + // backing device of an internal-metadata DRBD volume always + // shows blkid TYPE=drbd (the meta superblock at its tail) — + // only a non-drbd signature is user data (stand forensics + // pinned this; the naive probe wedged the bypass forever). + if r.backingHasUserFilesystem(ctx, device) { return false } } @@ -2832,13 +2866,18 @@ func (r *Reconciler) allVolumesDay0Empty(ctx context.Context, dr *intent.Desired // in the SAME data generation as the local one, so the promote // mints nothing unrelated and the mkfs writes replicate to peers // that are by construction bit-identical to the local volume. -// - Every local volume's backing device has NO filesystem signature -// (blkid): real data in the mkfs context means a filesystem (the -// RD requests FileSystem/Type; consumers only ever write through -// it), so fs-absence both proves the retry is still needed and -// that there is nothing to destroy. The post-promote blkid probe -// inside runAutoMkfs remains the per-volume double-mkfs safety -// net on the replicated device itself. +// - At least one local volume's backing device has NO USER +// filesystem signature (backingHasUserFilesystem — blkid that +// ignores TYPE=drbd, the internal-metadata superblock libblkid +// always sees at the backing device's tail; the naive probe +// wedged this gate forever on real hardware). Real data in the +// mkfs context means a filesystem (the RD requests +// FileSystem/Type; consumers only ever write through it), so a +// volume missing one both proves the retry is still needed and +// that there is nothing to destroy there; volumes that DO carry a +// filesystem are adopted untouched by runAutoMkfs's post-promote +// blkid probe on the replicated device (the Bug 311 partial-mkfs +// shape), which remains the per-volume double-mkfs safety net. // - Throttled through recoveryPromoteDue (shared with the Bug 366 // recovery-promote): the promote→demote dance churns kernel state // and may race the external promoter's own cycle; one nudge per @@ -2857,24 +2896,43 @@ func (r *Reconciler) latchFreeMkfsRetryAllowed(ctx context.Context, dr *intent.D return false } + // At least one desired volume must still be missing a USER + // filesystem (backing probe; TYPE=drbd is the internal-metadata + // signature, not user data) — otherwise there is nothing to retry. + // Volumes that already carry a filesystem are fine: runAutoMkfs's + // per-volume blkid probe on the promoted DRBD device adopts them + // untouched (the Bug 311 partial-mkfs shape). + anyVolumeNeedsMkfs := false + for _, vol := range dr.GetVolumes() { device := devices[vol.GetVolumeNumber()] if device == "" { - return false + continue } - if r.deviceHasFilesystem(ctx, device) { - return false + if !r.backingHasUserFilesystem(ctx, device) { + anyVolumeNeedsMkfs = true + + break } } + if !anyVolumeNeedsMkfs { + return false + } + // Consume the throttle slot LAST so a pass vetoed by a foreign - // Primary / probe failure does not burn the window. + // Primary / probe failure does not burn the window — the retry + // must land in the very next clear gap of the external promoter's + // cycle. Probe overhead while a candidate stays unconverged is + // bounded by the controller's steady-state requeue cadence, and + // this branch only runs at all on the rare initialized-RD-without- + // filesystem shape (auto-primary replicas short-circuit before it). if !r.recoveryPromoteDue(dr.GetName()) { return false } - log.FromContext(ctx).Info("BUG-028: latch-free mkfs retry — no auto-primary election but filesystem is provably absent, re-entering promote+mkfs+demote", + log.FromContext(ctx).Info("BUG-028: latch-free mkfs retry — no auto-primary election but a volume provably lacks its filesystem, re-entering promote+mkfs+demote", "resource", dr.GetName()) return true @@ -3832,6 +3890,37 @@ func (r *Reconciler) deviceHasFilesystem(ctx context.Context, device string) boo return false } +// backingHasUserFilesystem is the BACKING-device variant of +// deviceHasFilesystem for the BUG-028 pre-promote probes. The backing +// LV/zvol/loop of a DRBD-stacked volume with INTERNAL metadata always +// carries the DRBD meta-data superblock at its tail, and libblkid +// recognises it: `blkid -o export /dev/loopN` on a never-formatted +// volume reports `TYPE=drbd`. Counting that as "a filesystem is +// present" permanently wedged both BUG-028 probes on real hardware +// (stand forensics: every fresh ganesha volume showed TYPE=drbd, the +// bypass and the latch-free retry never fired, while the post-promote +// probe on /dev/drbdN — which hides the metadata — correctly saw +// nothing). Only a non-drbd TYPE counts as user data here; anything +// else (no signature, probe failure, the drbd meta signature itself) +// reads as "no filesystem". The post-promote blkid probe inside +// runAutoMkfs on the DRBD device remains the authoritative +// double-mkfs guard. +func (r *Reconciler) backingHasUserFilesystem(ctx context.Context, device string) bool { + out, err := r.cfg.Exec.Run(ctx, "blkid", "-o", "export", device) + if err != nil { + return false + } + + for line := range strings.SplitSeq(string(out), "\n") { + value, ok := strings.CutPrefix(strings.TrimSpace(line), "TYPE=") + if ok && value != "drbd" { + return true + } + } + + return false +} + // runApplyDRBDVerb is the per-reconcile dispatch for the bring-up // chain. First activation falls through to the SkipDisk-aware // `drbdadm adjust` (or `adjust --skip-disk`): the .res + freshly- diff --git a/pkg/satellite/reconciler_bug028_test.go b/pkg/satellite/reconciler_bug028_test.go index 2d0f91e1..927073b2 100644 --- a/pkg/satellite/reconciler_bug028_test.go +++ b/pkg/satellite/reconciler_bug028_test.go @@ -103,6 +103,18 @@ func expectGetGI(fx *storage.FakeExec, rd, device, currentGI string) { storage.FakeResponse{Stdout: []byte(currentGI + ":0000000000000000:0:0:1:1:0:0:0:0\n")}) } +// expectDrbdMetaSignature cans the blkid answer real hardware gives +// for the BACKING device of an internal-metadata DRBD volume: libblkid +// recognises the DRBD meta superblock at the device tail and reports +// TYPE=drbd even on a never-formatted volume (stand forensics, +// bug028-fix-validation-20260612-054452/iter2). The BUG-028 probes +// MUST read this as "no user filesystem" — the naive any-TYPE= probe +// wedged both the bypass and the latch-free retry forever. +func expectDrbdMetaSignature(fx *storage.FakeExec, device string) { + fx.Expect("blkid -o export "+device, + storage.FakeResponse{Stdout: []byte("DEVNAME=" + device + "\nUUID=6715da3a6dd3182a\nTYPE=drbd\n")}) +} + func newThinReconciler(fx *storage.FakeExec, dir string) *satellite.Reconciler { thin := lvm.NewThin(lvm.ThinConfig{VolumeGroup: "vg", ThinPool: "tp"}, fx) @@ -194,8 +206,11 @@ func TestApplyBug028Day0RaceVetoBypassedMkfsRuns(t *testing.T) { // the exact proof that every Connected+UpToDate peer is a // never-written day0 sibling of the same generation. expectGetGI(fx, "pvc-b028", device, satellite.Day0GiForTest("pvc-b028", 0)) - // blkid probes (backing pre-promote, /dev/drbd post-promote) return - // the FakeExec default (empty, no TYPE= line) → no fs anywhere. + // Backing-device blkid answers TYPE=drbd (the real-hardware shape: + // libblkid sees the internal DRBD metadata superblock) — the bypass + // must read that as "no user filesystem". The post-promote probe on + // /dev/drbd6500 returns the FakeExec default (no signature). + expectDrbdMetaSignature(fx, device) rec := newThinReconciler(fx, dir) @@ -263,18 +278,41 @@ func TestApplyBug028VetoHoldsOnFsSignature(t *testing.T) { assertNoPromoteNoMkfs(t, fx.CommandLines()) } -// TestApplyBug028VetoHoldsOnPeerHasData: the dispatcher's CRD view -// reports a PROVEN data-bearing diskful peer (non-day0 GI observed on -// the peer's Status) → bypass refused before any kernel probe runs. -func TestApplyBug028VetoHoldsOnPeerHasData(t *testing.T) { +// TestApplyBug028VetoHoldsOnUnconnectedDiskfulPeer: data-safety +// counter-case for the kernel-coverage gate — a configured DISKFUL +// peer whose connection has not handshaken (peer-disk DUnknown) could +// be an offline data holder, so the bypass must refuse even when the +// local volume is day0-empty. (The OTHER connected peer being UpToDate +// is what fired the veto.) +func TestApplyBug028VetoHoldsOnUnconnectedDiskfulPeer(t *testing.T) { dir := t.TempDir() fx := storage.NewFakeExec() - expectThinBacking(fx, "pvc-b028p") + device := expectThinBacking(fx, "pvc-b028u") + fx.Expect("drbdsetup status pvc-b028u --json", + storage.FakeResponse{Stdout: []byte(`[{ + "name":"pvc-b028u","node-id":0,"role":"Secondary", + "devices":[{"volume":0,"disk-state":"UpToDate"}], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected", + "peer-role":"Secondary", + "peer_devices":[{"volume":0,"peer-disk-state":"UpToDate"}] + },{ + "peer-node-id":2,"name":"n3","connection-state":"Connecting", + "peer-role":"Unknown", + "peer_devices":[{"volume":0,"peer-disk-state":"DUnknown"}] + }] + }]`)}) + expectGetGI(fx, "pvc-b028u", device, satellite.Day0GiForTest("pvc-b028u", 0)) + expectDrbdMetaSignature(fx, device) rec := newThinReconciler(fx, dir) - dr := bug028WinnerDR("pvc-b028p", "6530") - dr[0].PeerHasData = true + dr := bug028WinnerDR("pvc-b028u", "6530") + dr[0].Peers = []intent.DesiredPeer{{Name: "n2"}, {Name: "n3"}} + dr[0].DrbdOptions["peer.n3.port"] = "7000" + dr[0].DrbdOptions["peer.n3.node-id"] = "2" + dr[0].DrbdOptions["peer.n3.address"] = "10.0.0.3" + // n3 is DISKFUL (no peer.n3.diskless) — its DUnknown must refuse. _, err := rec.Apply(t.Context(), dr) if err != nil { @@ -284,6 +322,35 @@ func TestApplyBug028VetoHoldsOnPeerHasData(t *testing.T) { assertNoPromoteNoMkfs(t, fx.CommandLines()) } +// TestApplyBug028BypassFiresDespitePeerHasDataLag pins the CRD-lag +// acceptance: the dispatcher conservatively reports PeerHasData=true +// for an UpToDate day0 sibling whose CurrentGI backfill has not been +// observed yet. That signal is correct for the re-computed seed gates +// but must NOT cost the one-shot first-activation mkfs when kernel +// truth (full coverage + local day0 GI + no user fs) proves the whole +// connected set is day0-empty. +func TestApplyBug028BypassFiresDespitePeerHasDataLag(t *testing.T) { + dir := t.TempDir() + fx := storage.NewFakeExec() + device := expectThinBacking(fx, "pvc-b028g") + fx.Expect("drbdsetup status pvc-b028g --json", + storage.FakeResponse{Stdout: []byte(statusBothUpToDateSecondary("pvc-b028g"))}) + expectGetGI(fx, "pvc-b028g", device, satellite.Day0GiForTest("pvc-b028g", 0)) + expectDrbdMetaSignature(fx, device) + + rec := newThinReconciler(fx, dir) + + dr := bug028WinnerDR("pvc-b028g", "6540") + dr[0].PeerHasData = true // CRD lag: day0 sibling, CurrentGI unobserved + + _, err := rec.Apply(t.Context(), dr) + if err != nil { + t.Fatalf("Apply: %v", err) + } + + assertPromoteMkfsDemoteOrder(t, fx.CommandLines(), "pvc-b028g", "/dev/drbd6540") +} + // bug028FalseLatchDR is the wire payload of the BUG-028 TERMINAL state: // the false RD.Spec.Initialized latch fired, so the dispatcher no // longer stamps `auto-primary`; metadata exists (MetadataCreated=true → @@ -322,11 +389,14 @@ func bug028FalseLatchDR(rd, minor string) []*intent.DesiredResource { func TestApplyBug028FalseLatchRetryFiresWithoutAutoPrimary(t *testing.T) { dir := t.TempDir() fx := storage.NewFakeExec() - expectThinBacking(fx, "pvc-b028r") + device := expectThinBacking(fx, "pvc-b028r") fx.Expect("drbdsetup status pvc-b028r --json", storage.FakeResponse{Stdout: []byte(statusBothUpToDateSecondary("pvc-b028r"))}) - // blkid on the backing device and on /dev/drbd6600: FakeExec default - // (no TYPE=) → filesystem provably absent. + // Backing blkid answers TYPE=drbd (the real-hardware shape — see + // expectDrbdMetaSignature): the retry must read it as "no user + // filesystem". /dev/drbd6600's post-promote probe stays at the + // FakeExec default (no signature) → mkfs runs. + expectDrbdMetaSignature(fx, device) rec := newThinReconciler(fx, dir) From a5ed96ec815840a3ae6732b1080189acebd5eb50 Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Fri, 12 Jun 2026 09:09:37 +0300 Subject: [PATCH 4/4] docs(satellite): document the inherited day0-GI never-degraded-write ambiguity The bypass cannot distinguish a respawned replica joining a data-bearing-but-still-day0 survivor (a volume whose entire write history happened fully connected never mints a new current-UUID) from a fresh day0 sibling. This is the same ambiguity the day0 seed path documents and the same shape the pre-existing Bug-311 retry already had; production auto-mkfs topologies (RWX with a diskless witness) are immune because the first consumer promote mints a UUID and latches the RD. Documented as accepted residual risk. Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- pkg/satellite/reconciler.go | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pkg/satellite/reconciler.go b/pkg/satellite/reconciler.go index c7939ff3..87c8a15a 100644 --- a/pkg/satellite/reconciler.go +++ b/pkg/satellite/reconciler.go @@ -2767,6 +2767,25 @@ func (r *Reconciler) shouldRetryAutoMkfs(ctx context.Context, dr *intent.Desired // respawn-StandAlone protections of Bug 342/356 intact: this function // can only ever ADD an mkfs on a provably day0-empty generation, never // remove a veto protecting real data. +// +// KNOWN RESIDUAL AMBIGUITY (pre-existing, inherited, not widened): a +// volume whose entire write history happened while ALL peers were +// connected never advances its current-UUID past day0 (DRBD only +// mints on promote/write with an absent/weak peer). A respawned +// replica joining such a data-bearing-but-day0 survivor is +// indistinguishable from a fresh day0 sibling by GI bookkeeping — +// this is the SAME ambiguity resolveVolumeSeed documents for the +// day0 seed path and the same shape the pre-existing Bug-311 retry +// (auto-primary + absent marker, NO kernel veto at all) already had. +// In every production auto-mkfs topology (RWX ganesha) a diskless +// tiebreaker is part of the set, so the first consumer promote mints +// a new UUID (weak_nodes != 0, observed on the stand), the RD +// latches Initialized, the respawned replica is stamped +// SkipInitialSync=false, and condition 2 above refuses the bypass. +// Only a hand-built no-witness FileSystem/Type RD whose data never +// saw a degraded write retains the ambiguity — accepted and +// documented rather than "solved" with a heuristic that would +// reintroduce the BUG-028 wedge. func (r *Reconciler) day0EmptyMkfsBypass(ctx context.Context, dr *intent.DesiredResource, devices map[int32]string) bool { if !needsMkfs(dr) || r.cfg.Exec == nil { return false