From 4a42df781019c9410d9b1b0d044b230ab73c4bc9 Mon Sep 17 00:00:00 2001
From: Andrei Kvapil <kvapss@gmail.com>
Date: Fri, 12 Jun 2026 07:49:47 +0300
Subject: [PATCH 1/4] feat(drbd): add SafeForMkfsRetryPromote kernel-truth
 probe

Authorises a promote+mkfs+demote retry without the dispatcher's
auto-primary blessing, but only when the kernel proves it safe:
every replica Secondary, local volumes UpToDate, and every connected
peer-device either a lock-step UpToDate sibling or an intentional
diskless witness. A disconnected or non-UpToDate diskful peer (a
potential offline data holder) refuses, as does a foreign Primary
(external promoters such as drbd-reactor simply defer the retry).

Groundwork for the BUG-028 day0/mkfs race fix.

Co-Authored-By: Claude <noreply@anthropic.com>
Signed-off-by: Andrei Kvapil <kvapss@gmail.com>
---
 pkg/drbd/drbdadm.go                          |  76 ++++++++
 pkg/drbd/safe_for_mkfs_retry_promote_test.go | 177 +++++++++++++++++++
 2 files changed, 253 insertions(+)
 create mode 100644 pkg/drbd/safe_for_mkfs_retry_promote_test.go
diff --git a/pkg/drbd/drbdadm.go b/pkg/drbd/drbdadm.go
index 869d0ed3..d98b3366 100644
--- a/pkg/drbd/drbdadm.go
+++ b/pkg/drbd/drbdadm.go
@@ -935,6 +935,82 @@ func (a *Adm) AnyConnectedPeerHasDataForVolume(ctx context.Context, resource str
 	return false
 }
 
+// SafeForMkfsRetryPromote probes `drbdsetup status <res> --json` and
+// reports whether a promote→mkfs→demote retry is provably safe to run
+// RIGHT NOW on this node without the dispatcher's auto-primary
+// blessing (the BUG-028 latch-free mkfs retry; see the satellite's
+// latchFreeMkfsRetryAllowed for the full story of the false
+// RD.Spec.Initialized latch that kills the auto-primary election).
+//
+// Returns true ONLY when ALL hold:
+//
+//   - the local role is NOT Primary (we are about to promote; an
+//     already-Primary local slot means some consumer or a previous
+//     dance holds the device — let it finish);
+//   - every local volume is diskful UpToDate (the retry exists to add
+//     a missing filesystem to a HEALTHY converged replica set, never
+//     to promote an Inconsistent local copy);
+//   - NO peer is Primary (an external promoter — drbd-reactor's RWX
+//     mount loop — may briefly hold the device; the caller simply
+//     retries on a later reconcile once it has demoted again);
+//   - every connected peer-device is UpToDate or an intentional
+//     Diskless witness. UpToDate-while-Connected means the peer is in
+//     the SAME data generation as the local volume (bit-identical), so
+//     `primary --force` mints nothing unrelated and the subsequent
+//     mkfs writes replicate to copies that already equal ours. ANY
+//     other peer-disk state (Inconsistent, DUnknown of a disconnected
+//     peer, Negotiating, …) vetoes — a disconnected diskful peer could
+//     be an offline data holder, and forcing primary against one is
+//     exactly the Bug 342 unrelated-data wedge.
+//
+// Conservative on any probe / parse failure: returns false, the retry
+// just waits for the next reconcile.
+func (a *Adm) SafeForMkfsRetryPromote(ctx context.Context, resource string) bool {
+	out, err := a.exec.Run(ctx, "drbdsetup", "status", resource, "--json")
+	if err != nil {
+		return false
+	}
+
+	var status drbdsetupStatusRoot
+
+	err = json.Unmarshal(out, &status)
+	if err != nil || len(status) == 0 {
+		return false
+	}
+
+	res := status[0]
+
+	if Role(res.Role).IsPrimary() {
+		return false
+	}
+
+	if !localIsUpToDate(res.Devices) {
+		return false
+	}
+
+	for _, conn := range res.Connections {
+		if Role(conn.PeerRole).IsPrimary() {
+			return false
+		}
+
+		for _, pd := range conn.PeerDevices {
+			switch DiskState(pd.PeerDiskState) {
+			case DiskStateUpToDate, DiskStateDiskless:
+				// Lock-step sibling or intentional witness — safe.
+			case DiskStateConsistent, DiskStateOutdated, DiskStateAttaching,
+				DiskStateDetaching, DiskStateFailed, DiskStateNegotiating,
+				DiskStateInconsistent, DiskStateDUnknown:
+				return false
+			default:
+				// Unknown/empty token — refuse, conservative.
+				return false
+			}
+		}
+	}
+
+	return true
+}
+
 // NeedsRecoveryPromote probes the live kernel via `drbdsetup status
 // <res> --json` and reports whether THIS node should re-arm the
 // auto-primary seed to unstick a fresh RD whose initial sync wedged
diff --git a/pkg/drbd/safe_for_mkfs_retry_promote_test.go b/pkg/drbd/safe_for_mkfs_retry_promote_test.go
new file mode 100644
index 00000000..d596a49a
--- /dev/null
+++ b/pkg/drbd/safe_for_mkfs_retry_promote_test.go
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/*
+Copyright 2026 Cozystack contributors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package drbd_test
+
+import (
+	"testing"
+
+	"github.com/cozystack/blockstor/pkg/drbd"
+	"github.com/cozystack/blockstor/pkg/storage"
+)
+
+// Regression pins for the BUG-028 latch-free mkfs-retry promote-safety
+// predicate (SafeForMkfsRetryPromote). The predicate authorises a
+// promote→mkfs→demote retry WITHOUT the dispatcher's auto-primary
+// blessing, so it must be provably conservative: it may only return
+// true when every replica is Secondary and every connected peer-device
+// is a lock-step UpToDate sibling (or an intentional Diskless witness)
+// — i.e. when `primary --force` cannot mint an unrelated UUID against
+// anyone and the mkfs writes replicate to bit-identical copies.
+
+const mkfsRetryStatusKey = "drbdsetup status pvc-b028 --json"
+
+func admWithMkfsRetryStatus(t *testing.T, json string) *drbd.Adm {
+	t.Helper()
+
+	fx := storage.NewFakeExec()
+	fx.Responses[mkfsRetryStatusKey] = storage.FakeResponse{Stdout: []byte(json)}
+
+	return drbd.NewAdm(fx)
+}
+
+// TestSafeForMkfsRetryPromote_AllSecondaryLockStepUpToDate: the exact
+// BUG-028 terminal state between two drbd-reactor promote cycles —
+// local Secondary UpToDate, diskful peer Secondary UpToDate, diskless
+// tiebreaker — must authorise the retry.
+func TestSafeForMkfsRetryPromote_AllSecondaryLockStepUpToDate(t *testing.T) {
+	adm := admWithMkfsRetryStatus(t, `[{
+	  "name":"pvc-b028","node-id":0,"role":"Secondary",
+	  "devices":[{"volume":0,"disk-state":"UpToDate"},{"volume":1,"disk-state":"UpToDate"}],
+	  "connections":[{
+	    "peer-node-id":1,"name":"n2","connection-state":"Connected",
+	    "peer-role":"Secondary",
+	    "peer_devices":[{"volume":0,"peer-disk-state":"UpToDate"},{"volume":1,"peer-disk-state":"UpToDate"}]
+	  },{
+	    "peer-node-id":2,"name":"n3","connection-state":"Connected",
+	    "peer-role":"Secondary",
+	    "peer_devices":[{"volume":0,"peer-disk-state":"Diskless"},{"volume":1,"peer-disk-state":"Diskless"}]
+	  }]
+	}]`)
+
+	if !adm.SafeForMkfsRetryPromote(t.Context(), "pvc-b028") {
+		t.Fatal("all-Secondary lock-step UpToDate set (with diskless witness) must authorise the latch-free mkfs retry")
+	}
+}
+
+// TestSafeForMkfsRetryPromote_ForeignPrimaryDefers: drbd-reactor (or
+// any external promoter) currently holds the device on a peer →
+// refuse; the caller retries on a later reconcile once it demoted.
+func TestSafeForMkfsRetryPromote_ForeignPrimaryDefers(t *testing.T) {
+	adm := admWithMkfsRetryStatus(t, `[{
+	  "name":"pvc-b028","node-id":0,"role":"Secondary",
+	  "devices":[{"volume":0,"disk-state":"UpToDate"}],
+	  "connections":[{
+	    "peer-node-id":1,"name":"n2","connection-state":"Connected",
+	    "peer-role":"Primary",
+	    "peer_devices":[{"volume":0,"peer-disk-state":"UpToDate"}]
+	  }]
+	}]`)
+
+	if adm.SafeForMkfsRetryPromote(t.Context(), "pvc-b028") {
+		t.Fatal("a foreign Primary peer must defer the latch-free mkfs retry")
+	}
+}
+
+// TestSafeForMkfsRetryPromote_LocalPrimaryRefuses: the local slot is
+// already Primary (a consumer or a previous dance holds the device) →
+// refuse.
+func TestSafeForMkfsRetryPromote_LocalPrimaryRefuses(t *testing.T) {
+	adm := admWithMkfsRetryStatus(t, `[{
+	  "name":"pvc-b028","node-id":0,"role":"Primary",
+	  "devices":[{"volume":0,"disk-state":"UpToDate"}],
+	  "connections":[{
+	    "peer-node-id":1,"name":"n2","connection-state":"Connected",
+	    "peer-role":"Secondary",
+	    "peer_devices":[{"volume":0,"peer-disk-state":"UpToDate"}]
+	  }]
+	}]`)
+
+	if adm.SafeForMkfsRetryPromote(t.Context(), "pvc-b028") {
+		t.Fatal("a local Primary role must refuse the latch-free mkfs retry")
+	}
+}
+
+// TestSafeForMkfsRetryPromote_DisconnectedPeerRefuses: a peer whose
+// disk state is DUnknown (connection down) could be an OFFLINE DATA
+// HOLDER — promoting against it is the Bug 342 unrelated-data wedge,
+// and mkfs could overwrite real data once it reconnects. Refuse.
+func TestSafeForMkfsRetryPromote_DisconnectedPeerRefuses(t *testing.T) {
+	adm := admWithMkfsRetryStatus(t, `[{
+	  "name":"pvc-b028","node-id":0,"role":"Secondary",
+	  "devices":[{"volume":0,"disk-state":"UpToDate"}],
+	  "connections":[{
+	    "peer-node-id":1,"name":"n2","connection-state":"Connecting",
+	    "peer-role":"Unknown",
+	    "peer_devices":[{"volume":0,"peer-disk-state":"DUnknown"}]
+	  }]
+	}]`)
+
+	if adm.SafeForMkfsRetryPromote(t.Context(), "pvc-b028") {
+		t.Fatal("a disconnected (DUnknown) peer must refuse the latch-free mkfs retry — it could be an offline data holder")
+	}
+}
+
+// TestSafeForMkfsRetryPromote_InconsistentPeerRefuses: a peer still
+// Inconsistent is not in lock-step with the local copy; the retry must
+// wait (or let the Bug 366 recovery-promote own that state).
+func TestSafeForMkfsRetryPromote_InconsistentPeerRefuses(t *testing.T) {
+	adm := admWithMkfsRetryStatus(t, `[{
+	  "name":"pvc-b028","node-id":0,"role":"Secondary",
+	  "devices":[{"volume":0,"disk-state":"UpToDate"}],
+	  "connections":[{
+	    "peer-node-id":1,"name":"n2","connection-state":"Connected",
+	    "peer-role":"Secondary",
+	    "peer_devices":[{"volume":0,"peer-disk-state":"Inconsistent","replication-state":"Established","resync-suspended":"no"}]
+	  }]
+	}]`)
+
+	if adm.SafeForMkfsRetryPromote(t.Context(), "pvc-b028") {
+		t.Fatal("an Inconsistent peer must refuse the latch-free mkfs retry")
+	}
+}
+
+// TestSafeForMkfsRetryPromote_LocalNotUpToDateRefuses: the retry adds
+// a missing filesystem to a HEALTHY converged replica — it must never
+// promote an Inconsistent local copy.
+func TestSafeForMkfsRetryPromote_LocalNotUpToDateRefuses(t *testing.T) {
+	adm := admWithMkfsRetryStatus(t, `[{
+	  "name":"pvc-b028","node-id":0,"role":"Secondary",
+	  "devices":[{"volume":0,"disk-state":"Inconsistent"}],
+	  "connections":[{
+	    "peer-node-id":1,"name":"n2","connection-state":"Connected",
+	    "peer-role":"Secondary",
+	    "peer_devices":[{"volume":0,"peer-disk-state":"UpToDate"}]
+	  }]
+	}]`)
+
+	if adm.SafeForMkfsRetryPromote(t.Context(), "pvc-b028") {
+		t.Fatal("a non-UpToDate local volume must refuse the latch-free mkfs retry")
+	}
+}
+
+// TestSafeForMkfsRetryPromote_ProbeFailureRefuses: any probe / parse
+// failure must be conservative (false) — the retry just waits for the
+// next reconcile.
+func TestSafeForMkfsRetryPromote_ProbeFailureRefuses(t *testing.T) {
+	adm := admWithMkfsRetryStatus(t, `not-json`)
+
+	if adm.SafeForMkfsRetryPromote(t.Context(), "pvc-b028") {
+		t.Fatal("a malformed status probe must refuse the latch-free mkfs retry")
+	}
+}

From 3e6ba06f63f44d8318375aff16f357376c645b2b Mon Sep 17 00:00:00 2001
From: Andrei Kvapil <kvapss@gmail.com>
Date: Fri, 12 Jun 2026 07:50:06 +0300
Subject: [PATCH 2/4] fix(satellite): never lose the day0 first-activation mkfs
 (BUG-028)

A fresh RWX RD (FileSystem/Type on the RD) intermittently came up
with no filesystem, forever. Two coupled failures:

1. Day0 race: skip-initial-sync brings both diskful replicas
   Connected+UpToDate at the shared day0 GI before the elected mkfs
   winner reaches finishDRBDApply. The Bug 342 force-promote kernel
   veto cannot tell an empty day0 sibling from a real data peer, so
   the one-and-only first-activation mkfs was silently skipped.
2. False-latch terminal state: an external promoter (drbd-reactor)
   then bumps the current-UUID past day0 without writing data, the
   controller latches RD.Spec.Initialized, the dispatcher drops the
   auto-primary election, and the Bug-311 mkfs retry (gated solely
   on autoPrimaryReplica) goes permanently dead.

Fix, both sides, evidence-gated:

- day0EmptyMkfsBypass: the veto may be bypassed only when the
  controller-persisted Spec.SkipInitialSync proves a never-
  initialized generation, no proven data peer exists, the local
  metadata current-UUID still equals the deterministic day0 GI
  (DRBD only lets replicas sit Connected+UpToDate in the same data
  generation, so this proves every connected peer is a day0-empty
  sibling), and no volume carries a filesystem signature.
- shouldRetryAutoMkfs: the Bug-311 retry now also fires without the
  auto-primary election, via the deterministic lowest-diskful-
  node-id winner + SafeForMkfsRetryPromote kernel proof + blkid
  fs-absence probe, throttled, deferring while a foreign Primary
  holds the device.

Any missing evidence refuses both paths, preserving the Bug 342/356
relocate and respawn-StandAlone protections and the v0.1.11 day0
skip-initial-sync contract.

Co-Authored-By: Claude <noreply@anthropic.com>
Signed-off-by: Andrei Kvapil <kvapss@gmail.com>
---
 pkg/satellite/reconciler.go             | 269 ++++++++++++++-
 pkg/satellite/reconciler_bug028_test.go | 437 ++++++++++++++++++++++++
 2 files changed, 696 insertions(+), 10 deletions(-)
 create mode 100644 pkg/satellite/reconciler_bug028_test.go

diff --git a/pkg/satellite/reconciler.go b/pkg/satellite/reconciler.go
index e6b22c85..0d73d16d 100644
--- a/pkg/satellite/reconciler.go
+++ b/pkg/satellite/reconciler.go
@@ -2189,7 +2189,7 @@ func (r *Reconciler) applyDRBD(ctx context.Context, dr *intent.DesiredResource,
 		return err
 	}
 
-	return r.finishDRBDApply(ctx, dr, diskless, effectiveFirstActivation, resized, cloned)
+	return r.finishDRBDApply(ctx, dr, diskless, effectiveFirstActivation, resized, cloned, devices)
 }
 
 // healAndDispatchFsm runs the Bug 360 my-node-id self-heal and then
@@ -2293,7 +2293,7 @@ func (r *Reconciler) reconcileKernelMyNodeID(ctx context.Context, dr *intent.Des
 // adjust, and drbd-utils' compare_volume schedules attach_cmd
 // automatically when kern->disk=="none" but conf->disk points at a
 // real path. Matches upstream LINSTOR's DrbdLayer pipeline.
-func (r *Reconciler) finishDRBDApply(ctx context.Context, dr *intent.DesiredResource, diskless, firstActivation, resized, cloned bool) error {
+func (r *Reconciler) finishDRBDApply(ctx context.Context, dr *intent.DesiredResource, diskless, firstActivation, resized, cloned bool, devices map[int32]string) error {
 	// Pickup-time resize: the storage layer was just grown, drbdadm
 	// resize tells the kernel to extend the replicated device to
 	// match. Adjust on its own won't do this — only resize re-reads
@@ -2342,13 +2342,41 @@ func (r *Reconciler) finishDRBDApply(ctx context.Context, dr *intent.DesiredReso
 	_ = cloned
 
 	if autoPromote && !r.shouldForcePromote(ctx, dr) {
-		// Bug 342 force-promote gate fired: a data-bearing peer exists,
-		// so SKIP `drbdadm primary --force`. The fresh replica stays
-		// Inconsistent and SyncTargets from the peer (full resync,
-		// data-safe). Returning here also skips the mkfs-retry below —
-		// correct, since the replica adopts the peer's filesystem via
-		// the resync rather than formatting locally.
-		return nil
+		// Bug 342 force-promote gate fired: the kernel probe saw a
+		// connected peer-disk in UpToDate/Consistent/Outdated.
+		//
+		// BUG-028: that probe cannot tell an EMPTY day0 skip-initial-sync
+		// sibling (both fresh diskful replicas reach Connected+UpToDate at
+		// the shared day0 GI within ~2s, BEFORE the elected winner gets
+		// here) from a real data-bearing peer. Blindly returning here
+		// silently skipped the first-activation mkfs, latched
+		// firstActivation=false, and an external promoter (drbd-reactor
+		// RWX path) then bumped the current-UUID past day0 without
+		// writing data → the controller false-latched RD.Spec.Initialized
+		// → the dispatcher dropped auto-primary → the Bug-311 retry was
+		// permanently dead → "Bad magic number in super-block" forever.
+		//
+		// day0EmptyMkfsBypass re-checks with day0-aware evidence: ONLY
+		// when the controller-persisted SkipInitialSync says this is a
+		// genuinely-fresh generation, no peer is a proven data holder,
+		// every local volume still sits at the deterministic day0 GI
+		// (kernel-truth: a Connected+UpToDate peer necessarily shares the
+		// local current-UUID, so day0 here proves every connected peer is
+		// a never-written day0 sibling too), and no volume carries a
+		// filesystem signature, do we fall through to the promote+mkfs.
+		if !r.day0EmptyMkfsBypass(ctx, dr, devices) {
+			// Genuine data-bearing peer (or any evidence is missing —
+			// conservative): SKIP `drbdadm primary --force`. The fresh
+			// replica stays Inconsistent and SyncTargets from the peer
+			// (full resync, data-safe). Returning here also skips the
+			// mkfs-retry below — correct, since the replica adopts the
+			// peer's filesystem via the resync rather than formatting
+			// locally.
+			return nil
+		}
+
+		log.FromContext(ctx).Info("BUG-028: force-promote veto bypassed — every connected peer is a day0-empty sibling, proceeding to first-activation mkfs",
+			"resource", dr.GetName())
 	}
 
 	// Reaching UpToDate no longer depends on this promote. The elected
@@ -2396,7 +2424,19 @@ func (r *Reconciler) finishDRBDApply(ctx context.Context, dr *intent.DesiredReso
 	// already populated from a previous attempt), runAutoMkfs writes
 	// the marker and this branch becomes a no-op for the rest of the
 	// resource's life.
-	if !autoPromote && autoPrimaryReplica && r.needsAutoMkfsRetry(dr) {
+	//
+	// BUG-028: the retry can no longer depend SOLELY on
+	// autoPrimaryReplica. When the false RD.Spec.Initialized latch fires
+	// (an external promoter bumped the current-UUID past day0 without
+	// writing data), the dispatcher stops stamping `auto-primary` and the
+	// autoPrimaryReplica-only gate left this retry permanently dead on a
+	// volume that NEVER got a filesystem. latchFreeMkfsRetryAllowed
+	// re-enables the retry from first principles instead: deterministic
+	// lowest-diskful-node-id winner, kernel state safe for a promote (all
+	// replicas Secondary + lock-step UpToDate — retried later when a
+	// foreign Primary such as drbd-reactor briefly holds the device), and
+	// an actual filesystem-absence probe on every local volume.
+	if r.shouldRetryAutoMkfs(ctx, dr, autoPromote, autoPrimaryReplica, diskless, devices) {
 		err := r.runAutoPromote(ctx, dr)
 		if err != nil {
 			return err
@@ -2660,6 +2700,215 @@ func (r *Reconciler) needsAutoMkfsRetry(dr *intent.DesiredResource) bool {
 	return os.IsNotExist(err)
 }
 
+// shouldRetryAutoMkfs is the Bug-311 mkfs-retry predicate (see the
+// finishDRBDApply call-site comment for the full history): re-enter the
+// promote→mkfs→demote dance on a steady-state reconcile when the
+// first-activation mkfs never finished. The replica must be diskful and
+// past its first activation, the marker/Condition must be absent
+// (needsAutoMkfsRetry), and the node must be authorised either by the
+// dispatcher's auto-primary election or — BUG-028 — by the latch-free
+// evidence chain (latchFreeMkfsRetryAllowed) when the false
+// RD.Spec.Initialized latch killed that election.
+func (r *Reconciler) shouldRetryAutoMkfs(ctx context.Context, dr *intent.DesiredResource, autoPromote, autoPrimaryReplica, diskless bool, devices map[int32]string) bool {
+	if autoPromote || diskless || !r.needsAutoMkfsRetry(dr) {
+		return false
+	}
+
+	return autoPrimaryReplica || r.latchFreeMkfsRetryAllowed(ctx, dr, devices)
+}
+
+// day0EmptyMkfsBypass is the BUG-028 narrow escape hatch from the Bug
+// 342 force-promote veto. The veto's kernel probe (peer-disk UpToDate/
+// Consistent/Outdated) is the right conservative default, but it cannot
+// distinguish a real data-bearing peer from an EMPTY day0
+// skip-initial-sync sibling: both fresh diskful replicas of a
+// skip-seeded RD reach Connected+UpToDate at the SHARED deterministic
+// day0 current-UUID within seconds — often before the elected
+// mkfs winner reaches finishDRBDApply. Honouring the veto there
+// silently dropped the one-and-only first-activation mkfs.
+//
+// Returns true ONLY when every signal proves "all connected peers are
+// day0-empty siblings of the same never-initialized generation" (belt
+// and braces, all four must hold):
+//
+//  1. The RD asks for a filesystem and the satellite can run mkfs at
+//     all (needsMkfs + Exec wired) — otherwise the bypass is moot.
+//  2. The controller-persisted Spec.SkipInitialSync is explicitly true:
+//     the OFFLINE-SAFE proof this replica was born into a genuinely-
+//     fresh, never-initialized RD generation (relocate / migrate /
+//     extra-replica destinations are stamped false and never bypass).
+//  3. The dispatcher's CRD view reports no proven data-bearing diskful
+//     peer (PeerHasData=false; day0 siblings are already excluded by
+//     isDay0SeededVolume, a real survivor is not).
+//  4. Kernel/metadata truth per volume: the LOCAL current-UUID still
+//     equals the deterministic day0 GI. DRBD only lets two replicas
+//     sit Connected+UpToDate when they are in the same data
+//     generation, so local==day0 proves every connected UpToDate peer
+//     — exactly the ones that fired the veto — is at day0 too, i.e. a
+//     never-written sibling. A real data holder mints a runtime UUID
+//     that cannot collide with day0 (2^-64), so this discriminator is
+//     exact. AND the volume's backing device carries no filesystem
+//     signature (blkid probe; the DRBD device itself is unopenable
+//     while Secondary, and with internal metadata the backing device
+//     exposes the same data bytes at offset 0).
+//
+// Any probe failure, missing device path, or unknown GI refuses the
+// bypass — the veto then stands and behaviour is exactly pre-BUG-028
+// (skip promote, full-resync path). That keeps the relocate /
+// respawn-StandAlone protections of Bug 342/356 intact: this function
+// can only ever ADD an mkfs on a provably day0-empty generation, never
+// remove a veto protecting real data.
+func (r *Reconciler) day0EmptyMkfsBypass(ctx context.Context, dr *intent.DesiredResource, devices map[int32]string) bool {
+	if !needsMkfs(dr) || r.cfg.Exec == nil {
+		return false
+	}
+
+	if skip := dr.GetSkipInitialSync(); skip == nil || !*skip {
+		return false
+	}
+
+	if dr.GetPeerHasData() {
+		return false
+	}
+
+	return r.allVolumesDay0Empty(ctx, dr, devices)
+}
+
+// allVolumesDay0Empty reports whether EVERY desired volume of dr still
+// sits at the deterministic day0 current-UUID in its on-disk DRBD
+// metadata AND carries no filesystem signature on its backing device.
+// Conservative: any missing device path, drbdmeta/blkid probe failure,
+// or non-day0 GI returns false. See day0EmptyMkfsBypass for why this
+// is the exact "all connected peers are day0-empty siblings" proof.
+func (r *Reconciler) allVolumesDay0Empty(ctx context.Context, dr *intent.DesiredResource, devices map[int32]string) bool {
+	for _, vol := range dr.GetVolumes() {
+		device := devices[vol.GetVolumeNumber()]
+		if device == "" {
+			return false
+		}
+
+		gi, err := r.cfg.Adm.CurrentGI(ctx, dr.GetName(), vol.GetVolumeNumber(), device)
+		if err != nil || gi == "" {
+			return false
+		}
+
+		if !strings.EqualFold(gi, day0GiFor(dr.GetName(), vol.GetVolumeNumber())) {
+			return false
+		}
+
+		if r.deviceHasFilesystem(ctx, device) {
+			return false
+		}
+	}
+
+	return true
+}
+
+// latchFreeMkfsRetryAllowed is the BUG-028 replacement evidence for the
+// Bug-311 mkfs retry when the dispatcher no longer stamps
+// `auto-primary`. The dispatcher's election is gated on
+// !RD.Spec.Initialized, and that latch can fire FALSELY on a volume
+// that never received its filesystem: an external promoter
+// (drbd-reactor's RWX promote→mount-fail→demote loop) bumps the DRBD
+// current-UUID past day0 WITHOUT writing data, the controller reads
+// "UpToDate diskful with CurrentGI != day0" as proven data, latches
+// Initialized=true, and the auto-primary-only retry gate goes
+// permanently dead. This helper re-derives the retry permission from
+// first principles instead of the (unprovable-here) latch:
+//
+//   - This node is the DETERMINISTIC retry winner: lowest diskful
+//     node-id among itself and its configured peers — the same
+//     election rule the dispatcher uses — so at most ONE node ever
+//     re-enters the promote→mkfs→demote dance.
+//   - Kernel state is promote-safe (Adm.SafeForMkfsRetryPromote): the
+//     local replica is Secondary with every volume UpToDate, no
+//     replica anywhere is Primary (a foreign Primary — drbd-reactor
+//     mid-cycle — simply defers the retry to a later reconcile, when
+//     it has demoted again), and every connected peer-device is
+//     UpToDate or an intentional Diskless witness. An UNKNOWN /
+//     disconnected diskful peer vetoes: it could be an offline data
+//     holder, and `primary --force` against it is the Bug 342 wedge.
+//     Conversely all-UpToDate-while-Connected proves every replica is
+//     in the SAME data generation as the local one, so the promote
+//     mints nothing unrelated and the mkfs writes replicate to peers
+//     that are by construction bit-identical to the local volume.
+//   - Every local volume's backing device has NO filesystem signature
+//     (blkid): real data in the mkfs context means a filesystem (the
+//     RD requests FileSystem/Type; consumers only ever write through
+//     it), so fs-absence both proves the retry is still needed and
+//     that there is nothing to destroy. The post-promote blkid probe
+//     inside runAutoMkfs remains the per-volume double-mkfs safety
+//     net on the replicated device itself.
+//   - Throttled through recoveryPromoteDue (shared with the Bug 366
+//     recovery-promote): the promote→demote dance churns kernel state
+//     and may race the external promoter's own cycle; one nudge per
+//     throttle window is enough and keeps the reconcile loop cold.
+//
+// Callers must have already checked needsAutoMkfsRetry (marker and
+// Condition absent) so the probes below only ever run on a resource
+// that is genuinely missing its filesystem marker — never in steady
+// state.
+func (r *Reconciler) latchFreeMkfsRetryAllowed(ctx context.Context, dr *intent.DesiredResource, devices map[int32]string) bool {
+	if !r.isLowestDiskfulNodeID(dr) {
+		return false
+	}
+
+	if !r.cfg.Adm.SafeForMkfsRetryPromote(ctx, dr.GetName()) {
+		return false
+	}
+
+	for _, vol := range dr.GetVolumes() {
+		device := devices[vol.GetVolumeNumber()]
+		if device == "" {
+			return false
+		}
+
+		if r.deviceHasFilesystem(ctx, device) {
+			return false
+		}
+	}
+
+	// Consume the throttle slot LAST so a pass vetoed by a foreign
+	// Primary / probe failure does not burn the window.
+	if !r.recoveryPromoteDue(dr.GetName()) {
+		return false
+	}
+
+	log.FromContext(ctx).Info("BUG-028: latch-free mkfs retry — no auto-primary election but filesystem is provably absent, re-entering promote+mkfs+demote",
+		"resource", dr.GetName())
+
+	return true
+}
+
+// isLowestDiskfulNodeID replicates the dispatcher's mkfs-winner
+// election on the satellite side from the wire DrbdOptions: true when
+// the LOCAL node-id is resolved and is strictly the lowest among the
+// local node and every configured non-diskless peer. Missing /
+// unparsable local id refuses (conservative — an unresolved identity
+// must never promote); a peer with a missing id is treated as diskful
+// id-unknown and also refuses, since the election would be ambiguous.
+func (r *Reconciler) isLowestDiskfulNodeID(dr *intent.DesiredResource) bool {
+	opts := dr.GetDrbdOptions()
+
+	selfID, err := strconv.Atoi(opts["node-id"])
+	if err != nil {
+		return false
+	}
+
+	for _, peer := range dr.GetPeerNames() {
+		if opts["peer."+peer+".diskless"] == drbdBoolPropTrue {
+			continue
+		}
+
+		peerID, peerErr := strconv.Atoi(opts["peer."+peer+".node-id"])
+		if peerErr != nil || peerID < selfID {
+			return false
+		}
+	}
+
+	return true
+}
+
 // isDisklessToDiskfulFlip probes whether the local kernel slot is
 // currently `disk:Diskless client:yes` (intentional diskless) on a
 // Resource whose Spec has flipped to diskful (`linstor r td
diff --git a/pkg/satellite/reconciler_bug028_test.go b/pkg/satellite/reconciler_bug028_test.go
new file mode 100644
index 00000000..2d0f91e1
--- /dev/null
+++ b/pkg/satellite/reconciler_bug028_test.go
@@ -0,0 +1,437 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/*
+Copyright 2026 Cozystack contributors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package satellite_test
+
+import (
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/cozystack/blockstor/pkg/drbd"
+	"github.com/cozystack/blockstor/pkg/satellite"
+	intent "github.com/cozystack/blockstor/pkg/satellite/intent"
+	"github.com/cozystack/blockstor/pkg/storage"
+	"github.com/cozystack/blockstor/pkg/storage/lvm"
+)
+
+// BUG-028 regression pins — the day0 mkfs race and its false-latch
+// terminal state.
+//
+// Failure chain on a fresh RWX RD (FileSystem/Type=ext4):
+//
+//  1. Day0 skip-initial-sync brings BOTH diskful replicas
+//     Connected+UpToDate at the shared deterministic day0 GI in ~2s.
+//  2. The mkfs-election winner reaches finishDRBDApply AFTER that;
+//     the Bug 342 force-promote kernel veto (AnyConnectedPeerHasData)
+//     sees peer-disk:UpToDate, cannot tell the EMPTY day0 sibling from
+//     real data, and silently skips the one-and-only first-activation
+//     mkfs. firstActivation latches false.
+//  3. An external promoter (drbd-reactor RWX path) promote/demotes the
+//     empty volume in 20s cycles; each promote bumps the current-UUID
+//     past day0 WITHOUT writing data.
+//  4. The controller reads "UpToDate diskful, CurrentGI != day0" as
+//     proven data → RD.Spec.Initialized latches true (FALSELY).
+//  5. The dispatcher gates the auto-primary election on !rdInitialized
+//     → no replica carries auto-primary → the Bug-311 mkfs retry
+//     (gated on autoPrimaryReplica) is permanently dead. Terminal:
+//     promote → fsck "Bad magic number" → demote, forever.
+//
+// The fix is two-sided and these tests pin both sides plus the
+// data-safety counter-cases:
+//
+//   - day0EmptyMkfsBypass: the veto may be bypassed ONLY when every
+//     signal proves the whole connected set is day0-empty siblings
+//     (Spec.SkipInitialSync=true, PeerHasData=false, local metadata
+//     current-UUID == day0, no fs signature) → mkfs happens (step 2
+//     fixed, steps 3-5 never start).
+//   - latchFreeMkfsRetryAllowed: the Bug-311 retry no longer depends
+//     solely on the dispatcher's auto-primary election (killed by the
+//     false latch); the deterministic lowest-diskful-node-id winner
+//     re-enters promote→mkfs→demote when the kernel set is provably
+//     promote-safe and the filesystem is provably absent.
+
+// statusBothUpToDateSecondary is the kernel view of the BUG-028 race /
+// terminal state: local Secondary UpToDate, one diskful peer Secondary
+// UpToDate, one diskless tiebreaker.
+func statusBothUpToDateSecondary(rd string) string {
+	return `[{
+	  "name":"` + rd + `","node-id":0,"role":"Secondary",
+	  "devices":[{"volume":0,"disk-state":"UpToDate"}],
+	  "connections":[{
+	    "peer-node-id":1,"name":"n2","connection-state":"Connected",
+	    "peer-role":"Secondary",
+	    "peer_devices":[{"volume":0,"peer-disk-state":"UpToDate"}]
+	  }]
+	}]`
+}
+
+// expectThinBacking cans the lvs probes so applyStorage resolves the
+// (already-carved) thin LV and populates the devices map with its
+// backing path — which the BUG-028 probes (drbdmeta get-gi current-UUID
+// read, blkid fs-signature probe) target.
+func expectThinBacking(fx *storage.FakeExec, rd string) string {
+	lv := rd + "_00000"
+	device := "/dev/vg/" + lv
+
+	fx.Expect("lvs --config devices { filter=['r|^/dev/drbd|','r|^/dev/zd|'] } --noheadings -o lv_name vg/"+lv,
+		storage.FakeResponse{Stdout: []byte(lv + "\n")})
+	fx.Expect("lvs --config devices { filter=['r|^/dev/drbd|','r|^/dev/zd|'] } --noheadings --separator | -o lv_path,lv_size --units k --nosuffix vg/"+lv,
+		storage.FakeResponse{Stdout: []byte(device + "|1048576\n")})
+
+	return device
+}
+
+func expectGetGI(fx *storage.FakeExec, rd, device, currentGI string) {
+	fx.Expect("drbdmeta --force "+rd+"/0 v09 "+device+" internal get-gi --node-id 0",
+		storage.FakeResponse{Stdout: []byte(currentGI + ":0000000000000000:0:0:1:1:0:0:0:0\n")})
+}
+
+func newThinReconciler(fx *storage.FakeExec, dir string) *satellite.Reconciler {
+	thin := lvm.NewThin(lvm.ThinConfig{VolumeGroup: "vg", ThinPool: "tp"}, fx)
+
+	return satellite.NewReconciler(satellite.ReconcilerConfig{
+		Providers: map[string]storage.Provider{"thin1": thin},
+		Adm:       drbd.NewAdm(fx),
+		Exec:      fx,
+		StateDir:  dir,
+		NodeName:  "n1",
+	})
+}
+
+// bug028WinnerDR is the elected mkfs winner's wire payload at first
+// activation: auto-primary stamped, SkipInitialSync=true, RD-level
+// FileSystem/Type, one diskful peer.
+func bug028WinnerDR(rd, minor string) []*intent.DesiredResource {
+	return []*intent.DesiredResource{
+		{
+			Name:     rd,
+			NodeName: "n1",
+			Volumes: []*intent.DesiredVolume{
+				{VolumeNumber: 0, SizeKib: 1024 * 1024, StoragePool: "thin1"},
+			},
+			Props: map[string]string{
+				"FileSystem/Type": "ext4",
+			},
+			Peers:           []intent.DesiredPeer{{Name: "n2"}},
+			SkipInitialSync: skipInitTrue(),
+			DrbdOptions: map[string]string{
+				"port": "7000", "node-id": "0", "address": "10.0.0.1", "minor": minor,
+				"peer.n2.port": "7000", "peer.n2.node-id": "1", "peer.n2.address": "10.0.0.2",
+				"auto-primary": "true",
+			},
+		},
+	}
+}
+
+func assertPromoteMkfsDemoteOrder(t *testing.T, cmds []string, rd, drbdDev string) {
+	t.Helper()
+
+	posPrim, posMkfs, posSec := -1, -1, -1
+
+	for i, line := range cmds {
+		switch {
+		case posPrim < 0 && strings.Contains(line, "drbdadm primary --force "+rd):
+			posPrim = i
+		case posMkfs < 0 && strings.Contains(line, "mkfs.ext4 "+drbdDev):
+			posMkfs = i
+		case posSec < 0 && strings.Contains(line, "drbdadm secondary "+rd):
+			posSec = i
+		}
+	}
+
+	if posPrim < 0 || posMkfs <= posPrim || posSec <= posMkfs {
+		t.Errorf("want primary --force < mkfs < secondary; got prim=%d mkfs=%d sec=%d in %v",
+			posPrim, posMkfs, posSec, cmds)
+	}
+}
+
+func assertNoPromoteNoMkfs(t *testing.T, cmds []string) {
+	t.Helper()
+
+	for _, line := range cmds {
+		if strings.Contains(line, "primary --force") {
+			t.Errorf("must NOT force-promote: %s", line)
+		}
+
+		if strings.HasPrefix(line, "mkfs.") || strings.Contains(line, " mkfs.") {
+			t.Errorf("must NOT mkfs: %s", line)
+		}
+	}
+}
+
+// TestApplyBug028Day0RaceVetoBypassedMkfsRuns pins the race fix: the
+// day0 siblings connect Connected+UpToDate BEFORE the winner's
+// first-activation pass, the Bug 342 kernel veto fires — and the
+// day0-empty bypass (Spec.SkipInitialSync=true, PeerHasData=false,
+// local current-UUID == day0, no fs signature) lets the promote+mkfs
+// proceed anyway. Pre-fix the mkfs was silently skipped here, forever.
+func TestApplyBug028Day0RaceVetoBypassedMkfsRuns(t *testing.T) {
+	dir := t.TempDir()
+	fx := storage.NewFakeExec()
+	device := expectThinBacking(fx, "pvc-b028")
+	// Kernel truth at the winner's finishDRBDApply: the day0 sibling is
+	// already Connected+UpToDate → AnyConnectedPeerHasData vetoes.
+	fx.Expect("drbdsetup status pvc-b028 --json",
+		storage.FakeResponse{Stdout: []byte(statusBothUpToDateSecondary("pvc-b028"))})
+	// Local metadata still sits at the deterministic day0 current-UUID —
+	// the exact proof that every Connected+UpToDate peer is a
+	// never-written day0 sibling of the same generation.
+	expectGetGI(fx, "pvc-b028", device, satellite.Day0GiForTest("pvc-b028", 0))
+	// blkid probes (backing pre-promote, /dev/drbd post-promote) return
+	// the FakeExec default (empty, no TYPE= line) → no fs anywhere.
+
+	rec := newThinReconciler(fx, dir)
+
+	_, err := rec.Apply(t.Context(), bug028WinnerDR("pvc-b028", "6500"))
+	if err != nil {
+		t.Fatalf("Apply: %v", err)
+	}
+
+	cmds := fx.CommandLines()
+	assertPromoteMkfsDemoteOrder(t, cmds, "pvc-b028", "/dev/drbd6500")
+
+	if _, statErr := os.Stat(filepath.Join(dir, "pvc-b028.mkfs.done")); statErr != nil {
+		t.Errorf(".mkfs.done marker must be written after the bypassed-veto mkfs; got stat err %v", statErr)
+	}
+}
+
+// TestApplyBug028VetoHoldsOnNonDay0PeerGI is the data-safety
+// counter-case: the kernel veto fires and the local current-UUID is
+// NOT day0 (a real data generation — relocate survivor / post-write
+// state). The bypass must refuse: no promote, no mkfs; the replica
+// stays on the full-resync path. NEVER mkfs over real data.
+func TestApplyBug028VetoHoldsOnNonDay0PeerGI(t *testing.T) {
+	dir := t.TempDir()
+	fx := storage.NewFakeExec()
+	device := expectThinBacking(fx, "pvc-b028d")
+	fx.Expect("drbdsetup status pvc-b028d --json",
+		storage.FakeResponse{Stdout: []byte(statusBothUpToDateSecondary("pvc-b028d"))})
+	// A runtime current-UUID (cannot equal the deterministic day0).
+	expectGetGI(fx, "pvc-b028d", device, "2BCB1C8F00B058AE")
+
+	rec := newThinReconciler(fx, dir)
+
+	_, err := rec.Apply(t.Context(), bug028WinnerDR("pvc-b028d", "6510"))
+	if err != nil {
+		t.Fatalf("Apply: %v", err)
+	}
+
+	assertNoPromoteNoMkfs(t, fx.CommandLines())
+
+	if _, statErr := os.Stat(filepath.Join(dir, "pvc-b028d.mkfs.done")); statErr == nil {
+		t.Error(".mkfs.done must NOT be written when the veto holds")
+	}
+}
+
+// TestApplyBug028VetoHoldsOnFsSignature: belt-and-braces counter-case —
+// even at day0 GI, a filesystem signature on the backing device refuses
+// the bypass (there are bytes a mkfs would destroy).
+func TestApplyBug028VetoHoldsOnFsSignature(t *testing.T) {
+	dir := t.TempDir()
+	fx := storage.NewFakeExec()
+	device := expectThinBacking(fx, "pvc-b028f")
+	fx.Expect("drbdsetup status pvc-b028f --json",
+		storage.FakeResponse{Stdout: []byte(statusBothUpToDateSecondary("pvc-b028f"))})
+	expectGetGI(fx, "pvc-b028f", device, satellite.Day0GiForTest("pvc-b028f", 0))
+	fx.Expect("blkid -o export "+device,
+		storage.FakeResponse{Stdout: []byte("DEVNAME=" + device + "\nTYPE=ext4\nUSAGE=filesystem\n")})
+
+	rec := newThinReconciler(fx, dir)
+
+	_, err := rec.Apply(t.Context(), bug028WinnerDR("pvc-b028f", "6520"))
+	if err != nil {
+		t.Fatalf("Apply: %v", err)
+	}
+
+	assertNoPromoteNoMkfs(t, fx.CommandLines())
+}
+
+// TestApplyBug028VetoHoldsOnPeerHasData: the dispatcher's CRD view
+// reports a PROVEN data-bearing diskful peer (non-day0 GI observed on
+// the peer's Status) → bypass refused before any kernel probe runs.
+func TestApplyBug028VetoHoldsOnPeerHasData(t *testing.T) {
+	dir := t.TempDir()
+	fx := storage.NewFakeExec()
+	expectThinBacking(fx, "pvc-b028p")
+
+	rec := newThinReconciler(fx, dir)
+
+	dr := bug028WinnerDR("pvc-b028p", "6530")
+	dr[0].PeerHasData = true
+
+	_, err := rec.Apply(t.Context(), dr)
+	if err != nil {
+		t.Fatalf("Apply: %v", err)
+	}
+
+	assertNoPromoteNoMkfs(t, fx.CommandLines())
+}
+
+// bug028FalseLatchDR is the wire payload of the BUG-028 TERMINAL state:
+// the false RD.Spec.Initialized latch fired, so the dispatcher no
+// longer stamps `auto-primary`; metadata exists (MetadataCreated=true →
+// firstActivation=false); the `.mkfs.done` marker never landed; the RD
+// still asks for ext4.
+func bug028FalseLatchDR(rd, minor string) []*intent.DesiredResource {
+	return []*intent.DesiredResource{
+		{
+			Name:     rd,
+			NodeName: "n1",
+			Volumes: []*intent.DesiredVolume{
+				{VolumeNumber: 0, SizeKib: 1024 * 1024, StoragePool: "thin1"},
+			},
+			Props: map[string]string{
+				"FileSystem/Type": "ext4",
+			},
+			Peers:           []intent.DesiredPeer{{Name: "n2"}},
+			SkipInitialSync: skipInitTrue(),
+			MetadataCreated: true,
+			DrbdOptions: map[string]string{
+				"port": "7000", "node-id": "0", "address": "10.0.0.1", "minor": minor,
+				"peer.n2.port": "7000", "peer.n2.node-id": "1", "peer.n2.address": "10.0.0.2",
+				// NO auto-primary: the false Initialized latch killed the
+				// dispatcher's election.
+			},
+		},
+	}
+}
+
+// TestApplyBug028FalseLatchRetryFiresWithoutAutoPrimary pins the
+// latch-independence fix: even with NO auto-primary election, the
+// deterministic lowest-diskful-node-id winner re-enters
+// promote→mkfs→demote when the kernel set is all-Secondary lock-step
+// UpToDate and no volume carries a filesystem. Pre-fix this state was
+// terminal (retry gated solely on autoPrimaryReplica).
+func TestApplyBug028FalseLatchRetryFiresWithoutAutoPrimary(t *testing.T) {
+	dir := t.TempDir()
+	fx := storage.NewFakeExec()
+	expectThinBacking(fx, "pvc-b028r")
+	fx.Expect("drbdsetup status pvc-b028r --json",
+		storage.FakeResponse{Stdout: []byte(statusBothUpToDateSecondary("pvc-b028r"))})
+	// blkid on the backing device and on /dev/drbd6600: FakeExec default
+	// (no TYPE=) → filesystem provably absent.
+
+	rec := newThinReconciler(fx, dir)
+
+	_, err := rec.Apply(t.Context(), bug028FalseLatchDR("pvc-b028r", "6600"))
+	if err != nil {
+		t.Fatalf("Apply: %v", err)
+	}
+
+	cmds := fx.CommandLines()
+	assertPromoteMkfsDemoteOrder(t, cmds, "pvc-b028r", "/dev/drbd6600")
+
+	if _, statErr := os.Stat(filepath.Join(dir, "pvc-b028r.mkfs.done")); statErr != nil {
+		t.Errorf(".mkfs.done marker must be written after the latch-free retry; got stat err %v", statErr)
+	}
+}
+
+// TestApplyBug028FalseLatchRetryDefersWhileForeignPrimary pins the
+// external-promoter coexistence contract: while drbd-reactor holds the
+// device Primary on a peer, the retry must NOT fight it — and must fire
+// on a later pass once every replica is Secondary again.
+func TestApplyBug028FalseLatchRetryDefersWhileForeignPrimary(t *testing.T) {
+	dir := t.TempDir()
+	fx := storage.NewFakeExec()
+	expectThinBacking(fx, "pvc-b028w")
+	fx.Expect("drbdsetup status pvc-b028w --json",
+		storage.FakeResponse{Stdout: []byte(`[{
+		  "name":"pvc-b028w","node-id":0,"role":"Secondary",
+		  "devices":[{"volume":0,"disk-state":"UpToDate"}],
+		  "connections":[{
+		    "peer-node-id":1,"name":"n2","connection-state":"Connected",
+		    "peer-role":"Primary",
+		    "peer_devices":[{"volume":0,"peer-disk-state":"UpToDate"}]
+		  }]
+		}]`)})
+
+	rec := newThinReconciler(fx, dir)
+
+	dr := bug028FalseLatchDR("pvc-b028w", "6610")
+
+	_, err := rec.Apply(t.Context(), dr)
+	if err != nil {
+		t.Fatalf("Apply (foreign Primary): %v", err)
+	}
+
+	assertNoPromoteNoMkfs(t, fx.CommandLines())
+
+	// The reactor demoted (mount failed again) → all Secondary → the
+	// next reconcile pass picks the retry up.
+	fx.Reset()
+	expectThinBacking(fx, "pvc-b028w")
+	fx.Expect("drbdsetup status pvc-b028w --json",
+		storage.FakeResponse{Stdout: []byte(statusBothUpToDateSecondary("pvc-b028w"))})
+
+	_, err = rec.Apply(t.Context(), dr)
+	if err != nil {
+		t.Fatalf("Apply (all Secondary): %v", err)
+	}
+
+	assertPromoteMkfsDemoteOrder(t, fx.CommandLines(), "pvc-b028w", "/dev/drbd6610")
+}
+
+// TestApplyBug028FalseLatchRetryOnlyOnElectionWinner: the latch-free
+// retry replicates the dispatcher's lowest-diskful-node-id election so
+// AT MOST ONE node re-enters the promote dance. A node whose diskful
+// peer holds a lower id must stay quiet.
+func TestApplyBug028FalseLatchRetryOnlyOnElectionWinner(t *testing.T) {
+	dir := t.TempDir()
+	fx := storage.NewFakeExec()
+	expectThinBacking(fx, "pvc-b028l")
+	fx.Expect("drbdsetup status pvc-b028l --json",
+		storage.FakeResponse{Stdout: []byte(statusBothUpToDateSecondary("pvc-b028l"))})
+
+	rec := newThinReconciler(fx, dir)
+
+	dr := bug028FalseLatchDR("pvc-b028l", "6620")
+	dr[0].DrbdOptions["node-id"] = "1"
+	dr[0].DrbdOptions["peer.n2.node-id"] = "0"
+
+	_, err := rec.Apply(t.Context(), dr)
+	if err != nil {
+		t.Fatalf("Apply: %v", err)
+	}
+
+	assertNoPromoteNoMkfs(t, fx.CommandLines())
+}
+
+// TestApplyBug028FalseLatchRetryRefusedWhenFsPresent: data-safety
+// counter-case for the retry side — a filesystem signature on the
+// backing device means there is nothing to retry (and bytes a promote
+// dance could disturb). No promote, no mkfs.
+func TestApplyBug028FalseLatchRetryRefusedWhenFsPresent(t *testing.T) {
+	dir := t.TempDir()
+	fx := storage.NewFakeExec()
+	device := expectThinBacking(fx, "pvc-b028s")
+	fx.Expect("drbdsetup status pvc-b028s --json",
+		storage.FakeResponse{Stdout: []byte(statusBothUpToDateSecondary("pvc-b028s"))})
+	fx.Expect("blkid -o export "+device,
+		storage.FakeResponse{Stdout: []byte("DEVNAME=" + device + "\nTYPE=ext4\nUSAGE=filesystem\n")})
+
+	rec := newThinReconciler(fx, dir)
+
+	_, err := rec.Apply(t.Context(), bug028FalseLatchDR("pvc-b028s", "6630"))
+	if err != nil {
+		t.Fatalf("Apply: %v", err)
+	}
+
+	assertNoPromoteNoMkfs(t, fx.CommandLines())
+}

From 4873e811da1103facf6c8bd6f20c1b5160a0e56c Mon Sep 17 00:00:00 2001
From: Andrei Kvapil <kvapss@gmail.com>
Date: Fri, 12 Jun 2026 09:02:46 +0300
Subject: [PATCH 3/4] fix(satellite): unblock BUG-028 probes wedged by drbd
 meta signature and CRD lag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Stand forensics on the loop-backed ganesha scenario showed both new
BUG-028 paths permanently refusing on real hardware:

1. blkid on the BACKING device of an internal-metadata DRBD volume
   always reports TYPE=drbd (libblkid recognises the meta superblock
   at the device tail), so the naive any-TYPE fs-absence probe read
   every never-formatted volume as populated. New
   backingHasUserFilesystem treats only a non-drbd signature as user
   data; the post-promote probe on the DRBD device (which hides the
   metadata) remains the authoritative double-mkfs guard.
2. The dispatcher's PeerHasData conservatively counts an UpToDate
   day0 sibling whose CurrentGI backfill has not landed yet as
   data-bearing; refusing the bypass on it permanently cost the
   one-shot first-activation mkfs. The bypass now uses kernel
   coverage (Day0SiblingSetConnected: no Primary anywhere, local
   UpToDate, every connected peer-device UpToDate/Diskless, an
   un-handshaken peer tolerated only when it is a configured
   diskless witness) which, combined with the local-GI==day0 proof,
   strictly supersedes the CRD signal: every state PeerHasData
   correctly protects is still refused.

Also the latch-free retry now requires only ONE volume to be missing
its filesystem (volumes that already carry one are adopted untouched
by the per-volume blkid probe in the mkfs runner — the Bug 311
partial-mkfs shape).

Co-Authored-By: Claude <noreply@anthropic.com>
Signed-off-by: Andrei Kvapil <kvapss@gmail.com>
---
 pkg/drbd/drbdadm.go                          |  77 ++++++++++++
 pkg/drbd/safe_for_mkfs_retry_promote_test.go |  88 +++++++++++++
 pkg/satellite/reconciler.go                  | 123 ++++++++++++++++---
 pkg/satellite/reconciler_bug028_test.go      |  94 ++++++++++++--
 4 files changed, 353 insertions(+), 29 deletions(-)

diff --git a/pkg/drbd/drbdadm.go b/pkg/drbd/drbdadm.go
index d98b3366..bdcfab03 100644
--- a/pkg/drbd/drbdadm.go
+++ b/pkg/drbd/drbdadm.go
@@ -1011,6 +1011,83 @@ func (a *Adm) SafeForMkfsRetryPromote(ctx context.Context, resource string) bool
 	return true
 }
 
+// Day0SiblingSetConnected probes `drbdsetup status <res> --json` and
+// reports whether the ENTIRE configured replica set is currently
+// visible to the kernel as a promote-safe day0 candidate set (the
+// BUG-028 first-activation mkfs bypass; the GI-level day0 proof is the
+// satellite's, this is only the connectivity/coverage half):
+//
+//   - the local role is NOT Primary and every local volume is diskful
+//     UpToDate (the elected winner seeded UpToDate via set-gi);
+//   - NO peer is Primary (an external promoter mid-grab defers the
+//     bypass to the latch-free retry, which handles foreign Primaries);
+//   - every connected peer-device is UpToDate or Diskless;
+//   - a peer-device whose state is still unknown (DUnknown — the
+//     connection has not handshaken) is tolerated ONLY when the peer is
+//     named in disklessPeers (an intentional diskless witness carries
+//     no data by construction). An un-handshaken DISKFUL peer refuses:
+//     it could be an offline data holder, and both `primary --force`
+//     and mkfs against it are the Bug 342 unrelated-data / data-loss
+//     wedge.
+//
+// Why this exists: the dispatcher's CRD-level PeerHasData treats an
+// UpToDate sibling whose CurrentGI has not been OBSERVED yet (the
+// get-gi backfill is best-effort) as data-bearing. On a fresh day0
+// race that conservatism is FALSE and would permanently cost the
+// one-shot first-activation mkfs. The kernel coverage here, combined
+// with the satellite's local-GI==day0 proof (a Connected+UpToDate peer
+// necessarily shares the local data generation), strictly supersedes
+// the CRD signal: every case PeerHasData correctly protects is also
+// refused here (a real connected data peer forces local GI != day0; a
+// disconnected diskful peer is DUnknown).
+//
+// Conservative on any probe / parse failure: returns false.
+func (a *Adm) Day0SiblingSetConnected(ctx context.Context, resource string, disklessPeers map[string]bool) bool {
+	out, err := a.exec.Run(ctx, "drbdsetup", "status", resource, "--json")
+	if err != nil {
+		return false
+	}
+
+	var status drbdsetupStatusRoot
+
+	err = json.Unmarshal(out, &status)
+	if err != nil || len(status) == 0 {
+		return false
+	}
+
+	res := status[0]
+
+	if Role(res.Role).IsPrimary() || !localIsUpToDate(res.Devices) {
+		return false
+	}
+
+	for _, conn := range res.Connections {
+		if Role(conn.PeerRole).IsPrimary() {
+			return false
+		}
+
+		for _, pd := range conn.PeerDevices {
+			switch DiskState(pd.PeerDiskState) {
+			case DiskStateUpToDate, DiskStateDiskless:
+				// Lock-step sibling or intentional witness — safe.
+			case DiskStateDUnknown:
+				if !disklessPeers[conn.PeerName] {
+					return false
+				}
+			case DiskStateConsistent, DiskStateOutdated, DiskStateAttaching,
+				DiskStateDetaching, DiskStateFailed, DiskStateNegotiating,
+				DiskStateInconsistent:
+				return false
+			default:
+				// Unknown/empty token — refuse, conservative.
+				return false
+			}
+		}
+	}
+
+	return true
+}
+
 // NeedsRecoveryPromote probes the live kernel via `drbdsetup status
 // <res> --json` and reports whether THIS node should re-arm the
 // auto-primary seed to unstick a fresh RD whose initial sync wedged
diff --git a/pkg/drbd/safe_for_mkfs_retry_promote_test.go b/pkg/drbd/safe_for_mkfs_retry_promote_test.go
index d596a49a..292e5c6b 100644
--- a/pkg/drbd/safe_for_mkfs_retry_promote_test.go
+++ b/pkg/drbd/safe_for_mkfs_retry_promote_test.go
@@ -175,3 +175,91 @@ func TestSafeForMkfsRetryPromote_ProbeFailureRefuses(t *testing.T) {
 		t.Fatal("a malformed status probe must refuse the latch-free mkfs retry")
 	}
 }
+
+// Day0SiblingSetConnected pins (BUG-028 bypass coverage probe). Same
+// conservatism contract as SafeForMkfsRetryPromote, with ONE deliberate
+// relaxation: a not-yet-handshaken (DUnknown) peer is tolerated when it
+// is a configured diskless witness — it carries no data by construction
+// and must not cost the one-shot first-activation mkfs.
+
+// TestDay0SiblingSetConnected_DisklessWitnessStillConnecting: the day0
+// race shape — diskful sibling Connected+UpToDate, tiebreaker witness
+// still handshaking → covered.
+func TestDay0SiblingSetConnected_DisklessWitnessStillConnecting(t *testing.T) {
+	adm := admWithMkfsRetryStatus(t, `[{
+	  "name":"pvc-b028","node-id":0,"role":"Secondary",
+	  "devices":[{"volume":0,"disk-state":"UpToDate"}],
+	  "connections":[{
+	    "peer-node-id":1,"name":"n2","connection-state":"Connected",
+	    "peer-role":"Secondary",
+	    "peer_devices":[{"volume":0,"peer-disk-state":"UpToDate"}]
+	  },{
+	    "peer-node-id":2,"name":"n3","connection-state":"Connecting",
+	    "peer-role":"Unknown",
+	    "peer_devices":[{"volume":0,"peer-disk-state":"DUnknown"}]
+	  }]
+	}]`)
+
+	if !adm.Day0SiblingSetConnected(t.Context(), "pvc-b028", map[string]bool{"n3": true}) {
+		t.Fatal("a still-connecting DISKLESS witness must not block the day0 bypass coverage")
+	}
+}
+
+// TestDay0SiblingSetConnected_DiskfulPeerStillConnecting: the same
+// DUnknown peer WITHOUT the diskless marking is a potential offline
+// data holder → refuse.
+func TestDay0SiblingSetConnected_DiskfulPeerStillConnecting(t *testing.T) {
+	adm := admWithMkfsRetryStatus(t, `[{
+	  "name":"pvc-b028","node-id":0,"role":"Secondary",
+	  "devices":[{"volume":0,"disk-state":"UpToDate"}],
+	  "connections":[{
+	    "peer-node-id":1,"name":"n2","connection-state":"Connected",
+	    "peer-role":"Secondary",
+	    "peer_devices":[{"volume":0,"peer-disk-state":"UpToDate"}]
+	  },{
+	    "peer-node-id":2,"name":"n3","connection-state":"Connecting",
+	    "peer-role":"Unknown",
+	    "peer_devices":[{"volume":0,"peer-disk-state":"DUnknown"}]
+	  }]
+	}]`)
+
+	if adm.Day0SiblingSetConnected(t.Context(), "pvc-b028", map[string]bool{}) {
+		t.Fatal("a not-yet-handshaken DISKFUL peer must refuse the day0 bypass coverage")
+	}
+}
+
+// TestDay0SiblingSetConnected_ForeignPrimaryRefuses: an external
+// promoter already holds the device → defer to the latch-free retry.
+func TestDay0SiblingSetConnected_ForeignPrimaryRefuses(t *testing.T) {
+	adm := admWithMkfsRetryStatus(t, `[{
+	  "name":"pvc-b028","node-id":0,"role":"Secondary",
+	  "devices":[{"volume":0,"disk-state":"UpToDate"}],
+	  "connections":[{
+	    "peer-node-id":1,"name":"n2","connection-state":"Connected",
+	    "peer-role":"Primary",
+	    "peer_devices":[{"volume":0,"peer-disk-state":"UpToDate"}]
+	  }]
+	}]`)
+
+	if adm.Day0SiblingSetConnected(t.Context(), "pvc-b028", map[string]bool{}) {
+		t.Fatal("a foreign Primary must refuse the day0 bypass coverage")
+	}
+}
+
+// TestDay0SiblingSetConnected_InconsistentPeerRefuses: an Inconsistent
+// peer-device is not a lock-step day0 sibling → refuse.
+func TestDay0SiblingSetConnected_InconsistentPeerRefuses(t *testing.T) {
+	adm := admWithMkfsRetryStatus(t, `[{
+	  "name":"pvc-b028","node-id":0,"role":"Secondary",
+	  "devices":[{"volume":0,"disk-state":"UpToDate"}],
+	  "connections":[{
+	    "peer-node-id":1,"name":"n2","connection-state":"Connected",
+	    "peer-role":"Secondary",
+	    "peer_devices":[{"volume":0,"peer-disk-state":"Inconsistent","replication-state":"Established","resync-suspended":"no"}]
+	  }]
+	}]`)
+
+	if adm.Day0SiblingSetConnected(t.Context(), "pvc-b028", map[string]bool{}) {
+		t.Fatal("an Inconsistent peer must refuse the day0 bypass coverage")
+	}
+}
diff --git a/pkg/satellite/reconciler.go b/pkg/satellite/reconciler.go
index 0d73d16d..c7939ff3 100644
--- a/pkg/satellite/reconciler.go
+++ b/pkg/satellite/reconciler.go
@@ -2737,9 +2737,18 @@ func (r *Reconciler) shouldRetryAutoMkfs(ctx context.Context, dr *intent.Desired
 //     the OFFLINE-SAFE proof this replica was born into a genuinely-
 //     fresh, never-initialized RD generation (relocate / migrate /
 //     extra-replica destinations are stamped false and never bypass).
-//  3. The dispatcher's CRD view reports no proven data-bearing diskful
-//     peer (PeerHasData=false; day0 siblings are already excluded by
-//     isDay0SeededVolume, a real survivor is not).
+//  3. Kernel coverage of the whole configured replica set
+//     (Adm.Day0SiblingSetConnected): no Primary anywhere, local
+//     UpToDate, every connected peer-device UpToDate/Diskless, and any
+//     not-yet-handshaken peer is an intentional diskless witness. An
+//     un-handshaken DISKFUL peer (potential offline data holder)
+//     refuses. NOTE this deliberately supersedes the dispatcher's
+//     CRD-level PeerHasData: that flag treats an UpToDate day0 sibling
+//     whose CurrentGI backfill has not landed yet as data-bearing
+//     (correct for the re-computed seed gates, but a FALSE positive
+//     here would permanently cost the one-shot first-activation mkfs).
+//     Every state PeerHasData correctly protects is still refused by
+//     this kernel check plus the GI proof below.
 //  4. Kernel/metadata truth per volume: the LOCAL current-UUID still
 //     equals the deterministic day0 GI. DRBD only lets two replicas
 //     sit Connected+UpToDate when they are in the same data
@@ -2767,13 +2776,33 @@ func (r *Reconciler) day0EmptyMkfsBypass(ctx context.Context, dr *intent.Desired
 		return false
 	}
 
-	if dr.GetPeerHasData() {
+	if !r.cfg.Adm.Day0SiblingSetConnected(ctx, dr.GetName(), disklessPeerSet(dr)) {
 		return false
 	}
 
 	return r.allVolumesDay0Empty(ctx, dr, devices)
 }
 
+// disklessPeerSet collects the configured peers the dispatcher marked
+// as intentional diskless witnesses (`peer.<name>.diskless=true` in
+// the wire DrbdOptions). Consumed by the day0 bypass coverage probe to
+// tolerate a witness whose connection has not handshaken yet — it
+// carries no data by construction, so it cannot be a data holder the
+// bypass must wait for.
+func disklessPeerSet(dr *intent.DesiredResource) map[string]bool {
+	opts := dr.GetDrbdOptions()
+
+	out := make(map[string]bool)
+
+	for _, peer := range dr.GetPeerNames() {
+		if opts["peer."+peer+".diskless"] == drbdBoolPropTrue {
+			out[peer] = true
+		}
+	}
+
+	return out
+}
+
 // allVolumesDay0Empty reports whether EVERY desired volume of dr still
 // sits at the deterministic day0 current-UUID in its on-disk DRBD
 // metadata AND carries no filesystem signature on its backing device.
@@ -2796,7 +2825,12 @@ func (r *Reconciler) allVolumesDay0Empty(ctx context.Context, dr *intent.Desired
 			return false
 		}
 
-		if r.deviceHasFilesystem(ctx, device) {
+		// backingHasUserFilesystem, NOT deviceHasFilesystem: the
+		// backing device of an internal-metadata DRBD volume always
+		// shows blkid TYPE=drbd (the meta superblock at its tail) —
+		// only a non-drbd signature is user data (stand forensics
+		// pinned this; the naive probe wedged the bypass forever).
+		if r.backingHasUserFilesystem(ctx, device) {
 			return false
 		}
 	}
@@ -2832,13 +2866,18 @@ func (r *Reconciler) allVolumesDay0Empty(ctx context.Context, dr *intent.Desired
 //     in the SAME data generation as the local one, so the promote
 //     mints nothing unrelated and the mkfs writes replicate to peers
 //     that are by construction bit-identical to the local volume.
-//   - Every local volume's backing device has NO filesystem signature
-//     (blkid): real data in the mkfs context means a filesystem (the
-//     RD requests FileSystem/Type; consumers only ever write through
-//     it), so fs-absence both proves the retry is still needed and
-//     that there is nothing to destroy. The post-promote blkid probe
-//     inside runAutoMkfs remains the per-volume double-mkfs safety
-//     net on the replicated device itself.
+//   - At least one local volume's backing device has NO USER
+//     filesystem signature (backingHasUserFilesystem — blkid that
+//     ignores TYPE=drbd, the internal-metadata superblock libblkid
+//     always sees at the backing device's tail; the naive probe
+//     wedged this gate forever on real hardware). Real data in the
+//     mkfs context means a filesystem (the RD requests
+//     FileSystem/Type; consumers only ever write through it), so a
+//     volume missing one both proves the retry is still needed and
+//     that there is nothing to destroy there; volumes that DO carry a
+//     filesystem are adopted untouched by runAutoMkfs's post-promote
+//     blkid probe on the replicated device (the Bug 311 partial-mkfs
+//     shape), which remains the per-volume double-mkfs safety net.
 //   - Throttled through recoveryPromoteDue (shared with the Bug 366
 //     recovery-promote): the promote→demote dance churns kernel state
 //     and may race the external promoter's own cycle; one nudge per
@@ -2857,24 +2896,43 @@ func (r *Reconciler) latchFreeMkfsRetryAllowed(ctx context.Context, dr *intent.D
 		return false
 	}
 
+	// At least one desired volume must still be missing a USER
+	// filesystem (backing probe; TYPE=drbd is the internal-metadata
+	// signature, not user data) — otherwise there is nothing to retry.
+	// Volumes that already carry a filesystem are fine: runAutoMkfs's
+	// per-volume blkid probe on the promoted DRBD device adopts them
+	// untouched (the Bug 311 partial-mkfs shape).
+	anyVolumeNeedsMkfs := false
+
 	for _, vol := range dr.GetVolumes() {
 		device := devices[vol.GetVolumeNumber()]
 		if device == "" {
-			return false
+			continue
 		}
 
-		if r.deviceHasFilesystem(ctx, device) {
-			return false
+		if !r.backingHasUserFilesystem(ctx, device) {
+			anyVolumeNeedsMkfs = true
+
+			break
 		}
 	}
 
+	if !anyVolumeNeedsMkfs {
+		return false
+	}
+
 	// Consume the throttle slot LAST so a pass vetoed by a foreign
-	// Primary / probe failure does not burn the window.
+	// Primary / probe failure does not burn the window — the retry
+	// must land in the very next clear gap of the external promoter's
+	// cycle. Probe overhead while a candidate stays unconverged is
+	// bounded by the controller's steady-state requeue cadence, and
+	// this branch only runs at all on the rare initialized-RD-without-
+	// filesystem shape (auto-primary replicas short-circuit before it).
 	if !r.recoveryPromoteDue(dr.GetName()) {
 		return false
 	}
 
-	log.FromContext(ctx).Info("BUG-028: latch-free mkfs retry — no auto-primary election but filesystem is provably absent, re-entering promote+mkfs+demote",
+	log.FromContext(ctx).Info("BUG-028: latch-free mkfs retry — no auto-primary election but a volume provably lacks its filesystem, re-entering promote+mkfs+demote",
 		"resource", dr.GetName())
 
 	return true
@@ -3832,6 +3890,37 @@ func (r *Reconciler) deviceHasFilesystem(ctx context.Context, device string) boo
 	return false
 }
 
+// backingHasUserFilesystem is the BACKING-device variant of
+// deviceHasFilesystem for the BUG-028 pre-promote probes. The backing
+// LV/zvol/loop of a DRBD-stacked volume with INTERNAL metadata always
+// carries the DRBD meta-data superblock at its tail, and libblkid
+// recognises it: `blkid -o export /dev/loopN` on a never-formatted
+// volume reports `TYPE=drbd`. Counting that as "a filesystem is
+// present" permanently wedged both BUG-028 probes on real hardware
+// (stand forensics: every fresh ganesha volume showed TYPE=drbd, the
+// bypass and the latch-free retry never fired, while the post-promote
+// probe on /dev/drbdN — which hides the metadata — correctly saw
+// nothing). Only a non-drbd TYPE counts as user data here; anything
+// else (no signature, probe failure, the drbd meta signature itself)
+// reads as "no filesystem". The post-promote blkid probe inside
+// runAutoMkfs on the DRBD device remains the authoritative
+// double-mkfs guard.
+func (r *Reconciler) backingHasUserFilesystem(ctx context.Context, device string) bool {
+	out, err := r.cfg.Exec.Run(ctx, "blkid", "-o", "export", device)
+	if err != nil {
+		return false
+	}
+
+	for line := range strings.SplitSeq(string(out), "\n") {
+		value, ok := strings.CutPrefix(strings.TrimSpace(line), "TYPE=")
+		if ok && value != "drbd" {
+			return true
+		}
+	}
+
+	return false
+}
+
 // runApplyDRBDVerb is the per-reconcile dispatch for the bring-up
 // chain. First activation falls through to the SkipDisk-aware
 // `drbdadm adjust` (or `adjust --skip-disk`): the .res + freshly-
diff --git a/pkg/satellite/reconciler_bug028_test.go b/pkg/satellite/reconciler_bug028_test.go
index 2d0f91e1..927073b2 100644
--- a/pkg/satellite/reconciler_bug028_test.go
+++ b/pkg/satellite/reconciler_bug028_test.go
@@ -103,6 +103,18 @@ func expectGetGI(fx *storage.FakeExec, rd, device, currentGI string) {
 		storage.FakeResponse{Stdout: []byte(currentGI + ":0000000000000000:0:0:1:1:0:0:0:0\n")})
 }
 
+// expectDrbdMetaSignature cans the blkid answer real hardware gives
+// for the BACKING device of an internal-metadata DRBD volume: libblkid
+// recognises the DRBD meta superblock at the device tail and reports
+// TYPE=drbd even on a never-formatted volume (stand forensics,
+// bug028-fix-validation-20260612-054452/iter2). The BUG-028 probes
+// MUST read this as "no user filesystem" — the naive any-TYPE= probe
+// wedged both the bypass and the latch-free retry forever.
+func expectDrbdMetaSignature(fx *storage.FakeExec, device string) {
+	fx.Expect("blkid -o export "+device,
+		storage.FakeResponse{Stdout: []byte("DEVNAME=" + device + "\nUUID=6715da3a6dd3182a\nTYPE=drbd\n")})
+}
+
 func newThinReconciler(fx *storage.FakeExec, dir string) *satellite.Reconciler {
 	thin := lvm.NewThin(lvm.ThinConfig{VolumeGroup: "vg", ThinPool: "tp"}, fx)
 
@@ -194,8 +206,11 @@ func TestApplyBug028Day0RaceVetoBypassedMkfsRuns(t *testing.T) {
 	// the exact proof that every Connected+UpToDate peer is a
 	// never-written day0 sibling of the same generation.
 	expectGetGI(fx, "pvc-b028", device, satellite.Day0GiForTest("pvc-b028", 0))
-	// blkid probes (backing pre-promote, /dev/drbd post-promote) return
-	// the FakeExec default (empty, no TYPE= line) → no fs anywhere.
+	// Backing-device blkid answers TYPE=drbd (the real-hardware shape:
+	// libblkid sees the internal DRBD metadata superblock) — the bypass
+	// must read that as "no user filesystem". The post-promote probe on
+	// /dev/drbd6500 returns the FakeExec default (no signature).
+	expectDrbdMetaSignature(fx, device)
 
 	rec := newThinReconciler(fx, dir)
 
@@ -263,18 +278,41 @@ func TestApplyBug028VetoHoldsOnFsSignature(t *testing.T) {
 	assertNoPromoteNoMkfs(t, fx.CommandLines())
 }
 
-// TestApplyBug028VetoHoldsOnPeerHasData: the dispatcher's CRD view
-// reports a PROVEN data-bearing diskful peer (non-day0 GI observed on
-// the peer's Status) → bypass refused before any kernel probe runs.
-func TestApplyBug028VetoHoldsOnPeerHasData(t *testing.T) {
+// TestApplyBug028VetoHoldsOnUnconnectedDiskfulPeer: data-safety
+// counter-case for the kernel-coverage gate — a configured DISKFUL
+// peer whose connection has not handshaken (peer-disk DUnknown) could
+// be an offline data holder, so the bypass must refuse even when the
+// local volume is day0-empty. (The OTHER connected peer being UpToDate
+// is what fired the veto.)
+func TestApplyBug028VetoHoldsOnUnconnectedDiskfulPeer(t *testing.T) {
 	dir := t.TempDir()
 	fx := storage.NewFakeExec()
-	expectThinBacking(fx, "pvc-b028p")
+	device := expectThinBacking(fx, "pvc-b028u")
+	fx.Expect("drbdsetup status pvc-b028u --json",
+		storage.FakeResponse{Stdout: []byte(`[{
+		  "name":"pvc-b028u","node-id":0,"role":"Secondary",
+		  "devices":[{"volume":0,"disk-state":"UpToDate"}],
+		  "connections":[{
+		    "peer-node-id":1,"name":"n2","connection-state":"Connected",
+		    "peer-role":"Secondary",
+		    "peer_devices":[{"volume":0,"peer-disk-state":"UpToDate"}]
+		  },{
+		    "peer-node-id":2,"name":"n3","connection-state":"Connecting",
+		    "peer-role":"Unknown",
+		    "peer_devices":[{"volume":0,"peer-disk-state":"DUnknown"}]
+		  }]
+		}]`)})
+	expectGetGI(fx, "pvc-b028u", device, satellite.Day0GiForTest("pvc-b028u", 0))
+	expectDrbdMetaSignature(fx, device)
 
 	rec := newThinReconciler(fx, dir)
 
-	dr := bug028WinnerDR("pvc-b028p", "6530")
-	dr[0].PeerHasData = true
+	dr := bug028WinnerDR("pvc-b028u", "6530")
+	dr[0].Peers = []intent.DesiredPeer{{Name: "n2"}, {Name: "n3"}}
+	dr[0].DrbdOptions["peer.n3.port"] = "7000"
+	dr[0].DrbdOptions["peer.n3.node-id"] = "2"
+	dr[0].DrbdOptions["peer.n3.address"] = "10.0.0.3"
+	// n3 is DISKFUL (no peer.n3.diskless) — its DUnknown must refuse.
 
 	_, err := rec.Apply(t.Context(), dr)
 	if err != nil {
@@ -284,6 +322,35 @@ func TestApplyBug028VetoHoldsOnPeerHasData(t *testing.T) {
 	assertNoPromoteNoMkfs(t, fx.CommandLines())
 }
 
+// TestApplyBug028BypassFiresDespitePeerHasDataLag pins the CRD-lag
+// acceptance: the dispatcher conservatively reports PeerHasData=true
+// for an UpToDate day0 sibling whose CurrentGI backfill has not been
+// observed yet. That signal is correct for the re-computed seed gates
+// but must NOT cost the one-shot first-activation mkfs when kernel
+// truth (full coverage + local day0 GI + no user fs) proves the whole
+// connected set is day0-empty.
+func TestApplyBug028BypassFiresDespitePeerHasDataLag(t *testing.T) {
+	dir := t.TempDir()
+	fx := storage.NewFakeExec()
+	device := expectThinBacking(fx, "pvc-b028g")
+	fx.Expect("drbdsetup status pvc-b028g --json",
+		storage.FakeResponse{Stdout: []byte(statusBothUpToDateSecondary("pvc-b028g"))})
+	expectGetGI(fx, "pvc-b028g", device, satellite.Day0GiForTest("pvc-b028g", 0))
+	expectDrbdMetaSignature(fx, device)
+
+	rec := newThinReconciler(fx, dir)
+
+	dr := bug028WinnerDR("pvc-b028g", "6540")
+	dr[0].PeerHasData = true // CRD lag: day0 sibling, CurrentGI unobserved
+
+	_, err := rec.Apply(t.Context(), dr)
+	if err != nil {
+		t.Fatalf("Apply: %v", err)
+	}
+
+	assertPromoteMkfsDemoteOrder(t, fx.CommandLines(), "pvc-b028g", "/dev/drbd6540")
+}
+
 // bug028FalseLatchDR is the wire payload of the BUG-028 TERMINAL state:
 // the false RD.Spec.Initialized latch fired, so the dispatcher no
 // longer stamps `auto-primary`; metadata exists (MetadataCreated=true →
@@ -322,11 +389,14 @@ func bug028FalseLatchDR(rd, minor string) []*intent.DesiredResource {
 func TestApplyBug028FalseLatchRetryFiresWithoutAutoPrimary(t *testing.T) {
 	dir := t.TempDir()
 	fx := storage.NewFakeExec()
-	expectThinBacking(fx, "pvc-b028r")
+	device := expectThinBacking(fx, "pvc-b028r")
 	fx.Expect("drbdsetup status pvc-b028r --json",
 		storage.FakeResponse{Stdout: []byte(statusBothUpToDateSecondary("pvc-b028r"))})
-	// blkid on the backing device and on /dev/drbd6600: FakeExec default
-	// (no TYPE=) → filesystem provably absent.
+	// Backing blkid answers TYPE=drbd (the real-hardware shape — see
+	// expectDrbdMetaSignature): the retry must read it as "no user
+	// filesystem". /dev/drbd6600's post-promote probe stays at the
+	// FakeExec default (no signature) → mkfs runs.
+	expectDrbdMetaSignature(fx, device)
 
 	rec := newThinReconciler(fx, dir)
 

From a5ed96ec815840a3ae6732b1080189acebd5eb50 Mon Sep 17 00:00:00 2001
From: Andrei Kvapil <kvapss@gmail.com>
Date: Fri, 12 Jun 2026 09:09:37 +0300
Subject: [PATCH 4/4] docs(satellite): document the inherited day0-GI
 never-degraded-write ambiguity

The bypass cannot distinguish a respawned replica joining a
data-bearing-but-still-day0 survivor (a volume whose entire write
history happened fully connected never mints a new current-UUID)
from a fresh day0 sibling. This is the same ambiguity the day0 seed
path documents and the same shape the pre-existing Bug-311 retry
already had; production auto-mkfs topologies (RWX with a diskless
witness) are immune because the first consumer promote mints a UUID
and latches the RD. Documented as accepted residual risk.

Co-Authored-By: Claude <noreply@anthropic.com>
Signed-off-by: Andrei Kvapil <kvapss@gmail.com>
---
 pkg/satellite/reconciler.go | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/pkg/satellite/reconciler.go b/pkg/satellite/reconciler.go
index c7939ff3..87c8a15a 100644
--- a/pkg/satellite/reconciler.go
+++ b/pkg/satellite/reconciler.go
@@ -2767,6 +2767,25 @@ func (r *Reconciler) shouldRetryAutoMkfs(ctx context.Context, dr *intent.Desired
 // respawn-StandAlone protections of Bug 342/356 intact: this function
 // can only ever ADD an mkfs on a provably day0-empty generation, never
 // remove a veto protecting real data.
+//
+// KNOWN RESIDUAL AMBIGUITY (pre-existing, inherited, not widened): a
+// volume whose entire write history happened while ALL peers were
+// connected never advances its current-UUID past day0 (DRBD only
+// mints on promote/write with an absent/weak peer). A respawned
+// replica joining such a data-bearing-but-day0 survivor is
+// indistinguishable from a fresh day0 sibling by GI bookkeeping —
+// this is the SAME ambiguity resolveVolumeSeed documents for the
+// day0 seed path and the same shape the pre-existing Bug-311 retry
+// (auto-primary + absent marker, NO kernel veto at all) already had.
+// In every production auto-mkfs topology (RWX ganesha) a diskless
+// tiebreaker is part of the set, so the first consumer promote mints
+// a new UUID (weak_nodes != 0, observed on the stand), the RD
+// latches Initialized, the respawned replica is stamped
+// SkipInitialSync=false, and condition 2 above refuses the bypass.
+// Only a hand-built no-witness FileSystem/Type RD whose data never
+// saw a degraded write retains the ambiguity — accepted and
+// documented rather than "solved" with a heuristic that would
+// reintroduce the BUG-028 wedge.
 func (r *Reconciler) day0EmptyMkfsBypass(ctx context.Context, dr *intent.DesiredResource, devices map[int32]string) bool {
 	if !needsMkfs(dr) || r.cfg.Exec == nil {
 		return false