From e634d81fabb8bf9278eff65bb80e35dbdf6bc6e9 Mon Sep 17 00:00:00 2001 From: CMGS Date: Mon, 25 May 2026 17:17:40 +0800 Subject: [PATCH 1/2] fix(hibernation): clear hibernate annotation before wake fast-path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit reconcileWake's cloned-and-running fast-path returned Active without clearing vm.cocoonstack.io/hibernate. A pod already awake but carrying hibernate=true residue kept the annotation set; a subsequent Desire=Hibernate then no-ops PatchHibernateState (annotation already matches), so the CR could flip to Hibernated against a stale tag without vk-cocoon taking a fresh snapshot — a later wake clones stale/divergent state. Move the clear above the fast-path so it runs unconditionally on any wake reconcile. PatchHibernateState already no-ops when the annotation matches, so the common (already-false) case adds no extra write. Fixes #2. --- hibernation/reconciler_test.go | 57 ++++++++++++++++++++++++++++++++++ hibernation/wake.go | 17 ++++++---- 2 files changed, 68 insertions(+), 6 deletions(-) diff --git a/hibernation/reconciler_test.go b/hibernation/reconciler_test.go index 5635fbc..2a6b57b 100644 --- a/hibernation/reconciler_test.go +++ b/hibernation/reconciler_test.go @@ -463,6 +463,63 @@ func TestReconcileWakeRecoversFromFailed(t *testing.T) { } } +// TestReconcileWakeClearsHibernateResidueOnFastPath covers the case where a pod +// is already cloned+running but still carries hibernate=true. The wake fast-path +// must clear the annotation before marking the CR Active; otherwise a later +// Desire=Hibernate no-ops PatchHibernateState and the CR can flip to Hibernated +// against a stale tag without a fresh snapshot. +func TestReconcileWakeClearsHibernateResidueOnFastPath(t *testing.T) { + hib := &cocoonv1.CocoonHibernation{ + ObjectMeta: metav1.ObjectMeta{Name: "hib", Namespace: "ns", Finalizers: []string{finalizerName}}, + Spec: cocoonv1.CocoonHibernationSpec{ + Desire: cocoonv1.HibernationDesireWake, + PodRef: cocoonv1.HibernationPodRef{Name: "demo-0"}, + }, + Status: cocoonv1.CocoonHibernationStatus{Phase: cocoonv1.CocoonHibernationPhaseWaking}, + } + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "demo-0", Namespace: "ns"}, + Status: corev1.PodStatus{ + ContainerStatuses: []corev1.ContainerStatus{{ + State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}}, + }}, + }, + } + (&meta.VMSpec{VMName: "vk-ns-demo-0", Managed: true}).Apply(pod) + (&meta.VMRuntime{VMID: "vmid-live"}).Apply(pod) // cloned + running → fast-path + meta.HibernateState(true).Apply(pod) // residue that must be cleared + + scheme := testScheme(t) + cli := ctrlfake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(hib, pod). + WithStatusSubresource(&cocoonv1.CocoonHibernation{}). + Build() + r := &Reconciler{Client: cli, Scheme: scheme, Epoch: &fakeRegistry{}} + + if _, err := r.Reconcile(t.Context(), ctrl.Request{ + NamespacedName: types.NamespacedName{Namespace: "ns", Name: "hib"}, + }); err != nil { + t.Fatalf("Reconcile: %v", err) + } + + var outHib cocoonv1.CocoonHibernation + if err := cli.Get(t.Context(), types.NamespacedName{Namespace: "ns", Name: "hib"}, &outHib); err != nil { + t.Fatalf("get hib: %v", err) + } + if outHib.Status.Phase != cocoonv1.CocoonHibernationPhaseActive { + t.Errorf("phase = %q, want Active (fast-path)", outHib.Status.Phase) + } + + var outPod corev1.Pod + if err := cli.Get(t.Context(), types.NamespacedName{Namespace: "ns", Name: "demo-0"}, &outPod); err != nil { + t.Fatalf("get pod: %v", err) + } + if meta.ReadHibernateState(&outPod) { + t.Error("hibernate annotation must be cleared on the wake fast-path; still true") + } +} + func TestHibernateDeadlineExceeded(t *testing.T) { staleReady := metav1.Condition{ Type: commonk8s.ConditionTypeReady, diff --git a/hibernation/wake.go b/hibernation/wake.go index 30b4f11..5db67b4 100644 --- a/hibernation/wake.go +++ b/hibernation/wake.go @@ -17,6 +17,17 @@ func (r *Reconciler) reconcileWake(ctx context.Context, hib *cocoonv1.CocoonHibe logger := log.WithFunc("hibernation.Reconciler.reconcileWake") r.announceRetryFromFailed(hib, cocoonv1.HibernationDesireWake) + // Clear hibernate=true before the cloned-and-running fast-path. A pod that + // is already awake but still carries hibernate residue would otherwise take + // the fast-path with the annotation left set; a later Desire=Hibernate then + // no-ops PatchHibernateState, letting the CR flip to Hibernated against a + // stale tag without vk-cocoon ever taking a fresh snapshot. + if meta.ReadHibernateState(pod) { + if err := commonk8s.PatchHibernateState(ctx, r.Client, pod, false); err != nil { + return ctrl.Result{}, fmt.Errorf("clear hibernate annotation: %w", err) + } + } + if vmClonedAndRunning(pod) { // Drop snapshot tag (non-fatal; stale tag gets overwritten on next hibernate). if err := r.Epoch.DeleteManifest(ctx, vmName, meta.HibernateSnapshotTag); err != nil { @@ -29,12 +40,6 @@ func (r *Reconciler) reconcileWake(ctx context.Context, hib *cocoonv1.CocoonHibe return ctrl.Result{}, r.setPhase(ctx, hib, cocoonv1.CocoonHibernationPhaseActive, vmName) } - if meta.ReadHibernateState(pod) { - if err := commonk8s.PatchHibernateState(ctx, r.Client, pod, false); err != nil { - return ctrl.Result{}, fmt.Errorf("clear hibernate annotation: %w", err) - } - } - if phaseDeadlineExceeded(hib, cocoonv1.CocoonHibernationPhaseWaking, wakeTimeout) { if r.firstTransitionAt(hib) { observePhaseExit(hib, "timeout") From 4b770563cff50fac059c188871f7b4040a49ff0b Mon Sep 17 00:00:00 2001 From: CMGS Date: Mon, 25 May 2026 17:20:31 +0800 Subject: [PATCH 2/2] chore(hibernation): drop comments from wake fix --- hibernation/reconciler_test.go | 9 ++------- hibernation/wake.go | 5 ----- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/hibernation/reconciler_test.go b/hibernation/reconciler_test.go index 2a6b57b..5569705 100644 --- a/hibernation/reconciler_test.go +++ b/hibernation/reconciler_test.go @@ -463,11 +463,6 @@ func TestReconcileWakeRecoversFromFailed(t *testing.T) { } } -// TestReconcileWakeClearsHibernateResidueOnFastPath covers the case where a pod -// is already cloned+running but still carries hibernate=true. The wake fast-path -// must clear the annotation before marking the CR Active; otherwise a later -// Desire=Hibernate no-ops PatchHibernateState and the CR can flip to Hibernated -// against a stale tag without a fresh snapshot. func TestReconcileWakeClearsHibernateResidueOnFastPath(t *testing.T) { hib := &cocoonv1.CocoonHibernation{ ObjectMeta: metav1.ObjectMeta{Name: "hib", Namespace: "ns", Finalizers: []string{finalizerName}}, @@ -486,8 +481,8 @@ func TestReconcileWakeClearsHibernateResidueOnFastPath(t *testing.T) { }, } (&meta.VMSpec{VMName: "vk-ns-demo-0", Managed: true}).Apply(pod) - (&meta.VMRuntime{VMID: "vmid-live"}).Apply(pod) // cloned + running → fast-path - meta.HibernateState(true).Apply(pod) // residue that must be cleared + (&meta.VMRuntime{VMID: "vmid-live"}).Apply(pod) + meta.HibernateState(true).Apply(pod) scheme := testScheme(t) cli := ctrlfake.NewClientBuilder(). diff --git a/hibernation/wake.go b/hibernation/wake.go index 5db67b4..4df03a6 100644 --- a/hibernation/wake.go +++ b/hibernation/wake.go @@ -17,11 +17,6 @@ func (r *Reconciler) reconcileWake(ctx context.Context, hib *cocoonv1.CocoonHibe logger := log.WithFunc("hibernation.Reconciler.reconcileWake") r.announceRetryFromFailed(hib, cocoonv1.HibernationDesireWake) - // Clear hibernate=true before the cloned-and-running fast-path. A pod that - // is already awake but still carries hibernate residue would otherwise take - // the fast-path with the annotation left set; a later Desire=Hibernate then - // no-ops PatchHibernateState, letting the CR flip to Hibernated against a - // stale tag without vk-cocoon ever taking a fresh snapshot. if meta.ReadHibernateState(pod) { if err := commonk8s.PatchHibernateState(ctx, r.Client, pod, false); err != nil { return ctrl.Result{}, fmt.Errorf("clear hibernate annotation: %w", err)