From c516b6a776e69ab68b7cc9da910c081aea32ebd3 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Mon, 1 Jun 2026 19:44:41 -0700 Subject: [PATCH 1/5] feat(planner): detect sidecar image drift on Running nodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend the existing image-drift code path to also detect sidecar image drift. Same update plan executes — ApplyStatefulSet re-renders the STS with the new sidecar container image, ReplacePod cycles, ObserveImage stamps. The new code only adds the drift trigger; the rest of the infrastructure already handled sidecar rollouts naturally. Why this matters: - SND-side templateHash already includes Spec.Sidecar.Image, so SND-level rollouts fire on per-node overrides. But it does NOT include the controller-wide SEI_SIDECAR_IMAGE default. When a controller deploy bumps that env, no SND template changes — SeiNodes silently keep running the old sidecar until something else triggers a rollout. - This PR closes that gap at the SeiNode planner level. Changes: - New Status.CurrentSidecarImage field on SeiNode CR (omitempty, no schema break). Stamped jointly with CurrentImage by ObserveImage when the StatefulSet rollout completes — uses the effective sidecar image (Spec.Sidecar.Image override, else Platform.SidecarImage default). - task.EffectiveSidecarImage exported helper; same resolution logic as noderesource.go and bootstrap_resources.go (those duplicates left for a future consolidation pass). - New sidecarImageDrifted(node, platform) helper. Empty Status.CurrentSidecarImage treated as "not yet observed, no drift" — prevents fleet-wide rollout on first reconcile after upgrade. - Each mode planner's buildRunningPlan: imageDrifted(node) || sidecarImageDrifted(node, p.platform). - imageDriftMessage extended to name which image drifted (seid only, sidecar only, or both) so operators reading the condition can tell the source of the rollout. - NodeResolver gains a Platform field; plannerForMode becomes a method that threads Platform into each mode planner's struct field. Reconciler wires Platform on NodeResolver in cmd/main.go. Pre-implementation cross-review: kubernetes-specialist + platform-engineer both ALIGN_AND_IMPLEMENT. Both independently flagged the empty-status backfill guard as load-bearing and the SND templateHash gap as the real motivator for this change. Tests: - TestFullPlanner_SidecarDriftFromPlatformDefault_UpdateProgression - TestFullPlanner_SidecarDriftFromOverride_UpdateProgression - TestFullPlanner_CombinedDrift_SingleUpdatePlan — seid AND sidecar drift share one update plan; condition message names both. - TestFullPlanner_NoCurrentSidecarImage_NoDrift — pins the backfill guard. - TestImageDriftMessage_NamesWhichDrifted — pins the diagnostic split. - TestObserveImage_RolloutComplete_StampsSidecarFromPlatformDefault - TestObserveImage_RolloutComplete_StampsSidecarFromOverride Co-Authored-By: Claude Opus 4.7 (1M context) --- api/v1alpha1/seinode_types.go | 10 +++ cmd/main.go | 5 +- config/crd/sei.io_seinodes.yaml | 10 +++ internal/planner/archive.go | 6 +- internal/planner/full.go | 6 +- internal/planner/node_update_test.go | 110 ++++++++++++++++++++++++++- internal/planner/planner.go | 52 ++++++++++--- internal/planner/replay.go | 6 +- internal/planner/validator.go | 6 +- internal/task/observe_image.go | 24 +++++- internal/task/observe_image_test.go | 38 +++++++++ manifests/sei.io_seinodes.yaml | 10 +++ 12 files changed, 258 insertions(+), 25 deletions(-) diff --git a/api/v1alpha1/seinode_types.go b/api/v1alpha1/seinode_types.go index d82f0cb..aa3cd17 100644 --- a/api/v1alpha1/seinode_types.go +++ b/api/v1alpha1/seinode_types.go @@ -332,6 +332,16 @@ type SeiNodeStatus struct { // +optional CurrentImage string `json:"currentImage,omitempty"` + // CurrentSidecarImage is the sidecar container image observed running + // on the owned StatefulSet. Stamped jointly with CurrentImage when the + // rollout completes. Compared against the effective sidecar image + // (spec.sidecar.image, else the controller's SEI_SIDECAR_IMAGE) to + // detect drift independent of seid changes. Empty until the first + // observe — empty is treated as "not yet observed, no drift" so a + // controller upgrade doesn't fleet-roll every node on first reconcile. + // +optional + CurrentSidecarImage string `json:"currentSidecarImage,omitempty"` + // +listType=map // +listMapKey=type // +optional diff --git a/cmd/main.go b/cmd/main.go index e149c38..bfa9146 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -191,7 +191,10 @@ func main() { Scheme: mgr.GetScheme(), Recorder: nodeRecorder, Platform: platformCfg, - Planner: &planner.NodeResolver{BuildSidecarClient: buildSidecarClient}, + Planner: &planner.NodeResolver{ + BuildSidecarClient: buildSidecarClient, + Platform: platformCfg, + }, PlanExecutor: &planner.Executor[*seiv1alpha1.SeiNode]{ ConfigFor: func(_ context.Context, node *seiv1alpha1.SeiNode) task.ExecutionConfig { return task.ExecutionConfig{ diff --git a/config/crd/sei.io_seinodes.yaml b/config/crd/sei.io_seinodes.yaml index 1281022..6c8bdaf 100644 --- a/config/crd/sei.io_seinodes.yaml +++ b/config/crd/sei.io_seinodes.yaml @@ -813,6 +813,16 @@ spec: Parent controllers compare this against spec.image to determine whether a spec change has been fully actuated. type: string + currentSidecarImage: + description: |- + CurrentSidecarImage is the sidecar container image observed running + on the owned StatefulSet. Stamped jointly with CurrentImage when the + rollout completes. Compared against the effective sidecar image + (spec.sidecar.image, else the controller's SEI_SIDECAR_IMAGE) to + detect drift independent of seid changes. Empty until the first + observe — empty is treated as "not yet observed, no drift" so a + controller upgrade doesn't fleet-roll every node on first reconcile. + type: string phase: description: Phase is the high-level lifecycle state. enum: diff --git a/internal/planner/archive.go b/internal/planner/archive.go index 04c30a6..6a667c1 100644 --- a/internal/planner/archive.go +++ b/internal/planner/archive.go @@ -8,10 +8,12 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" + "github.com/sei-protocol/sei-k8s-controller/internal/platform" "github.com/sei-protocol/sei-k8s-controller/internal/task" ) type archiveNodePlanner struct { + platform platform.Config } func (p *archiveNodePlanner) Mode() string { return string(seiconfig.ModeArchive) } @@ -40,8 +42,8 @@ func (p *archiveNodePlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1. // buildRunningPlan returns the update plan for a Running archive node. // Same shape as full nodes (no extra validation gates). func (p *archiveNodePlanner) buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { - if imageDrifted(node) { - setNodeUpdateCondition(node, metav1.ConditionTrue, "UpdateStarted", imageDriftMessage(node)) + if imageDrifted(node) || sidecarImageDrifted(node, p.platform) { + setNodeUpdateCondition(node, metav1.ConditionTrue, "UpdateStarted", imageDriftMessage(node, p.platform)) prog := []string{ task.TaskTypeApplyStatefulSet, task.TaskTypeApplyService, diff --git a/internal/planner/full.go b/internal/planner/full.go index ebb64f5..cd0c35d 100644 --- a/internal/planner/full.go +++ b/internal/planner/full.go @@ -7,10 +7,12 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" + "github.com/sei-protocol/sei-k8s-controller/internal/platform" "github.com/sei-protocol/sei-k8s-controller/internal/task" ) type fullNodePlanner struct { + platform platform.Config } func (p *fullNodePlanner) Mode() string { return string(seiconfig.ModeFull) } @@ -49,8 +51,8 @@ func (p *fullNodePlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.Tas // nil if no drift. pelletier/go-toml/v2 does not preserve comments on // re-encode — the first config-patch erases operator-added comments. func (p *fullNodePlanner) buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { - if imageDrifted(node) { - setNodeUpdateCondition(node, metav1.ConditionTrue, "UpdateStarted", imageDriftMessage(node)) + if imageDrifted(node) || sidecarImageDrifted(node, p.platform) { + setNodeUpdateCondition(node, metav1.ConditionTrue, "UpdateStarted", imageDriftMessage(node, p.platform)) prog := []string{ task.TaskTypeApplyStatefulSet, task.TaskTypeApplyService, diff --git a/internal/planner/node_update_test.go b/internal/planner/node_update_test.go index 12a6104..e199a0c 100644 --- a/internal/planner/node_update_test.go +++ b/internal/planner/node_update_test.go @@ -10,14 +10,22 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" + "github.com/sei-protocol/sei-k8s-controller/internal/platform" "github.com/sei-protocol/sei-k8s-controller/internal/task" ) const ( - testImageV2 = "sei:v2.0.0" - testExternalAddrAtl = "syncer-0-0-p2p.atlantic-2.harbor.platform.sei.io:26656" + testImageV2 = "sei:v2.0.0" + testExternalAddrAtl = "syncer-0-0-p2p.atlantic-2.harbor.platform.sei.io:26656" + testSidecarImageV1 = "ghcr.io/sei-protocol/seictl@sha256:1111" + testSidecarImageV2 = "ghcr.io/sei-protocol/seictl@sha256:2222" + testSidecarOverrideV = "ghcr.io/sei-protocol/seictl@sha256:3333" ) +func platformWithSidecar(image string) platform.Config { + return platform.Config{SidecarImage: image} +} + // runningFullNode returns a SeiNode in the Running phase with currentImage matching spec.image. func runningFullNode() *seiv1alpha1.SeiNode { return &seiv1alpha1.SeiNode{ @@ -165,6 +173,104 @@ func TestArchivePlanner_ImageDrift_UpdateProgression(t *testing.T) { })) } +// --- sidecar image drift tests --- + +// sidecarDriftedNode returns a runningFullNode with CurrentSidecarImage +// populated so the empty-no-drift backfill guard doesn't fire; the test +// then mutates Spec.Sidecar / platform to drive drift. +func sidecarDriftedNode() *seiv1alpha1.SeiNode { + node := runningFullNode() + node.Status.CurrentSidecarImage = testSidecarImageV1 + return node +} + +// Effective sidecar image = platform default when Spec.Sidecar is nil. +// Bumping the platform default while CurrentSidecarImage stays at v1 +// must trigger a drift-routed update plan. +func TestFullPlanner_SidecarDriftFromPlatformDefault_UpdateProgression(t *testing.T) { + g := NewWithT(t) + node := sidecarDriftedNode() + p := platformWithSidecar(testSidecarImageV2) + + plan, err := (&fullNodePlanner{platform: p}).BuildPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(plan).NotTo(BeNil(), "sidecar drift should trigger update plan") + g.Expect(planTaskTypes(plan)).To(Equal([]string{ + task.TaskTypeApplyStatefulSet, + task.TaskTypeApplyService, + TaskConfigPatch, + TaskConfigValidate, + task.TaskTypeReplacePod, + task.TaskTypeObserveImage, + TaskMarkReady, + })) +} + +// Spec.Sidecar.Image override takes precedence over the platform default. +func TestFullPlanner_SidecarDriftFromOverride_UpdateProgression(t *testing.T) { + g := NewWithT(t) + node := sidecarDriftedNode() + node.Spec.Sidecar = &seiv1alpha1.SidecarConfig{Image: testSidecarOverrideV} + p := platformWithSidecar(testSidecarImageV1) + + plan, err := (&fullNodePlanner{platform: p}).BuildPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(plan).NotTo(BeNil(), "sidecar override drift should trigger update plan") + g.Expect(plan.Tasks).To(HaveLen(7)) +} + +// Combined drift — single update plan covers both. +func TestFullPlanner_CombinedDrift_SingleUpdatePlan(t *testing.T) { + g := NewWithT(t) + node := sidecarDriftedNode() + node.Spec.Image = testImageV2 + p := platformWithSidecar(testSidecarImageV2) + + plan, err := (&fullNodePlanner{platform: p}).BuildPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(plan).NotTo(BeNil()) + g.Expect(plan.Tasks).To(HaveLen(7), "one plan covers both drifts") + + cond := meta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionNodeUpdateInProgress) + g.Expect(cond).NotTo(BeNil()) + g.Expect(cond.Message).To(ContainSubstring("seid spec=")) + g.Expect(cond.Message).To(ContainSubstring("sidecar spec=")) +} + +// Backfill guard: empty CurrentSidecarImage = "not yet observed, no drift." +// Without this, a controller upgrade fleet-rolls every existing SeiNode on +// first reconcile before ObserveImage backfills the field. +func TestFullPlanner_NoCurrentSidecarImage_NoDrift(t *testing.T) { + g := NewWithT(t) + node := runningFullNode() + p := platformWithSidecar(testSidecarImageV2) + + plan, err := (&fullNodePlanner{platform: p}).BuildPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(plan).To(BeNil(), "empty CurrentSidecarImage must NOT trigger drift") +} + +// Diagnostic message names which image drifted. +func TestImageDriftMessage_NamesWhichDrifted(t *testing.T) { + g := NewWithT(t) + p := platformWithSidecar(testSidecarImageV2) + + seidOnly := sidecarDriftedNode() + seidOnly.Spec.Image = testImageV2 + g.Expect(imageDriftMessage(seidOnly, platformWithSidecar(testSidecarImageV1))).To(And( + ContainSubstring("image drift detected"), + Not(ContainSubstring("sidecar")))) + + sidecarOnly := sidecarDriftedNode() + g.Expect(imageDriftMessage(sidecarOnly, p)).To(ContainSubstring("sidecar image drift detected")) + + both := sidecarDriftedNode() + both.Spec.Image = testImageV2 + g.Expect(imageDriftMessage(both, p)).To(And( + ContainSubstring("seid spec="), + ContainSubstring("sidecar spec="))) +} + // Replayer shares the full/archive update shape — symmetry test. func TestReplayerPlanner_ImageDrift_UpdateProgression(t *testing.T) { g := NewWithT(t) diff --git a/internal/planner/planner.go b/internal/planner/planner.go index 4bea23a..0ea5846 100644 --- a/internal/planner/planner.go +++ b/internal/planner/planner.go @@ -21,6 +21,7 @@ import ( seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" "github.com/sei-protocol/sei-k8s-controller/internal/controller/observability" "github.com/sei-protocol/sei-k8s-controller/internal/noderesource" + "github.com/sei-protocol/sei-k8s-controller/internal/platform" "github.com/sei-protocol/sei-k8s-controller/internal/task" ) @@ -128,6 +129,9 @@ func isConditionTrue(group *seiv1alpha1.SeiNodeDeployment, condType string) bool type NodeResolver struct { // Nil factory skips the sidecar probe; used by tests. BuildSidecarClient func(node *seiv1alpha1.SeiNode) (task.SidecarClient, error) + // Platform supplies the controller-wide sidecar image fallback used + // by sidecarImageDrifted when Spec.Sidecar.Image is unset. + Platform platform.Config } func (p *NodeResolver) ResolvePlan(ctx context.Context, node *seiv1alpha1.SeiNode) error { @@ -145,7 +149,7 @@ func (p *NodeResolver) ResolvePlan(ctx context.Context, node *seiv1alpha1.SeiNod handleTerminalPlan(ctx, node) - mode, err := plannerForMode(node) + mode, err := p.plannerForMode(node) if err != nil { return err } @@ -262,17 +266,19 @@ func planFailureMessage(plan *seiv1alpha1.TaskPlan) string { } // plannerForMode returns the appropriate NodePlanner based on which mode -// sub-spec is populated on the SeiNode. -func plannerForMode(node *seiv1alpha1.SeiNode) (NodePlanner, error) { +// sub-spec is populated on the SeiNode. Constructor — passes the resolver's +// Platform config into each planner so they can resolve the effective +// sidecar image for drift detection without taking a cluster dependency. +func (r *NodeResolver) plannerForMode(node *seiv1alpha1.SeiNode) (NodePlanner, error) { switch { case node.Spec.FullNode != nil: - return &fullNodePlanner{}, nil + return &fullNodePlanner{platform: r.Platform}, nil case node.Spec.Archive != nil: - return &archiveNodePlanner{}, nil + return &archiveNodePlanner{platform: r.Platform}, nil case node.Spec.Replayer != nil: - return &replayerPlanner{}, nil + return &replayerPlanner{platform: r.Platform}, nil case node.Spec.Validator != nil: - return &validatorPlanner{}, nil + return &validatorPlanner{platform: r.Platform}, nil default: return nil, fmt.Errorf("no mode sub-spec set on SeiNode %s/%s", node.Namespace, node.Name) } @@ -733,10 +739,36 @@ func imageDrifted(node *seiv1alpha1.SeiNode) bool { return node.Spec.Image != node.Status.CurrentImage } +// sidecarImageDrifted reports whether the effective sidecar image (per-node +// override or controller-wide default) diverges from what was last observed. +// Empty Status.CurrentSidecarImage means "not yet observed" and is treated +// as no-drift so a controller upgrade doesn't fleet-roll every node before +// ObserveImage has had a chance to backfill the field. +func sidecarImageDrifted(node *seiv1alpha1.SeiNode, p platform.Config) bool { + if node.Status.CurrentSidecarImage == "" { + return false + } + return task.EffectiveSidecarImage(node, p) != node.Status.CurrentSidecarImage +} + // imageDriftMessage formats the NodeUpdateInProgress message every mode -// planner stamps before an image-drift-triggered assembleUpdatePlan call. -func imageDriftMessage(node *seiv1alpha1.SeiNode) string { - return fmt.Sprintf("image drift detected: spec=%s current=%s", node.Spec.Image, node.Status.CurrentImage) +// planner stamps before an update plan. Names which image(s) drifted so an +// operator reading the condition can tell seid bumps from sidecar bumps. +func imageDriftMessage(node *seiv1alpha1.SeiNode, p platform.Config) string { + seid := imageDrifted(node) + sc := sidecarImageDrifted(node, p) + switch { + case seid && sc: + return fmt.Sprintf("image drift detected: seid spec=%s current=%s; sidecar spec=%s current=%s", + node.Spec.Image, node.Status.CurrentImage, + task.EffectiveSidecarImage(node, p), node.Status.CurrentSidecarImage) + case sc: + return fmt.Sprintf("sidecar image drift detected: spec=%s current=%s", + task.EffectiveSidecarImage(node, p), node.Status.CurrentSidecarImage) + default: + return fmt.Sprintf("image drift detected: spec=%s current=%s", + node.Spec.Image, node.Status.CurrentImage) + } } // externalAddressPatch is the config.toml patch that stamps the diff --git a/internal/planner/replay.go b/internal/planner/replay.go index 594fce0..c6bc75b 100644 --- a/internal/planner/replay.go +++ b/internal/planner/replay.go @@ -7,10 +7,12 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" + "github.com/sei-protocol/sei-k8s-controller/internal/platform" "github.com/sei-protocol/sei-k8s-controller/internal/task" ) type replayerPlanner struct { + platform platform.Config } func (p *replayerPlanner) Mode() string { return string(seiconfig.ModeFull) } @@ -52,8 +54,8 @@ func (p *replayerPlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.Tas // buildRunningPlan returns the update plan for a Running replayer node. // Same shape as full and archive. func (p *replayerPlanner) buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { - if imageDrifted(node) { - setNodeUpdateCondition(node, metav1.ConditionTrue, "UpdateStarted", imageDriftMessage(node)) + if imageDrifted(node) || sidecarImageDrifted(node, p.platform) { + setNodeUpdateCondition(node, metav1.ConditionTrue, "UpdateStarted", imageDriftMessage(node, p.platform)) prog := []string{ task.TaskTypeApplyStatefulSet, task.TaskTypeApplyService, diff --git a/internal/planner/validator.go b/internal/planner/validator.go index 602239f..da3226e 100644 --- a/internal/planner/validator.go +++ b/internal/planner/validator.go @@ -7,10 +7,12 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" + "github.com/sei-protocol/sei-k8s-controller/internal/platform" "github.com/sei-protocol/sei-k8s-controller/internal/task" ) type validatorPlanner struct { + platform platform.Config } func (p *validatorPlanner) Mode() string { return string(seiconfig.ModeValidator) } @@ -117,8 +119,8 @@ func (p *validatorPlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.Ta // malformed secret aborts with a clear controller-side error rather than // a kubelet volume-mount failure on the recreated pod. func (p *validatorPlanner) buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { - if imageDrifted(node) { - setNodeUpdateCondition(node, metav1.ConditionTrue, "UpdateStarted", imageDriftMessage(node)) + if imageDrifted(node) || sidecarImageDrifted(node, p.platform) { + setNodeUpdateCondition(node, metav1.ConditionTrue, "UpdateStarted", imageDriftMessage(node, p.platform)) prog := []string{ task.TaskTypeValidateSigningKey, task.TaskTypeValidateNodeKey, diff --git a/internal/task/observe_image.go b/internal/task/observe_image.go index 68cb18f..ca6b510 100644 --- a/internal/task/observe_image.go +++ b/internal/task/observe_image.go @@ -10,6 +10,7 @@ import ( "k8s.io/apimachinery/pkg/types" seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" + "github.com/sei-protocol/sei-k8s-controller/internal/platform" ) const TaskTypeObserveImage = "observe-image" @@ -43,9 +44,10 @@ func deserializeObserveImage(id string, params json.RawMessage, cfg ExecutionCon } // Execute polls the StatefulSet rollout. If the rollout is complete, stamps -// status.currentImage on the owning SeiNode and marks the task complete. -// If the rollout is still in progress, returns nil — the executor will -// re-invoke on the next reconcile since the task remains Pending. +// status.currentImage and status.currentSidecarImage on the owning SeiNode +// and marks the task complete. If the rollout is still in progress, returns +// nil — the executor will re-invoke on the next reconcile since the task +// remains Pending. func (e *observeImageExecution) Execute(ctx context.Context) error { node, err := ResourceAs[*seiv1alpha1.SeiNode](e.cfg) if err != nil { @@ -69,12 +71,26 @@ func (e *observeImageExecution) Execute(ctx context.Context) error { return nil } - // Rollout complete — stamp currentImage in-memory. + // Rollout complete — stamp both currentImage fields in-memory. STS + // rollout-completion is container-agnostic (UpdatedReplicas covers the + // whole pod spec), so seid and sidecar containers roll together. node.Status.CurrentImage = node.Spec.Image + node.Status.CurrentSidecarImage = EffectiveSidecarImage(node, e.cfg.Platform) e.complete() return nil } +// EffectiveSidecarImage returns the sidecar container image actually +// rendered onto the StatefulSet: the per-SeiNode override if set, else +// the controller-wide default from SEI_SIDECAR_IMAGE. Mirrors the +// resolution at noderesource.go and bootstrap_resources.go. +func EffectiveSidecarImage(node *seiv1alpha1.SeiNode, p platform.Config) string { + if node.Spec.Sidecar != nil && node.Spec.Sidecar.Image != "" { + return node.Spec.Sidecar.Image + } + return p.SidecarImage +} + // Status returns the cached execution status. func (e *observeImageExecution) Status(_ context.Context) ExecutionStatus { return e.DefaultStatus() diff --git a/internal/task/observe_image_test.go b/internal/task/observe_image_test.go index 6576071..e941f4c 100644 --- a/internal/task/observe_image_test.go +++ b/internal/task/observe_image_test.go @@ -13,6 +13,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/fake" seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" + "github.com/sei-protocol/sei-k8s-controller/internal/platform" ) func observeImageNode() *seiv1alpha1.SeiNode { @@ -81,6 +82,43 @@ func TestObserveImage_RolloutComplete_SetsCurrentImage(t *testing.T) { g.Expect(node.Status.CurrentImage).To(Equal("sei:v2.0.0")) } +// Rollout completion stamps both currentImage and currentSidecarImage. +// Sidecar value falls back to the platform default when Spec.Sidecar is unset. +func TestObserveImage_RolloutComplete_StampsSidecarFromPlatformDefault(t *testing.T) { + g := NewWithT(t) + node := observeImageNode() + sts := &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{Name: node.Name, Namespace: node.Namespace, Generation: 2}, + Spec: appsv1.StatefulSetSpec{Replicas: int32Ptr(1)}, + Status: appsv1.StatefulSetStatus{ObservedGeneration: 2, UpdatedReplicas: 1, Replicas: 1}, + } + cfg := observeImageCfg(t, node, sts) + cfg.Platform = platform.Config{SidecarImage: "seictl@sha256:default"} + + exec := newObserveImageExec(t, cfg) + g.Expect(exec.Execute(context.Background())).To(Succeed()) + g.Expect(node.Status.CurrentImage).To(Equal("sei:v2.0.0")) + g.Expect(node.Status.CurrentSidecarImage).To(Equal("seictl@sha256:default")) +} + +// Spec.Sidecar.Image override beats the platform default. +func TestObserveImage_RolloutComplete_StampsSidecarFromOverride(t *testing.T) { + g := NewWithT(t) + node := observeImageNode() + node.Spec.Sidecar = &seiv1alpha1.SidecarConfig{Image: "seictl@sha256:override"} + sts := &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{Name: node.Name, Namespace: node.Namespace, Generation: 2}, + Spec: appsv1.StatefulSetSpec{Replicas: int32Ptr(1)}, + Status: appsv1.StatefulSetStatus{ObservedGeneration: 2, UpdatedReplicas: 1, Replicas: 1}, + } + cfg := observeImageCfg(t, node, sts) + cfg.Platform = platform.Config{SidecarImage: "seictl@sha256:default"} + + exec := newObserveImageExec(t, cfg) + g.Expect(exec.Execute(context.Background())).To(Succeed()) + g.Expect(node.Status.CurrentSidecarImage).To(Equal("seictl@sha256:override")) +} + func TestObserveImage_RollingUpdate_ReturnsRunning(t *testing.T) { g := NewWithT(t) node := observeImageNode() diff --git a/manifests/sei.io_seinodes.yaml b/manifests/sei.io_seinodes.yaml index 1281022..6c8bdaf 100644 --- a/manifests/sei.io_seinodes.yaml +++ b/manifests/sei.io_seinodes.yaml @@ -813,6 +813,16 @@ spec: Parent controllers compare this against spec.image to determine whether a spec change has been fully actuated. type: string + currentSidecarImage: + description: |- + CurrentSidecarImage is the sidecar container image observed running + on the owned StatefulSet. Stamped jointly with CurrentImage when the + rollout completes. Compared against the effective sidecar image + (spec.sidecar.image, else the controller's SEI_SIDECAR_IMAGE) to + detect drift independent of seid changes. Empty until the first + observe — empty is treated as "not yet observed, no drift" so a + controller upgrade doesn't fleet-roll every node on first reconcile. + type: string phase: description: Phase is the high-level lifecycle state. enum: From d120f60d163fcbf9b55307a35de16d05d118691e Mon Sep 17 00:00:00 2001 From: bdchatham Date: Mon, 1 Jun 2026 19:48:58 -0700 Subject: [PATCH 2/5] docs(planner): note sidecarImageDrifted backfill guard short-circuits for tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per K8s-specialist review on PR #375 — one-line addition to the doc comment explaining why planner tests that construct zero-value planners (empty Platform.SidecarImage) don't accidentally trip drift detection. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/planner/planner.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/internal/planner/planner.go b/internal/planner/planner.go index 0ea5846..e39a8f3 100644 --- a/internal/planner/planner.go +++ b/internal/planner/planner.go @@ -743,7 +743,9 @@ func imageDrifted(node *seiv1alpha1.SeiNode) bool { // override or controller-wide default) diverges from what was last observed. // Empty Status.CurrentSidecarImage means "not yet observed" and is treated // as no-drift so a controller upgrade doesn't fleet-roll every node before -// ObserveImage has had a chance to backfill the field. +// ObserveImage has had a chance to backfill the field. The early return +// also keeps planner tests that construct zero-value planners (empty +// Platform.SidecarImage) from accidentally tripping drift. func sidecarImageDrifted(node *seiv1alpha1.SeiNode, p platform.Config) bool { if node.Status.CurrentSidecarImage == "" { return false From a420484326fb325c4f2815aa2818c240515b87a6 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Mon, 1 Jun 2026 19:59:59 -0700 Subject: [PATCH 3/5] fix(envtest): wire Platform on NodeResolver to match ExecutionConfig MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cursor Bugbot catch: the envtest suite at suite_test.go:164 constructed NodeResolver without Platform while ExecutionConfig.Platform at line 184 was populated with platformCfg. After PR #375 adds Platform to NodeResolver, this asymmetry would loop sidecarImageDrifted on every reconcile: 1. ObserveImage stamps CurrentSidecarImage from ExecutionConfig.Platform (the populated value). 2. Next reconcile, the planner re-resolves the effective image from NodeResolver.Platform (the zero value, empty SidecarImage). 3. sidecarImageDrifted sees stamped non-empty != current empty → drift! 4. Queue update plan. Loop. No existing test currently exercises this path (no test runs ObserveImage to completion and re-reconciles), so the bug doesn't break CI today. But the moment an envtest does drive a node to Running + observe + re-reconcile, it would loop. Mirror the same fix shape we used for SEI_NLB_TARGET_TYPE in #373. Same fix the bot proposed; the comment on the new line documents the symmetry requirement to prevent regression. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/controller/nodedeployment/envtest/suite_test.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/internal/controller/nodedeployment/envtest/suite_test.go b/internal/controller/nodedeployment/envtest/suite_test.go index e8c4b90..a3e309c 100644 --- a/internal/controller/nodedeployment/envtest/suite_test.go +++ b/internal/controller/nodedeployment/envtest/suite_test.go @@ -165,6 +165,11 @@ func run(m *testing.M) (int, error) { BuildSidecarClient: func(_ *seiv1alpha1.SeiNode) (task.SidecarClient, error) { return stubSC, nil }, + // Must match ExecutionConfig.Platform below: ObserveImage stamps + // CurrentSidecarImage from ExecutionConfig.Platform, then the + // planner re-resolves the effective image from NodeResolver.Platform. + // Asymmetric values would loop sidecarImageDrifted on every reconcile. + Platform: platformCfg, }, PlanExecutor: &planner.Executor[*seiv1alpha1.SeiNode]{ ConfigFor: func(_ context.Context, node *seiv1alpha1.SeiNode) task.ExecutionConfig { From aa9df8cc3140f883fae10035bba76a8bba130807 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Tue, 2 Jun 2026 05:09:26 +0200 Subject: [PATCH 4/5] fix(test): mirror NodeResolver.Platform in remaining test constructors + sweep comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cursor Bugbot caught a sibling site of the envtest fix: plan_execution_test.go (lines 133, 799) and reconciler_test.go (line 56) also construct NodeResolver without Platform while their ExecutionConfig.Platform is platformtest.Config(). Same asymmetric-stamp drift loop. Same fix. Also trim a few comments added during the iteration that grew fat: - CurrentSidecarImage CRD godoc 7 → 5 lines (drop the spec.sidecar.image / SEI_SIDECAR_IMAGE restatement; the helper is named). - sidecarImageDrifted doc 7 → 4 (drop the "planner tests" sentence; the test-constructor fix obsoletes that note). - plannerForMode method doc 4 → 3 (signature speaks for itself). - A few test-case docstrings folded to one-liners. Co-Authored-By: Claude Opus 4.7 (1M context) --- api/v1alpha1/seinode_types.go | 10 ++++------ .../controller/node/plan_execution_test.go | 3 ++- internal/controller/node/reconciler_test.go | 1 + internal/planner/node_update_test.go | 14 +++++--------- internal/planner/planner.go | 18 +++++++----------- 5 files changed, 19 insertions(+), 27 deletions(-) diff --git a/api/v1alpha1/seinode_types.go b/api/v1alpha1/seinode_types.go index aa3cd17..85001d0 100644 --- a/api/v1alpha1/seinode_types.go +++ b/api/v1alpha1/seinode_types.go @@ -333,12 +333,10 @@ type SeiNodeStatus struct { CurrentImage string `json:"currentImage,omitempty"` // CurrentSidecarImage is the sidecar container image observed running - // on the owned StatefulSet. Stamped jointly with CurrentImage when the - // rollout completes. Compared against the effective sidecar image - // (spec.sidecar.image, else the controller's SEI_SIDECAR_IMAGE) to - // detect drift independent of seid changes. Empty until the first - // observe — empty is treated as "not yet observed, no drift" so a - // controller upgrade doesn't fleet-roll every node on first reconcile. + // on the owned StatefulSet. Stamped jointly with CurrentImage on + // rollout completion. Empty means "not yet observed" and is treated + // as no-drift so a controller upgrade doesn't fleet-roll every node + // on first reconcile. // +optional CurrentSidecarImage string `json:"currentSidecarImage,omitempty"` diff --git a/internal/controller/node/plan_execution_test.go b/internal/controller/node/plan_execution_test.go index 5d00e24..f465cb7 100644 --- a/internal/controller/node/plan_execution_test.go +++ b/internal/controller/node/plan_execution_test.go @@ -132,6 +132,7 @@ func newProgressionReconciler(t *testing.T, mock *mockSidecarClient, objs ...cli Platform: platformtest.Config(), Planner: &planner.NodeResolver{ BuildSidecarClient: func(_ *seiv1alpha1.SeiNode) (task.SidecarClient, error) { return mock, nil }, + Platform: platformtest.Config(), }, PlanExecutor: &planner.Executor[*seiv1alpha1.SeiNode]{ ConfigFor: func(_ context.Context, node *seiv1alpha1.SeiNode) task.ExecutionConfig { @@ -796,7 +797,7 @@ func TestReconcileInitializing_SidecarClientError_Requeues(t *testing.T) { Scheme: s, Recorder: record.NewFakeRecorder(100), Platform: platformtest.Config(), - Planner: &planner.NodeResolver{}, + Planner: &planner.NodeResolver{Platform: platformtest.Config()}, PlanExecutor: &planner.Executor[*seiv1alpha1.SeiNode]{ ConfigFor: func(_ context.Context, n *seiv1alpha1.SeiNode) task.ExecutionConfig { return task.ExecutionConfig{ diff --git a/internal/controller/node/reconciler_test.go b/internal/controller/node/reconciler_test.go index 93d232c..6f547be 100644 --- a/internal/controller/node/reconciler_test.go +++ b/internal/controller/node/reconciler_test.go @@ -55,6 +55,7 @@ func newNodeReconcilerWithSidecar(t *testing.T, mock *mockSidecarClient, objs .. Platform: platformtest.Config(), Planner: &planner.NodeResolver{ BuildSidecarClient: func(_ *seiv1alpha1.SeiNode) (task.SidecarClient, error) { return mock, nil }, + Platform: platformtest.Config(), }, PlanExecutor: &planner.Executor[*seiv1alpha1.SeiNode]{ ConfigFor: func(_ context.Context, node *seiv1alpha1.SeiNode) task.ExecutionConfig { diff --git a/internal/planner/node_update_test.go b/internal/planner/node_update_test.go index e199a0c..bb07399 100644 --- a/internal/planner/node_update_test.go +++ b/internal/planner/node_update_test.go @@ -175,18 +175,15 @@ func TestArchivePlanner_ImageDrift_UpdateProgression(t *testing.T) { // --- sidecar image drift tests --- -// sidecarDriftedNode returns a runningFullNode with CurrentSidecarImage -// populated so the empty-no-drift backfill guard doesn't fire; the test -// then mutates Spec.Sidecar / platform to drive drift. +// sidecarDriftedNode primes CurrentSidecarImage so the backfill guard +// doesn't fire; the test then drives drift via Spec.Sidecar or platform. func sidecarDriftedNode() *seiv1alpha1.SeiNode { node := runningFullNode() node.Status.CurrentSidecarImage = testSidecarImageV1 return node } -// Effective sidecar image = platform default when Spec.Sidecar is nil. -// Bumping the platform default while CurrentSidecarImage stays at v1 -// must trigger a drift-routed update plan. +// Platform default drift (Spec.Sidecar nil) triggers an update plan. func TestFullPlanner_SidecarDriftFromPlatformDefault_UpdateProgression(t *testing.T) { g := NewWithT(t) node := sidecarDriftedNode() @@ -237,9 +234,8 @@ func TestFullPlanner_CombinedDrift_SingleUpdatePlan(t *testing.T) { g.Expect(cond.Message).To(ContainSubstring("sidecar spec=")) } -// Backfill guard: empty CurrentSidecarImage = "not yet observed, no drift." -// Without this, a controller upgrade fleet-rolls every existing SeiNode on -// first reconcile before ObserveImage backfills the field. +// Backfill guard: empty CurrentSidecarImage must not fire drift, else a +// controller upgrade fleet-rolls every node before ObserveImage backfills. func TestFullPlanner_NoCurrentSidecarImage_NoDrift(t *testing.T) { g := NewWithT(t) node := runningFullNode() diff --git a/internal/planner/planner.go b/internal/planner/planner.go index e39a8f3..73a9f0f 100644 --- a/internal/planner/planner.go +++ b/internal/planner/planner.go @@ -265,10 +265,9 @@ func planFailureMessage(plan *seiv1alpha1.TaskPlan) string { return unknownValue } -// plannerForMode returns the appropriate NodePlanner based on which mode -// sub-spec is populated on the SeiNode. Constructor — passes the resolver's -// Platform config into each planner so they can resolve the effective -// sidecar image for drift detection without taking a cluster dependency. +// plannerForMode returns the appropriate NodePlanner for the SeiNode's +// mode sub-spec, threaded with the resolver's Platform config so each +// planner can resolve the effective sidecar image for drift detection. func (r *NodeResolver) plannerForMode(node *seiv1alpha1.SeiNode) (NodePlanner, error) { switch { case node.Spec.FullNode != nil: @@ -739,13 +738,10 @@ func imageDrifted(node *seiv1alpha1.SeiNode) bool { return node.Spec.Image != node.Status.CurrentImage } -// sidecarImageDrifted reports whether the effective sidecar image (per-node -// override or controller-wide default) diverges from what was last observed. -// Empty Status.CurrentSidecarImage means "not yet observed" and is treated -// as no-drift so a controller upgrade doesn't fleet-roll every node before -// ObserveImage has had a chance to backfill the field. The early return -// also keeps planner tests that construct zero-value planners (empty -// Platform.SidecarImage) from accidentally tripping drift. +// sidecarImageDrifted reports whether the effective sidecar image diverges +// from what was last observed. Empty Status.CurrentSidecarImage means +// "not yet observed" and is treated as no-drift so a controller upgrade +// doesn't fleet-roll every node before ObserveImage backfills the field. func sidecarImageDrifted(node *seiv1alpha1.SeiNode, p platform.Config) bool { if node.Status.CurrentSidecarImage == "" { return false From fe254a9e05e7001ad91a8c1c523c3bf0692c25de Mon Sep 17 00:00:00 2001 From: bdchatham Date: Tue, 2 Jun 2026 05:11:05 +0200 Subject: [PATCH 5/5] chore(manifests): regenerate CRDs after CurrentSidecarImage godoc trim Co-Authored-By: Claude Opus 4.7 (1M context) --- config/crd/sei.io_seinodes.yaml | 10 ++++------ manifests/sei.io_seinodes.yaml | 10 ++++------ 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/config/crd/sei.io_seinodes.yaml b/config/crd/sei.io_seinodes.yaml index 6c8bdaf..159a0f3 100644 --- a/config/crd/sei.io_seinodes.yaml +++ b/config/crd/sei.io_seinodes.yaml @@ -816,12 +816,10 @@ spec: currentSidecarImage: description: |- CurrentSidecarImage is the sidecar container image observed running - on the owned StatefulSet. Stamped jointly with CurrentImage when the - rollout completes. Compared against the effective sidecar image - (spec.sidecar.image, else the controller's SEI_SIDECAR_IMAGE) to - detect drift independent of seid changes. Empty until the first - observe — empty is treated as "not yet observed, no drift" so a - controller upgrade doesn't fleet-roll every node on first reconcile. + on the owned StatefulSet. Stamped jointly with CurrentImage on + rollout completion. Empty means "not yet observed" and is treated + as no-drift so a controller upgrade doesn't fleet-roll every node + on first reconcile. type: string phase: description: Phase is the high-level lifecycle state. diff --git a/manifests/sei.io_seinodes.yaml b/manifests/sei.io_seinodes.yaml index 6c8bdaf..159a0f3 100644 --- a/manifests/sei.io_seinodes.yaml +++ b/manifests/sei.io_seinodes.yaml @@ -816,12 +816,10 @@ spec: currentSidecarImage: description: |- CurrentSidecarImage is the sidecar container image observed running - on the owned StatefulSet. Stamped jointly with CurrentImage when the - rollout completes. Compared against the effective sidecar image - (spec.sidecar.image, else the controller's SEI_SIDECAR_IMAGE) to - detect drift independent of seid changes. Empty until the first - observe — empty is treated as "not yet observed, no drift" so a - controller upgrade doesn't fleet-roll every node on first reconcile. + on the owned StatefulSet. Stamped jointly with CurrentImage on + rollout completion. Empty means "not yet observed" and is treated + as no-drift so a controller upgrade doesn't fleet-roll every node + on first reconcile. type: string phase: description: Phase is the high-level lifecycle state.