From 35b0289124ed02b5c842f1c4ca2041a81ba3ebbf Mon Sep 17 00:00:00 2001 From: Camila Macedo <7708031+camilamacedo86@users.noreply.github.com> Date: Mon, 16 Mar 2026 11:26:42 +0100 Subject: [PATCH] fix(Boxcutter): Re-resolve bundle when progression deadline is exceeded during rollout Generated-by: Cursor/Claude --- .../clusterextension_controller_test.go | 169 +++++++++++++++++- .../clusterextension_reconcile_steps.go | 41 ++++- test/e2e/features/update.feature | 33 ++++ 3 files changed, 235 insertions(+), 8 deletions(-) diff --git a/internal/operator-controller/controllers/clusterextension_controller_test.go b/internal/operator-controller/controllers/clusterextension_controller_test.go index 6aa55c5f9..4c443fcb1 100644 --- a/internal/operator-controller/controllers/clusterextension_controller_test.go +++ b/internal/operator-controller/controllers/clusterextension_controller_test.go @@ -680,7 +680,8 @@ func TestClusterExtensionBoxcutterApplierFailsDoesNotLeakDeprecationErrors(t *te cl, reconciler := newClientAndReconciler(t, func(d *deps) { // Boxcutter keeps a rolling revision when apply fails. We mirror that state so the test uses - // the same inputs the runtime would see. + // the same inputs the runtime would see. The rolling-out revision has no ProgressDeadlineExceeded + // condition, so the controller reuses it without engaging the resolver. d.RevisionStatesGetter = &MockRevisionStatesGetter{ RevisionStates: &controllers.RevisionStates{ RollingOut: []*controllers.RevisionMetadata{{}}, @@ -768,6 +769,172 @@ func TestClusterExtensionBoxcutterApplierFailsDoesNotLeakDeprecationErrors(t *te require.NoError(t, cl.DeleteAllOf(ctx, &ocv1.ClusterExtension{})) } +// TestClusterExtensionProgressDeadlineExceededReResolvesBundle verifies that when a +// rolling-out revision has exceeded its progression deadline, the controller re-resolves +// from the catalog. This is the primary mechanism for recovering from stuck rollouts. +// +// Scenario: +// - A revision for v1.0.2 is rolling out but has exceeded its progression deadline +// - The ClusterExtension spec requests v1.0.1 (admin changed spec to recover) +// - The controller detects ProgressDeadlineExceeded and re-resolves from catalog +// - The resolver returns v1.0.1, which is used for the new rollout +func TestClusterExtensionProgressDeadlineExceededReResolvesBundle(t *testing.T) { + require.NoError(t, features.OperatorControllerFeatureGate.Set(fmt.Sprintf("%s=true", features.BoxcutterRuntime))) + t.Cleanup(func() { + require.NoError(t, features.OperatorControllerFeatureGate.Set(fmt.Sprintf("%s=false", features.BoxcutterRuntime))) + }) + + resolverCalled := false + cl, reconciler := newClientAndReconciler(t, func(d *deps) { + d.RevisionStatesGetter = &MockRevisionStatesGetter{ + RevisionStates: &controllers.RevisionStates{ + Installed: &controllers.RevisionMetadata{ + Package: "nginx88138", + Image: "quay.io/olmqe/nginxolm-operator-bundle:v1.0.0-nginxolm88138", + BundleMetadata: ocv1.BundleMetadata{ + Name: "nginx88138.v1.0.0", + Version: "1.0.0", + }, + }, + RollingOut: []*controllers.RevisionMetadata{{ + RevisionName: "extension-88138-2", + Package: "nginx88138", + Image: "quay.io/olmqe/nginxolm-operator-bundle:v1.0.2-nginx88138", + BundleMetadata: ocv1.BundleMetadata{ + Name: "nginx88138.v1.0.2", + Version: "1.0.2", + }, + Conditions: []metav1.Condition{{ + Type: ocv1.ClusterExtensionRevisionTypeProgressing, + Status: metav1.ConditionFalse, + Reason: ocv1.ReasonProgressDeadlineExceeded, + }}, + }}, + }, + } + d.Resolver = resolve.Func(func(ctx context.Context, ext *ocv1.ClusterExtension, installedBundle *ocv1.BundleMetadata) (*declcfg.Bundle, *bundle.VersionRelease, *declcfg.Deprecation, error) { + resolverCalled = true + v := bundle.VersionRelease{ + Version: bsemver.MustParse("1.0.1"), + } + return &declcfg.Bundle{ + Name: "nginx88138.v1.0.1", + Package: "nginx88138", + Image: "quay.io/olmqe/nginxolm-operator-bundle:v1.0.1-nginxolm88138", + }, &v, nil, nil + }) + d.ImagePuller = &imageutil.MockPuller{ImageFS: fstest.MapFS{}} + d.Applier = &MockApplier{installCompleted: true} + }) + + ctx := context.Background() + extKey := types.NamespacedName{Name: fmt.Sprintf("cluster-extension-test-%s", rand.String(8))} + + clusterExtension := &ocv1.ClusterExtension{ + ObjectMeta: metav1.ObjectMeta{Name: extKey.Name}, + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "nginx88138", + Version: "1.0.1", + }, + }, + Namespace: "default", + ServiceAccount: ocv1.ServiceAccountReference{ + Name: "default", + }, + }, + } + require.NoError(t, cl.Create(ctx, clusterExtension)) + + res, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.Equal(t, ctrl.Result{}, res) + require.NoError(t, err) + + require.True(t, resolverCalled, + "resolver should be called because the rolling out revision exceeded its progression deadline") + + require.NoError(t, cl.Get(ctx, extKey, clusterExtension)) + + installedCond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled) + require.NotNil(t, installedCond) + require.Equal(t, metav1.ConditionTrue, installedCond.Status) + require.Contains(t, installedCond.Message, "1.0.1") + + require.NoError(t, cl.DeleteAllOf(ctx, &ocv1.ClusterExtension{})) +} + +// TestClusterExtensionRollingOutNoDeadlineExceededReusesRevision verifies that when a +// revision is actively rolling out and has NOT exceeded its progression deadline, +// the controller reuses the existing revision without re-resolving from the catalog, +// regardless of whether the spec has changed. +func TestClusterExtensionRollingOutNoDeadlineExceededReusesRevision(t *testing.T) { + require.NoError(t, features.OperatorControllerFeatureGate.Set(fmt.Sprintf("%s=true", features.BoxcutterRuntime))) + t.Cleanup(func() { + require.NoError(t, features.OperatorControllerFeatureGate.Set(fmt.Sprintf("%s=false", features.BoxcutterRuntime))) + }) + + resolverCalled := false + cl, reconciler := newClientAndReconciler(t, func(d *deps) { + d.RevisionStatesGetter = &MockRevisionStatesGetter{ + RevisionStates: &controllers.RevisionStates{ + RollingOut: []*controllers.RevisionMetadata{{ + Package: "nginx88138", + Image: "quay.io/olmqe/nginxolm-operator-bundle:v1.0.2-nginx88138", + BundleMetadata: ocv1.BundleMetadata{ + Name: "nginx88138.v1.0.2", + Version: "1.0.2", + }, + }}, + }, + } + d.Resolver = resolve.Func(func(ctx context.Context, ext *ocv1.ClusterExtension, installedBundle *ocv1.BundleMetadata) (*declcfg.Bundle, *bundle.VersionRelease, *declcfg.Deprecation, error) { + resolverCalled = true + v := bundle.VersionRelease{ + Version: bsemver.MustParse("1.0.1"), + } + return &declcfg.Bundle{ + Name: "nginx88138.v1.0.1", + Package: "nginx88138", + Image: "quay.io/olmqe/nginxolm-operator-bundle:v1.0.1-nginxolm88138", + }, &v, nil, nil + }) + d.ImagePuller = &imageutil.MockPuller{ImageFS: fstest.MapFS{}} + d.Applier = &MockApplier{installCompleted: true} + }) + + ctx := context.Background() + extKey := types.NamespacedName{Name: fmt.Sprintf("cluster-extension-test-%s", rand.String(8))} + + clusterExtension := &ocv1.ClusterExtension{ + ObjectMeta: metav1.ObjectMeta{Name: extKey.Name}, + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "nginx88138", + Version: "1.0.1", + }, + }, + Namespace: "default", + ServiceAccount: ocv1.ServiceAccountReference{ + Name: "default", + }, + }, + } + require.NoError(t, cl.Create(ctx, clusterExtension)) + + _, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.NoError(t, err) + + require.False(t, resolverCalled, + "resolver should NOT be called: rollout is active and progression deadline has not been exceeded, "+ + "even though the spec (v1.0.1) differs from the rolling out revision (v1.0.2)") + + require.NoError(t, cl.DeleteAllOf(ctx, &ocv1.ClusterExtension{})) +} + func TestValidateClusterExtension(t *testing.T) { tests := []struct { name string diff --git a/internal/operator-controller/controllers/clusterextension_reconcile_steps.go b/internal/operator-controller/controllers/clusterextension_reconcile_steps.go index 14ad71785..a00922ad4 100644 --- a/internal/operator-controller/controllers/clusterextension_reconcile_steps.go +++ b/internal/operator-controller/controllers/clusterextension_reconcile_steps.go @@ -140,15 +140,34 @@ func ResolveBundle(r resolve.Resolver, c client.Client) ReconcileStepFunc { return func(ctx context.Context, state *reconcileState, ext *ocv1.ClusterExtension) (*ctrl.Result, error) { l := log.FromContext(ctx) - // If already rolling out, use existing revision and set deprecation to Unknown (no catalog check) + // When revisions are actively rolling out, the decision to re-engage the resolver + // is driven by the progression deadline — not by spec or catalog changes. + // + // If the latest rolling-out revision has NOT exceeded its progression deadline, + // we continue with the current rollout. Any spec changes the user made will be + // picked up once the rollout completes or the progression deadline is exceeded. + // + // If the latest rolling-out revision HAS exceeded its progression deadline, + // we re-resolve from the catalog. At that point the resolver will use the + // current spec (which may have been updated by the admin) and the installed + // bundle metadata to find the next appropriate version. if len(state.revisionStates.RollingOut) > 0 { - installedBundleName := "" - if state.revisionStates.Installed != nil { - installedBundleName = state.revisionStates.Installed.Name + latestRollingOut := state.revisionStates.RollingOut[len(state.revisionStates.RollingOut)-1] + + if !hasProgressDeadlineExceeded(latestRollingOut) { + installedBundleName := "" + if state.revisionStates.Installed != nil { + installedBundleName = state.revisionStates.Installed.Name + } + SetDeprecationStatus(ext, installedBundleName, nil, false) + state.resolvedRevisionMetadata = latestRollingOut + return nil, nil } - SetDeprecationStatus(ext, installedBundleName, nil, false) - state.resolvedRevisionMetadata = state.revisionStates.RollingOut[0] - return nil, nil + + l.Info("progression deadline exceeded for rolling out revision, re-resolving bundle", + "revision", latestRollingOut.RevisionName, + "version", latestRollingOut.Version, + ) } // Resolve a new bundle from the catalog @@ -199,6 +218,14 @@ func ResolveBundle(r resolve.Resolver, c client.Client) ReconcileStepFunc { } } +// hasProgressDeadlineExceeded checks whether a rolling-out revision has exceeded +// its progression deadline. This is the primary trigger for re-engaging the resolver +// during an active rollout. +func hasProgressDeadlineExceeded(rm *RevisionMetadata) bool { + cnd := apimeta.FindStatusCondition(rm.Conditions, ocv1.ClusterExtensionRevisionTypeProgressing) + return cnd != nil && cnd.Status == metav1.ConditionFalse && cnd.Reason == ocv1.ReasonProgressDeadlineExceeded +} + // handleResolutionError handles the case when bundle resolution fails. // // Decision logic (evaluated in order): diff --git a/test/e2e/features/update.feature b/test/e2e/features/update.feature index e1b4becca..54f372b64 100644 --- a/test/e2e/features/update.feature +++ b/test/e2e/features/update.feature @@ -243,3 +243,36 @@ Feature: Update ClusterExtension And ClusterExtensionRevision "${NAME}-2" reports Progressing as True with Reason RollingOut And ClusterExtensionRevision "${NAME}-2" reports Available as False with Reason ProbeFailure + @BoxcutterRuntime + @ProgressDeadline + Scenario: Progression deadline triggers re-resolution when rollout is stuck + Given min value for ClusterExtension .spec.progressDeadlineMinutes is set to 1 + And min value for ClusterExtensionRevision .spec.progressDeadlineMinutes is set to 1 + And ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + version: 1.0.2 + upgradeConstraintPolicy: SelfCertified + progressDeadlineMinutes: 1 + """ + And ClusterExtensionRevision "${NAME}-1" reports Available as False with Reason ProbeFailure + When ClusterExtension is updated to version "1.0.1" + And ClusterExtensionRevision "${NAME}-1" reports Progressing as False with Reason ProgressDeadlineExceeded + Then ClusterExtension is rolled out + And ClusterExtension is available + And bundle "test-operator.1.0.1" is installed in version "1.0.1" +