diff --git a/controllers/clustersummary_controller.go b/controllers/clustersummary_controller.go index eba726b9..6b24f6b6 100644 --- a/controllers/clustersummary_controller.go +++ b/controllers/clustersummary_controller.go @@ -531,12 +531,23 @@ func (r *ClusterSummaryReconciler) prepareForDeployment(ctx context.Context, func (r *ClusterSummaryReconciler) proceedDeployingClusterSummary(ctx context.Context, clusterSummaryScope *scope.ClusterSummaryScope, logger logr.Logger) (reconcile.Result, error) { + // Snapshot existing failure messages before deploying so we can detect + // new conflicts vs. ongoing ones and avoid re-raising an event every retry. + preDeployFailures := collectFailureMessages(clusterSummaryScope.ClusterSummary) + err := r.deploy(ctx, clusterSummaryScope, logger) if err != nil { var conflictErr *deployer.ConflictError ok := errors.As(err, &conflictErr) if ok { logger.V(logs.LogInfo).Error(err, "failed to deploy because of conflict") + if _, alreadyKnown := preDeployFailures[conflictErr.Error()]; !alreadyKnown { + clusterSummary := clusterSummaryScope.ClusterSummary + r.eventRecorder.Eventf(clusterSummary, nil, corev1.EventTypeWarning, "Conflict", + configv1beta1.ClusterSummaryKind, "Conflict detected for cluster %s %s/%s: %s", + clusterSummary.Spec.ClusterType, clusterSummary.Spec.ClusterNamespace, + clusterSummary.Spec.ClusterName, conflictErr.Error()) + } r.setNextReconcileTime(clusterSummaryScope, r.ConflictRetryTime) return reconcile.Result{Requeue: true, RequeueAfter: r.ConflictRetryTime}, nil } @@ -2095,3 +2106,16 @@ func getClusterSummaryWithInstantiatedCharts(ctx context.Context, cs *configv1be return &csCopy, nil } + +// collectFailureMessages returns the set of FailureMessages currently recorded +// across all FeatureSummaries. Used to detect whether a conflict is new (not +// yet in the status) versus ongoing (already reported in a previous reconcile). +func collectFailureMessages(cs *configv1beta1.ClusterSummary) map[string]struct{} { + msgs := make(map[string]struct{}, len(cs.Status.FeatureSummaries)) + for i := range cs.Status.FeatureSummaries { + if msg := cs.Status.FeatureSummaries[i].FailureMessage; msg != nil { + msgs[*msg] = struct{}{} + } + } + return msgs +} diff --git a/controllers/clustersummary_deployer.go b/controllers/clustersummary_deployer.go index 0b1eb90f..f6edfc2d 100644 --- a/controllers/clustersummary_deployer.go +++ b/controllers/clustersummary_deployer.go @@ -238,17 +238,29 @@ func (r *ClusterSummaryReconciler) proceedDeployingFeature(ctx context.Context, func (r *ClusterSummaryReconciler) handleDeployerError(deployerError error, clusterSummaryScope *scope.ClusterSummaryScope, f feature, currentHash []byte, logger logr.Logger) (bool, error) { + clusterSummary := clusterSummaryScope.ClusterSummary + // Check if error is a NonRetriableError type var nonRetriableError *configv1beta1.NonRetriableError if errors.As(deployerError, &nonRetriableError) { nonRetriableStatus := libsveltosv1beta1.FeatureStatusFailedNonRetriable r.updateFeatureStatus(clusterSummaryScope, f.id, &nonRetriableStatus, currentHash, deployerError, logger) + r.eventRecorder.Eventf(clusterSummary, nil, corev1.EventTypeWarning, "FailedNonRetriable", + configv1beta1.ClusterSummaryKind, + "Feature %s for cluster %s %s/%s failed with non-retriable error: %s", + f.id, clusterSummary.Spec.ClusterType, clusterSummary.Spec.ClusterNamespace, + clusterSummary.Spec.ClusterName, deployerError.Error()) return true, nil } var templateError *configv1beta1.TemplateInstantiationError if errors.As(deployerError, &templateError) { nonRetriableStatus := libsveltosv1beta1.FeatureStatusFailedNonRetriable r.updateFeatureStatus(clusterSummaryScope, f.id, &nonRetriableStatus, currentHash, deployerError, logger) + r.eventRecorder.Eventf(clusterSummary, nil, corev1.EventTypeWarning, "FailedNonRetriable", + configv1beta1.ClusterSummaryKind, + "Feature %s for cluster %s %s/%s failed to instantiate template: %s", + f.id, clusterSummary.Spec.ClusterType, clusterSummary.Spec.ClusterNamespace, + clusterSummary.Spec.ClusterName, deployerError.Error()) return true, nil } var healthCheckError *clusterops.HealthCheckError @@ -261,6 +273,11 @@ func (r *ClusterSummaryReconciler) handleDeployerError(deployerError error, clus nonRetriableStatus := libsveltosv1beta1.FeatureStatusFailedNonRetriable resultError := errors.New("the maximum number of consecutive errors has been reached") r.updateFeatureStatus(clusterSummaryScope, f.id, &nonRetriableStatus, currentHash, resultError, logger) + r.eventRecorder.Eventf(clusterSummary, nil, corev1.EventTypeWarning, "FailedNonRetriable", + configv1beta1.ClusterSummaryKind, + "Feature %s for cluster %s %s/%s will no longer be retried: maximum consecutive failures reached", + f.id, clusterSummary.Spec.ClusterType, clusterSummary.Spec.ClusterNamespace, + clusterSummary.Spec.ClusterName) return true, nil }