From 6bcd8225cbe30bc822c3f0bd145a323f7fded9c1 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Fri, 29 May 2026 17:54:12 -0500 Subject: [PATCH 1/3] fix: Guard nil Instance controller status when counting replicas WorkloadDeployment reconciliation read instance.Status.Controller.ObservedTemplateHash to count current replicas without checking that Status.Controller (a nilable pointer) was populated. Infra providers set that field independently of the Programmed condition, so an instance could report Programmed=True with Status.Controller still nil. The dereference panicked, and because it ran before the status write, the deployment's status froze at its create-time values and the reconcile hot-looped -- surfacing as a permanently Unavailable workload with 0 ready replicas even though its instances were running. Guard the dereference so reconciliation completes and the deployment reports accurate readiness. Co-Authored-By: Claude Opus 4.8 (1M context) --- internal/controller/workloaddeployment_controller.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/internal/controller/workloaddeployment_controller.go b/internal/controller/workloaddeployment_controller.go index 9b17266e..def9341a 100644 --- a/internal/controller/workloaddeployment_controller.go +++ b/internal/controller/workloaddeployment_controller.go @@ -261,7 +261,10 @@ func (r *WorkloadDeploymentReconciler) reconcileInstanceGates( } if apimeta.IsStatusConditionTrue(instance.Status.Conditions, computev1alpha.InstanceProgrammed) { - if instance.Status.Controller.ObservedTemplateHash == templateHash { + // Status.Controller is a pointer that the infra provider may not have + // populated yet even after it sets Programmed=True. Guard the deref to + // avoid a panic that would abort the reconcile before the status write. + if instance.Status.Controller != nil && instance.Status.Controller.ObservedTemplateHash == templateHash { currentReplicas++ } } From 6583d48bd2f40b12288eaa16d973fe2e7d0cc122 Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Fri, 29 May 2026 17:54:13 -0500 Subject: [PATCH 2/3] fix: Stamp project WorkloadDeployment UID on instance projections Instance projections copied the workload-deployment-uid label verbatim from the cell-plane Instance, where it carries the cell/Karmada WorkloadDeployment UID. That value never matches the project-plane WorkloadDeployment UID, so label-selector lookups in the project cluster (e.g. the CLI CITY column) failed to resolve. Overwrite the label on the projection with the owning project-side WorkloadDeployment's UID, which the projector already resolves for the owner reference. Co-Authored-By: Claude Opus 4.8 (1M context) --- internal/controller/instance_projector.go | 11 +++++++--- .../controller/instance_projector_test.go | 22 ++++++++++++++++--- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/internal/controller/instance_projector.go b/internal/controller/instance_projector.go index fa0b69b6..c1f5fa1d 100644 --- a/internal/controller/instance_projector.go +++ b/internal/controller/instance_projector.go @@ -5,6 +5,7 @@ package controller import ( "context" "fmt" + "maps" "strings" "time" @@ -152,9 +153,13 @@ func (r *InstanceProjector) Reconcile(ctx context.Context, req ctrl.Request) (ct if projection.Labels == nil { projection.Labels = make(map[string]string) } - for k, v := range downstreamInstance.Labels { - projection.Labels[k] = v - } + maps.Copy(projection.Labels, downstreamInstance.Labels) + // Overwrite the WD UID label with the project-cluster WD UID. The + // downstream Instance carries the cell-plane WD UID (assigned by Karmada + // when it propagated the WD), which never matches the project WD UID. + // Consumers doing label-selector lookups by WorkloadDeploymentUIDLabel + // (e.g. CLI CITY column) must see the project-side UID. + projection.Labels[computev1alpha.WorkloadDeploymentUIDLabel] = string(ownerWD.UID) projection.Spec = downstreamInstance.Spec diff --git a/internal/controller/instance_projector_test.go b/internal/controller/instance_projector_test.go index 7dcc8168..e77ed5f4 100644 --- a/internal/controller/instance_projector_test.go +++ b/internal/controller/instance_projector_test.go @@ -368,9 +368,17 @@ func TestInstanceProjector_Reconcile(t *testing.T) { require.NoError(t, err, "expected projection to exist in project namespace") - // Labels should be copied from the Karmada instance. + // Labels should be copied from the Karmada instance, with one + // deliberate exception: WorkloadDeploymentUIDLabel is overwritten with + // the project-cluster WD UID (not the edge/Karmada UID) so that + // label-selector lookups in the project cluster return correct results. if tt.karmadaInstance != nil { for k, v := range tt.karmadaInstance.Labels { + if k == computev1alpha.WorkloadDeploymentUIDLabel { + // Checked separately below — the projection must carry + // the project-cluster UID, not the cell/Karmada UID. + continue + } assert.Equal(t, v, projection.Labels[k], "projection label %q should match Karmada instance label", k) } @@ -383,10 +391,18 @@ func TestInstanceProjector_Reconcile(t *testing.T) { tt.karmadaInstance.Labels[computev1alpha.WorkloadUIDLabel], projection.Labels[computev1alpha.WorkloadUIDLabel], "WorkloadUIDLabel must be propagated to the projection") + // WorkloadDeploymentUIDLabel must carry the PROJECT-cluster WD UID, + // not the cell/Karmada UID. The projector overwrites this label after + // copying the rest. Verify the project UID is present and that the + // edge UID was NOT passed through. assert.Equal(t, - tt.karmadaInstance.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + string(projTestWDUID), projection.Labels[computev1alpha.WorkloadDeploymentUIDLabel], - "WorkloadDeploymentUIDLabel must be propagated to the projection") + "WorkloadDeploymentUIDLabel must be the project-cluster WD UID on the projection") + assert.NotEqual(t, + string(projTestEdgeWDUID), + projection.Labels[computev1alpha.WorkloadDeploymentUIDLabel], + "WorkloadDeploymentUIDLabel must NOT be the edge/Karmada WD UID on the projection") assert.Equal(t, tt.karmadaInstance.Labels[computev1alpha.WorkloadDeploymentNameLabel], projection.Labels[computev1alpha.WorkloadDeploymentNameLabel], From d25bdf3e12c8de94f6fd694bc6ed0b8d12c9d03e Mon Sep 17 00:00:00 2001 From: Scot Wells Date: Fri, 29 May 2026 17:54:13 -0500 Subject: [PATCH 3/3] feat: Sync WorkloadDeployment status to the project plane Federated WorkloadDeployment status was aggregated onto the Karmada hub object but never propagated back to the project-namespace deployment that users and the Workload controller read. The federator only watched the project-side object, so its downstream status sync never fired reactively, leaving the project deployment's status empty and the parent Workload stuck at 0/0 replicas. Add a WorkloadDeploymentStatusSyncer on the management plane that watches Karmada WorkloadDeployments and writes their aggregated status back to the originating project deployment, mirroring how the InstanceProjector resolves the project cluster and namespace. Co-Authored-By: Claude Opus 4.8 (1M context) --- cmd/main.go | 11 ++ .../workloaddeployment_federator.go | 7 + .../workloaddeployment_status_syncer.go | 158 ++++++++++++++++++ 3 files changed, 176 insertions(+) create mode 100644 internal/controller/workloaddeployment_status_syncer.go diff --git a/cmd/main.go b/cmd/main.go index 01d3eddd..60e8a564 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -489,6 +489,17 @@ func setupManagementControllers(mgr mcmanager.Manager, federationClient client.C return nil, fmt.Errorf("InstanceProjector: %w", err) } + // WorkloadDeploymentStatusSyncer watches WDs on the Karmada hub and + // reactively syncs their aggregated status back to the project cluster. + // This complements WorkloadDeploymentFederator.syncStatusFromDownstream, + // which only fires when the project-side WD changes (spec updates/restarts). + if err = (&controller.WorkloadDeploymentStatusSyncer{ + FederationClient: federationClient, + MCManager: mgr, + }).SetupWithManager(federationMgr); err != nil { + return nil, fmt.Errorf("WorkloadDeploymentStatusSyncer: %w", err) + } + return []manager.Runnable{federationMgr}, nil } diff --git a/internal/controller/workloaddeployment_federator.go b/internal/controller/workloaddeployment_federator.go index 9c736cf0..adb799c7 100644 --- a/internal/controller/workloaddeployment_federator.go +++ b/internal/controller/workloaddeployment_federator.go @@ -318,6 +318,13 @@ func (r *WorkloadDeploymentFederator) ensurePropagationPolicy( // syncStatusFromDownstream reads the aggregated status of the WorkloadDeployment // from the downstream namespace and writes it back to the project-namespace // object. It is a no-op when the downstream object does not yet exist. +// +// Belt-and-suspenders: WorkloadDeploymentStatusSyncer (registered with +// federationMgr) is the primary reactive path — it fires whenever Karmada +// updates the hub WD status. This call ensures status is also pulled during +// spec-change reconciles (e.g. manager restarts, propagation re-syncs). +// Removal of this call is deferred to a separate cleanup PR once the syncer +// is proven in production. func (r *WorkloadDeploymentFederator) syncStatusFromDownstream( ctx context.Context, projectClient client.Client, diff --git a/internal/controller/workloaddeployment_status_syncer.go b/internal/controller/workloaddeployment_status_syncer.go new file mode 100644 index 00000000..9ba52c3d --- /dev/null +++ b/internal/controller/workloaddeployment_status_syncer.go @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package controller + +import ( + "context" + "fmt" + "strings" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/equality" + apierrors "k8s.io/apimachinery/pkg/api/errors" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" + mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager" + "sigs.k8s.io/multicluster-runtime/pkg/multicluster" + + computev1alpha "go.datum.net/compute/api/v1alpha" + "go.miloapis.com/milo/pkg/downstreamclient" +) + +// WorkloadDeploymentStatusSyncer watches WorkloadDeployments on the Karmada +// federation hub and syncs their aggregated status back to the corresponding +// project-cluster WorkloadDeployment. +// +// It is registered with the federationMgr (a standard manager.Manager pointed +// at the Karmada control plane) so it reacts to status changes written by +// Karmada's statusAggregation interpreter. This complements the existing +// syncStatusFromDownstream call in WorkloadDeploymentFederator, which only runs +// when the project-side WD changes (spec updates, restarts). The syncer makes +// status propagation reactive: status flows to the project cluster as soon as +// Karmada aggregates it from the cell. +// +// Namespace resolution: the Karmada WD lives in ns-. +// That namespace is labeled by WorkloadDeploymentFederator.ensureDownstreamNamespace +// with UpstreamOwnerClusterNameLabel and UpstreamOwnerNamespaceLabel. The syncer +// reads those labels to locate the project cluster and namespace, then looks up +// the project WD by name (which is stable across all planes). +type WorkloadDeploymentStatusSyncer struct { + // FederationClient reads WorkloadDeployments from the Karmada federation + // control plane. + FederationClient client.Client + + // MCManager provides project-cluster clients via GetCluster. + MCManager mcmanager.Manager +} + +// +kubebuilder:rbac:groups=compute.datumapis.com,resources=workloaddeployments/status,verbs=get;update;patch + +func (r *WorkloadDeploymentStatusSyncer) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx).WithValues("workloaddeployment", req.NamespacedName) + + // 1. Fetch the WorkloadDeployment from the Karmada federation hub. + var karmadaWD computev1alpha.WorkloadDeployment + if err := r.FederationClient.Get(ctx, req.NamespacedName, &karmadaWD); err != nil { + if apierrors.IsNotFound(err) { + // Deleted from the hub — no projection to update. + return ctrl.Result{}, nil + } + return ctrl.Result{}, fmt.Errorf("failed getting Karmada WorkloadDeployment: %w", err) + } + + // 2. Read the Karmada namespace to resolve the upstream cluster name. + // WorkloadDeploymentFederator.ensureDownstreamNamespace stamps the namespace + // with UpstreamOwnerClusterNameLabel ("cluster-") and + // UpstreamOwnerNamespaceLabel (project namespace name). + var ns corev1.Namespace + if err := r.FederationClient.Get(ctx, client.ObjectKey{Name: req.Namespace}, &ns); err != nil { + if apierrors.IsNotFound(err) { + // Namespace not yet labeled or removed — not an error; skip quietly. + logger.Info("Karmada namespace not found, skipping status sync", "namespace", req.Namespace) + return ctrl.Result{}, nil + } + return ctrl.Result{}, fmt.Errorf("failed getting Karmada namespace %q: %w", req.Namespace, err) + } + + encodedClusterName, ok := ns.Labels[downstreamclient.UpstreamOwnerClusterNameLabel] + if !ok { + // Namespace not yet stamped by the federator (e.g. WD predates the label). + logger.Info("Karmada namespace missing upstream-cluster-name label, skipping status sync", + "namespace", req.Namespace) + return ctrl.Result{}, nil + } + + // 3. Decode the cluster name ("cluster-" with "/" encoded as "_"). + clusterName := strings.TrimPrefix(encodedClusterName, "cluster-") + clusterName = strings.ReplaceAll(clusterName, "_", "/") + + // 4. Resolve the project namespace from the WD's own label (stamped by + // upsertDownstreamDeployment). Fall back to the namespace label if absent. + targetNamespace := karmadaWD.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] + if targetNamespace == "" { + targetNamespace = ns.Labels[downstreamclient.UpstreamOwnerNamespaceLabel] + } + if targetNamespace == "" { + logger.Info("cannot resolve project namespace for Karmada WorkloadDeployment, skipping status sync", + "namespace", req.Namespace, "name", req.Name) + return ctrl.Result{}, nil + } + + // 5. Obtain the project cluster client. + projectCluster, err := r.MCManager.GetCluster(ctx, multicluster.ClusterName(clusterName)) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed getting project cluster %q: %w", clusterName, err) + } + projectClient := projectCluster.GetClient() + + // 6. Fetch the project-cluster WD by name. The WD name is stable across all + // planes (project, Karmada, cell) and is the correct cross-plane identifier. + var projectWD computev1alpha.WorkloadDeployment + if err := projectClient.Get(ctx, client.ObjectKey{ + Namespace: targetNamespace, + Name: req.Name, + }, &projectWD); err != nil { + if apierrors.IsNotFound(err) { + // Project WD may not exist yet (ordering race) or may have been + // deleted. Not an error — the federator will handle cleanup. + logger.Info("project WorkloadDeployment not found, skipping status sync", + "cluster", clusterName, "namespace", targetNamespace, "name", req.Name) + return ctrl.Result{}, nil + } + return ctrl.Result{}, fmt.Errorf("failed getting project WorkloadDeployment %s/%s in cluster %q: %w", + targetNamespace, req.Name, clusterName, err) + } + + // 7. No-op when status is already equal — avoid spurious writes. + if equality.Semantic.DeepEqual(projectWD.Status, karmadaWD.Status) { + return ctrl.Result{}, nil + } + + // 8. Write the Karmada-aggregated status to the project WD. + // Use Status().Update() rather than Patch() so that zero-value int32 fields + // (currentReplicas, readyReplicas) are always included in the request body. + // MergeFrom omits unchanged zero-value fields, which would silently drop + // required status sub-fields on the project side. + projectWD.Status = karmadaWD.Status + if err := projectClient.Status().Update(ctx, &projectWD); err != nil { + return ctrl.Result{}, fmt.Errorf("failed updating project WorkloadDeployment status: %w", err) + } + + logger.Info("synced WorkloadDeployment status from Karmada hub to project cluster", + "cluster", clusterName, "namespace", targetNamespace) + + return ctrl.Result{}, nil +} + +// SetupWithManager registers the WorkloadDeploymentStatusSyncer with +// federationMgr, a standard manager.Manager pointed at the Karmada federation +// control plane. It watches WorkloadDeployments on the hub and reacts when their +// status is updated by Karmada's statusAggregation interpreter. +func (r *WorkloadDeploymentStatusSyncer) SetupWithManager(federationMgr manager.Manager) error { + return ctrl.NewControllerManagedBy(federationMgr). + For(&computev1alpha.WorkloadDeployment{}). + Named("workload-deployment-status-syncer"). + Complete(r) +}