From 7896384b1ae5b17496ccd989b36d6a4e14a405e7 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 23 May 2026 22:45:16 +0400 Subject: [PATCH] feat(controller): wire client-go leader-election so HA scaling is safe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Today the serviceoffer-controller is pinned at replicas: 1 with a "Do not scale" comment in x402.yaml. The RBAC for leases is already granted (x402.yaml:176-178) — pre-positioned and unused. An accidental `kubectl scale --replicas=2` or HPA misconfiguration produces split-brain finalizers and double on-chain ERC-8004 registration (real gas spend + duplicate registry entries). This wires client-go tools/leaderelection so multi-replica deployment is safe-by-correctness, not safe-by-comment. - cmd/serviceoffer-controller/main.go: - Read POD_NAME / POD_NAMESPACE from downward API env. - Acquire Lease "serviceoffer-controller" in POD_NAMESPACE before running the reconcile loop. - On lost leadership, os.Exit(1) — kubelet restarts the pod which re-elects from scratch. - --leader-elect flag (default true) so local dev can bypass. - x402.yaml: - Add downward-API POD_NAME env to the controller Deployment (POD_NAMESPACE was already wired). - Update the "Do not scale" comment to "Single replica by default; bumping to 2+ is now safe — leader election prevents split-brain on the reconcile loop." - Lease parameters chosen for fast failover on k3d (lease=30s, renew=20s, retry=5s). Tunable via flag if a multi-zone deployment ever needs longer. Uses client-go directly rather than controller-runtime Manager to minimize churn — controller is currently raw client-go workqueues, not controller-runtime. Migration to controller-runtime is a separate much larger workstream and not necessary just for leader election. --- cmd/serviceoffer-controller/main.go | 78 +++++++++++++++++- cmd/serviceoffer-controller/main_test.go | 81 +++++++++++++++++++ .../infrastructure/base/templates/x402.yaml | 9 ++- 3 files changed, 165 insertions(+), 3 deletions(-) create mode 100644 cmd/serviceoffer-controller/main_test.go diff --git a/cmd/serviceoffer-controller/main.go b/cmd/serviceoffer-controller/main.go index 8fc01a42..6e81cd65 100644 --- a/cmd/serviceoffer-controller/main.go +++ b/cmd/serviceoffer-controller/main.go @@ -7,15 +7,27 @@ import ( "os" "os/signal" "syscall" + "time" "github.com/ObolNetwork/obol-stack/internal/serviceoffercontroller" "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" + "k8s.io/client-go/tools/leaderelection" + "k8s.io/client-go/tools/leaderelection/resourcelock" +) + +const ( + defaultLockNamespace = "x402" + leaseName = "serviceoffer-controller" + leaseDuration = 30 * time.Second + renewDeadline = 20 * time.Second + retryPeriod = 5 * time.Second ) func main() { kubeconfig := flag.String("kubeconfig", "", "Path to kubeconfig for out-of-cluster runs") workers := flag.Int("workers", 1, "Number of reconcile workers") + leaderElect := flag.Bool("leader-elect", true, "Acquire a Lease before running the reconcile loop (disable for local dev)") flag.Parse() cfg, err := loadConfig(*kubeconfig) @@ -31,9 +43,71 @@ func main() { ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) defer cancel() - if err := controller.Run(ctx, *workers); err != nil { - log.Fatalf("run controller: %v", err) + if !*leaderElect { + if err := controller.Run(ctx, *workers); err != nil { + log.Fatalf("run controller: %v", err) + } + return + } + + runWithLeaderElection(ctx, cfg, controller, *workers) +} + +func runWithLeaderElection(ctx context.Context, cfg *rest.Config, controller *serviceoffercontroller.Controller, workers int) { + podName := os.Getenv("POD_NAME") + if podName == "" { + // Fall back so local dev (go run ./cmd/serviceoffer-controller --leader-elect=false) + // still works if someone forgets the flag. Identity must be unique across + // candidates — in real deployments the downward API supplies the pod name. + podName = "serviceoffer-controller-local" } + + lockNamespace := os.Getenv("POD_NAMESPACE") + if lockNamespace == "" { + lockNamespace = defaultLockNamespace + } + + lock, err := resourcelock.NewFromKubeconfig( + resourcelock.LeasesResourceLock, + lockNamespace, + leaseName, + resourcelock.ResourceLockConfig{ + Identity: podName, + }, + cfg, + renewDeadline, + ) + if err != nil { + log.Fatalf("create lease lock: %v", err) + } + + leaderelection.RunOrDie(ctx, leaderelection.LeaderElectionConfig{ + Lock: lock, + ReleaseOnCancel: true, + LeaseDuration: leaseDuration, + RenewDeadline: renewDeadline, + RetryPeriod: retryPeriod, + Callbacks: leaderelection.LeaderCallbacks{ + OnStartedLeading: func(ctx context.Context) { + log.Printf("serviceoffer-controller: became leader %s", podName) + if err := controller.Run(ctx, workers); err != nil { + log.Printf("controller run: %v", err) + } + }, + OnStoppedLeading: func() { + // On lost leadership exit non-zero so the kubelet restarts the + // pod and the next election starts from a clean state. Trying + // to keep running without the lease would race the new leader. + log.Printf("serviceoffer-controller: lost leadership %s", podName) + os.Exit(1) + }, + OnNewLeader: func(identity string) { + if identity != podName { + log.Printf("serviceoffer-controller: new leader is %s", identity) + } + }, + }, + }) } func loadConfig(kubeconfig string) (*rest.Config, error) { diff --git a/cmd/serviceoffer-controller/main_test.go b/cmd/serviceoffer-controller/main_test.go new file mode 100644 index 00000000..addb8856 --- /dev/null +++ b/cmd/serviceoffer-controller/main_test.go @@ -0,0 +1,81 @@ +package main + +import ( + "os" + "path/filepath" + "testing" +) + +// TestLoadConfig_FromKubeconfigFile asserts loadConfig parses an explicit +// kubeconfig path. This is the local-dev codepath used when --leader-elect=false. +func TestLoadConfig_FromKubeconfigFile(t *testing.T) { + dir := t.TempDir() + kc := filepath.Join(dir, "kubeconfig") + if err := os.WriteFile(kc, []byte(minimalKubeconfig), 0o600); err != nil { + t.Fatalf("write kubeconfig: %v", err) + } + + cfg, err := loadConfig(kc) + if err != nil { + t.Fatalf("loadConfig: %v", err) + } + if cfg.Host != "https://example.invalid:6443" { + t.Fatalf("unexpected host: %q", cfg.Host) + } +} + +// TestLoadConfig_FromKubeconfigEnv mirrors the path used when KUBECONFIG is set +// (e.g. obol kubectl/helm passthrough during local dev). +func TestLoadConfig_FromKubeconfigEnv(t *testing.T) { + dir := t.TempDir() + kc := filepath.Join(dir, "kubeconfig") + if err := os.WriteFile(kc, []byte(minimalKubeconfig), 0o600); err != nil { + t.Fatalf("write kubeconfig: %v", err) + } + + t.Setenv("KUBECONFIG", kc) + cfg, err := loadConfig("") + if err != nil { + t.Fatalf("loadConfig: %v", err) + } + if cfg.Host != "https://example.invalid:6443" { + t.Fatalf("unexpected host: %q", cfg.Host) + } +} + +// TestLeaderElectionDefaults locks in the lease parameters chosen for fast +// failover on single-node k3d. If you tune these for a multi-zone deployment, +// update this test and the PR-description rationale. +func TestLeaderElectionDefaults(t *testing.T) { + if leaseDuration <= renewDeadline { + t.Fatalf("leaseDuration (%s) must exceed renewDeadline (%s)", leaseDuration, renewDeadline) + } + if renewDeadline <= retryPeriod { + t.Fatalf("renewDeadline (%s) must exceed retryPeriod (%s)", renewDeadline, retryPeriod) + } + if leaseName != "serviceoffer-controller" { + t.Fatalf("leaseName drifted from RBAC + Deployment expectation: %q", leaseName) + } + if defaultLockNamespace != "x402" { + t.Fatalf("defaultLockNamespace drifted from infrastructure manifest: %q", defaultLockNamespace) + } +} + +const minimalKubeconfig = `apiVersion: v1 +kind: Config +clusters: +- name: test + cluster: + server: https://example.invalid:6443 + insecure-skip-tls-verify: true +contexts: +- name: test + context: + cluster: test + user: test +current-context: test +users: +- name: test + user: + token: test-token +` diff --git a/internal/embed/infrastructure/base/templates/x402.yaml b/internal/embed/infrastructure/base/templates/x402.yaml index 9dcc933e..56d75737 100644 --- a/internal/embed/infrastructure/base/templates/x402.yaml +++ b/internal/embed/infrastructure/base/templates/x402.yaml @@ -271,7 +271,10 @@ metadata: labels: app: serviceoffer-controller spec: - replicas: 1 # Do not scale — multiple replicas race on ERC-8004 on-chain registration + # Single replica by default; bumping to 2+ is now safe — leader election + # (client-go Lease in this namespace) prevents split-brain on the reconcile + # loop and the resulting double on-chain ERC-8004 registration. + replicas: 1 selector: matchLabels: app: serviceoffer-controller @@ -286,6 +289,10 @@ spec: image: ghcr.io/obolnetwork/serviceoffer-controller:b13254e imagePullPolicy: IfNotPresent env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name - name: POD_NAMESPACE valueFrom: fieldRef: