From c1c79d1dd8c389f1916ccc51951ef4a6084f2f5f Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sun, 24 May 2026 14:14:17 +0400 Subject: [PATCH 1/2] fix(stack): tolerate transient helm repo update failures, keep cluster running MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit helm aborts the WHOLE `helm repo update` if any single registered repo fails (e.g. kubernetes-dashboard started returning 404 in 2025). That cascaded: helmfile sync propagated exit 1, syncDefaults called Down() as "cleanup", and `obol stack up` ended up running `k3d cluster delete` — destroying PVCs, agent wallets, and registered services for a transient external-repo flake. Two layers of defense: A. Tolerant repo-update preflight (helmcmd.UpdateRepos): run `helm repo update --fail-on-repo-update-fail=false ` before helmfile, then pass --skip-deps so helmfile doesn't re-run its own strict update. Only the repos our embedded helmfile.yaml declares are touched; unrelated repos in the user's global helm config are ignored. B. Don't stop the cluster on helmfile failure: remove the Down() call from the error branch in syncDefaults. A failed sync is almost always recoverable in place; tearing down the cluster destroys unrelated state. We surface inspect/retry hints instead. Tests: - helmcmd: ParseHelmfileRepos / UpdateRepos tolerant-flag wiring (fake helm). - stack: preflightHelmRepos success + failure paths against a fake helm that exits 1 on `repo update` (mirrors the real kubernetes-dashboard incident). - stack: source-level regression guard against re-introducing Down(cfg, u) in syncDefaults. Confirmed the guard fires against the pre-fix code. --- internal/helmcmd/helmcmd.go | 107 ++++++++++++++++++ internal/helmcmd/helmcmd_test.go | 163 +++++++++++++++++++++++++++- internal/stack/backend_k3d.go | 8 +- internal/stack/stack.go | 97 +++++++++++++++-- internal/stack/stack_test.go | 180 +++++++++++++++++++++++++++++++ 5 files changed, 542 insertions(+), 13 deletions(-) diff --git a/internal/helmcmd/helmcmd.go b/internal/helmcmd/helmcmd.go index dd91cbb8..81c893d5 100644 --- a/internal/helmcmd/helmcmd.go +++ b/internal/helmcmd/helmcmd.go @@ -11,10 +11,13 @@ package helmcmd import ( "fmt" + "os" "os/exec" "regexp" "strconv" "strings" + + "gopkg.in/yaml.v3" ) // versionRE matches the major number in `helm version --short` output, e.g. @@ -63,3 +66,107 @@ func SyncFlagsForVersion(helmBinary string) []string { } return []string{"--sync-args=--force-conflicts"} } + +// helmfileRepo mirrors the shape of each entry under the top-level +// `repositories:` key in a helmfile.yaml. Only the fields we need are decoded; +// extra keys (oci, username, passwordRef, ...) are ignored. +type helmfileRepo struct { + Name string `yaml:"name"` + URL string `yaml:"url"` +} + +// helmfileDoc is the minimal shape of helmfile.yaml we care about for the +// repo-update preflight: just the `repositories:` block. +type helmfileDoc struct { + Repositories []helmfileRepo `yaml:"repositories"` +} + +// ParseHelmfileRepos extracts (name, url) entries from a helmfile.yaml file. +// Repos without both a name and a URL (e.g. OCI-only refs) are skipped — they +// are not added via `helm repo add` so they don't participate in the +// `helm repo update` path that this package guards against. +func ParseHelmfileRepos(helmfilePath string) ([]helmfileRepo, error) { + data, err := os.ReadFile(helmfilePath) + if err != nil { + return nil, fmt.Errorf("read helmfile %s: %w", helmfilePath, err) + } + return parseHelmfileReposBytes(data) +} + +func parseHelmfileReposBytes(data []byte) ([]helmfileRepo, error) { + var doc helmfileDoc + if err := yaml.Unmarshal(data, &doc); err != nil { + return nil, fmt.Errorf("parse helmfile yaml: %w", err) + } + + out := make([]helmfileRepo, 0, len(doc.Repositories)) + for _, r := range doc.Repositories { + if strings.TrimSpace(r.Name) == "" || strings.TrimSpace(r.URL) == "" { + continue + } + out = append(out, r) + } + return out, nil +} + +// ManagedRepoNames returns just the repo names from a helmfile.yaml. These are +// the only repos this stack is responsible for keeping up to date; everything +// else in the user's global `helm repo list` belongs to other tools. +func ManagedRepoNames(helmfilePath string) ([]string, error) { + repos, err := ParseHelmfileRepos(helmfilePath) + if err != nil { + return nil, err + } + names := make([]string, 0, len(repos)) + for _, r := range repos { + names = append(names, r.Name) + } + return names, nil +} + +// EnsureRepos registers each (name, url) pair via `helm repo add --force-update` +// so that a fresh host without `helm repo add` for our managed repos still gets +// them registered before we ask helm to update them by name. Best-effort: +// failures are returned for visibility but should not be treated as fatal by +// callers (the subsequent `helm repo update` will surface real problems). +func EnsureRepos(helmBinary string, repos []helmfileRepo) error { + var firstErr error + for _, r := range repos { + cmd := exec.Command(helmBinary, "repo", "add", "--force-update", r.Name, r.URL) + if out, err := cmd.CombinedOutput(); err != nil { + if firstErr == nil { + firstErr = fmt.Errorf("helm repo add %s %s: %w (%s)", r.Name, r.URL, err, strings.TrimSpace(string(out))) + } + } + } + return firstErr +} + +// UpdateRepos runs `helm repo update ` and, when the helm version +// supports it, passes --fail-on-repo-update-fail=false so that a single dead +// repo (e.g. a tertiary repository that started serving 404) doesn't abort the +// whole update. +// +// Behaviour: +// - helm 3.14+ (where --fail-on-repo-update-fail exists): the flag is passed +// and the returned error is nil even if individual repos in `names` fail. +// - older helm: the flag is omitted and the error surfaces normally. +// +// The targeted form (`helm repo update `) is important: it limits the +// update to repos this stack actually needs, so unrelated dead repos in the +// user's global helm config can't break us even on helm versions that lack the +// tolerant flag. +func UpdateRepos(helmBinary string, names []string) ([]byte, error) { + if len(names) == 0 { + return nil, nil + } + args := []string{"repo", "update"} + if major, err := MajorVersion(helmBinary); err == nil && major >= 3 { + args = append(args, "--fail-on-repo-update-fail=false") + } + args = append(args, names...) + + cmd := exec.Command(helmBinary, args...) + out, err := cmd.CombinedOutput() + return out, err +} diff --git a/internal/helmcmd/helmcmd_test.go b/internal/helmcmd/helmcmd_test.go index 4e355240..5fb14740 100644 --- a/internal/helmcmd/helmcmd_test.go +++ b/internal/helmcmd/helmcmd_test.go @@ -1,6 +1,11 @@ package helmcmd -import "testing" +import ( + "os" + "path/filepath" + "reflect" + "testing" +) func TestParseMajor(t *testing.T) { cases := []struct { @@ -31,3 +36,159 @@ func TestParseMajor(t *testing.T) { } } } + +// TestParseHelmfileReposBytes is the table-driven happy/edge path for the +// helmfile repo extractor that the tolerant-update preflight depends on. +func TestParseHelmfileReposBytes(t *testing.T) { + cases := []struct { + name string + in string + want []helmfileRepo + }{ + { + name: "minimal valid helmfile with multiple repos", + in: ` +repositories: + - name: traefik + url: https://traefik.github.io/charts + - name: prometheus-community + url: https://prometheus-community.github.io/helm-charts +releases: + - name: ignored + chart: ./ignored +`, + want: []helmfileRepo{ + {Name: "traefik", URL: "https://traefik.github.io/charts"}, + {Name: "prometheus-community", URL: "https://prometheus-community.github.io/helm-charts"}, + }, + }, + { + name: "entries missing name or url are skipped (OCI-only refs etc.)", + in: ` +repositories: + - name: traefik + url: https://traefik.github.io/charts + - name: oci-only + - url: https://nameless.example/charts +`, + want: []helmfileRepo{ + {Name: "traefik", URL: "https://traefik.github.io/charts"}, + }, + }, + { + name: "no repositories key returns empty slice (not nil-failure)", + in: ` +releases: [] +`, + want: []helmfileRepo{}, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got, err := parseHelmfileReposBytes([]byte(tc.in)) + if err != nil { + t.Fatalf("parseHelmfileReposBytes: %v", err) + } + if !reflect.DeepEqual(got, tc.want) { + t.Fatalf("repos = %#v, want %#v", got, tc.want) + } + }) + } +} + +func TestParseHelmfileRepos_FromFile(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "helmfile.yaml") + body := ` +repositories: + - name: obol + url: https://obolnetwork.github.io/helm-charts/ +` + if err := os.WriteFile(path, []byte(body), 0o600); err != nil { + t.Fatalf("write helmfile: %v", err) + } + + repos, err := ParseHelmfileRepos(path) + if err != nil { + t.Fatalf("ParseHelmfileRepos: %v", err) + } + if len(repos) != 1 || repos[0].Name != "obol" { + t.Fatalf("unexpected repos: %#v", repos) + } + + names, err := ManagedRepoNames(path) + if err != nil { + t.Fatalf("ManagedRepoNames: %v", err) + } + if !reflect.DeepEqual(names, []string{"obol"}) { + t.Fatalf("names = %#v, want [obol]", names) + } +} + +// TestUpdateRepos_NoNamesIsNoop ensures we don't shell out for an empty repo +// list — the preflight should silently do nothing rather than running +// `helm repo update` with no args (which would update every globally +// registered repo, defeating the whole point of the fix). +func TestUpdateRepos_NoNamesIsNoop(t *testing.T) { + out, err := UpdateRepos("/nonexistent/helm", nil) + if err != nil { + t.Fatalf("UpdateRepos(nil) returned err: %v", err) + } + if out != nil { + t.Fatalf("UpdateRepos(nil) returned output: %q", out) + } +} + +// TestUpdateRepos_TolerantArgsConstructed verifies that the tolerant flag is +// passed when invoking against a real helm-3-style version response. We use a +// fake helm binary written into the temp dir so the test stays hermetic. The +// fake records its argv to a sentinel file so we can assert on it. +func TestUpdateRepos_TolerantArgsConstructed(t *testing.T) { + if os.Getenv("GOOS") == "windows" { + t.Skip("shell-script fake binary not supported on windows") + } + + dir := t.TempDir() + helm := filepath.Join(dir, "helm") + argLog := filepath.Join(dir, "args.log") + + // Fake helm: + // - `version --short` → "v3.20.1\n" + // - any other args → append to args.log and exit 0 + script := `#!/bin/sh +if [ "$1" = "version" ] && [ "$2" = "--short" ]; then + echo "v3.20.1" + exit 0 +fi +echo "$@" >> "` + argLog + `" +exit 0 +` + if err := os.WriteFile(helm, []byte(script), 0o755); err != nil { //nolint:gosec // test binary + t.Fatalf("write fake helm: %v", err) + } + + if _, err := UpdateRepos(helm, []string{"traefik", "obol"}); err != nil { + t.Fatalf("UpdateRepos: %v", err) + } + + logged, err := os.ReadFile(argLog) + if err != nil { + t.Fatalf("read args log: %v", err) + } + got := string(logged) + for _, want := range []string{"repo", "update", "--fail-on-repo-update-fail=false", "traefik", "obol"} { + if !contains(got, want) { + t.Fatalf("expected %q in fake helm argv, got: %s", want, got) + } + } +} + +func contains(haystack, needle string) bool { + for i := 0; i+len(needle) <= len(haystack); i++ { + if haystack[i:i+len(needle)] == needle { + return true + } + } + return false +} diff --git a/internal/stack/backend_k3d.go b/internal/stack/backend_k3d.go index f2c5c10e..9a8baa70 100644 --- a/internal/stack/backend_k3d.go +++ b/internal/stack/backend_k3d.go @@ -107,10 +107,10 @@ func (b *K3dBackend) Up(cfg *config.Config, u *ui.UI, stackID string) ([]byte, e // existing-cluster and fresh-create branches. `k3d cluster start` does // not auto-restart standalone registry containers attached via // `--registry-use` at create time — it only starts the cluster's own - // nodes. Without this call, every retry after a `cluster stop` (or after - // the failure-recovery Down() call in syncDefaults) falls back to direct - // upstream pulls and re-fetches every image, costing minutes per - // attempt. + // nodes. Without this call, every retry after a `cluster stop` (or any + // other path that left the registry containers in an `Exited` state) + // falls back to direct upstream pulls and re-fetches every image, + // costing minutes per attempt. if os.Getenv("OBOL_DEVELOPMENT") == "true" { setup, setupErr := ensureDevRegistries(cfg, u) if setupErr != nil { diff --git a/internal/stack/stack.go b/internal/stack/stack.go index 0f4b111f..bfe5a783 100644 --- a/internal/stack/stack.go +++ b/internal/stack/stack.go @@ -364,8 +364,25 @@ func GetStackID(cfg *config.Config) string { return getStackID(cfg) } -// syncDefaults deploys the default infrastructure using helmfile -// If deployment fails, the cluster is automatically stopped via Down() +// syncDefaults deploys the default infrastructure using helmfile. +// +// On helmfile failure we deliberately leave the cluster running. Historically +// this code path called Down() to "clean up", which destroyed all running +// state (PVCs, agent wallets, registered services) any time helmfile sync +// failed for a transient reason — most painfully when a third-party helm +// repo that the user happens to have registered globally (e.g. +// kubernetes-dashboard, which started returning 404 in mid-2025) failed +// `helm repo update`. The cascading failure went: +// +// helm repo update -> exit 1 +// helmfile sync -> exit 1 +// obol stack up -> k3d cluster delete (oops) +// +// A failed helmfile sync should leave the cluster intact so the user can +// inspect, fix, and rerun without losing unrelated state. The tolerant +// repo-update preflight below (helmcmd.UpdateRepos with +// --fail-on-repo-update-fail=false, plus --skip-deps on the sync) also makes +// the dead-tertiary-repo case stop hurting users in the first place. func syncDefaults(cfg *config.Config, u *ui.UI, kubeconfigPath string, dataDir string) error { defaultsHelmfilePath := filepath.Join(cfg.ConfigDir, "defaults") helmfilePath := filepath.Join(defaultsHelmfilePath, "helmfile.yaml") @@ -380,6 +397,15 @@ func syncDefaults(cfg *config.Config, u *ui.UI, kubeconfigPath string, dataDir s u.Warnf("Failed to migrate defaults helmfile hostnames: %v", err) } + helmBinary := filepath.Join(cfg.BinDir, "helm") + + // Pre-update only the repos our helmfile depends on, tolerating per-repo + // failures so an unrelated dead repo in the user's global helm config + // (e.g. kubernetes-dashboard returning 404) can't abort our sync. Then + // pass --skip-deps to helmfile so it doesn't run its own strict + // `helm repo update` against every globally-registered repo. + skipDeps := preflightHelmRepos(cfg, u, helmBinary, helmfilePath) + helmfileArgs := []string{ "--file", helmfilePath, "--kubeconfig", kubeconfigPath, @@ -396,7 +422,10 @@ func syncDefaults(cfg *config.Config, u *ui.UI, kubeconfigPath string, dataDir s u.Dim("Active quick tunnel detected — skipping cloudflared chart to preserve the URL") } helmfileArgs = append(helmfileArgs, "sync") - helmfileArgs = append(helmfileArgs, helmcmd.SyncFlagsForVersion(filepath.Join(cfg.BinDir, "helm"))...) + if skipDeps { + helmfileArgs = append(helmfileArgs, "--skip-deps") + } + helmfileArgs = append(helmfileArgs, helmcmd.SyncFlagsForVersion(helmBinary)...) helmfileCmd := exec.Command(filepath.Join(cfg.BinDir, "helmfile"), helmfileArgs...) helmfileCmd.Env = append(os.Environ(), "KUBECONFIG="+kubeconfigPath, @@ -414,7 +443,15 @@ func syncDefaults(cfg *config.Config, u *ui.UI, kubeconfigPath string, dataDir s Name: "Deploying default infrastructure", Cmd: helmfileCmd, }); err != nil { - u.Warn("Helmfile sync failed, stopping cluster") + // Do NOT stop the cluster here. A failed helmfile sync is almost + // always recoverable in place (transient repo flake, single bad + // release, etc.), and tearing down the cluster destroys unrelated + // state the user cares about. Surface the error with a clear retry + // hint instead. + u.Warn("Helmfile sync failed — cluster left running so you can inspect and retry.") + u.Dim(" Inspect: obol kubectl get pods -A") + u.Dim(" Retry: obol stack up") + u.Dim(" Tear down only if you really want to: obol stack down") if previousLiteLLMConfig != "" { if restoreErr := restoreLiteLLMConfig(cfg, kubeconfigPath, previousLiteLLMConfig); restoreErr != nil { @@ -422,10 +459,6 @@ func syncDefaults(cfg *config.Config, u *ui.UI, kubeconfigPath string, dataDir s } } - if downErr := Down(cfg, u); downErr != nil { - u.Warnf("Failed to stop cluster during cleanup: %v", downErr) - } - return fmt.Errorf("failed to apply defaults helmfile: %w", err) } @@ -502,6 +535,54 @@ func syncDefaults(cfg *config.Config, u *ui.UI, kubeconfigPath string, dataDir s return nil } +// preflightHelmRepos ensures the helm repos our embedded helmfile depends on +// are registered, then runs a tolerant `helm repo update` against just those +// repos. Returns true when the preflight succeeded (and so the helmfile sync +// can safely pass --skip-deps), false on any failure (in which case helmfile +// runs its own — non-tolerant — repo update as a fallback). +// +// This is the core of the "tolerate a single dead repo" fix: helm aborts the +// whole `helm repo update` if any one repo errors, and the user's global helm +// config may contain unrelated repos (e.g. kubernetes-dashboard, which +// started serving 404 in 2025) that we have no control over. By updating +// only our managed repos with --fail-on-repo-update-fail=false, a transient +// failure in any single one of them no longer blocks stack up. +// +// Non-fatal: any failure here degrades to "let helmfile do its thing". +func preflightHelmRepos(cfg *config.Config, u *ui.UI, helmBinary, helmfilePath string) bool { + repos, err := helmcmd.ParseHelmfileRepos(helmfilePath) + if err != nil || len(repos) == 0 { + return false + } + + // `helm repo add --force-update` is idempotent and updates the URL if it + // changed. Failures here are surfaced as a warning but don't block the + // preflight — the subsequent update step is the one that matters. + if err := helmcmd.EnsureRepos(helmBinary, repos); err != nil { + u.Dim(fmt.Sprintf("helm repo add (best-effort): %v", err)) + } + + names := make([]string, 0, len(repos)) + for _, r := range repos { + names = append(names, r.Name) + } + + if out, err := helmcmd.UpdateRepos(helmBinary, names); err != nil { + // helm aborted despite --fail-on-repo-update-fail=false (or the flag + // wasn't supported). Surface the output for debuggability but DON'T + // fail the whole sync — let helmfile try its own dependency + // resolution. If the failure is real, helmfile will report it; if + // it's a transient flake, the chart cache may already be warm. + u.Dim(fmt.Sprintf("helm repo update for managed repos failed: %v", err)) + if len(out) > 0 { + u.Dim(strings.TrimSpace(string(out))) + } + return false + } + + return true +} + // claudeTipIfRelevant prints a hint when the user has Claude Code installed // but the Obol skills plugin is not yet usable in their setup. Best-effort // and silent on any error — a missing or malformed Claude config must never diff --git a/internal/stack/stack_test.go b/internal/stack/stack_test.go index 0d7cca31..5dd93072 100644 --- a/internal/stack/stack_test.go +++ b/internal/stack/stack_test.go @@ -1032,3 +1032,183 @@ func TestWarnIfNoChatModel_EmitsWarnForWildcardOnly(t *testing.T) { t.Fatalf("wildcard-only list should trigger warn, got: %q", stderr.String()) } } + +// fakeHelmScript returns a sh script body that fakes a `helm` binary with +// configurable behaviour for repo update. It logs every invocation to +// invokeLog so callers can assert on what helm was asked to do. +// +// - `helm version --short`: prints "v3.20.1" (a version that supports +// --fail-on-repo-update-fail=false). +// - `helm repo add ...`: exits 0 (idempotent registration). +// - `helm repo update ...`: exits with repoUpdateExit. Stdout is empty, +// stderr mimics helm's real "failed to update the following repositories" +// message when repoUpdateExit != 0. +// - Anything else: exits 0. +func fakeHelmScript(invokeLog string, repoUpdateExit int) string { + return `#!/bin/sh +echo "$@" >> "` + invokeLog + `" +if [ "$1" = "version" ] && [ "$2" = "--short" ]; then + echo "v3.20.1" + exit 0 +fi +if [ "$1" = "repo" ] && [ "$2" = "add" ]; then + exit 0 +fi +if [ "$1" = "repo" ] && [ "$2" = "update" ]; then + if [ "` + sprintInt(repoUpdateExit) + `" != "0" ]; then + echo "Error: failed to update the following repositories: [https://example.invalid/charts]" 1>&2 + exit ` + sprintInt(repoUpdateExit) + ` + fi + echo "...Successfully got an update from all chart repositories" + exit 0 +fi +exit 0 +` +} + +func sprintInt(n int) string { return strconv.Itoa(n) } + +// TestPreflightHelmRepos_SuccessAllowsSkipDeps verifies the happy path: +// when our managed repos update cleanly, preflightHelmRepos returns true so +// the caller can pass --skip-deps to helmfile sync. +func TestPreflightHelmRepos_SuccessAllowsSkipDeps(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("shell-script fake binary not supported on windows") + } + + dir := t.TempDir() + binDir := filepath.Join(dir, "bin") + cfgDir := filepath.Join(dir, "cfg") + for _, d := range []string{binDir, cfgDir} { + if err := os.MkdirAll(d, 0o755); err != nil { + t.Fatalf("mkdir %s: %v", d, err) + } + } + + invokeLog := filepath.Join(dir, "helm.log") + helm := filepath.Join(binDir, "helm") + if err := os.WriteFile(helm, []byte(fakeHelmScript(invokeLog, 0)), 0o755); err != nil { //nolint:gosec // test fake + t.Fatalf("write fake helm: %v", err) + } + + helmfilePath := filepath.Join(cfgDir, "helmfile.yaml") + body := ` +repositories: + - name: traefik + url: https://traefik.github.io/charts + - name: obol + url: https://obolnetwork.github.io/helm-charts/ +` + if err := os.WriteFile(helmfilePath, []byte(body), 0o600); err != nil { + t.Fatalf("write helmfile: %v", err) + } + + cfg := &config.Config{BinDir: binDir, ConfigDir: cfgDir} + u, _, _ := newCaptureUI() + + if !preflightHelmRepos(cfg, u, helm, helmfilePath) { + t.Fatal("preflightHelmRepos should return true when helm repo update succeeds") + } + + logged, _ := os.ReadFile(invokeLog) + got := string(logged) + // Sanity: we should have updated only our managed repos by name, and + // passed the tolerant flag. + if !strings.Contains(got, "--fail-on-repo-update-fail=false") { + t.Fatalf("expected tolerant flag in helm invocation, got: %s", got) + } + if !strings.Contains(got, "traefik") || !strings.Contains(got, "obol") { + t.Fatalf("expected managed repo names in helm invocation, got: %s", got) + } +} + +// TestPreflightHelmRepos_FailureFallsBackGracefully captures the actual bug +// scenario: a tertiary repo update fails. The preflight should swallow that +// (and return false so the caller falls back to letting helmfile manage +// dependencies itself) rather than propagate a fatal error that would, in +// the old code path, have stopped the entire cluster. +func TestPreflightHelmRepos_FailureFallsBackGracefully(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("shell-script fake binary not supported on windows") + } + + dir := t.TempDir() + binDir := filepath.Join(dir, "bin") + cfgDir := filepath.Join(dir, "cfg") + for _, d := range []string{binDir, cfgDir} { + if err := os.MkdirAll(d, 0o755); err != nil { + t.Fatalf("mkdir %s: %v", d, err) + } + } + + invokeLog := filepath.Join(dir, "helm.log") + helm := filepath.Join(binDir, "helm") + // Fake helm exits 1 on `repo update` — same shape as the real-world + // kubernetes-dashboard 404 incident that motivated this fix. + if err := os.WriteFile(helm, []byte(fakeHelmScript(invokeLog, 1)), 0o755); err != nil { //nolint:gosec // test fake + t.Fatalf("write fake helm: %v", err) + } + + helmfilePath := filepath.Join(cfgDir, "helmfile.yaml") + body := ` +repositories: + - name: traefik + url: https://traefik.github.io/charts +` + if err := os.WriteFile(helmfilePath, []byte(body), 0o600); err != nil { + t.Fatalf("write helmfile: %v", err) + } + + cfg := &config.Config{BinDir: binDir, ConfigDir: cfgDir} + u, _, _ := newCaptureUI() + + // Critical assertion: the preflight returns (without panicking, without + // returning an error type at all — by design it absorbs failure and + // signals "let helmfile try its own resolution"). + if got := preflightHelmRepos(cfg, u, helm, helmfilePath); got { + t.Fatal("preflightHelmRepos should return false when helm repo update fails") + } +} + +// TestSyncDefaults_DoesNotCallDownOnHelmfileFailure is a source-level +// regression guard for the cluster-stop-on-failure bug. The old syncDefaults +// invoked Down() (which calls `k3d cluster delete`) whenever helmfile sync +// errored, destroying user state for transient failures like a single dead +// helm repo. The fix removes that call; this test keeps it gone. +// +// We inspect the source rather than mock the entire backend stack because +// the wrong behaviour to prevent is statically visible (a Down call in the +// error branch) and the right behaviour is statically defined (no Down +// call). Behavioural drift is checked by the helmcmd unit tests above. +func TestSyncDefaults_DoesNotCallDownOnHelmfileFailure(t *testing.T) { + projectRoot := findProjectRoot() + if projectRoot == "" { + t.Skip("project root not found") + } + + src, err := os.ReadFile(filepath.Join(projectRoot, "internal/stack/stack.go")) + if err != nil { + t.Fatalf("read stack.go: %v", err) + } + + // Locate the syncDefaults function body. We bound the scan to the + // function so we don't accidentally match Down() calls in unrelated + // helpers (e.g. the `Down(cfg, u *ui.UI)` definition itself). + const fnSig = "func syncDefaults(" + start := strings.Index(string(src), fnSig) + if start < 0 { + t.Fatalf("syncDefaults function not found in stack.go") + } + const fnEndMarker = "\n// claudeTipIfRelevant" + end := strings.Index(string(src)[start:], fnEndMarker) + if end < 0 { + t.Fatalf("could not locate end of syncDefaults body") + } + body := string(src)[start : start+end] + + if strings.Contains(body, "Down(cfg, u)") { + t.Fatalf("syncDefaults must not call Down() on failure — that destroys " + + "unrelated user state when helmfile sync fails for transient reasons. " + + "See fix/tolerant-helm-repo-update.") + } +} From 0e1cdf9acbc09088208c610c3842920431ecd815 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sun, 24 May 2026 18:34:17 +0400 Subject: [PATCH 2/2] fix: probe helm repo update tolerant flag support --- internal/helmcmd/helmcmd.go | 31 ++++++++++++++----- internal/helmcmd/helmcmd_test.go | 52 +++++++++++++++++++++++++++++--- internal/stack/stack_test.go | 14 ++++++--- 3 files changed, 80 insertions(+), 17 deletions(-) diff --git a/internal/helmcmd/helmcmd.go b/internal/helmcmd/helmcmd.go index 81c893d5..7d04f714 100644 --- a/internal/helmcmd/helmcmd.go +++ b/internal/helmcmd/helmcmd.go @@ -142,15 +142,30 @@ func EnsureRepos(helmBinary string, repos []helmfileRepo) error { return firstErr } -// UpdateRepos runs `helm repo update ` and, when the helm version -// supports it, passes --fail-on-repo-update-fail=false so that a single dead -// repo (e.g. a tertiary repository that started serving 404) doesn't abort the -// whole update. +// RepoUpdateSupportsFailOnRepoUpdateFail reports whether the current helm +// binary accepts `helm repo update --fail-on-repo-update-fail=false`. +// +// Do not infer this from the major version. Some Helm 4 builds dropped the flag +// even though Helm 3.14+ had it, and passing an unknown flag prevents the +// targeted repo update from running at all. +func RepoUpdateSupportsFailOnRepoUpdateFail(helmBinary string) bool { + cmd := exec.Command(helmBinary, "repo", "update", "--help") + out, err := cmd.CombinedOutput() + if err != nil { + return false + } + return strings.Contains(string(out), "--fail-on-repo-update-fail") +} + +// UpdateRepos runs `helm repo update ` and, when the helm binary +// advertises support, passes --fail-on-repo-update-fail=false so that a single +// dead repo doesn't abort the whole update. // // Behaviour: -// - helm 3.14+ (where --fail-on-repo-update-fail exists): the flag is passed -// and the returned error is nil even if individual repos in `names` fail. -// - older helm: the flag is omitted and the error surfaces normally. +// - helm versions that advertise --fail-on-repo-update-fail: the flag is +// passed and the returned error is nil even if individual repos in `names` +// fail. +// - other helm versions: the flag is omitted and the error surfaces normally. // // The targeted form (`helm repo update `) is important: it limits the // update to repos this stack actually needs, so unrelated dead repos in the @@ -161,7 +176,7 @@ func UpdateRepos(helmBinary string, names []string) ([]byte, error) { return nil, nil } args := []string{"repo", "update"} - if major, err := MajorVersion(helmBinary); err == nil && major >= 3 { + if RepoUpdateSupportsFailOnRepoUpdateFail(helmBinary) { args = append(args, "--fail-on-repo-update-fail=false") } args = append(args, names...) diff --git a/internal/helmcmd/helmcmd_test.go b/internal/helmcmd/helmcmd_test.go index 5fb14740..aa68bba1 100644 --- a/internal/helmcmd/helmcmd_test.go +++ b/internal/helmcmd/helmcmd_test.go @@ -154,11 +154,11 @@ func TestUpdateRepos_TolerantArgsConstructed(t *testing.T) { argLog := filepath.Join(dir, "args.log") // Fake helm: - // - `version --short` → "v3.20.1\n" - // - any other args → append to args.log and exit 0 + // - `repo update --help` → advertises the tolerant flag + // - any other args → append to args.log and exit 0 script := `#!/bin/sh -if [ "$1" = "version" ] && [ "$2" = "--short" ]; then - echo "v3.20.1" +if [ "$1" = "repo" ] && [ "$2" = "update" ] && [ "$3" = "--help" ]; then + echo " --fail-on-repo-update-fail=false tolerate individual repo failures" exit 0 fi echo "$@" >> "` + argLog + `" @@ -184,6 +184,50 @@ exit 0 } } +// TestUpdateRepos_OmitsUnsupportedTolerantFlag covers Helm builds whose +// `repo update` command no longer accepts --fail-on-repo-update-fail. The +// preflight must still run a targeted repo update instead of failing before +// any repo is refreshed. +func TestUpdateRepos_OmitsUnsupportedTolerantFlag(t *testing.T) { + if os.Getenv("GOOS") == "windows" { + t.Skip("shell-script fake binary not supported on windows") + } + + dir := t.TempDir() + helm := filepath.Join(dir, "helm") + argLog := filepath.Join(dir, "args.log") + + script := `#!/bin/sh +if [ "$1" = "repo" ] && [ "$2" = "update" ] && [ "$3" = "--help" ]; then + echo "Usage: helm repo update [REPO1 [REPO2 ...]] [flags]" + exit 0 +fi +echo "$@" >> "` + argLog + `" +exit 0 +` + if err := os.WriteFile(helm, []byte(script), 0o755); err != nil { //nolint:gosec // test binary + t.Fatalf("write fake helm: %v", err) + } + + if _, err := UpdateRepos(helm, []string{"traefik", "obol"}); err != nil { + t.Fatalf("UpdateRepos: %v", err) + } + + logged, err := os.ReadFile(argLog) + if err != nil { + t.Fatalf("read args log: %v", err) + } + got := string(logged) + if contains(got, "--fail-on-repo-update-fail=false") { + t.Fatalf("unsupported tolerant flag was passed: %s", got) + } + for _, want := range []string{"repo", "update", "traefik", "obol"} { + if !contains(got, want) { + t.Fatalf("expected %q in fake helm argv, got: %s", want, got) + } + } +} + func contains(haystack, needle string) bool { for i := 0; i+len(needle) <= len(haystack); i++ { if haystack[i:i+len(needle)] == needle { diff --git a/internal/stack/stack_test.go b/internal/stack/stack_test.go index 5dd93072..28fcbcd5 100644 --- a/internal/stack/stack_test.go +++ b/internal/stack/stack_test.go @@ -1037,13 +1037,13 @@ func TestWarnIfNoChatModel_EmitsWarnForWildcardOnly(t *testing.T) { // configurable behaviour for repo update. It logs every invocation to // invokeLog so callers can assert on what helm was asked to do. // -// - `helm version --short`: prints "v3.20.1" (a version that supports -// --fail-on-repo-update-fail=false). -// - `helm repo add ...`: exits 0 (idempotent registration). -// - `helm repo update ...`: exits with repoUpdateExit. Stdout is empty, +// - `helm version --short`: prints "v3.20.1". +// - `helm repo update --help`: advertises --fail-on-repo-update-fail=false. +// - `helm repo add ...`: exits 0 (idempotent registration). +// - `helm repo update ...`: exits with repoUpdateExit. Stdout is empty, // stderr mimics helm's real "failed to update the following repositories" // message when repoUpdateExit != 0. -// - Anything else: exits 0. +// - Anything else: exits 0. func fakeHelmScript(invokeLog string, repoUpdateExit int) string { return `#!/bin/sh echo "$@" >> "` + invokeLog + `" @@ -1054,6 +1054,10 @@ fi if [ "$1" = "repo" ] && [ "$2" = "add" ]; then exit 0 fi +if [ "$1" = "repo" ] && [ "$2" = "update" ] && [ "$3" = "--help" ]; then + echo " --fail-on-repo-update-fail=false tolerate individual repo failures" + exit 0 +fi if [ "$1" = "repo" ] && [ "$2" = "update" ]; then if [ "` + sprintInt(repoUpdateExit) + `" != "0" ]; then echo "Error: failed to update the following repositories: [https://example.invalid/charts]" 1>&2