From 5f1edc2ea49b6f107c55faeaa630bae63fe749f0 Mon Sep 17 00:00:00 2001
From: Alexey Yakubov <alexey.yakubov@flant.com>
Date: Wed, 3 Jun 2026 17:34:01 +0400
Subject: [PATCH 1/8] unit-tests and CI pipeline added. Initial commit.

---
 .github/workflows/unit-tests.yml              |  81 +++
 .gitignore                                    |   3 +
 CLAUDE.md                                     |  88 +++
 Makefile                                      |  52 ++
 docs/TESTS_IMPLEMENTATION_PLAN.md             | 300 ++++++++
 docs/WORKLOG.md                               |  10 +
 internal/config/types_yaml_test.go            | 247 +++++++
 .../kubernetes/commander/client_http_test.go  | 683 ++++++++++++++++++
 internal/kubernetes/commander/client_test.go  | 176 +++++
 internal/logger/level_test.go                 |  70 ++
 pkg/cluster/vms_test.go                       | 257 +++++++
 pkg/kubernetes/apply_test.go                  | 128 ++++
 pkg/kubernetes/modules_test.go                | 280 +++++++
 pkg/kubernetes/poll_test.go                   | 102 +++
 pkg/retry/retry_test.go                       | 408 +++++++++++
 pkg/testkit/stress_tests_test.go              | 276 +++++++
 16 files changed, 3161 insertions(+)
 create mode 100644 .github/workflows/unit-tests.yml
 create mode 100644 .gitignore
 create mode 100644 CLAUDE.md
 create mode 100644 Makefile
 create mode 100644 docs/TESTS_IMPLEMENTATION_PLAN.md
 create mode 100644 internal/config/types_yaml_test.go
 create mode 100644 internal/kubernetes/commander/client_http_test.go
 create mode 100644 internal/kubernetes/commander/client_test.go
 create mode 100644 internal/logger/level_test.go
 create mode 100644 pkg/cluster/vms_test.go
 create mode 100644 pkg/kubernetes/apply_test.go
 create mode 100644 pkg/kubernetes/modules_test.go
 create mode 100644 pkg/kubernetes/poll_test.go
 create mode 100644 pkg/retry/retry_test.go
 create mode 100644 pkg/testkit/stress_tests_test.go

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
new file mode 100644
index 0000000..1a0d70d
--- /dev/null
+++ b/.github/workflows/unit-tests.yml
@@ -0,0 +1,81 @@
+# Copyright 2026 Flant JSC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Unit Tests
+
+# Triggers:
+#   push:           every push to any branch — gives feature branches a green/red
+#                   signal without needing an open PR.
+#   pull_request:   PRs targeting main — required so the check is also evaluated
+#                   for fork PRs (whose push events do not run in the upstream repo).
+on:
+  push: {}
+  pull_request:
+    branches: [main]
+    types: [opened, synchronize, reopened]
+
+permissions:
+  contents: read
+
+# Cancel in-flight runs for the same ref when a new commit lands. Each push
+# trigger then evaluates only the latest SHA, which keeps CI minutes bounded
+# on rapid-fire force-pushes.
+concurrency:
+  group: unit-tests-${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  unit-tests:
+    name: Build, vet & unit test
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-go@v5
+        with:
+          go-version-file: go.mod
+          cache: true
+
+      - name: Download modules
+        run: go mod download
+
+      # Compile everything (including the e2e suites under ./tests/...) to catch
+      # refactor breakage in code that the unit-tests package set deliberately
+      # does not execute.
+      - name: Build
+        run: go build ./...
+
+      - name: Vet
+        run: go vet ./...
+
+      # Unit tests run with -race (catches data races in retry.Do and module
+      # configuration goroutines) and -shuffle=on (surfaces ordering bugs in
+      # tests that touch package-level env vars in internal/config).
+      # We restrict the set to ./internal/... ./pkg/... so the e2e suites in
+      # ./tests/... do not execute (they require real VMs/clusters/SSH).
+      - name: Unit tests (race + coverage)
+        run: |
+          go test -race -shuffle=on -covermode=atomic \
+            -coverprofile=coverage.out \
+            ./internal/... ./pkg/...
+          echo "Total coverage:"
+          go tool cover -func=coverage.out | tail -1
+
+      - name: Upload coverage artifact
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage
+          path: coverage.out
+          if-no-files-found: warn
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7eb8131
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+# Coverage artifacts emitted by `make cover` / unit-tests CI job.
+coverage.out
+coverage.html
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..a927383
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,88 @@
+# CLAUDE.md
+
+Guidance for working in this repository. Read the linked docs in `docs/` before making
+non-trivial changes.
+
+## What this is
+
+`github.com/deckhouse/storage-e2e` — an end-to-end test framework for Deckhouse storage
+components. It provisions/manages test clusters (VMs via virtualization, existing clusters,
+or Deckhouse Commander), wires up storage modules, and runs Ginkgo-based e2e suites.
+
+**It is imported as a library by other repos** (e.g. `csi-ceph`), so the `pkg/` public API
+must stay backward compatible.
+
+- Module path: `github.com/deckhouse/storage-e2e`, Go `1.26`.
+- Layers (top → bottom): `tests/` → `pkg/testkit` → `internal/` domain logic →
+  `internal/infrastructure` → `k8s client-go`. See ARCHITECTURE for the full tree.
+
+## Documentation map (`docs/`)
+
+| Doc | Purpose |
+|---|---|
+| [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) | Package structure, layers, components, env-var reference. **Source of truth for layout.** |
+| [docs/FUNCTIONS_GLOSSARY.md](docs/FUNCTIONS_GLOSSARY.md) | All exported `pkg/` functions, grouped by resource. Check before adding new exported functions. |
+| [docs/TODO.md](docs/TODO.md) | Global TODO list (managed via the `/todo` convention below). |
+| [docs/WORKLOG.md](docs/WORKLOG.md) | Dated change log. Append an entry on every code change. |
+| [docs/TESTS_IMPLEMENTATION_PLAN.md](docs/TESTS_IMPLEMENTATION_PLAN.md) | Plan for unit-test coverage and the CI/CD pipeline (build/vet/test, merge-blocking). |
+| [docs/TESTING_STRATEGY.md](docs/TESTING_STRATEGY.md) | *(to be created)* Testing conventions, unit-vs-e2e boundaries, fakes/httptest patterns, coverage policy. |
+
+Also useful: [README.md](README.md) (quick start, env vars, running suites),
+[internal/README.md](internal/README.md), [internal/logger/README.md](internal/logger/README.md).
+
+## Build, vet & test
+
+```bash
+go build ./...                                   # compiles everything (incl. e2e suites)
+go vet ./...
+go test -race -shuffle=on ./internal/... ./pkg/...   # unit tests only — no cluster needed
+```
+
+- **Unit tests** live next to sources under `internal/` and `pkg/`; they are hermetic
+  (no SSH, no cluster, no network — use `httptest` and `client-go` fakes).
+- **E2E suites** live under `tests/` and require real VMs/clusters + env vars
+  (`SSH_HOST`, `DKP_LICENSE_KEY`, `TEST_CLUSTER_CREATE_MODE`, …). **Do not run them in
+  unit/CI flows.** Run one explicitly, e.g.:
+  `go test -timeout=240m -v ./tests/<name> -count=1`.
+- New tests/CI conventions are described in TESTS_IMPLEMENTATION_PLAN.md (and, once it
+  exists, TESTING_STRATEGY.md).
+
+## Project conventions (must follow)
+
+These mirror the rules in `.cursor/rules/` — apply them whether or not you use Cursor.
+
+1. **Backward compatibility (`pkg/`).** Removing/renaming an exported symbol, changing a
+   signature, or changing observed behavior is a breaking change. **Ask the user first**,
+   prefer adding alongside (deprecate, don't delete), and tag the WORKLOG entry with
+   `[Possible compatibility break]`.
+
+2. **Functions glossary.** Before adding a new exported `pkg/` function, search
+   [docs/FUNCTIONS_GLOSSARY.md](docs/FUNCTIONS_GLOSSARY.md) for an existing one. Reuse/extend
+   if it exists; otherwise add the new function to the glossary in the matching section.
+
+3. **Versatile functions.** `pkg/` functions must be general-purpose: return data not
+   decisions, no hardcoded names/indices, accept broad inputs (`context`, `rest.Config`)
+   and return concrete K8s objects, no empty wrappers.
+
+4. **Architecture sync.** On structural changes (add/remove/rename a package or file, move
+   between layers, new cluster mode, new env var, changed core types), update the affected
+   sections of [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md). Only touch affected sections.
+
+5. **Work log.** After any change to `*.go`, `*.yml`, `*.yaml`, `*.tpl`, or `*.sh`, append a
+   one-line entry to [docs/WORKLOG.md](docs/WORKLOG.md) under today's `## YYYY-MM-DD`
+   heading: `- **Type** description` (Type ∈ Add/Remove/Rename/Refactor/Bugfix/Update/Move).
+   Append only; never edit previous days' entries.
+
+6. **`/todo` command.** When a message starts with `/todo`, manage items in
+   [docs/TODO.md](docs/TODO.md) (Add / Remove / Check|List / Done), then record a WORKLOG entry.
+
+7. **Commits/PRs.** Do not add AI co-author trailers (no `Co-Authored-By: Claude` /
+   "Generated with Claude").
+
+## Creating new e2e tests
+
+Use the template + script (don't hand-roll suites, and don't edit `tests/test-template`):
+
+```bash
+cd tests && ./create-test.sh <your-test-name>
+```
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..8cf347a
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,52 @@
+# Copyright 2026 Flant JSC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Convenience targets for local development. CI runs the same commands so
+# `make test` locally is equivalent to the CI unit-test job.
+#
+# Unit tests deliberately exclude ./tests/... (e2e suites that require real
+# VMs/clusters/SSH). Use `make e2e` for hints on running individual suites.
+
+UNIT_PKGS := ./internal/... ./pkg/...
+
+.PHONY: help build vet test cover e2e clean
+
+help: ## Print this help.
+	@awk 'BEGIN {FS = ":.*##"; printf "Targets:\n"} \
+	      /^[a-zA-Z0-9_-]+:.*##/ {printf "  \033[36m%-10s\033[0m %s\n", $$1, $$2}' \
+	      $(MAKEFILE_LIST)
+
+build: ## Compile every package (incl. e2e suites for refactor-breakage check).
+	go build ./...
+
+vet: ## Run go vet on every package.
+	go vet ./...
+
+test: ## Unit tests with -race -shuffle=on (no cluster needed).
+	go test -race -shuffle=on $(UNIT_PKGS)
+
+cover: ## Unit tests + coverage profile + total %.
+	go test -race -shuffle=on -covermode=atomic \
+	    -coverprofile=coverage.out $(UNIT_PKGS)
+	@echo "Total coverage:"
+	@go tool cover -func=coverage.out | tail -1
+
+e2e: ## Hints for running an individual e2e suite (requires VMs/cluster).
+	@echo "E2E suites require real infra. Examples:"
+	@echo "  go test -timeout=240m -v ./tests/test-template -count=1"
+	@echo "  go test -timeout=240m -v ./tests/csi-all-stress-tests -count=1"
+	@echo "See README.md for required env vars (SSH_HOST, DKP_LICENSE_KEY, ...)."
+
+clean: ## Remove build / coverage artifacts.
+	rm -f coverage.out
diff --git a/docs/TESTS_IMPLEMENTATION_PLAN.md b/docs/TESTS_IMPLEMENTATION_PLAN.md
new file mode 100644
index 0000000..d00c811
--- /dev/null
+++ b/docs/TESTS_IMPLEMENTATION_PLAN.md
@@ -0,0 +1,300 @@
+# Unit Testing & CI/CD Implementation Plan
+
+> Status: **Wave 1 + Wave 2 landed, CI workflow active.** Remaining work:
+> Wave 3 fake-client refactors (optional), enable branch protection so the
+> "Build, vet & unit test" check becomes required on `main`. This document
+> stays as the source of truth for *what* is covered, *how*, and how CI enforces it.
+
+## 1. Goals
+
+1. Cover the **deterministic core logic** of the project (`pkg/` + `internal/`) with fast,
+   hermetic unit tests that need **no SSH access, no Kubernetes cluster, and no network**.
+2. Add a **GitHub Actions pipeline** that builds, vets, and runs the unit tests on every
+   push and pull request.
+3. Make the test job a **required status check** so a PR cannot be merged into `main`
+   while tests are red.
+
+### Non-goals
+
+- We are **not** trying to unit-test the end-to-end suites under `tests/` — those
+  legitimately require real VMs/clusters and stay as manual/scheduled e2e runs.
+- We are **not** aiming for 100% line coverage. The realistic target is high coverage of
+  the *pure-logic* surface; the cluster/SSH/virtualization orchestration code is mostly
+  I/O glue and is covered by e2e.
+
+---
+
+## 2. Current state
+
+- Module: `github.com/deckhouse/storage-e2e`, Go `1.26`.
+- Test framework already in use: standard `testing` + table tests (unit), Ginkgo/Gomega (e2e).
+- Existing unit tests (2 files, ~all green today):
+  - [internal/config/types_test.go](../internal/config/types_test.go) — `ValidateModulePullOverrides`.
+  - [internal/logger/logger_test.go](../internal/logger/logger_test.go) — level parsing, handlers, helpers.
+- Only CI present: [gitleaks-scan-on-pr.yml](../.github/workflows/gitleaks-scan-on-pr.yml) (secret scanning). **No build/test CI.**
+- `ARCHITECTURE.md` §9.2 explicitly lists "Limited unit test coverage" as known tech debt.
+
+### Key fact that shapes the strategy
+
+Most `pkg/kubernetes/*` and `pkg/cluster/*` functions take a `*rest.Config` (or a concrete
+`*virtualization.Client`) and **construct their clients internally**, so they can only be
+exercised against a live API server. They are **not** directly unit-testable without a
+small refactor (see §5). The first waves of tests therefore focus on the large amount of
+**pure / injectable logic** that already exists.
+
+---
+
+## 3. Testing strategy & conventions
+
+- **Test type:** standard library `testing` with table-driven subtests (`t.Run`). Matches
+  the existing style in `types_test.go` / `logger_test.go`. No new test deps required for
+  the core waves.
+- **Hermetic:** no real network. HTTP clients are pointed at `httptest.Server`; Kubernetes
+  interactions (later waves) use `client-go`'s fakes (`k8s.io/client-go/kubernetes/fake`,
+  `k8s.io/client-go/dynamic/fake`) — already available transitively via `client-go`.
+- **No global-state leakage:** `internal/config` reads env into package-level vars at init.
+  Tests that touch them must **save and restore** the globals with `t.Cleanup`. A small
+  test helper (`withEnvSnapshot`) will encapsulate this.
+- **Logger:** `logger.GetLogger()` falls back to `slog.Default()` when uninitialized, so
+  code under test that logs (e.g. `retry.Do`) is safe to call directly — no setup needed.
+- **Determinism:** time-based code is tested with tiny durations and `context` cancellation
+  rather than wall-clock sleeps; randomness (`GenerateRandomSuffix`) is asserted on
+  shape/charset/length, not exact value.
+- **File naming:** `<source>_test.go` next to the source, same package for white-box tests
+  of unexported helpers (e.g. `package kubernetes`, `package cluster`).
+
+### Separating unit tests from e2e in CI
+
+The e2e suites under `tests/` must never run in PR CI (they need clusters). Approach:
+
+- **Primary (chosen):** CI runs an explicit package set: `go test ./internal/... ./pkg/...`.
+  The `tests/` directory is excluded from the test run.
+- **Compile safety:** CI additionally runs `go vet ./...` and `go build ./...`, which
+  *compile* the e2e suites (catching refactor breakage) without *executing* them.
+- Alternative considered: build tags (`//go:build e2e`) on the e2e suite files. More robust
+  but touches every existing/template e2e file and `create-test.sh`. Deferred unless we
+  later want `go test ./...` to be safe by default. (Open question Q1.)
+
+---
+
+## 4. Unit test target inventory (prioritized)
+
+Effort key: **S** ≈ <1h, **M** ≈ a few hours, **L** ≈ day+ (usually needs a refactor).
+
+### Wave 1 — pure logic, zero refactor, highest ROI
+
+| # | Package / file | Functions under test | Notable cases | Effort |
+|---|---|---|---|---|
+| 1 | `pkg/retry` ([retry.go](../pkg/retry/retry.go)) | `IsRetryable`, `IsSSHConnectionError`, `WithRetryAfter`, `Do`/`DoVoid` | k8s `StatusError` codes (500/501/429, RetryAfter), `io.EOF`, every network/k8s/ssh/webhook string pattern, non-retryable → no retry, success-after-N, `ctx` cancel mid-wait, backoff cap | M |
+| 2 | `pkg/kubernetes` ([apply.go](../pkg/kubernetes/apply.go)) | `FindUnsetEnvVars`, `splitYAMLDocuments` | multiple `${VAR}`, dedup, set vs unset (use `t.Setenv`), `---` splitting, trailing/empty docs, leading `---` | S |
+| 3 | `pkg/kubernetes` ([modules.go](../pkg/kubernetes/modules.go)) | `convertModuleSpecsToConfigs`, `buildModuleGraph`, `topologicalSortLevels`, `isWebhookConnectionError` | nil settings → empty map, missing dependency error, multi-level ordering, **cycle detection**, reverse-dep correctness | M |
+| 4 | `pkg/cluster` ([vms.go](../pkg/cluster/vms.go)) | `getCVMINameFromImageURL`, `getVMNodes`, `GetSetupNode`, `GetNodeIPAddress`, `GenerateRandomSuffix`, `generateCloudInitUserData`/`generateSetupNodeCloudInit` (smoke) | `.img`/`.qcow2` strip, `_`/`.`→`-`, collapse `--`, trim `-`, empty→`image`; VM-vs-baremetal filtering; setup-node nil/role; suffix length & charset; cloud-init contains hostname + pubkey | M |
+| 5 | `internal/config` ([types.go](../internal/config/types.go)) | `ClusterNode.UnmarshalYAML`, `ClusterDefinition.UnmarshalYAML`, extend `ValidateModulePullOverrides` | invalid `hostType`, invalid `role`, unknown `osType`, `clusterDefinition:` wrapper vs bare, nil module / empty override | M |
+| 6 | `internal/kubernetes/commander` ([client.go](../internal/kubernetes/commander/client.go)) | `mapStatusToPhase`, `base64Encode`, `NewClientWithOptions` validation | every status→phase mapping + unknown passthrough; empty baseURL/token errors; trailing-slash trim on baseURL & apiPrefix; default auth method/prefix; bad `CACertPath` | M |
+| 7 | `internal/logger` ([level.go](../internal/logger/level.go)) | `LevelToString` (round-trip with `ParseLevel`) | all levels + default | S |
+| 8 | `pkg/testkit` ([stress-tests.go](../pkg/testkit/stress-tests.go)) | `(*Config).Validate`, `DefaultConfig` | each required-field error, per-mode branches, invalid `TestOrder` step, resize/clone size requirements, `SnapshotsPerPVC` defaulting | M |
+| 9 | `pkg/kubernetes` ([poll.go](../pkg/kubernetes/poll.go)) | `formatRef`, `sameFinalizers`, `errIfTerminating` | ns vs cluster-scoped ref formatting; finalizer set equality (order, dupes, len); terminating object → error | S |
+
+### Wave 2 — needs `httptest` (no refactor)
+
+| # | Package | Functions under test | Approach | Effort |
+|---|---|---|---|---|
+| 10 | `internal/kubernetes/commander` | `setAuthHeaders` (all 5 methods), `GetClusterByID` (200/404/500), `ListClustersAPI` (array vs `items`/`data` object vs garbage), `GetClusterByName`, `CreateClusterFromTemplate`, `DeleteClusterByID`, `GetClusterKubeconfigByID` (+`/kubeconfig` 404 → cluster-details fallback), `GetRegistryByName` (exact + partial), `GetClusterConnectionInfo` (connection_hosts/agent_data/legacy precedence) | spin `httptest.Server`, build client with `baseURL=server.URL`, assert request headers/paths and decoded responses | M |
+
+> Note: `WaitForClusterReady` uses a hardcoded 10s ticker → not cheaply unit-testable as-is.
+> Either skip it or extract the interval (small refactor, Wave 3). Listed as Q2.
+
+### Wave 3 — small refactors to unlock fake-client tests (optional, higher value-per-effort later)
+
+These functions are good candidates but currently build clients from `*rest.Config`
+internally. The proposed refactor is **non-breaking**: keep the existing exported signature,
+extract the logic into an unexported helper that accepts an interface
+(`kubernetes.Interface` / `dynamic.Interface`), and have the public function build the real
+client and delegate. Tests then call the helper with a fake.
+
+| Package | Candidates | Refactor | Effort |
+|---|---|---|---|
+| `pkg/kubernetes` apply.go | `applyDocument`/`createDocument` via `ApplyClient{dynamicClient, discoveryClient}` (already injectable in-package using `dynamic/fake` + a fake discovery) | none for struct; build fakes in test | M |
+| `pkg/kubernetes` pvc/pod/namespace/secrets/storageclass | `WaitForPVCsBound`, `WaitForPodsStatus`, namespace/secret CRUD, default-SC selection | extract `…WithClientset(clientset kubernetes.Interface, …)` helper | L |
+| `pkg/cluster` lock.go | `AcquireClusterLock`/`Release`/`IsClusterLocked`/`GetClusterLockInfo` (ConfigMap logic, lock-contention error message) | extract clientset-accepting helpers; test with `kubernetes/fake` | M |
+| `internal/kubernetes/commander` | `WaitForClusterReady` | inject ticker interval | S |
+
+### Out of scope for unit tests (covered by e2e / not deterministic)
+
+- `internal/infrastructure/ssh/*` (real SSH/network; `findFreePort` is the only trivially
+  testable bit).
+- `pkg/cluster/{cluster,setup}.go`, `pkg/cluster/vms.go` orchestration (VM/CVI lifecycle).
+- `internal/kubernetes/{deckhouse,storage,virtualization}/*` thin CRD clients.
+- `pkg/testkit/{ceph*,storageclass}.go` provisioning flows and `StressTestRunner.Run*` modes.
+
+---
+
+## 5. Refactor principles (for Wave 3, if approved)
+
+- **Additive, not breaking:** public API signatures stay; add internal seams.
+- Prefer accepting **client-go interfaces** (`kubernetes.Interface`, `dynamic.Interface`)
+  over concrete `*Clientset` so fakes drop in.
+- Keep retry/IO wiring in the public function; keep *decision logic* in the helper.
+- Each refactor lands with its tests in the same PR.
+
+---
+
+## 6. CI/CD pipeline design
+
+New workflow: `.github/workflows/unit-tests.yml` (additive; coexists with gitleaks).
+
+### Triggers
+- `push` → **any branch** (every push gets a green/red signal + coverage so feature
+  branches surface failures immediately, not only when a PR is opened)
+- `pull_request` → `main` (types: opened, synchronize, reopened) — kept so PRs from
+  forks are still exercised even though their push events run under the fork's repo
+
+### Job: `unit-tests` (runs-on `ubuntu-latest`)
+Steps:
+1. `actions/checkout@v4`
+2. `actions/setup-go@v5` with `go-version: '1.26'`, `cache: true` (module + build cache).
+3. `go mod download`
+4. `go build ./...` — compiles everything incl. e2e suites (refactor-breakage guard).
+5. `go vet ./...`
+6. `go test -race -shuffle=on -covermode=atomic -coverprofile=coverage.out ./internal/... ./pkg/...`
+7. `go tool cover -func=coverage.out | tail -1` (print total %).
+8. Upload `coverage.out` as an artifact (`actions/upload-artifact@v4`).
+
+### Proposed workflow skeleton
+
+```yaml
+name: Unit Tests
+on:
+  push: {}
+  pull_request:
+    branches: [main]
+    types: [opened, synchronize, reopened]
+permissions:
+  contents: read
+jobs:
+  unit-tests:
+    name: Build, vet & unit test
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.26'
+          cache: true
+      - run: go mod download
+      - run: go build ./...
+      - run: go vet ./...
+      - name: Unit tests (race + coverage)
+        run: |
+          go test -race -shuffle=on -covermode=atomic \
+            -coverprofile=coverage.out ./internal/... ./pkg/...
+          go tool cover -func=coverage.out | tail -1
+      - uses: actions/upload-artifact@v4
+        with:
+          name: coverage
+          path: coverage.out
+```
+
+### Optional add-ons (decide in review — Q3)
+- **golangci-lint** job (`golangci/golangci-lint-action`) with a checked-in `.golangci.yml`.
+  Improves quality gate; adds maintenance. Could start in "report-only" (non-blocking).
+- **Coverage threshold** gate (fail under N%). Recommend starting **without** a hard
+  threshold and only printing total, then introducing a floor once Wave 1–2 land.
+- **Codecov/coveralls** upload for PR coverage comments (needs token/secret).
+
+### Making it block merges
+Workflow files alone don't block merges — that's a repo setting:
+
+- **Branch protection** (or a ruleset) on `main`: require status check **"Build, vet & unit
+  test"** to pass, and require branches up to date before merging.
+- Configure in GitHub Settings → Branches, or via the one-time `gh` command below
+  (needs repo-admin rights). The check name must match the `job.name` from
+  [.github/workflows/unit-tests.yml](.github/workflows/unit-tests.yml) verbatim.
+
+```bash
+# Replace <owner>/<repo> as appropriate; e.g. deckhouse/storage-e2e.
+gh api \
+  --method PUT \
+  -H "Accept: application/vnd.github+json" \
+  /repos/<owner>/<repo>/branches/main/protection \
+  -f required_status_checks.strict=true \
+  -f 'required_status_checks.contexts[]=Build, vet & unit test' \
+  -f enforce_admins=false \
+  -F required_pull_request_reviews= \
+  -F restrictions=
+```
+
+After applying, a PR with a failing unit-test job is blocked from merge.
+
+---
+
+## 7. Developer ergonomics (optional but recommended)
+
+Add a `Makefile` so local + CI use the same commands:
+
+```make
+test:        ; go test -race -shuffle=on ./internal/... ./pkg/...
+cover:       ; go test -covermode=atomic -coverprofile=coverage.out ./internal/... ./pkg/... && go tool cover -func=coverage.out
+vet:         ; go vet ./...
+e2e:         ; @echo "run a specific suite, e.g. go test -timeout=240m -v ./tests/<name> -count=1"
+```
+
+This also gives the e2e workflow (future) a single entry point.
+
+---
+
+## 8. Rollout phases & milestones
+
+| Phase | Contents | Status |
+|---|---|---|
+| **P0** | `unit-tests.yml` + `Makefile`; runs existing test files | **Done** — workflow on every push to any branch + PRs to `main` |
+| **P1** | Wave 1 (targets #1–#9) | **Done** — `retry` 94%, `commander` mappers, config YAML, `cluster/vms` helpers, `apply`/`modules`/`poll`, `testkit.Validate`, `logger/level` |
+| **P2** | Wave 2 (httptest, target #10) | **Done** — Commander HTTP client covered via `httptest.Server` |
+| **P3** | Branch protection enabled; check becomes **required** | **Pending admin** — see "Making it block merges" above for the exact `gh api` invocation |
+| **P4** *(optional)* | Wave 3 refactors + fake-client tests; lint job | Not started |
+
+Suggested coverage signal (not a hard gate initially): after P2, expect solid coverage of
+the pure-logic packages (`pkg/retry`, the pure parts of `pkg/kubernetes`,
+`internal/kubernetes/commander`, `internal/config`, `internal/logger`).
+
+---
+
+## 9. Risks & considerations
+
+- **Global env state in `internal/config`** — tests must snapshot/restore package vars;
+  `t.Setenv` only affects new reads, not the already-initialized globals. The
+  `withEnvSnapshot` helper mitigates this. `-shuffle=on` will surface any ordering bugs.
+- **`-race`** roughly doubles test time but is cheap here (no heavy tests) and valuable for
+  `retry.Do` / parallel module config goroutines.
+- **Module downloads in CI** — first run pulls k8s/virtualization deps; `setup-go` caching
+  keeps subsequent runs fast.
+- **e2e accidentally running in CI** — avoided by the explicit `./internal/... ./pkg/...`
+  package set; `go vet ./...` still compiles them.
+- **Hidden behavior change** — any Wave 3 refactor is additive and shipped with tests, but
+  reviewers should confirm the public signatures are unchanged.
+
+---
+
+## 10. Open questions for review
+
+- **Q1.** Separate e2e via explicit package list (proposed) or introduce `//go:build e2e`
+  tags so `go test ./...` is safe everywhere?
+- **Q2.** Refactor `WaitForClusterReady` (and similar tickers) to inject interval for
+  testability, or leave time-based waits to e2e?
+- **Q3.** Add golangci-lint now (blocking? report-only?) or defer?
+- **Q4.** Who enables branch protection on `main` (needs admin), and do you want me to
+  provide the exact `gh api` command?
+- **Q5.** Do you want a coverage threshold gate eventually, and at what %?
+- **Q6.** Coverage reporting service (Codecov) — desired, or keep coverage as a CI artifact only?
+
+---
+
+## 11. Proposed PR breakdown
+
+1. **PR-1 (CI bootstrap):** `unit-tests.yml` + `Makefile`. No new tests. Lands the pipeline.
+2. **PR-2 (Wave 1a):** `retry`, `apply`, `modules`, `poll` tests.
+3. **PR-3 (Wave 1b):** `cluster/vms` pure helpers, `config/types` YAML, `commander` mappers,
+   `logger/level`, `testkit` `Validate`.
+4. **PR-4 (Wave 2):** Commander httptest suite.
+5. **PR-5 (admin):** enable branch protection / required check.
+6. **PR-6+ (optional):** Wave 3 refactors + fake-client tests, lint job.
diff --git a/docs/WORKLOG.md b/docs/WORKLOG.md
index 99378d6..5e5b0a3 100644
--- a/docs/WORKLOG.md
+++ b/docs/WORKLOG.md
@@ -102,3 +102,13 @@ All notable changes to this repository are documented here. New entries are appe
 - **Remove** `internal/config/overrides.go`: dropped `${VAR}` expansion in `modulePullOverride` per review — each module needs its own literal tag in `cluster_config.yml` (e.g. `pr131`, `mr55`).
 - **Add** `internal/config.ValidateModulePullOverrides`: rejects `${...}` placeholders at config load with an explicit error.
 - **Update** `internal/cluster/cluster.go::LoadClusterConfig` and `pkg/cluster/cluster.go::loadClusterConfigFromPath`, `README.md`, `docs/ARCHITECTURE.md`.
+
+---
+
+## 2026-06-03
+
+- **Add** `.github/workflows/unit-tests.yml`: mandatory CI workflow that builds, vets and runs unit tests on every push (any branch) and on PRs to `main`; uses `go-version-file: go.mod`, `-race -shuffle=on -covermode=atomic`, uploads `coverage.out` artifact, scoped to `./internal/... ./pkg/...` so e2e suites stay off CI.
+- **Add** `Makefile`: `test` / `cover` / `vet` / `build` / `e2e` / `clean` targets mirroring the CI commands; `.gitignore` for `coverage.out` / `coverage.html`.
+- **Add** Wave 1 unit tests (`pkg/retry/retry_test.go`, `pkg/kubernetes/{apply,modules,poll}_test.go`, `pkg/cluster/vms_test.go`, `pkg/testkit/stress_tests_test.go`, `internal/config/types_yaml_test.go`, `internal/kubernetes/commander/client_test.go`, `internal/logger/level_test.go`): hermetic table-driven coverage of `retry.Do/IsRetryable/IsSSHConnectionError/WithRetryAfter`, YAML doc splitting/env-var scanning, module graph + topo sort + cycle detection, `cluster/vms` pure helpers, `commander` mappers / base64 / `NewClientWithOptions` validation, `stress-tests.Config.Validate` / `DefaultConfig`, `LevelToString` round-trip, `ClusterNode`/`ClusterDefinition` YAML unmarshal validation.
+- **Add** Wave 2 httptest tests (`internal/kubernetes/commander/client_http_test.go`): drives the Commander HTTP client (`GetClusterByID`, `ListClustersAPI` array/items/data/garbage, `GetClusterByName`, `CreateClusterFromTemplate`, `DeleteClusterByID`, `GetClusterKubeconfigByID` + cluster-details fallback, `GetRegistryByName`, `GetClusterConnectionInfo` precedence + defaults) and all five `setAuthHeaders` paths via a real `httptest.Server`.
+- **Update** `docs/TESTS_IMPLEMENTATION_PLAN.md`: triggers changed from `push → main` to push-on-any-branch + `pull_request → main`; status header refreshed; rollout phases marked Done/Pending; exact `gh api` branch-protection command documented.
diff --git a/internal/config/types_yaml_test.go b/internal/config/types_yaml_test.go
new file mode 100644
index 0000000..c0ef870
--- /dev/null
+++ b/internal/config/types_yaml_test.go
@@ -0,0 +1,247 @@
+/*
+Copyright 2026 Flant JSC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package config
+
+import (
+	"strings"
+	"testing"
+
+	"gopkg.in/yaml.v3"
+)
+
+// pickKnownOSType returns one of the keys from OSTypeMap so we can build
+// test YAML that is robust against changes to the canonical OS list.
+func pickKnownOSType(t *testing.T) string {
+	t.Helper()
+	if len(OSTypeMap) == 0 {
+		t.Fatal("OSTypeMap is empty; cannot build YAML fixtures")
+	}
+	// Prefer the well-known default if present for stable assertions.
+	preferred := "Ubuntu 22.04 6.2.0-39-generic"
+	if _, ok := OSTypeMap[preferred]; ok {
+		return preferred
+	}
+	for k := range OSTypeMap {
+		return k
+	}
+	return ""
+}
+
+func TestClusterNodeUnmarshalYAML(t *testing.T) {
+	osName := pickKnownOSType(t)
+
+	t.Run("VM node with valid fields", func(t *testing.T) {
+		in := []byte(`
+hostname: master-1
+hostType: vm
+osType: "` + osName + `"
+cpu: 4
+ram: 8
+diskSize: 50
+`)
+		var node ClusterNode
+		if err := yaml.Unmarshal(in, &node); err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if node.Hostname != "master-1" {
+			t.Errorf("Hostname=%q", node.Hostname)
+		}
+		if node.HostType != HostTypeVM {
+			t.Errorf("HostType=%q", node.HostType)
+		}
+		if node.OSType.ImageURL == "" {
+			t.Errorf("OSType.ImageURL not populated for %q", osName)
+		}
+		if node.CPU != 4 || node.RAM != 8 || node.DiskSize != 50 {
+			t.Errorf("VM fields not parsed: %+v", node)
+		}
+	})
+
+	t.Run("bare-metal node with role=setup", func(t *testing.T) {
+		in := []byte(`
+hostname: boot
+hostType: bare-metal
+osType: "` + osName + `"
+role: setup
+`)
+		var node ClusterNode
+		if err := yaml.Unmarshal(in, &node); err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if node.HostType != HostTypeBareMetal {
+			t.Errorf("HostType=%q", node.HostType)
+		}
+		if node.Role != ClusterRoleSetup {
+			t.Errorf("Role=%q, want %q", node.Role, ClusterRoleSetup)
+		}
+	})
+
+	t.Run("invalid hostType errors", func(t *testing.T) {
+		in := []byte(`
+hostname: x
+hostType: container
+osType: "` + osName + `"
+`)
+		var node ClusterNode
+		err := yaml.Unmarshal(in, &node)
+		if err == nil || !strings.Contains(err.Error(), "invalid hostType") {
+			t.Errorf("want 'invalid hostType' error, got %v", err)
+		}
+	})
+
+	t.Run("invalid role errors", func(t *testing.T) {
+		in := []byte(`
+hostname: x
+hostType: vm
+osType: "` + osName + `"
+role: master
+`)
+		var node ClusterNode
+		err := yaml.Unmarshal(in, &node)
+		if err == nil || !strings.Contains(err.Error(), "invalid role") {
+			t.Errorf("want 'invalid role' error, got %v", err)
+		}
+	})
+
+	t.Run("unknown osType errors", func(t *testing.T) {
+		in := []byte(`
+hostname: x
+hostType: vm
+osType: "some-unknown-os 999"
+`)
+		var node ClusterNode
+		err := yaml.Unmarshal(in, &node)
+		if err == nil || !strings.Contains(err.Error(), "unknown osType") {
+			t.Errorf("want 'unknown osType' error, got %v", err)
+		}
+	})
+}
+
+func TestClusterDefinitionUnmarshalYAML(t *testing.T) {
+	osName := pickKnownOSType(t)
+
+	t.Run("bare cluster definition (no wrapper key)", func(t *testing.T) {
+		in := []byte(`
+masters:
+  - hostname: m1
+    hostType: vm
+    osType: "` + osName + `"
+    cpu: 2
+    ram: 4
+    diskSize: 20
+workers:
+  - hostname: w1
+    hostType: vm
+    osType: "` + osName + `"
+    cpu: 2
+    ram: 4
+    diskSize: 20
+dkpParameters:
+  kubernetesVersion: "1.30"
+  registryRepo: dev-registry.deckhouse.io/sys/deckhouse-oss
+`)
+		var def ClusterDefinition
+		if err := yaml.Unmarshal(in, &def); err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if len(def.Masters) != 1 || def.Masters[0].Hostname != "m1" {
+			t.Errorf("masters not parsed: %+v", def.Masters)
+		}
+		if len(def.Workers) != 1 || def.Workers[0].Hostname != "w1" {
+			t.Errorf("workers not parsed: %+v", def.Workers)
+		}
+		if def.DKPParameters.KubernetesVersion != "1.30" {
+			t.Errorf("KubernetesVersion=%q", def.DKPParameters.KubernetesVersion)
+		}
+	})
+
+	t.Run("clusterDefinition wrapper key", func(t *testing.T) {
+		in := []byte(`
+clusterDefinition:
+  masters:
+    - hostname: m1
+      hostType: vm
+      osType: "` + osName + `"
+      cpu: 1
+      ram: 1
+      diskSize: 10
+  dkpParameters:
+    registryRepo: dev-registry.deckhouse.io/sys/deckhouse-oss
+`)
+		var def ClusterDefinition
+		if err := yaml.Unmarshal(in, &def); err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if len(def.Masters) != 1 || def.Masters[0].Hostname != "m1" {
+			t.Errorf("wrapper-key path didn't unwrap: %+v", def)
+		}
+		if def.DKPParameters.RegistryRepo != "dev-registry.deckhouse.io/sys/deckhouse-oss" {
+			t.Errorf("registryRepo not parsed: %q", def.DKPParameters.RegistryRepo)
+		}
+	})
+
+	t.Run("invalid nested node propagates error", func(t *testing.T) {
+		in := []byte(`
+masters:
+  - hostname: m1
+    hostType: container
+    osType: "` + osName + `"
+`)
+		var def ClusterDefinition
+		err := yaml.Unmarshal(in, &def)
+		if err == nil || !strings.Contains(err.Error(), "invalid hostType") {
+			t.Errorf("want 'invalid hostType' error, got %v", err)
+		}
+	})
+}
+
+func TestValidateModulePullOverrides_Extended(t *testing.T) {
+	t.Run("nil module entries are ignored", func(t *testing.T) {
+		def := &ClusterDefinition{
+			DKPParameters: DKPParameters{
+				Modules: []*ModuleConfig{
+					nil,
+					{Name: "ok", ModulePullOverride: "pr1"},
+					nil,
+				},
+			},
+		}
+		if err := ValidateModulePullOverrides(def); err != nil {
+			t.Errorf("unexpected error: %v", err)
+		}
+	})
+
+	t.Run("empty override is allowed", func(t *testing.T) {
+		def := &ClusterDefinition{
+			DKPParameters: DKPParameters{
+				Modules: []*ModuleConfig{
+					{Name: "x", ModulePullOverride: ""},
+				},
+			},
+		}
+		if err := ValidateModulePullOverrides(def); err != nil {
+			t.Errorf("unexpected error: %v", err)
+		}
+	})
+
+	t.Run("no modules at all is allowed", func(t *testing.T) {
+		def := &ClusterDefinition{}
+		if err := ValidateModulePullOverrides(def); err != nil {
+			t.Errorf("unexpected error: %v", err)
+		}
+	})
+}
diff --git a/internal/kubernetes/commander/client_http_test.go b/internal/kubernetes/commander/client_http_test.go
new file mode 100644
index 0000000..d95069f
--- /dev/null
+++ b/internal/kubernetes/commander/client_http_test.go
@@ -0,0 +1,683 @@
+/*
+Copyright 2026 Flant JSC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// HTTP-driven tests for the Commander API client. Each test spins up an
+// httptest.Server, points a real Client at it, and asserts that requests are
+// shaped correctly and responses are decoded as expected. No network traffic
+// leaves the test binary.
+
+package commander
+
+import (
+	"context"
+	"encoding/base64"
+	"encoding/json"
+	"errors"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"net/url"
+	"strings"
+	"testing"
+)
+
+// recordedRequest captures the parts of an *http.Request we care about so
+// tests can assert against them after the response is returned.
+type recordedRequest struct {
+	Method      string
+	Path        string
+	RawQuery    string
+	Header      http.Header
+	BodyBytes   []byte
+	CookieToken string
+}
+
+// captureHandler returns an http.Handler that records every incoming request
+// into *out and calls fn to write the response. fn may inspect r to vary the
+// behaviour per call.
+func captureHandler(out *[]recordedRequest, fn http.HandlerFunc) http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		body, _ := io.ReadAll(r.Body)
+		rec := recordedRequest{
+			Method:    r.Method,
+			Path:      r.URL.Path,
+			RawQuery:  r.URL.RawQuery,
+			Header:    r.Header.Clone(),
+			BodyBytes: body,
+		}
+		if cookie, err := r.Cookie("token"); err == nil {
+			rec.CookieToken = cookie.Value
+		}
+		*out = append(*out, rec)
+		fn(w, r)
+	}
+}
+
+func mustClient(t *testing.T, baseURL, token string, opts ClientOptions) *Client {
+	t.Helper()
+	c, err := NewClientWithOptions(baseURL, token, opts)
+	if err != nil {
+		t.Fatalf("NewClientWithOptions: %v", err)
+	}
+	return c
+}
+
+func TestSetAuthHeaders_AllMethods(t *testing.T) {
+	cases := []struct {
+		name        string
+		method      AuthMethod
+		user        string
+		token       string
+		assertReq   func(t *testing.T, r recordedRequest)
+	}{
+		{
+			name:   "default = X-Auth-Token",
+			method: "",
+			token:  "sekret",
+			assertReq: func(t *testing.T, r recordedRequest) {
+				if got := r.Header.Get("X-Auth-Token"); got != "sekret" {
+					t.Errorf("X-Auth-Token=%q, want sekret", got)
+				}
+				if r.Header.Get("Authorization") != "" {
+					t.Errorf("Authorization header should be empty for x-auth-token")
+				}
+			},
+		},
+		{
+			name:   "bearer",
+			method: AuthMethodBearer,
+			token:  "sekret",
+			assertReq: func(t *testing.T, r recordedRequest) {
+				if got := r.Header.Get("Authorization"); got != "Bearer sekret" {
+					t.Errorf("Authorization=%q, want Bearer sekret", got)
+				}
+			},
+		},
+		{
+			name:   "token",
+			method: AuthMethodToken,
+			token:  "sekret",
+			assertReq: func(t *testing.T, r recordedRequest) {
+				if got := r.Header.Get("Authorization"); got != "Token sekret" {
+					t.Errorf("Authorization=%q, want Token sekret", got)
+				}
+			},
+		},
+		{
+			name:   "cookie",
+			method: AuthMethodCookie,
+			token:  "sekret",
+			assertReq: func(t *testing.T, r recordedRequest) {
+				if r.CookieToken != "sekret" {
+					t.Errorf("cookie token=%q, want sekret", r.CookieToken)
+				}
+			},
+		},
+		{
+			name:   "basic",
+			method: AuthMethodBasic,
+			user:   "alice",
+			token:  "sekret",
+			assertReq: func(t *testing.T, r recordedRequest) {
+				want := "Basic " + base64.StdEncoding.EncodeToString([]byte("alice:sekret"))
+				if got := r.Header.Get("Authorization"); got != want {
+					t.Errorf("Authorization=%q, want %q", got, want)
+				}
+			},
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			var recs []recordedRequest
+			srv := httptest.NewServer(captureHandler(&recs, func(w http.ResponseWriter, _ *http.Request) {
+				w.Header().Set("Content-Type", "application/json")
+				_, _ = w.Write([]byte(`{"id":"x","name":"y"}`))
+			}))
+			defer srv.Close()
+
+			c := mustClient(t, srv.URL, tc.token, ClientOptions{
+				AuthMethod: tc.method,
+				AuthUser:   tc.user,
+			})
+			if _, err := c.GetClusterByID(context.Background(), "x"); err != nil {
+				t.Fatalf("GetClusterByID: %v", err)
+			}
+			if len(recs) != 1 {
+				t.Fatalf("got %d requests, want 1", len(recs))
+			}
+			tc.assertReq(t, recs[0])
+			if r := recs[0]; r.Header.Get("Accept") != "application/json" {
+				t.Errorf("Accept=%q, want application/json", r.Header.Get("Accept"))
+			}
+		})
+	}
+}
+
+func TestGetClusterByID(t *testing.T) {
+	t.Run("200 returns cluster", func(t *testing.T) {
+		var recs []recordedRequest
+		srv := httptest.NewServer(captureHandler(&recs, func(w http.ResponseWriter, _ *http.Request) {
+			_, _ = w.Write([]byte(`{"id":"abc","name":"my-cluster","status":"in_sync"}`))
+		}))
+		defer srv.Close()
+
+		c := mustClient(t, srv.URL, "tok", ClientOptions{})
+		got, err := c.GetClusterByID(context.Background(), "abc")
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if got.ID != "abc" || got.Name != "my-cluster" || got.Status != "in_sync" {
+			t.Errorf("decoded %+v", got)
+		}
+		if recs[0].Path != "/api/v1/clusters/abc" || recs[0].Method != http.MethodGet {
+			t.Errorf("unexpected request: %+v", recs[0])
+		}
+	})
+
+	t.Run("404 returns ErrClusterNotFound", func(t *testing.T) {
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+			http.Error(w, "missing", http.StatusNotFound)
+		}))
+		defer srv.Close()
+
+		c := mustClient(t, srv.URL, "tok", ClientOptions{})
+		_, err := c.GetClusterByID(context.Background(), "ghost")
+		if !errors.Is(err, ErrClusterNotFound) {
+			t.Errorf("want ErrClusterNotFound, got %v", err)
+		}
+	})
+
+	t.Run("500 returns wrapped error with body", func(t *testing.T) {
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+			w.WriteHeader(http.StatusInternalServerError)
+			_, _ = w.Write([]byte("boom"))
+		}))
+		defer srv.Close()
+
+		c := mustClient(t, srv.URL, "tok", ClientOptions{})
+		_, err := c.GetClusterByID(context.Background(), "x")
+		if err == nil {
+			t.Fatal("expected error")
+		}
+		msg := err.Error()
+		if !strings.Contains(msg, "500") || !strings.Contains(msg, "boom") {
+			t.Errorf("error should include code and body: %v", err)
+		}
+	})
+}
+
+func TestListClustersAPI(t *testing.T) {
+	t.Run("decodes array body", func(t *testing.T) {
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+			_, _ = w.Write([]byte(`[{"id":"a","name":"alpha"},{"id":"b","name":"beta"}]`))
+		}))
+		defer srv.Close()
+
+		c := mustClient(t, srv.URL, "tok", ClientOptions{})
+		got, err := c.ListClustersAPI(context.Background())
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if len(got) != 2 || got[0].Name != "alpha" || got[1].Name != "beta" {
+			t.Errorf("decoded %+v", got)
+		}
+	})
+
+	t.Run("decodes object body with items[]", func(t *testing.T) {
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+			_, _ = w.Write([]byte(`{"items":[{"id":"a","name":"alpha"}]}`))
+		}))
+		defer srv.Close()
+
+		c := mustClient(t, srv.URL, "tok", ClientOptions{})
+		got, err := c.ListClustersAPI(context.Background())
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if len(got) != 1 || got[0].Name != "alpha" {
+			t.Errorf("decoded %+v", got)
+		}
+	})
+
+	t.Run("decodes object body with data[]", func(t *testing.T) {
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+			_, _ = w.Write([]byte(`{"data":[{"id":"c","name":"gamma"}]}`))
+		}))
+		defer srv.Close()
+
+		c := mustClient(t, srv.URL, "tok", ClientOptions{})
+		got, err := c.ListClustersAPI(context.Background())
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if len(got) != 1 || got[0].Name != "gamma" {
+			t.Errorf("decoded %+v", got)
+		}
+	})
+
+	t.Run("garbage body returns descriptive error", func(t *testing.T) {
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+			_, _ = w.Write([]byte(`not json at all`))
+		}))
+		defer srv.Close()
+
+		c := mustClient(t, srv.URL, "tok", ClientOptions{})
+		_, err := c.ListClustersAPI(context.Background())
+		if err == nil {
+			t.Fatal("expected error")
+		}
+		if !strings.Contains(err.Error(), "failed to decode response") {
+			t.Errorf("want 'failed to decode response' error, got %v", err)
+		}
+	})
+}
+
+func TestGetClusterByName(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		_, _ = w.Write([]byte(`[{"id":"a","name":"alpha"},{"id":"b","name":"beta"}]`))
+	}))
+	defer srv.Close()
+
+	c := mustClient(t, srv.URL, "tok", ClientOptions{})
+
+	t.Run("found", func(t *testing.T) {
+		got, err := c.GetClusterByName(context.Background(), "beta")
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if got.ID != "b" {
+			t.Errorf("ID=%q, want b", got.ID)
+		}
+	})
+
+	t.Run("not found", func(t *testing.T) {
+		_, err := c.GetClusterByName(context.Background(), "zeta")
+		if !errors.Is(err, ErrClusterNotFound) {
+			t.Errorf("want ErrClusterNotFound, got %v", err)
+		}
+	})
+}
+
+func TestCreateClusterFromTemplate(t *testing.T) {
+	var recs []recordedRequest
+	srv := httptest.NewServer(captureHandler(&recs, func(w http.ResponseWriter, _ *http.Request) {
+		w.WriteHeader(http.StatusCreated)
+		_, _ = w.Write([]byte(`{"id":"new","name":"created"}`))
+	}))
+	defer srv.Close()
+
+	c := mustClient(t, srv.URL, "tok", ClientOptions{})
+	values := map[string]interface{}{"releaseChannel": "EarlyAccess"}
+	got, err := c.CreateClusterFromTemplate(context.Background(), "my-cluster", "tpl-version-123", "reg-456", values)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if got.ID != "new" || got.Name != "created" {
+		t.Errorf("decoded %+v", got)
+	}
+
+	if len(recs) != 1 {
+		t.Fatalf("got %d requests, want 1", len(recs))
+	}
+	r := recs[0]
+	if r.Method != http.MethodPost || r.Path != "/api/v1/clusters" {
+		t.Errorf("unexpected request: %s %s", r.Method, r.Path)
+	}
+	if ct := r.Header.Get("Content-Type"); ct != "application/json" {
+		t.Errorf("Content-Type=%q, want application/json", ct)
+	}
+
+	// Verify request body shape.
+	var body CreateClusterRequest
+	if err := json.Unmarshal(r.BodyBytes, &body); err != nil {
+		t.Fatalf("body not JSON: %v (raw=%s)", err, string(r.BodyBytes))
+	}
+	if body.Name != "my-cluster" || body.ClusterTemplateVersionID != "tpl-version-123" || body.RegistryID != "reg-456" {
+		t.Errorf("body fields: %+v", body)
+	}
+	if body.Values["releaseChannel"] != "EarlyAccess" {
+		t.Errorf("Values not forwarded: %+v", body.Values)
+	}
+}
+
+func TestDeleteClusterByID(t *testing.T) {
+	t.Run("204 No Content is success", func(t *testing.T) {
+		var recs []recordedRequest
+		srv := httptest.NewServer(captureHandler(&recs, func(w http.ResponseWriter, _ *http.Request) {
+			w.WriteHeader(http.StatusNoContent)
+		}))
+		defer srv.Close()
+
+		c := mustClient(t, srv.URL, "tok", ClientOptions{})
+		if err := c.DeleteClusterByID(context.Background(), "xyz"); err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if recs[0].Method != http.MethodDelete || recs[0].Path != "/api/v1/clusters/xyz" {
+			t.Errorf("unexpected request: %+v", recs[0])
+		}
+	})
+
+	t.Run("202 Accepted is success", func(t *testing.T) {
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+			w.WriteHeader(http.StatusAccepted)
+		}))
+		defer srv.Close()
+
+		c := mustClient(t, srv.URL, "tok", ClientOptions{})
+		if err := c.DeleteClusterByID(context.Background(), "xyz"); err != nil {
+			t.Errorf("unexpected error: %v", err)
+		}
+	})
+
+	t.Run("500 returns error with body", func(t *testing.T) {
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+			w.WriteHeader(http.StatusInternalServerError)
+			_, _ = w.Write([]byte("nope"))
+		}))
+		defer srv.Close()
+
+		c := mustClient(t, srv.URL, "tok", ClientOptions{})
+		err := c.DeleteClusterByID(context.Background(), "xyz")
+		if err == nil || !strings.Contains(err.Error(), "500") || !strings.Contains(err.Error(), "nope") {
+			t.Errorf("want 500/nope in error, got %v", err)
+		}
+	})
+}
+
+func TestGetClusterKubeconfigByID(t *testing.T) {
+	t.Run("raw kubeconfig body", func(t *testing.T) {
+		const raw = "apiVersion: v1\nkind: Config\n"
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			if r.URL.Path != "/api/v1/clusters/x/kubeconfig" {
+				http.Error(w, "wrong path", http.StatusBadRequest)
+				return
+			}
+			_, _ = w.Write([]byte(raw))
+		}))
+		defer srv.Close()
+
+		c := mustClient(t, srv.URL, "tok", ClientOptions{})
+		got, err := c.GetClusterKubeconfigByID(context.Background(), "x")
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if got != raw {
+			t.Errorf("got %q, want %q", got, raw)
+		}
+	})
+
+	t.Run("JSON wrapper with 'kubeconfig' field", func(t *testing.T) {
+		const raw = "apiVersion: v1\nkind: Config\n"
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+			payload, _ := json.Marshal(map[string]string{"kubeconfig": raw})
+			_, _ = w.Write(payload)
+		}))
+		defer srv.Close()
+
+		c := mustClient(t, srv.URL, "tok", ClientOptions{})
+		got, err := c.GetClusterKubeconfigByID(context.Background(), "x")
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if got != raw {
+			t.Errorf("got %q, want %q", got, raw)
+		}
+	})
+
+	t.Run("/kubeconfig 404 falls back to cluster-details ClusterAgentData.data.kubeconfig", func(t *testing.T) {
+		const raw = "apiVersion: v1\nkind: Config\n"
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			switch r.URL.Path {
+			case "/api/v1/clusters/x/kubeconfig":
+				http.Error(w, "no such endpoint", http.StatusNotFound)
+			case "/api/v1/clusters/x":
+				resp := ClusterResponse{
+					ID:     "x",
+					Status: "in_sync",
+					ClusterAgentData: map[string]interface{}{
+						"data": map[string]interface{}{"kubeconfig": raw},
+					},
+				}
+				payload, _ := json.Marshal(resp)
+				_, _ = w.Write(payload)
+			default:
+				http.Error(w, "unexpected", http.StatusBadRequest)
+			}
+		}))
+		defer srv.Close()
+
+		c := mustClient(t, srv.URL, "tok", ClientOptions{})
+		got, err := c.GetClusterKubeconfigByID(context.Background(), "x")
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if got != raw {
+			t.Errorf("got %q, want %q", got, raw)
+		}
+	})
+
+	t.Run("/kubeconfig 404 with nothing in details returns descriptive error", func(t *testing.T) {
+		srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			if r.URL.Path == "/api/v1/clusters/x/kubeconfig" {
+				http.Error(w, "no such endpoint", http.StatusNotFound)
+				return
+			}
+			// details lookup with no kubeconfig field at all
+			_, _ = w.Write([]byte(`{"id":"x","status":"new"}`))
+		}))
+		defer srv.Close()
+
+		c := mustClient(t, srv.URL, "tok", ClientOptions{})
+		_, err := c.GetClusterKubeconfigByID(context.Background(), "x")
+		if err == nil || !strings.Contains(err.Error(), "kubeconfig not found") {
+			t.Errorf("want 'kubeconfig not found' error, got %v", err)
+		}
+	})
+}
+
+func TestGetRegistryByName(t *testing.T) {
+	registries := `[{"id":"r1","name":"prod-registry"},{"id":"r2","name":"dev-registry-eu"}]`
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		_, _ = w.Write([]byte(registries))
+	}))
+	defer srv.Close()
+
+	c := mustClient(t, srv.URL, "tok", ClientOptions{})
+
+	t.Run("exact name match wins", func(t *testing.T) {
+		got, err := c.GetRegistryByName(context.Background(), "prod-registry")
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if got.ID != "r1" {
+			t.Errorf("ID=%q, want r1", got.ID)
+		}
+	})
+
+	t.Run("partial match used when exact missing", func(t *testing.T) {
+		got, err := c.GetRegistryByName(context.Background(), "dev-registry")
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if got.ID != "r2" {
+			t.Errorf("ID=%q, want r2 (partial match)", got.ID)
+		}
+	})
+
+	t.Run("nothing matches errors", func(t *testing.T) {
+		_, err := c.GetRegistryByName(context.Background(), "no-such-thing")
+		if err == nil || !strings.Contains(err.Error(), "no-such-thing") {
+			t.Errorf("want 'no-such-thing' error, got %v", err)
+		}
+	})
+}
+
+func TestGetClusterConnectionInfo_PrefersConnectionHosts(t *testing.T) {
+	const kc = "apiVersion: v1\nkind: Config\n"
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.URL.Path == "/api/v1/clusters" && r.Method == http.MethodGet:
+			// list returns one cluster
+			resp := []ClusterResponse{{
+				ID:     "c1",
+				Name:   "my-cluster",
+				Status: "in_sync",
+				ConnectionHosts: map[string]interface{}{
+					"api_endpoint": "https://api.example.com",
+					"masters": []interface{}{
+						map[string]interface{}{
+							"host": "10.1.2.3",
+							"user": "ubuntu",
+							"port": float64(2222),
+						},
+					},
+				},
+			}}
+			payload, _ := json.Marshal(resp)
+			_, _ = w.Write(payload)
+		case r.URL.Path == "/api/v1/clusters/c1/kubeconfig":
+			_, _ = w.Write([]byte(kc))
+		default:
+			http.Error(w, "unexpected: "+r.URL.Path, http.StatusBadRequest)
+		}
+	}))
+	defer srv.Close()
+
+	c := mustClient(t, srv.URL, "tok", ClientOptions{})
+	info, err := c.GetClusterConnectionInfo(context.Background(), "my-cluster")
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if info.APIEndpoint != "https://api.example.com" {
+		t.Errorf("APIEndpoint=%q", info.APIEndpoint)
+	}
+	if info.SSHHost != "10.1.2.3" || info.SSHUser != "ubuntu" || info.SSHPort != 2222 {
+		t.Errorf("SSH info wrong: %+v", info)
+	}
+	if info.Kubeconfig != kc {
+		t.Errorf("kubeconfig not propagated")
+	}
+}
+
+func TestGetClusterConnectionInfo_FallsBackToAgentData(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch {
+		case r.URL.Path == "/api/v1/clusters" && r.Method == http.MethodGet:
+			resp := []ClusterResponse{{
+				ID:     "c1",
+				Name:   "my-cluster",
+				Status: "in_sync",
+				ClusterAgentData: map[string]interface{}{
+					"data": map[string]interface{}{
+						"ssh_host": "10.9.9.9",
+						"ssh_user": "root",
+						// no port: default 22 should apply
+					},
+				},
+			}}
+			payload, _ := json.Marshal(resp)
+			_, _ = w.Write(payload)
+		case strings.HasSuffix(r.URL.Path, "/kubeconfig"):
+			// no kubeconfig endpoint; let it fail
+			http.Error(w, "missing", http.StatusNotFound)
+		case r.URL.Path == "/api/v1/clusters/c1":
+			// detail lookup used by fallback also has no kubeconfig
+			_, _ = w.Write([]byte(`{"id":"c1","name":"my-cluster"}`))
+		default:
+			http.Error(w, "unexpected: "+r.URL.Path, http.StatusBadRequest)
+		}
+	}))
+	defer srv.Close()
+
+	c := mustClient(t, srv.URL, "tok", ClientOptions{})
+	info, err := c.GetClusterConnectionInfo(context.Background(), "my-cluster")
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if info.SSHHost != "10.9.9.9" || info.SSHUser != "root" {
+		t.Errorf("agent-data fallback failed: %+v", info)
+	}
+	if info.SSHPort != 22 {
+		t.Errorf("default port not applied, got %d", info.SSHPort)
+	}
+}
+
+func TestGetClusterConnectionInfo_FallsBackToLegacyFields(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/api/v1/clusters":
+			resp := []ClusterResponse{{
+				ID:      "c1",
+				Name:    "my-cluster",
+				SSHHost: "10.5.5.5",
+				SSHUser: "deploy",
+				SSHPort: 0,
+			}}
+			payload, _ := json.Marshal(resp)
+			_, _ = w.Write(payload)
+		case "/api/v1/clusters/c1/kubeconfig":
+			http.Error(w, "missing", http.StatusNotFound)
+		case "/api/v1/clusters/c1":
+			_, _ = w.Write([]byte(`{"id":"c1","name":"my-cluster"}`))
+		default:
+			http.Error(w, "unexpected: "+r.URL.Path, http.StatusBadRequest)
+		}
+	}))
+	defer srv.Close()
+
+	c := mustClient(t, srv.URL, "tok", ClientOptions{})
+	info, err := c.GetClusterConnectionInfo(context.Background(), "my-cluster")
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if info.SSHHost != "10.5.5.5" || info.SSHUser != "deploy" || info.SSHPort != 22 {
+		t.Errorf("legacy fallback failed: %+v", info)
+	}
+}
+
+func TestGetClusterConnectionInfo_NoSSHNoKubeconfigErrors(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/api/v1/clusters":
+			_, _ = w.Write([]byte(`[{"id":"c1","name":"my-cluster"}]`))
+		case "/api/v1/clusters/c1/kubeconfig":
+			http.Error(w, "no kubeconfig endpoint", http.StatusNotFound)
+		case "/api/v1/clusters/c1":
+			_, _ = w.Write([]byte(`{"id":"c1","name":"my-cluster"}`))
+		default:
+			http.Error(w, "unexpected: "+r.URL.Path, http.StatusBadRequest)
+		}
+	}))
+	defer srv.Close()
+
+	c := mustClient(t, srv.URL, "tok", ClientOptions{})
+	_, err := c.GetClusterConnectionInfo(context.Background(), "my-cluster")
+	if err == nil {
+		t.Fatal("expected error when neither kubeconfig nor SSH info is available")
+	}
+}
+
+// Sanity: ensure srv.URL is a usable URL the client can talk to. This guards
+// against test infra regressions where httptest.NewServer returns something
+// pathological.
+func TestServerURLIsValid(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {}))
+	defer srv.Close()
+	if _, err := url.Parse(srv.URL); err != nil {
+		t.Fatalf("httptest server URL is not parseable: %v", err)
+	}
+}
diff --git a/internal/kubernetes/commander/client_test.go b/internal/kubernetes/commander/client_test.go
new file mode 100644
index 0000000..4180b79
--- /dev/null
+++ b/internal/kubernetes/commander/client_test.go
@@ -0,0 +1,176 @@
+/*
+Copyright 2026 Flant JSC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package commander
+
+import (
+	"encoding/base64"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+)
+
+func TestMapStatusToPhase(t *testing.T) {
+	cases := []struct {
+		in   string
+		want ClusterPhase
+	}{
+		{"in_sync", ClusterPhaseReady},
+		{"insync", ClusterPhaseReady},
+		{"ready", ClusterPhaseReady},
+		{"running", ClusterPhaseReady},
+		{"active", ClusterPhaseReady},
+		{"new", ClusterPhaseDraft},
+		{"creating", ClusterPhaseCreating},
+		{"provisioning", ClusterPhaseCreating},
+		{"bootstrapping", ClusterPhaseCreating},
+		{"updating", ClusterPhaseUpdating},
+		{"upgrading", ClusterPhaseUpdating},
+		{"deleting", ClusterPhaseDeleting},
+		{"terminating", ClusterPhaseDeleting},
+		{"failed", ClusterPhaseFailed},
+		{"error", ClusterPhaseFailed},
+		{"joining", ClusterPhaseJoining},
+		{"ready_to_join", ClusterPhaseReadyToJoin},
+		{"readytojoin", ClusterPhaseReadyToJoin},
+
+		// Case-insensitive.
+		{"IN_SYNC", ClusterPhaseReady},
+		{"Creating", ClusterPhaseCreating},
+
+		// Unknown values pass through verbatim (no normalization).
+		{"some-novel-state", ClusterPhase("some-novel-state")},
+		{"", ClusterPhase("")},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.in, func(t *testing.T) {
+			got := mapStatusToPhase(tc.in)
+			if got != tc.want {
+				t.Errorf("mapStatusToPhase(%q)=%q, want %q", tc.in, got, tc.want)
+			}
+		})
+	}
+}
+
+func TestBase64Encode(t *testing.T) {
+	cases := []struct {
+		name, in, want string
+	}{
+		{"empty", "", ""},
+		{"ascii", "user:token", "dXNlcjp0b2tlbg=="},
+		{"unicode", "tëst", "dMOrc3Q="},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := base64Encode(tc.in)
+			if got != tc.want {
+				t.Errorf("base64Encode(%q)=%q, want %q", tc.in, got, tc.want)
+			}
+			// Sanity: must round-trip via stdlib decoder.
+			dec, err := base64.StdEncoding.DecodeString(got)
+			if err != nil {
+				t.Fatalf("decode error: %v", err)
+			}
+			if string(dec) != tc.in {
+				t.Errorf("round-trip mismatch: %q -> %q", tc.in, string(dec))
+			}
+		})
+	}
+}
+
+func TestNewClientWithOptions_Validation(t *testing.T) {
+	t.Run("empty baseURL errors", func(t *testing.T) {
+		_, err := NewClientWithOptions("", "tok", ClientOptions{})
+		if err == nil || !strings.Contains(err.Error(), "baseURL") {
+			t.Errorf("want baseURL error, got %v", err)
+		}
+	})
+
+	t.Run("empty token errors", func(t *testing.T) {
+		_, err := NewClientWithOptions("https://x", "", ClientOptions{})
+		if err == nil || !strings.Contains(err.Error(), "token") {
+			t.Errorf("want token error, got %v", err)
+		}
+	})
+
+	t.Run("defaults: x-auth-token + /api/v1", func(t *testing.T) {
+		c, err := NewClientWithOptions("https://commander.example.com", "tok", ClientOptions{})
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if c.authMethod != AuthMethodXAuthToken {
+			t.Errorf("authMethod=%q, want %q", c.authMethod, AuthMethodXAuthToken)
+		}
+		if c.apiPrefix != "/api/v1" {
+			t.Errorf("apiPrefix=%q, want /api/v1", c.apiPrefix)
+		}
+	})
+
+	t.Run("trailing slashes trimmed from baseURL and apiPrefix", func(t *testing.T) {
+		c, err := NewClientWithOptions("https://x/", "tok", ClientOptions{APIPrefix: "/api/"})
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if c.baseURL != "https://x" {
+			t.Errorf("baseURL=%q, want %q", c.baseURL, "https://x")
+		}
+		if c.apiPrefix != "/api" {
+			t.Errorf("apiPrefix=%q, want /api", c.apiPrefix)
+		}
+	})
+
+	t.Run("explicit auth method is honored", func(t *testing.T) {
+		c, err := NewClientWithOptions("https://x", "tok", ClientOptions{AuthMethod: AuthMethodBearer})
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if c.authMethod != AuthMethodBearer {
+			t.Errorf("authMethod=%q, want %q", c.authMethod, AuthMethodBearer)
+		}
+	})
+
+	t.Run("bad CACertPath errors with file path", func(t *testing.T) {
+		bad := filepath.Join(t.TempDir(), "does-not-exist.pem")
+		_, err := NewClientWithOptions("https://x", "tok", ClientOptions{CACertPath: bad})
+		if err == nil || !strings.Contains(err.Error(), "CA certificate") {
+			t.Errorf("want CA certificate error, got %v", err)
+		}
+	})
+
+	t.Run("malformed CA cert content errors", func(t *testing.T) {
+		path := filepath.Join(t.TempDir(), "not-a-cert.pem")
+		if err := os.WriteFile(path, []byte("not pem data"), 0o600); err != nil {
+			t.Fatal(err)
+		}
+		_, err := NewClientWithOptions("https://x", "tok", ClientOptions{CACertPath: path})
+		if err == nil || !strings.Contains(err.Error(), "parse CA certificate") {
+			t.Errorf("want parse error, got %v", err)
+		}
+	})
+
+	t.Run("InsecureSkipTLSVerify is configured", func(t *testing.T) {
+		c, err := NewClientWithOptions("https://x", "tok", ClientOptions{InsecureSkipTLSVerify: true})
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if c.httpClient == nil || c.httpClient.Transport == nil {
+			t.Fatal("expected non-nil httpClient/transport")
+		}
+	})
+}
+
diff --git a/internal/logger/level_test.go b/internal/logger/level_test.go
new file mode 100644
index 0000000..864ada0
--- /dev/null
+++ b/internal/logger/level_test.go
@@ -0,0 +1,70 @@
+/*
+Copyright 2026 Flant JSC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package logger
+
+import (
+	"log/slog"
+	"testing"
+)
+
+func TestLevelToString(t *testing.T) {
+	cases := []struct {
+		level slog.Level
+		want  string
+	}{
+		{slog.LevelDebug, "DEBUG"},
+		{slog.LevelInfo, "INFO"},
+		{slog.LevelWarn, "WARN"},
+		{slog.LevelError, "ERROR"},
+		// Unknown/out-of-range level returns "INFO".
+		{slog.Level(100), "INFO"},
+		{slog.Level(-100), "INFO"},
+	}
+	for _, tc := range cases {
+		t.Run(tc.want, func(t *testing.T) {
+			got := LevelToString(tc.level)
+			if got != tc.want {
+				t.Errorf("LevelToString(%v)=%q, want %q", tc.level, got, tc.want)
+			}
+		})
+	}
+}
+
+// TestParseLevelLevelToStringRoundTrip checks that the canonical pairs survive
+// a string -> Level -> string trip. Note: ParseLevel accepts "warning" as a
+// synonym for "warn" but LevelToString always emits "WARN".
+func TestParseLevelLevelToStringRoundTrip(t *testing.T) {
+	pairs := map[string]string{
+		"debug": "DEBUG",
+		"info":  "INFO",
+		"warn":  "WARN",
+		"error": "ERROR",
+	}
+	for in, want := range pairs {
+		t.Run(in, func(t *testing.T) {
+			got := LevelToString(ParseLevel(in))
+			if got != want {
+				t.Errorf("LevelToString(ParseLevel(%q))=%q, want %q", in, got, want)
+			}
+		})
+	}
+
+	// "warning" is a synonym for warn.
+	if got := LevelToString(ParseLevel("warning")); got != "WARN" {
+		t.Errorf("ParseLevel(\"warning\") roundtrip = %q, want WARN", got)
+	}
+}
diff --git a/pkg/cluster/vms_test.go b/pkg/cluster/vms_test.go
new file mode 100644
index 0000000..917a557
--- /dev/null
+++ b/pkg/cluster/vms_test.go
@@ -0,0 +1,257 @@
+/*
+Copyright 2026 Flant JSC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package cluster
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/deckhouse/storage-e2e/internal/config"
+)
+
+func TestGetCVMINameFromImageURL(t *testing.T) {
+	cases := []struct {
+		name string
+		in   string
+		want string
+	}{
+		{"strips .img extension", "https://example.com/jammy-server.img", "jammy-server"},
+		{"strips .qcow2 extension", "https://example.com/redos.qcow2", "redos"},
+		{"lowercases", "https://EXAMPLE.com/UBUNTU-22.04.img", "ubuntu-22-04"},
+		{"underscores to hyphens", "https://example.com/my_image_file.img", "my-image-file"},
+		{"dots to hyphens", "https://example.com/v1.0.2-image.img", "v1-0-2-image"},
+		{"collapses consecutive hyphens", "https://example.com/a__b..c.img", "a-b-c"},
+		{"trims leading/trailing hyphens", "https://example.com/_..foo..img", "foo"},
+		{
+			"empty after sanitation -> 'image'",
+			"https://example.com/._.qcow2",
+			"image",
+		},
+		{
+			"no path segments yet still works",
+			"foo.img",
+			"foo",
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := getCVMINameFromImageURL(tc.in)
+			if got != tc.want {
+				t.Errorf("getCVMINameFromImageURL(%q)=%q, want %q", tc.in, got, tc.want)
+			}
+		})
+	}
+}
+
+func TestGetVMNodes(t *testing.T) {
+	t.Run("filters VM nodes across masters/workers and includes VM setup", func(t *testing.T) {
+		def := &config.ClusterDefinition{
+			Masters: []config.ClusterNode{
+				{Hostname: "m1", HostType: config.HostTypeVM},
+				{Hostname: "m2", HostType: config.HostTypeBareMetal},
+			},
+			Workers: []config.ClusterNode{
+				{Hostname: "w1", HostType: config.HostTypeBareMetal},
+				{Hostname: "w2", HostType: config.HostTypeVM},
+			},
+			Setup: &config.ClusterNode{Hostname: "setup", HostType: config.HostTypeVM},
+		}
+
+		nodes := getVMNodes(def)
+		got := hostnames(nodes)
+		want := []string{"m1", "w2", "setup"}
+		if !equalStrings(got, want) {
+			t.Errorf("getVMNodes hostnames = %v, want %v", got, want)
+		}
+	})
+
+	t.Run("bare-metal setup is excluded", func(t *testing.T) {
+		def := &config.ClusterDefinition{
+			Masters: []config.ClusterNode{{Hostname: "m1", HostType: config.HostTypeVM}},
+			Setup:   &config.ClusterNode{Hostname: "setup-bm", HostType: config.HostTypeBareMetal},
+		}
+		nodes := getVMNodes(def)
+		got := hostnames(nodes)
+		if !equalStrings(got, []string{"m1"}) {
+			t.Errorf("got %v, want [m1] (setup must be excluded when bare-metal)", got)
+		}
+	})
+
+	t.Run("nil setup is fine", func(t *testing.T) {
+		def := &config.ClusterDefinition{
+			Workers: []config.ClusterNode{{Hostname: "w1", HostType: config.HostTypeVM}},
+		}
+		nodes := getVMNodes(def)
+		if len(nodes) != 1 || nodes[0].Hostname != "w1" {
+			t.Errorf("got %v, want [w1]", hostnames(nodes))
+		}
+	})
+}
+
+func TestGetSetupNode(t *testing.T) {
+	t.Run("nil clusterDef errors", func(t *testing.T) {
+		_, err := GetSetupNode(nil)
+		if err == nil {
+			t.Fatal("expected error for nil clusterDef")
+		}
+	})
+
+	t.Run("nil Setup errors", func(t *testing.T) {
+		_, err := GetSetupNode(&config.ClusterDefinition{})
+		if err == nil {
+			t.Fatal("expected error when Setup is nil")
+		}
+	})
+
+	t.Run("returns setup node", func(t *testing.T) {
+		want := &config.ClusterNode{Hostname: "boot", HostType: config.HostTypeVM}
+		def := &config.ClusterDefinition{Setup: want}
+		got, err := GetSetupNode(def)
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if got != want {
+			t.Errorf("got %p, want %p", got, want)
+		}
+	})
+}
+
+func TestGetNodeIPAddress(t *testing.T) {
+	def := &config.ClusterDefinition{
+		Masters: []config.ClusterNode{
+			{Hostname: "m1", IPAddress: "10.0.0.1", HostType: config.HostTypeVM},
+			{Hostname: "m-empty", IPAddress: "", HostType: config.HostTypeVM},
+		},
+		Workers: []config.ClusterNode{
+			{Hostname: "w1", IPAddress: "10.0.0.2", HostType: config.HostTypeVM},
+		},
+		Setup: &config.ClusterNode{Hostname: "boot", IPAddress: "10.0.0.99", HostType: config.HostTypeVM},
+	}
+
+	cases := []struct {
+		name     string
+		hostname string
+		wantIP   string
+		wantErr  string // substring; empty means no error expected
+	}{
+		{"master found", "m1", "10.0.0.1", ""},
+		{"worker found", "w1", "10.0.0.2", ""},
+		{"setup found", "boot", "10.0.0.99", ""},
+		{"master IP empty -> error", "m-empty", "", "IP address not set for master"},
+		{"unknown hostname -> error", "ghost", "", "not found in cluster definition"},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			ip, err := GetNodeIPAddress(def, tc.hostname)
+			if tc.wantErr == "" {
+				if err != nil {
+					t.Fatalf("unexpected error: %v", err)
+				}
+				if ip != tc.wantIP {
+					t.Errorf("ip=%q, want %q", ip, tc.wantIP)
+				}
+				return
+			}
+			if err == nil {
+				t.Fatalf("expected error containing %q, got nil", tc.wantErr)
+			}
+			if !strings.Contains(err.Error(), tc.wantErr) {
+				t.Errorf("error %q does not contain %q", err, tc.wantErr)
+			}
+		})
+	}
+}
+
+func TestGenerateRandomSuffix(t *testing.T) {
+	t.Run("length matches argument", func(t *testing.T) {
+		for _, n := range []int{0, 1, 5, 32} {
+			got := GenerateRandomSuffix(n)
+			if len(got) != n {
+				t.Errorf("len(GenerateRandomSuffix(%d))=%d, want %d (got=%q)", n, len(got), n, got)
+			}
+		}
+	})
+
+	t.Run("uses only lowercase alphanumerics", func(t *testing.T) {
+		got := GenerateRandomSuffix(64)
+		for _, r := range got {
+			if !((r >= 'a' && r <= 'z') || (r >= '0' && r <= '9')) {
+				t.Fatalf("invalid character %q in suffix %q", r, got)
+			}
+		}
+	})
+
+	t.Run("two calls usually differ for non-trivial length", func(t *testing.T) {
+		// With charset of 36 and length 16 the collision probability is
+		// 36^-16 — negligible. If it ever collides we have a bigger problem.
+		a := GenerateRandomSuffix(16)
+		b := GenerateRandomSuffix(16)
+		if a == b {
+			t.Errorf("two 16-char suffixes were identical: %q (probability ~36^-16)", a)
+		}
+	})
+}
+
+func TestGenerateCloudInitUserData(t *testing.T) {
+	t.Run("worker cloud-init includes hostname and key", func(t *testing.T) {
+		got := generateCloudInitUserData("worker-1", "ssh-ed25519 AAAA test@key")
+		mustContain(t, got, "#cloud-config")
+		mustContain(t, got, "hostnamectl set-hostname worker-1")
+		mustContain(t, got, "ssh-ed25519 AAAA test@key")
+		mustContain(t, got, "mirror.yandex.ru/ubuntu") // apt mirror tweak
+		mustContain(t, got, "99force-ipv4")            // IPv4 apt override
+	})
+
+	t.Run("setup cloud-init includes docker and key", func(t *testing.T) {
+		got := generateSetupNodeCloudInit("bootstrap-node-abc", "ssh-rsa BBBB me@host")
+		mustContain(t, got, "#cloud-config")
+		mustContain(t, got, "hostnamectl set-hostname bootstrap-node-abc")
+		mustContain(t, got, "ssh-rsa BBBB me@host")
+		mustContain(t, got, "docker.io")
+		mustContain(t, got, "systemctl enable --now docker.service")
+	})
+}
+
+// helpers ---------------------------------------------------------------
+
+func hostnames(nodes []config.ClusterNode) []string {
+	out := make([]string, 0, len(nodes))
+	for _, n := range nodes {
+		out = append(out, n.Hostname)
+	}
+	return out
+}
+
+func equalStrings(a, b []string) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for i := range a {
+		if a[i] != b[i] {
+			return false
+		}
+	}
+	return true
+}
+
+func mustContain(t *testing.T, s, substr string) {
+	t.Helper()
+	if !strings.Contains(s, substr) {
+		t.Errorf("missing %q in output:\n%s", substr, s)
+	}
+}
diff --git a/pkg/kubernetes/apply_test.go b/pkg/kubernetes/apply_test.go
new file mode 100644
index 0000000..b19db38
--- /dev/null
+++ b/pkg/kubernetes/apply_test.go
@@ -0,0 +1,128 @@
+/*
+Copyright 2026 Flant JSC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package kubernetes
+
+import (
+	"reflect"
+	"sort"
+	"testing"
+)
+
+func TestFindUnsetEnvVars(t *testing.T) {
+	t.Run("returns deduped names for unset vars only", func(t *testing.T) {
+		const (
+			unsetA = "STORAGE_E2E_TEST_UNSET_A"
+			unsetB = "STORAGE_E2E_TEST_UNSET_B"
+			setOK  = "STORAGE_E2E_TEST_SET_OK"
+		)
+		// Explicitly empty the probe vars so the test is hermetic even if
+		// something in the environment happened to set them.
+		t.Setenv(unsetA, "")
+		t.Setenv(unsetB, "")
+		t.Setenv(setOK, "value-is-here")
+
+		content := `image: ${` + unsetA + `}
+extra: ${` + unsetA + `}
+ref: ${` + unsetB + `}
+tag: ${` + setOK + `}`
+
+		got := FindUnsetEnvVars(content)
+		sort.Strings(got)
+		want := []string{unsetA, unsetB}
+		sort.Strings(want)
+		if !reflect.DeepEqual(got, want) {
+			t.Fatalf("FindUnsetEnvVars()=%v, want %v", got, want)
+		}
+	})
+
+	t.Run("ignores non-matching tokens", func(t *testing.T) {
+		// `$VAR` (no braces) and `${1foo}` (starts with digit) must be ignored.
+		got := FindUnsetEnvVars(`a: $PLAIN
+b: ${1invalid}
+c: ${valid_NAME}`)
+		// $PLAIN and ${1invalid} should not show up. ${valid_NAME} should.
+		// Note: this var should remain unset (highly unlikely to collide).
+		t.Setenv("valid_NAME", "") // ensure unset (empty -> treated as unset)
+		// re-evaluate now that we forced empty
+		got = FindUnsetEnvVars(`c: ${valid_NAME}`)
+		if !reflect.DeepEqual(got, []string{"valid_NAME"}) {
+			t.Fatalf("got %v, want [valid_NAME]", got)
+		}
+	})
+
+	t.Run("empty content returns nil", func(t *testing.T) {
+		if got := FindUnsetEnvVars(""); got != nil {
+			t.Fatalf("got %v, want nil", got)
+		}
+	})
+}
+
+func TestSplitYAMLDocuments(t *testing.T) {
+	cases := []struct {
+		name string
+		in   string
+		want []string
+	}{
+		{
+			name: "single document",
+			in:   "kind: A\nmetadata:\n  name: a\n",
+			want: []string{"kind: A\nmetadata:\n  name: a"},
+		},
+		{
+			name: "two documents joined by \\n---\\n",
+			in:   "kind: A\n---\nkind: B\n",
+			want: []string{"kind: A", "kind: B"},
+		},
+		{
+			name: "trailing separator yields no extra empty doc",
+			in:   "kind: A\n---\n",
+			want: []string{"kind: A"},
+		},
+		{
+			name: "leading separator skipped",
+			in:   "---\nkind: A\n",
+			// "---\nkind: A\n" has no "\n---\n" separator (the leading "---" is at line 0).
+			// Strings.Split returns the whole thing as one entry; "---\nkind: A" remains
+			// after TrimSpace and is kept because it isn't bare "---".
+			want: []string{"---\nkind: A"},
+		},
+		{
+			name: "only separators -> nothing",
+			in:   "\n---\n\n---\n",
+			want: nil,
+		},
+		{
+			name: "empty input returns nil",
+			in:   "",
+			want: nil,
+		},
+		{
+			name: "three docs with whitespace",
+			in:   "  kind: A  \n---\nkind: B\n---\nkind: C",
+			want: []string{"kind: A", "kind: B", "kind: C"},
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := splitYAMLDocuments(tc.in)
+			if !reflect.DeepEqual(got, tc.want) {
+				t.Fatalf("splitYAMLDocuments(%q)=\n  %#v\nwant\n  %#v", tc.in, got, tc.want)
+			}
+		})
+	}
+}
diff --git a/pkg/kubernetes/modules_test.go b/pkg/kubernetes/modules_test.go
new file mode 100644
index 0000000..977750b
--- /dev/null
+++ b/pkg/kubernetes/modules_test.go
@@ -0,0 +1,280 @@
+/*
+Copyright 2026 Flant JSC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package kubernetes
+
+import (
+	"errors"
+	"sort"
+	"strings"
+	"testing"
+
+	"github.com/deckhouse/storage-e2e/internal/config"
+)
+
+func TestConvertModuleSpecsToConfigs(t *testing.T) {
+	t.Run("nil Settings becomes empty map", func(t *testing.T) {
+		got := convertModuleSpecsToConfigs([]ModuleSpec{
+			{Name: "foo", Version: 1, Enabled: true, Settings: nil},
+		})
+		if len(got) != 1 {
+			t.Fatalf("len=%d, want 1", len(got))
+		}
+		if got[0].Settings == nil {
+			t.Fatal("Settings is nil; expected non-nil empty map")
+		}
+		if len(got[0].Settings) != 0 {
+			t.Errorf("Settings should be empty, got %v", got[0].Settings)
+		}
+	})
+
+	t.Run("copies fields verbatim", func(t *testing.T) {
+		specs := []ModuleSpec{
+			{
+				Name:               "csi-ceph",
+				Version:            2,
+				Enabled:            true,
+				Settings:           map[string]interface{}{"foo": "bar"},
+				Dependencies:       []string{"snapshot-controller"},
+				ModulePullOverride: "pr131",
+			},
+			{
+				Name:    "noop",
+				Enabled: false,
+			},
+		}
+		got := convertModuleSpecsToConfigs(specs)
+		if len(got) != 2 {
+			t.Fatalf("len=%d, want 2", len(got))
+		}
+		if got[0].Name != "csi-ceph" || got[0].Version != 2 || !got[0].Enabled {
+			t.Errorf("got[0]=%+v", got[0])
+		}
+		if got[0].Settings["foo"] != "bar" {
+			t.Errorf("settings not copied: %v", got[0].Settings)
+		}
+		if !equalStrings(got[0].Dependencies, []string{"snapshot-controller"}) {
+			t.Errorf("Dependencies = %v", got[0].Dependencies)
+		}
+		if got[0].ModulePullOverride != "pr131" {
+			t.Errorf("ModulePullOverride = %q", got[0].ModulePullOverride)
+		}
+	})
+
+	t.Run("empty input returns empty slice", func(t *testing.T) {
+		got := convertModuleSpecsToConfigs(nil)
+		if got == nil {
+			t.Fatal("expected non-nil empty slice")
+		}
+		if len(got) != 0 {
+			t.Errorf("len=%d, want 0", len(got))
+		}
+	})
+}
+
+func TestBuildModuleGraph(t *testing.T) {
+	t.Run("empty input -> empty graph", func(t *testing.T) {
+		g, err := buildModuleGraph(nil)
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if len(g.modules) != 0 || len(g.dependencies) != 0 || len(g.reverseDeps) != 0 {
+			t.Errorf("non-empty graph: %+v", g)
+		}
+	})
+
+	t.Run("builds dependency and reverse-dependency edges", func(t *testing.T) {
+		modules := []*config.ModuleConfig{
+			{Name: "a"},
+			{Name: "b", Dependencies: []string{"a"}},
+			{Name: "c", Dependencies: []string{"a", "b"}},
+		}
+		g, err := buildModuleGraph(modules)
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if len(g.modules) != 3 {
+			t.Errorf("modules count = %d, want 3", len(g.modules))
+		}
+
+		// Forward deps.
+		if !equalStrings(g.dependencies["b"], []string{"a"}) {
+			t.Errorf("dep[b] = %v", g.dependencies["b"])
+		}
+		if !equalStrings(g.dependencies["c"], []string{"a", "b"}) {
+			t.Errorf("dep[c] = %v", g.dependencies["c"])
+		}
+
+		// Reverse deps: who depends on "a"? -> b and c.
+		gotRevA := append([]string(nil), g.reverseDeps["a"]...)
+		sort.Strings(gotRevA)
+		if !equalStrings(gotRevA, []string{"b", "c"}) {
+			t.Errorf("reverseDeps[a] = %v", gotRevA)
+		}
+		// "b" is depended on only by c.
+		if !equalStrings(g.reverseDeps["b"], []string{"c"}) {
+			t.Errorf("reverseDeps[b] = %v", g.reverseDeps["b"])
+		}
+	})
+
+	t.Run("missing dependency returns error", func(t *testing.T) {
+		modules := []*config.ModuleConfig{
+			{Name: "a", Dependencies: []string{"ghost"}},
+		}
+		_, err := buildModuleGraph(modules)
+		if err == nil {
+			t.Fatal("expected error for missing dependency")
+		}
+		if !strings.Contains(err.Error(), "ghost") || !strings.Contains(err.Error(), "a") {
+			t.Errorf("error should mention 'ghost' and 'a': %v", err)
+		}
+	})
+}
+
+func TestTopologicalSortLevels(t *testing.T) {
+	t.Run("orders by dependency depth (multi-level)", func(t *testing.T) {
+		modules := []*config.ModuleConfig{
+			{Name: "leaf1"},
+			{Name: "leaf2"},
+			{Name: "mid", Dependencies: []string{"leaf1", "leaf2"}},
+			{Name: "top", Dependencies: []string{"mid"}},
+		}
+		g, err := buildModuleGraph(modules)
+		if err != nil {
+			t.Fatalf("graph error: %v", err)
+		}
+		levels, err := topologicalSortLevels(g)
+		if err != nil {
+			t.Fatalf("sort error: %v", err)
+		}
+		if len(levels) != 3 {
+			t.Fatalf("want 3 levels, got %d (%v)", len(levels), levelNames(levels))
+		}
+		level0 := names(levels[0])
+		sort.Strings(level0)
+		if !equalStrings(level0, []string{"leaf1", "leaf2"}) {
+			t.Errorf("level 0 = %v, want [leaf1 leaf2]", level0)
+		}
+		if !equalStrings(names(levels[1]), []string{"mid"}) {
+			t.Errorf("level 1 = %v, want [mid]", names(levels[1]))
+		}
+		if !equalStrings(names(levels[2]), []string{"top"}) {
+			t.Errorf("level 2 = %v, want [top]", names(levels[2]))
+		}
+	})
+
+	t.Run("flat list -> single level", func(t *testing.T) {
+		modules := []*config.ModuleConfig{
+			{Name: "a"}, {Name: "b"}, {Name: "c"},
+		}
+		g, _ := buildModuleGraph(modules)
+		levels, err := topologicalSortLevels(g)
+		if err != nil {
+			t.Fatalf("sort error: %v", err)
+		}
+		if len(levels) != 1 || len(levels[0]) != 3 {
+			t.Errorf("expected 1 level of 3 modules, got %d levels (%v)", len(levels), levelNames(levels))
+		}
+	})
+
+	t.Run("cycle detection returns error listing remaining modules", func(t *testing.T) {
+		// Cycle: a -> b -> a
+		modules := []*config.ModuleConfig{
+			{Name: "a", Dependencies: []string{"b"}},
+			{Name: "b", Dependencies: []string{"a"}},
+		}
+		g, err := buildModuleGraph(modules)
+		if err != nil {
+			t.Fatalf("graph error: %v", err)
+		}
+		_, err = topologicalSortLevels(g)
+		if err == nil {
+			t.Fatal("expected error for cycle")
+		}
+		if !strings.Contains(err.Error(), "circular dependency") {
+			t.Errorf("want 'circular dependency' in error: %v", err)
+		}
+		// Must list both remaining modules.
+		if !strings.Contains(err.Error(), "a") || !strings.Contains(err.Error(), "b") {
+			t.Errorf("error should list cycle members: %v", err)
+		}
+	})
+
+	t.Run("empty graph returns no levels", func(t *testing.T) {
+		g, _ := buildModuleGraph(nil)
+		levels, err := topologicalSortLevels(g)
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if len(levels) != 0 {
+			t.Errorf("want 0 levels, got %d", len(levels))
+		}
+	})
+}
+
+func TestIsWebhookConnectionError(t *testing.T) {
+	cases := []struct {
+		name string
+		err  error
+		want bool
+	}{
+		{"nil", nil, false},
+		{"connection refused", errors.New("dial: connection refused"), true},
+		{"failed calling webhook", errors.New("failed calling webhook foo.bar"), true},
+		{"webhook + timeout combo", errors.New("Internal error: webhook handler request timeout"), true},
+		{"plain webhook word only", errors.New("webhook validating CR"), false},
+		{"unrelated error", errors.New("oops"), false},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := isWebhookConnectionError(tc.err)
+			if got != tc.want {
+				t.Fatalf("isWebhookConnectionError(%v)=%v, want %v", tc.err, got, tc.want)
+			}
+		})
+	}
+}
+
+// helpers ---------------------------------------------------------------
+
+func names(modules []*config.ModuleConfig) []string {
+	out := make([]string, 0, len(modules))
+	for _, m := range modules {
+		out = append(out, m.Name)
+	}
+	return out
+}
+
+func levelNames(levels [][]*config.ModuleConfig) [][]string {
+	out := make([][]string, len(levels))
+	for i, lvl := range levels {
+		out[i] = names(lvl)
+	}
+	return out
+}
+
+func equalStrings(a, b []string) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for i := range a {
+		if a[i] != b[i] {
+			return false
+		}
+	}
+	return true
+}
diff --git a/pkg/kubernetes/poll_test.go b/pkg/kubernetes/poll_test.go
new file mode 100644
index 0000000..ee76028
--- /dev/null
+++ b/pkg/kubernetes/poll_test.go
@@ -0,0 +1,102 @@
+/*
+Copyright 2026 Flant JSC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package kubernetes
+
+import (
+	"strings"
+	"testing"
+	"time"
+
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
+)
+
+func TestFormatRef(t *testing.T) {
+	cases := []struct {
+		namespace, name, want string
+	}{
+		{"", "cluster-scoped", "cluster-scoped"},
+		{"ns", "foo", "ns/foo"},
+		{"", "", ""},
+		{"ns", "", "ns/"},
+	}
+	for _, tc := range cases {
+		got := formatRef(tc.namespace, tc.name)
+		if got != tc.want {
+			t.Errorf("formatRef(%q,%q)=%q, want %q", tc.namespace, tc.name, got, tc.want)
+		}
+	}
+}
+
+func TestSameFinalizers(t *testing.T) {
+	cases := []struct {
+		name string
+		a, b []string
+		want bool
+	}{
+		{"both nil", nil, nil, true},
+		{"both empty", []string{}, []string{}, true},
+		{"identical order", []string{"a", "b"}, []string{"a", "b"}, true},
+		{"different order", []string{"a", "b"}, []string{"b", "a"}, false},
+		{"different lengths", []string{"a"}, []string{"a", "b"}, false},
+		{"duplicates differ", []string{"a", "a"}, []string{"a", "b"}, false},
+		{"single equal", []string{"x"}, []string{"x"}, true},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := sameFinalizers(tc.a, tc.b)
+			if got != tc.want {
+				t.Errorf("sameFinalizers(%v,%v)=%v, want %v", tc.a, tc.b, got, tc.want)
+			}
+		})
+	}
+}
+
+func TestErrIfTerminating(t *testing.T) {
+	t.Run("no deletionTimestamp returns nil", func(t *testing.T) {
+		obj := &unstructured.Unstructured{}
+		obj.SetName("foo")
+		if err := errIfTerminating(obj, "Pod", "default/foo"); err != nil {
+			t.Errorf("unexpected error: %v", err)
+		}
+	})
+
+	t.Run("deletionTimestamp set returns descriptive error", func(t *testing.T) {
+		obj := &unstructured.Unstructured{}
+		obj.SetName("foo")
+		obj.SetNamespace("default")
+		now := metav1.NewTime(time.Date(2026, 1, 2, 3, 4, 5, 0, time.UTC))
+		obj.SetDeletionTimestamp(&now)
+		obj.SetFinalizers([]string{"kubernetes.io/pvc-protection"})
+
+		err := errIfTerminating(obj, "Pod", "default/foo")
+		if err == nil {
+			t.Fatal("expected error")
+		}
+		msg := err.Error()
+		for _, want := range []string{
+			"Pod",
+			"default/foo",
+			"deletionTimestamp",
+			"kubernetes.io/pvc-protection",
+		} {
+			if !strings.Contains(msg, want) {
+				t.Errorf("error missing %q: %v", want, msg)
+			}
+		}
+	})
+}
diff --git a/pkg/retry/retry_test.go b/pkg/retry/retry_test.go
new file mode 100644
index 0000000..9b953d0
--- /dev/null
+++ b/pkg/retry/retry_test.go
@@ -0,0 +1,408 @@
+/*
+Copyright 2026 Flant JSC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package retry
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/runtime/schema"
+)
+
+// statusErr constructs a *apierrors.StatusError with the given HTTP code and
+// optional RetryAfterSeconds, mirroring how apiserver decodes failures.
+func statusErr(code int32, retryAfterSec int32) *apierrors.StatusError {
+	st := metav1.Status{
+		Status: metav1.StatusFailure,
+		Code:   code,
+	}
+	if retryAfterSec > 0 {
+		st.Details = &metav1.StatusDetails{RetryAfterSeconds: retryAfterSec}
+	}
+	return &apierrors.StatusError{ErrStatus: st}
+}
+
+func TestIsRetryable(t *testing.T) {
+	cases := []struct {
+		name string
+		err  error
+		want bool
+	}{
+		{"nil error", nil, false},
+
+		// Status code branch.
+		{"status 500", statusErr(500, 0), true},
+		{"status 503", statusErr(503, 0), true},
+		{"status 429 (too many requests)", statusErr(429, 0), true},
+		{"status 501 not implemented", statusErr(501, 0), false},
+		{"status 200 ok", statusErr(200, 0), false},
+		{"status 404 not found", statusErr(404, 0), false},
+		{"status with RetryAfterSeconds hint", statusErr(400, 7), true},
+
+		// Helper-based detection from apimachinery.
+		{
+			"server timeout via helper",
+			apierrors.NewServerTimeout(schema.GroupResource{Resource: "pods"}, "x", 1),
+			true,
+		},
+		{
+			"service unavailable via helper",
+			apierrors.NewServiceUnavailable("svc"),
+			true,
+		},
+		{
+			"too many requests via helper",
+			apierrors.NewTooManyRequestsError("rl"),
+			true,
+		},
+		{
+			"internal error via helper",
+			apierrors.NewInternalError(errors.New("boom")),
+			true,
+		},
+
+		// io.EOF is always retryable (broken connections).
+		{"io.EOF", io.EOF, true},
+		{"wrapped io.EOF", fmt.Errorf("read tcp: %w", io.EOF), true},
+
+		// String-pattern branches.
+		{"TLS handshake timeout", errors.New("TLS handshake timeout"), true},
+		{"connection refused", errors.New("dial: connection refused"), true},
+		{"connection reset", errors.New("read: connection reset by peer"), true},
+		{"connection timed out", errors.New("connection timed out"), true},
+		{"i/o timeout", errors.New("read tcp: i/o timeout"), true},
+		{"EOF substring", errors.New("transport: EOF"), true},
+		{"broken pipe", errors.New("write: broken pipe"), true},
+		{"no route to host", errors.New("dial: no route to host"), true},
+		{"network is unreachable", errors.New("dial: network is unreachable"), true},
+		{"net/http: request canceled", errors.New("net/http: request canceled"), true},
+		{"context deadline exceeded", errors.New("context deadline exceeded"), true},
+
+		{"k8s server unable", errors.New("the server is currently unable to handle the request"), true},
+		{"ServiceUnavailable msg", errors.New("ServiceUnavailable"), true},
+		{"etcdserver timeout", errors.New("etcdserver: request timed out"), true},
+		{"etcdserver leader changed", errors.New("etcdserver: leader changed"), true},
+		{"failed to get server groups", errors.New("failed to get server groups"), true},
+
+		{"ssh handshake failed", errors.New("ssh: handshake failed"), true},
+		{"ssh unable to authenticate", errors.New("ssh: unable to authenticate"), true},
+		{"ssh connection lost", errors.New("ssh: connection lost"), true},
+		{"ssh failed to dial", errors.New("failed to dial 1.2.3.4:22"), true},
+		{"closed network connection", errors.New("use of closed network connection"), true},
+
+		{"webhook calling failure", errors.New("failed calling webhook validate.k8s.io"), true},
+		{"plain webhook word", errors.New("webhook handler exploded"), true},
+
+		// Non-retryable.
+		{"plain generic error", errors.New("oops"), false},
+		{"validation failure", errors.New("validation failed: invalid name"), false},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := IsRetryable(tc.err)
+			if got != tc.want {
+				t.Fatalf("IsRetryable(%v) = %v, want %v", tc.err, got, tc.want)
+			}
+		})
+	}
+}
+
+func TestIsSSHConnectionError(t *testing.T) {
+	cases := []struct {
+		name string
+		err  error
+		want bool
+	}{
+		{"nil", nil, false},
+
+		{"failed to create SSH session", errors.New("failed to create SSH session: foo"), true},
+		{"ssh handshake failed", errors.New("ssh: handshake failed"), true},
+		{"ssh connection lost", errors.New("ssh: connection lost"), true},
+		{"closed network connection", errors.New("use of closed network connection"), true},
+		{"connection refused", errors.New("connection refused"), true},
+		{"connection reset", errors.New("connection reset"), true},
+		{"broken pipe", errors.New("broken pipe"), true},
+		{"EOF string", errors.New("io: EOF"), true},
+		{"io timeout", errors.New("i/o timeout"), true},
+
+		{"io.EOF via errors.Is", io.EOF, true},
+		{"wrapped io.EOF", fmt.Errorf("wrap: %w", io.EOF), true},
+
+		// Not a connection error.
+		{"command not found", errors.New("bash: foo: command not found"), false},
+		{"permission denied", errors.New("permission denied"), false},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := IsSSHConnectionError(tc.err)
+			if got != tc.want {
+				t.Fatalf("IsSSHConnectionError(%v) = %v, want %v", tc.err, got, tc.want)
+			}
+		})
+	}
+}
+
+func TestWithRetryAfter(t *testing.T) {
+	base := Config{
+		MaxRetries:  5,
+		InitialWait: 1 * time.Second,
+		MaxWait:     10 * time.Second,
+		Backoff:     2.0,
+	}
+
+	t.Run("nil error returns config unchanged", func(t *testing.T) {
+		got := WithRetryAfter(base, nil)
+		if got.InitialWait != base.InitialWait {
+			t.Errorf("InitialWait changed: got %v, want %v", got.InitialWait, base.InitialWait)
+		}
+	})
+
+	t.Run("non-status error returns config unchanged", func(t *testing.T) {
+		got := WithRetryAfter(base, errors.New("boom"))
+		if got.InitialWait != base.InitialWait {
+			t.Errorf("InitialWait changed unexpectedly: got %v", got.InitialWait)
+		}
+	})
+
+	t.Run("status error without RetryAfter returns config unchanged", func(t *testing.T) {
+		got := WithRetryAfter(base, statusErr(503, 0))
+		if got.InitialWait != base.InitialWait {
+			t.Errorf("InitialWait changed unexpectedly: got %v", got.InitialWait)
+		}
+	})
+
+	t.Run("RetryAfter larger than InitialWait overrides it", func(t *testing.T) {
+		got := WithRetryAfter(base, statusErr(429, 5))
+		want := 5 * time.Second
+		if got.InitialWait != want {
+			t.Errorf("InitialWait = %v, want %v", got.InitialWait, want)
+		}
+		// Other fields preserved.
+		if got.MaxRetries != base.MaxRetries || got.MaxWait != base.MaxWait || got.Backoff != base.Backoff {
+			t.Errorf("non-InitialWait fields changed: %+v vs %+v", got, base)
+		}
+	})
+
+	t.Run("RetryAfter smaller than InitialWait does not shrink it", func(t *testing.T) {
+		cfg := base
+		cfg.InitialWait = 10 * time.Second
+		got := WithRetryAfter(cfg, statusErr(429, 2))
+		if got.InitialWait != cfg.InitialWait {
+			t.Errorf("InitialWait shrunk to %v, want %v", got.InitialWait, cfg.InitialWait)
+		}
+	})
+}
+
+// fastCfg keeps Do's wall-clock cost low while still exercising backoff.
+func fastCfg(maxRetries int) Config {
+	return Config{
+		MaxRetries:  maxRetries,
+		InitialWait: 1 * time.Millisecond,
+		MaxWait:     4 * time.Millisecond,
+		Backoff:     2.0,
+		LogRetries:  false,
+	}
+}
+
+func TestDo_SuccessFirstAttempt(t *testing.T) {
+	var calls int32
+	got, err := Do(context.Background(), fastCfg(3), "op", func() (int, error) {
+		atomic.AddInt32(&calls, 1)
+		return 42, nil
+	})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if got != 42 {
+		t.Errorf("got %d, want 42", got)
+	}
+	if atomic.LoadInt32(&calls) != 1 {
+		t.Errorf("expected exactly 1 call, got %d", calls)
+	}
+}
+
+func TestDo_NonRetryableErrorReturnsImmediately(t *testing.T) {
+	var calls int32
+	_, err := Do(context.Background(), fastCfg(5), "op", func() (struct{}, error) {
+		atomic.AddInt32(&calls, 1)
+		return struct{}{}, errors.New("validation failed: bad input")
+	})
+	if err == nil {
+		t.Fatal("expected error")
+	}
+	if atomic.LoadInt32(&calls) != 1 {
+		t.Errorf("expected 1 call (non-retryable), got %d", calls)
+	}
+}
+
+func TestDo_SuccessAfterRetries(t *testing.T) {
+	var calls int32
+	target := int32(3) // succeed on the 3rd call.
+	got, err := Do(context.Background(), fastCfg(5), "op", func() (string, error) {
+		n := atomic.AddInt32(&calls, 1)
+		if n < target {
+			return "", io.EOF // retryable
+		}
+		return "ok", nil
+	})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if got != "ok" {
+		t.Errorf("got %q, want %q", got, "ok")
+	}
+	if atomic.LoadInt32(&calls) != target {
+		t.Errorf("got %d calls, want %d", calls, target)
+	}
+}
+
+func TestDo_FailsAfterMaxRetriesExhausted(t *testing.T) {
+	var calls int32
+	_, err := Do(context.Background(), fastCfg(2), "op", func() (int, error) {
+		atomic.AddInt32(&calls, 1)
+		return 0, io.EOF
+	})
+	if !errors.Is(err, io.EOF) {
+		t.Fatalf("expected io.EOF, got %v", err)
+	}
+	// MaxRetries=2 means up to 3 attempts total.
+	if got := atomic.LoadInt32(&calls); got != 3 {
+		t.Errorf("got %d calls, want 3", got)
+	}
+}
+
+func TestDo_ContextCancelBeforeAttempt(t *testing.T) {
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel() // already cancelled
+
+	var calls int32
+	_, err := Do(ctx, fastCfg(5), "op", func() (int, error) {
+		atomic.AddInt32(&calls, 1)
+		return 0, nil
+	})
+	if !errors.Is(err, context.Canceled) {
+		t.Fatalf("want context.Canceled, got %v", err)
+	}
+	if atomic.LoadInt32(&calls) != 0 {
+		t.Errorf("fn must not be called when ctx already cancelled, got %d calls", calls)
+	}
+}
+
+func TestDo_ContextCancelDuringWait(t *testing.T) {
+	cfg := Config{
+		MaxRetries:  10,
+		InitialWait: 100 * time.Millisecond,
+		MaxWait:     1 * time.Second,
+		Backoff:     2.0,
+	}
+
+	ctx, cancel := context.WithCancel(context.Background())
+	var calls int32
+	// Cancel shortly after the first failed attempt enters the wait.
+	go func() {
+		time.Sleep(20 * time.Millisecond)
+		cancel()
+	}()
+
+	start := time.Now()
+	_, err := Do(ctx, cfg, "op", func() (int, error) {
+		atomic.AddInt32(&calls, 1)
+		return 0, io.EOF
+	})
+	elapsed := time.Since(start)
+
+	if !errors.Is(err, context.Canceled) {
+		t.Fatalf("want context.Canceled, got %v", err)
+	}
+	if atomic.LoadInt32(&calls) != 1 {
+		t.Errorf("got %d calls, want 1 (cancelled during first wait)", calls)
+	}
+	if elapsed >= cfg.InitialWait {
+		t.Errorf("Do returned too late (%v >= InitialWait %v); cancel did not abort wait", elapsed, cfg.InitialWait)
+	}
+}
+
+func TestDo_BackoffIsCapped(t *testing.T) {
+	// With InitialWait=4ms, Backoff=10, MaxWait=8ms we would normally jump to
+	// 40ms after the first retry — the cap must prevent that.
+	cfg := Config{
+		MaxRetries:  5,
+		InitialWait: 4 * time.Millisecond,
+		MaxWait:     8 * time.Millisecond,
+		Backoff:     10.0,
+	}
+
+	var calls int32
+	start := time.Now()
+	_, err := Do(context.Background(), cfg, "op", func() (int, error) {
+		atomic.AddInt32(&calls, 1)
+		return 0, io.EOF
+	})
+	elapsed := time.Since(start)
+
+	if !errors.Is(err, io.EOF) {
+		t.Fatalf("want io.EOF, got %v", err)
+	}
+	// 5 waits, each capped at 8ms => upper bound ~40ms even though uncapped
+	// backoff would explode to seconds. Allow ample slack for slow CI runners.
+	if elapsed > 300*time.Millisecond {
+		t.Errorf("elapsed %v suggests backoff was not capped (MaxWait=%v)", elapsed, cfg.MaxWait)
+	}
+	if atomic.LoadInt32(&calls) != 6 {
+		t.Errorf("got %d calls, want 6 (MaxRetries+1)", calls)
+	}
+}
+
+func TestDoVoid_DelegatesToDo(t *testing.T) {
+	t.Run("success", func(t *testing.T) {
+		var calls int32
+		err := DoVoid(context.Background(), fastCfg(2), "op", func() error {
+			atomic.AddInt32(&calls, 1)
+			return nil
+		})
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if atomic.LoadInt32(&calls) != 1 {
+			t.Errorf("got %d calls, want 1", calls)
+		}
+	})
+
+	t.Run("propagates non-retryable error", func(t *testing.T) {
+		want := errors.New("bad input")
+		var calls int32
+		err := DoVoid(context.Background(), fastCfg(3), "op", func() error {
+			atomic.AddInt32(&calls, 1)
+			return want
+		})
+		if !errors.Is(err, want) {
+			t.Fatalf("got %v, want %v", err, want)
+		}
+		if atomic.LoadInt32(&calls) != 1 {
+			t.Errorf("got %d calls, want 1", calls)
+		}
+	})
+}
diff --git a/pkg/testkit/stress_tests_test.go b/pkg/testkit/stress_tests_test.go
new file mode 100644
index 0000000..3e05e52
--- /dev/null
+++ b/pkg/testkit/stress_tests_test.go
@@ -0,0 +1,276 @@
+/*
+Copyright 2026 Flant JSC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package testkit
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/deckhouse/storage-e2e/internal/config"
+)
+
+// baseConfig returns a minimal valid Config for ModeFlog.
+func baseConfig() *Config {
+	return &Config{
+		Namespace:        "ns",
+		StorageClassName: "sc",
+		PVCSize:          "1Gi",
+		PodsCount:        1,
+		Iterations:       1,
+		Mode:             ModeFlog,
+	}
+}
+
+func TestConfigValidate(t *testing.T) {
+	cases := []struct {
+		name      string
+		mutate    func(c *Config)
+		wantErr   string // substring; empty == no error
+		afterHook func(t *testing.T, c *Config)
+	}{
+		{
+			name:    "valid flog config",
+			mutate:  func(*Config) {},
+			wantErr: "",
+		},
+		{
+			name:    "missing namespace",
+			mutate:  func(c *Config) { c.Namespace = "" },
+			wantErr: "namespace",
+		},
+		{
+			name:    "missing storage class",
+			mutate:  func(c *Config) { c.StorageClassName = "" },
+			wantErr: "storage class",
+		},
+		{
+			name:    "missing PVC size",
+			mutate:  func(c *Config) { c.PVCSize = "" },
+			wantErr: "PVC size",
+		},
+		{
+			name:    "zero pods count",
+			mutate:  func(c *Config) { c.PodsCount = 0 },
+			wantErr: "pods count",
+		},
+		{
+			name:    "negative iterations",
+			mutate:  func(c *Config) { c.Iterations = 0 },
+			wantErr: "iterations",
+		},
+		{
+			name: "snapshot-only with default SnapshotsPerPVC gets defaulted to 1",
+			mutate: func(c *Config) {
+				c.Mode = ModeSnapshotOnly
+				c.SnapshotsPerPVC = 0
+			},
+			wantErr: "",
+			afterHook: func(t *testing.T, c *Config) {
+				if c.SnapshotsPerPVC != 1 {
+					t.Errorf("SnapshotsPerPVC=%d, want 1 (default)", c.SnapshotsPerPVC)
+				}
+			},
+		},
+		{
+			name: "snapshot_resize_cloning requires SnapshotsPerPVC > 0",
+			mutate: func(c *Config) {
+				c.Mode = ModeSnapshotResizeCloning
+				c.SnapshotsPerPVC = 0
+				c.PVCSizeAfterResize = "2Gi"
+				c.PVCSizeAfterResizeStage2 = "3Gi"
+				c.TestOrder = []TestStep{StepResize}
+			},
+			wantErr: "snapshots per PVC",
+		},
+		{
+			name: "snapshot_resize_cloning rejects invalid step",
+			mutate: func(c *Config) {
+				c.Mode = ModeSnapshotResizeCloning
+				c.SnapshotsPerPVC = 1
+				c.PVCSizeAfterResize = "2Gi"
+				c.PVCSizeAfterResizeStage2 = "3Gi"
+				c.TestOrder = []TestStep{"bogus"}
+			},
+			wantErr: "invalid test step",
+		},
+		{
+			name: "snapshot_resize_cloning resize step requires PVCSizeAfterResize",
+			mutate: func(c *Config) {
+				c.Mode = ModeSnapshotResizeCloning
+				c.SnapshotsPerPVC = 1
+				c.PVCSizeAfterResize = ""
+				c.PVCSizeAfterResizeStage2 = "3Gi"
+				c.TestOrder = []TestStep{StepResize}
+			},
+			wantErr: "PVC size after resize",
+		},
+		{
+			name: "snapshot_resize_cloning clone/restore step requires Stage2 size",
+			mutate: func(c *Config) {
+				c.Mode = ModeSnapshotResizeCloning
+				c.SnapshotsPerPVC = 1
+				c.PVCSizeAfterResize = "2Gi"
+				c.PVCSizeAfterResizeStage2 = ""
+				c.TestOrder = []TestStep{StepClone}
+			},
+			wantErr: "stage2",
+		},
+		{
+			name: "snapshot_resize_cloning happy path",
+			mutate: func(c *Config) {
+				c.Mode = ModeSnapshotResizeCloning
+				c.SnapshotsPerPVC = 2
+				c.PVCSizeAfterResize = "2Gi"
+				c.PVCSizeAfterResizeStage2 = "3Gi"
+				c.TestOrder = []TestStep{StepRestoreFromSnapshot, StepResize, StepClone}
+			},
+			wantErr: "",
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			c := baseConfig()
+			tc.mutate(c)
+			err := c.Validate()
+			if tc.wantErr == "" {
+				if err != nil {
+					t.Fatalf("unexpected error: %v", err)
+				}
+				if tc.afterHook != nil {
+					tc.afterHook(t, c)
+				}
+				return
+			}
+			if err == nil {
+				t.Fatalf("expected error containing %q, got nil", tc.wantErr)
+			}
+			if !strings.Contains(strings.ToLower(err.Error()), strings.ToLower(tc.wantErr)) {
+				t.Errorf("error %q does not contain %q", err, tc.wantErr)
+			}
+		})
+	}
+}
+
+func TestDefaultConfig_UsesDefaultsWhenEnvUnset(t *testing.T) {
+	// Snapshot package-level env-derived globals and restore them so test
+	// ordering (`-shuffle=on`) cannot leak between cases. `t.Setenv` only
+	// affects new os.Getenv reads — the config package already cached env
+	// values into package vars at init.
+	defer withConfigSnapshot(t)()
+
+	config.StressTestPVCSize = ""
+	config.StressTestPodsCount = ""
+	config.StressTestPVCSizeAfterResize = ""
+	config.StressTestPVCSizeAfterResizeStage2 = ""
+	config.StressTestSnapshotsPerPVC = ""
+	config.StressTestMaxAttempts = ""
+	config.StressTestInterval = ""
+	config.StressTestCleanup = ""
+
+	got := DefaultConfig()
+
+	if got.PVCSize != config.StressTestPVCSizeDefaultValue {
+		t.Errorf("PVCSize=%q, want default %q", got.PVCSize, config.StressTestPVCSizeDefaultValue)
+	}
+	if got.PVCSizeAfterResize != config.StressTestPVCSizeAfterResizeDefaultValue {
+		t.Errorf("PVCSizeAfterResize=%q, want default", got.PVCSizeAfterResize)
+	}
+	if got.PVCSizeAfterResizeStage2 != config.StressTestPVCSizeAfterResizeStage2DefaultValue {
+		t.Errorf("PVCSizeAfterResizeStage2=%q, want default", got.PVCSizeAfterResizeStage2)
+	}
+	if got.PodsCount <= 0 {
+		t.Errorf("PodsCount=%d, want > 0", got.PodsCount)
+	}
+	if got.SnapshotsPerPVC <= 0 {
+		t.Errorf("SnapshotsPerPVC=%d, want > 0", got.SnapshotsPerPVC)
+	}
+	if got.MaxAttempts <= 0 {
+		t.Errorf("MaxAttempts=%d, want > 0", got.MaxAttempts)
+	}
+	if got.Interval <= 0 {
+		t.Errorf("Interval=%v, want > 0", got.Interval)
+	}
+	if !got.Cleanup {
+		t.Errorf("Cleanup=false; default should be true")
+	}
+	if got.Iterations != 1 {
+		t.Errorf("Iterations=%d, want 1", got.Iterations)
+	}
+	if got.Mode != ModeFlog {
+		t.Errorf("Mode=%q, want %q", got.Mode, ModeFlog)
+	}
+	if got.SchedulerName != "default-scheduler" {
+		t.Errorf("SchedulerName=%q, want %q", got.SchedulerName, "default-scheduler")
+	}
+	if len(got.TestOrder) != 3 {
+		t.Errorf("TestOrder len=%d, want 3", len(got.TestOrder))
+	}
+}
+
+func TestDefaultConfig_RespectsEnvOverrides(t *testing.T) {
+	defer withConfigSnapshot(t)()
+
+	config.StressTestPVCSize = "9Gi"
+	config.StressTestPodsCount = "7"
+	config.StressTestPVCSizeAfterResize = "10Gi"
+	config.StressTestPVCSizeAfterResizeStage2 = "11Gi"
+	config.StressTestSnapshotsPerPVC = "5"
+	config.StressTestMaxAttempts = "12"
+	config.StressTestInterval = "3"
+	config.StressTestCleanup = "false"
+
+	got := DefaultConfig()
+
+	if got.PVCSize != "9Gi" || got.PodsCount != 7 || got.PVCSizeAfterResize != "10Gi" ||
+		got.PVCSizeAfterResizeStage2 != "11Gi" || got.SnapshotsPerPVC != 5 ||
+		got.MaxAttempts != 12 || got.Interval.Seconds() != 3 || got.Cleanup {
+		t.Errorf("env overrides not honored: %+v", got)
+	}
+}
+
+// withConfigSnapshot captures the package-level config vars touched by
+// DefaultConfig and returns a restore function. Use as
+//
+//	defer withConfigSnapshot(t)()
+func withConfigSnapshot(t *testing.T) func() {
+	t.Helper()
+	snap := struct {
+		PVCSize, PodsCount                                                       string
+		PVCSizeAfterResize, PVCSizeAfterResizeStage2                             string
+		SnapshotsPerPVC, MaxAttempts, Interval, Cleanup                          string
+	}{
+		PVCSize:                  config.StressTestPVCSize,
+		PodsCount:                config.StressTestPodsCount,
+		PVCSizeAfterResize:       config.StressTestPVCSizeAfterResize,
+		PVCSizeAfterResizeStage2: config.StressTestPVCSizeAfterResizeStage2,
+		SnapshotsPerPVC:          config.StressTestSnapshotsPerPVC,
+		MaxAttempts:              config.StressTestMaxAttempts,
+		Interval:                 config.StressTestInterval,
+		Cleanup:                  config.StressTestCleanup,
+	}
+	return func() {
+		config.StressTestPVCSize = snap.PVCSize
+		config.StressTestPodsCount = snap.PodsCount
+		config.StressTestPVCSizeAfterResize = snap.PVCSizeAfterResize
+		config.StressTestPVCSizeAfterResizeStage2 = snap.PVCSizeAfterResizeStage2
+		config.StressTestSnapshotsPerPVC = snap.SnapshotsPerPVC
+		config.StressTestMaxAttempts = snap.MaxAttempts
+		config.StressTestInterval = snap.Interval
+		config.StressTestCleanup = snap.Cleanup
+	}
+}

From 8dd0b50f4647d93d75493a4ea7c4944083143bba Mon Sep 17 00:00:00 2001
From: Alexey Yakubov <alexey.yakubov@flant.com>
Date: Wed, 3 Jun 2026 17:40:26 +0400
Subject: [PATCH 2/8] gitleaks false positive fix

---
 internal/kubernetes/commander/client_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/internal/kubernetes/commander/client_test.go b/internal/kubernetes/commander/client_test.go
index 4180b79..08f4cfe 100644
--- a/internal/kubernetes/commander/client_test.go
+++ b/internal/kubernetes/commander/client_test.go
@@ -72,7 +72,7 @@ func TestBase64Encode(t *testing.T) {
 		name, in, want string
 	}{
 		{"empty", "", ""},
-		{"ascii", "user:token", "dXNlcjp0b2tlbg=="},
+		{"ascii", "user:token", "dXNlcjp0b2tlbg=="}, // gitleaks:allow — base64("user:token") test fixture, not a secret
 		{"unicode", "tëst", "dMOrc3Q="},
 	}
 	for _, tc := range cases {

From eb434ffcb2648fd747184e37357f15fae3da37f3 Mon Sep 17 00:00:00 2001
From: Alexey Yakubov <alexey.yakubov@flant.com>
Date: Wed, 3 Jun 2026 17:54:59 +0400
Subject: [PATCH 3/8] gitleaks is now per-push

---
 .github/workflows/{gitleaks-scan-on-pr.yml => gitleaks.yml} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename .github/workflows/{gitleaks-scan-on-pr.yml => gitleaks.yml} (100%)

diff --git a/.github/workflows/gitleaks-scan-on-pr.yml b/.github/workflows/gitleaks.yml
similarity index 100%
rename from .github/workflows/gitleaks-scan-on-pr.yml
rename to .github/workflows/gitleaks.yml

From 627889252ccef23d9371143509785708c4e0763c Mon Sep 17 00:00:00 2001
From: Alexey Yakubov <alexey.yakubov@flant.com>
Date: Wed, 3 Jun 2026 17:55:25 +0400
Subject: [PATCH 4/8] gitleaks is now per-push

---
 .github/workflows/gitleaks.yml               | 16 +++++++++++++++-
 docs/WORKLOG.md                              |  1 +
 internal/kubernetes/commander/client_test.go |  1 -
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/gitleaks.yml b/.github/workflows/gitleaks.yml
index 8f35434..b8c2487 100644
--- a/.github/workflows/gitleaks.yml
+++ b/.github/workflows/gitleaks.yml
@@ -12,15 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-name: Gitleaks Pull Request Scan
+name: Gitleaks
 
+# Triggers:
+#   push:           every push to any branch — catches leaked secrets before a PR
+#                   is even opened (feature branches, force-pushes, throw-away
+#                   debug branches).
+#   pull_request:   PRs targeting any branch — required so the check is also
+#                   evaluated for fork PRs, whose `push` events do not run in
+#                   the upstream repo.
 on:
+  push: {}
   pull_request:
     types: [opened, synchronize, reopened]
+
 permissions:
   contents: read
   pull-requests: read
 
+# Cancel superseded runs on the same ref so rapid-fire pushes don't pile up.
+concurrency:
+  group: gitleaks-${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
   gitleaks_scan:
     name: Gitleaks scan
diff --git a/docs/WORKLOG.md b/docs/WORKLOG.md
index 5e5b0a3..987e3ac 100644
--- a/docs/WORKLOG.md
+++ b/docs/WORKLOG.md
@@ -112,3 +112,4 @@ All notable changes to this repository are documented here. New entries are appe
 - **Add** Wave 1 unit tests (`pkg/retry/retry_test.go`, `pkg/kubernetes/{apply,modules,poll}_test.go`, `pkg/cluster/vms_test.go`, `pkg/testkit/stress_tests_test.go`, `internal/config/types_yaml_test.go`, `internal/kubernetes/commander/client_test.go`, `internal/logger/level_test.go`): hermetic table-driven coverage of `retry.Do/IsRetryable/IsSSHConnectionError/WithRetryAfter`, YAML doc splitting/env-var scanning, module graph + topo sort + cycle detection, `cluster/vms` pure helpers, `commander` mappers / base64 / `NewClientWithOptions` validation, `stress-tests.Config.Validate` / `DefaultConfig`, `LevelToString` round-trip, `ClusterNode`/`ClusterDefinition` YAML unmarshal validation.
 - **Add** Wave 2 httptest tests (`internal/kubernetes/commander/client_http_test.go`): drives the Commander HTTP client (`GetClusterByID`, `ListClustersAPI` array/items/data/garbage, `GetClusterByName`, `CreateClusterFromTemplate`, `DeleteClusterByID`, `GetClusterKubeconfigByID` + cluster-details fallback, `GetRegistryByName`, `GetClusterConnectionInfo` precedence + defaults) and all five `setAuthHeaders` paths via a real `httptest.Server`.
 - **Update** `docs/TESTS_IMPLEMENTATION_PLAN.md`: triggers changed from `push → main` to push-on-any-branch + `pull_request → main`; status header refreshed; rollout phases marked Done/Pending; exact `gh api` branch-protection command documented.
+- **Update** `.github/workflows/gitleaks-scan-on-pr.yml` → renamed to `.github/workflows/gitleaks.yml`: workflow `name` shortened to `Gitleaks`, added `push: {}` trigger so secret scanning runs on every push (any branch), not only on PRs; added cancel-in-progress concurrency group.
diff --git a/internal/kubernetes/commander/client_test.go b/internal/kubernetes/commander/client_test.go
index 08f4cfe..8097c4c 100644
--- a/internal/kubernetes/commander/client_test.go
+++ b/internal/kubernetes/commander/client_test.go
@@ -173,4 +173,3 @@ func TestNewClientWithOptions_Validation(t *testing.T) {
 		}
 	})
 }
-

From 0fc85a19c836fb5fddbfc2506efdb86c642b5cfd Mon Sep 17 00:00:00 2001
From: Alexey Yakubov <alexey.yakubov@flant.com>
Date: Wed, 3 Jun 2026 18:00:17 +0400
Subject: [PATCH 5/8] gitleaks - now two jobs - for pr and for push

---
 .github/workflows/gitleaks.yml | 26 +++++++++++++++++++++++++-
 docs/WORKLOG.md                |  1 +
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/gitleaks.yml b/.github/workflows/gitleaks.yml
index b8c2487..45cf831 100644
--- a/.github/workflows/gitleaks.yml
+++ b/.github/workflows/gitleaks.yml
@@ -21,6 +21,14 @@ name: Gitleaks
 #   pull_request:   PRs targeting any branch — required so the check is also
 #                   evaluated for fork PRs, whose `push` events do not run in
 #                   the upstream repo.
+#
+# The upstream action (deckhouse/modules-actions/gitleaks) supports two modes:
+#   * diff — checks out refs/pull/<N>/merge and scans only the PR range. This
+#            mode only works on pull_request events (it needs github.event.number);
+#            on a push it produces an empty refspec and `git fetch` fails.
+#   * full — checks out the whole repo and scans every tracked file.
+# So we split into two jobs gated by github.event_name: diff for PRs, full for
+# pushes.
 on:
   push: {}
   pull_request:
@@ -36,8 +44,10 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  gitleaks_scan:
+  # PR scan: only the diff introduced by the pull request.
+  gitleaks_diff:
     name: Gitleaks scan
+    if: ${{ github.event_name == 'pull_request' }}
     runs-on: ubuntu-latest
     permissions:
       contents: read
@@ -47,3 +57,17 @@ jobs:
         uses: deckhouse/modules-actions/gitleaks@v6
         with:
           scan_mode: "diff"
+
+  # Push scan: full-repo scan (diff mode can't resolve a PR ref on a push event).
+  gitleaks_full:
+    name: Gitleaks scan
+    if: ${{ github.event_name == 'push' }}
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: read
+    steps:
+      - name: Run Gitleaks full scan
+        uses: deckhouse/modules-actions/gitleaks@v6
+        with:
+          scan_mode: "full"
diff --git a/docs/WORKLOG.md b/docs/WORKLOG.md
index 987e3ac..3e05008 100644
--- a/docs/WORKLOG.md
+++ b/docs/WORKLOG.md
@@ -113,3 +113,4 @@ All notable changes to this repository are documented here. New entries are appe
 - **Add** Wave 2 httptest tests (`internal/kubernetes/commander/client_http_test.go`): drives the Commander HTTP client (`GetClusterByID`, `ListClustersAPI` array/items/data/garbage, `GetClusterByName`, `CreateClusterFromTemplate`, `DeleteClusterByID`, `GetClusterKubeconfigByID` + cluster-details fallback, `GetRegistryByName`, `GetClusterConnectionInfo` precedence + defaults) and all five `setAuthHeaders` paths via a real `httptest.Server`.
 - **Update** `docs/TESTS_IMPLEMENTATION_PLAN.md`: triggers changed from `push → main` to push-on-any-branch + `pull_request → main`; status header refreshed; rollout phases marked Done/Pending; exact `gh api` branch-protection command documented.
 - **Update** `.github/workflows/gitleaks-scan-on-pr.yml` → renamed to `.github/workflows/gitleaks.yml`: workflow `name` shortened to `Gitleaks`, added `push: {}` trigger so secret scanning runs on every push (any branch), not only on PRs; added cancel-in-progress concurrency group.
+- **Update** `.github/workflows/gitleaks.yml`: split into two jobs gated by `github.event_name` — `gitleaks_diff` (`scan_mode: diff`) for `pull_request`, `gitleaks_full` (`scan_mode: full`) for `push`; fixes `fatal: invalid refspec '+refs/pull//merge:...'` that broke push runs because the upstream action's diff mode needs `github.event.number`. Both jobs share check name `Gitleaks scan`.

From dd8dc74053f7d3c74888396f9561ffdf8c5c03b8 Mon Sep 17 00:00:00 2001
From: Alexey Yakubov <alexey.yakubov@flant.com>
Date: Wed, 3 Jun 2026 18:57:35 +0400
Subject: [PATCH 6/8] gitleaks: revert to pull_request-only

Drop the push trigger. The upstream action's diff mode needs
github.event.number (only present on pull_request); on push it builds an
empty refspec (+refs/pull//merge:...) and git fetch fails with exit 128.
---
 .github/workflows/gitleaks.yml | 41 ++++++----------------------------
 docs/WORKLOG.md                |  2 +-
 2 files changed, 8 insertions(+), 35 deletions(-)

diff --git a/.github/workflows/gitleaks.yml b/.github/workflows/gitleaks.yml
index 45cf831..e7c01a9 100644
--- a/.github/workflows/gitleaks.yml
+++ b/.github/workflows/gitleaks.yml
@@ -14,23 +14,12 @@
 
 name: Gitleaks
 
-# Triggers:
-#   push:           every push to any branch — catches leaked secrets before a PR
-#                   is even opened (feature branches, force-pushes, throw-away
-#                   debug branches).
-#   pull_request:   PRs targeting any branch — required so the check is also
-#                   evaluated for fork PRs, whose `push` events do not run in
-#                   the upstream repo.
-#
-# The upstream action (deckhouse/modules-actions/gitleaks) supports two modes:
-#   * diff — checks out refs/pull/<N>/merge and scans only the PR range. This
-#            mode only works on pull_request events (it needs github.event.number);
-#            on a push it produces an empty refspec and `git fetch` fails.
-#   * full — checks out the whole repo and scans every tracked file.
-# So we split into two jobs gated by github.event_name: diff for PRs, full for
-# pushes.
+# Gitleaks runs only on pull requests. The upstream action's diff mode checks
+# out refs/pull/<N>/merge and scans just the PR range; it relies on
+# github.event.number, which exists only for pull_request events. (A push-event
+# run produces an empty refspec — `+refs/pull//merge:...` — and `git fetch`
+# fails, so we deliberately do not trigger on push.)
 on:
-  push: {}
   pull_request:
     types: [opened, synchronize, reopened]
 
@@ -38,16 +27,14 @@ permissions:
   contents: read
   pull-requests: read
 
-# Cancel superseded runs on the same ref so rapid-fire pushes don't pile up.
+# Cancel superseded runs on the same PR ref so rapid-fire pushes don't pile up.
 concurrency:
   group: gitleaks-${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
 jobs:
-  # PR scan: only the diff introduced by the pull request.
-  gitleaks_diff:
+  gitleaks_scan:
     name: Gitleaks scan
-    if: ${{ github.event_name == 'pull_request' }}
     runs-on: ubuntu-latest
     permissions:
       contents: read
@@ -57,17 +44,3 @@ jobs:
         uses: deckhouse/modules-actions/gitleaks@v6
         with:
           scan_mode: "diff"
-
-  # Push scan: full-repo scan (diff mode can't resolve a PR ref on a push event).
-  gitleaks_full:
-    name: Gitleaks scan
-    if: ${{ github.event_name == 'push' }}
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      pull-requests: read
-    steps:
-      - name: Run Gitleaks full scan
-        uses: deckhouse/modules-actions/gitleaks@v6
-        with:
-          scan_mode: "full"
diff --git a/docs/WORKLOG.md b/docs/WORKLOG.md
index 3e05008..324b0e3 100644
--- a/docs/WORKLOG.md
+++ b/docs/WORKLOG.md
@@ -113,4 +113,4 @@ All notable changes to this repository are documented here. New entries are appe
 - **Add** Wave 2 httptest tests (`internal/kubernetes/commander/client_http_test.go`): drives the Commander HTTP client (`GetClusterByID`, `ListClustersAPI` array/items/data/garbage, `GetClusterByName`, `CreateClusterFromTemplate`, `DeleteClusterByID`, `GetClusterKubeconfigByID` + cluster-details fallback, `GetRegistryByName`, `GetClusterConnectionInfo` precedence + defaults) and all five `setAuthHeaders` paths via a real `httptest.Server`.
 - **Update** `docs/TESTS_IMPLEMENTATION_PLAN.md`: triggers changed from `push → main` to push-on-any-branch + `pull_request → main`; status header refreshed; rollout phases marked Done/Pending; exact `gh api` branch-protection command documented.
 - **Update** `.github/workflows/gitleaks-scan-on-pr.yml` → renamed to `.github/workflows/gitleaks.yml`: workflow `name` shortened to `Gitleaks`, added `push: {}` trigger so secret scanning runs on every push (any branch), not only on PRs; added cancel-in-progress concurrency group.
-- **Update** `.github/workflows/gitleaks.yml`: split into two jobs gated by `github.event_name` — `gitleaks_diff` (`scan_mode: diff`) for `pull_request`, `gitleaks_full` (`scan_mode: full`) for `push`; fixes `fatal: invalid refspec '+refs/pull//merge:...'` that broke push runs because the upstream action's diff mode needs `github.event.number`. Both jobs share check name `Gitleaks scan`.
+- **Update** `.github/workflows/gitleaks.yml`: reverted to `pull_request`-only (single `gitleaks_scan` job, `scan_mode: diff`); dropped the `push` trigger because the upstream action's diff mode needs `github.event.number` and fails on push with `fatal: invalid refspec '+refs/pull//merge:...'`.

From 3122f28a2f514d9d996a143f8784fc16e7f99a40 Mon Sep 17 00:00:00 2001
From: Alexey Yakubov <alexey.yakubov@flant.com>
Date: Wed, 3 Jun 2026 19:01:30 +0400
Subject: [PATCH 7/8] gitleaks: ignore base64 test-fixture false positive by
 fingerprint

The diff scan flags commit 5f1edc2, which introduced the base64("user:token")
test fixture before the inline gitleaks:allow comment was added, so the inline
suppression cannot hide it. Ignore the finding by fingerprint instead.
---
 .gitleaksignore | 9 +++++++++
 docs/WORKLOG.md | 1 +
 2 files changed, 10 insertions(+)
 create mode 100644 .gitleaksignore

diff --git a/.gitleaksignore b/.gitleaksignore
new file mode 100644
index 0000000..4c56962
--- /dev/null
+++ b/.gitleaksignore
@@ -0,0 +1,9 @@
+# gitleaks finding fingerprints to ignore (format: commit:file:rule:startline).
+#
+# False positive: base64("user:token") == "dXNlcjp0b2tlbg==" is a table-driven
+# unit-test fixture in TestBase64Encode, not a real credential. The finding is
+# anchored to commit 5f1edc2, which first introduced the line *before* the
+# inline `gitleaks:allow` comment was added (in a later commit). A diff scan
+# walks the whole PR commit range, so the later inline suppression cannot hide
+# the earlier commit — we ignore it here by fingerprint instead.
+5f1edc2ea49b6f107c55faeaa630bae63fe749f0:internal/kubernetes/commander/client_test.go:generic-api-key:75
diff --git a/docs/WORKLOG.md b/docs/WORKLOG.md
index 324b0e3..0646c89 100644
--- a/docs/WORKLOG.md
+++ b/docs/WORKLOG.md
@@ -114,3 +114,4 @@ All notable changes to this repository are documented here. New entries are appe
 - **Update** `docs/TESTS_IMPLEMENTATION_PLAN.md`: triggers changed from `push → main` to push-on-any-branch + `pull_request → main`; status header refreshed; rollout phases marked Done/Pending; exact `gh api` branch-protection command documented.
 - **Update** `.github/workflows/gitleaks-scan-on-pr.yml` → renamed to `.github/workflows/gitleaks.yml`: workflow `name` shortened to `Gitleaks`, added `push: {}` trigger so secret scanning runs on every push (any branch), not only on PRs; added cancel-in-progress concurrency group.
 - **Update** `.github/workflows/gitleaks.yml`: reverted to `pull_request`-only (single `gitleaks_scan` job, `scan_mode: diff`); dropped the `push` trigger because the upstream action's diff mode needs `github.event.number` and fails on push with `fatal: invalid refspec '+refs/pull//merge:...'`.
+- **Add** `.gitleaksignore`: ignores the `generic-api-key` false positive on `internal/kubernetes/commander/client_test.go:75` (base64 test fixture) by fingerprint at commit `5f1edc2`; the diff scan flags the introducing commit, so the later inline `gitleaks:allow` could not suppress it.

From 2dd1e5381993f489c8cbe0419bee12f134b42c46 Mon Sep 17 00:00:00 2001
From: Alexey Yakubov <alexey.yakubov@flant.com>
Date: Sun, 7 Jun 2026 22:26:51 +0400
Subject: [PATCH 8/8] test-coverage added

---
 .github/workflows/unit-tests.yml | 27 ++++++++++++++++++++++++++-
 docs/WORKLOG.md                  |  6 ++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index 1a0d70d..6b7750d 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -27,6 +27,11 @@ on:
 
 permissions:
   contents: read
+  # Required by actions/upload-code-coverage to publish the coverage report.
+  code-quality: write
+  # On push events the action resolves any open PR for the ref via `gh pr list`,
+  # which needs read access to pull requests.
+  pull-requests: read
 
 # Cancel in-flight runs for the same ref when a new commit lands. Each push
 # trigger then evaluates only the latest SHA, which keeps CI minutes bounded
@@ -72,10 +77,30 @@ jobs:
           echo "Total coverage:"
           go tool cover -func=coverage.out | tail -1
 
+      # actions/upload-code-coverage consumes Cobertura XML, so convert the Go
+      # coverage profile produced above into that format.
+      - name: Convert coverage to Cobertura XML
+        run: |
+          go run github.com/boumenot/gocover-cobertura@latest \
+            < coverage.out > coverage.xml
+
+      # Publishes the report via GitHub's native code-coverage feature. On push
+      # to the default branch it sets the baseline; on push to a branch with an
+      # open PR the github-code-quality[bot] posts the comparison on that PR.
+      - name: Upload code coverage
+        uses: actions/upload-code-coverage@v1
+        with:
+          file: coverage.xml
+          language: Go
+          label: code-coverage/go
+          fail-on-error: false
+
       - name: Upload coverage artifact
         if: always()
         uses: actions/upload-artifact@v4
         with:
           name: coverage
-          path: coverage.out
+          path: |
+            coverage.out
+            coverage.xml
           if-no-files-found: warn
diff --git a/docs/WORKLOG.md b/docs/WORKLOG.md
index 0646c89..3b62631 100644
--- a/docs/WORKLOG.md
+++ b/docs/WORKLOG.md
@@ -4,6 +4,12 @@ All notable changes to this repository are documented here. New entries are appe
 
 ---
 
+## 2026-06-07
+
+- **Update** `.github/workflows/unit-tests.yml`: integrate GitHub native code coverage (per-push) — add `code-quality: write` + `pull-requests: read` permissions, convert `coverage.out` to Cobertura XML via `boumenot/gocover-cobertura`, and publish with `actions/upload-code-coverage@v1`; coverage artifact now also includes `coverage.xml`
+
+---
+
 ## 2026-05-06
 
 - **Add** `UploadPrivate` on `ssh.SSHClient` (`internal/infrastructure/ssh`): SFTP `Chmod` immediately after `Create`, before payload copy; `uploadOverSFTPOnce`, `uploadWithSFTPRetries`, `jumpUploadWithSFTPRetries`; passphrase `BootstrapCluster` uses it with `install -d -m 0700` staging (`pkg/cluster/setup.go`); ARCHITECTURE mentions ssh uploads