diff --git a/.github/release-template.md b/.github/release-template.md index c67dc086..7172a71b 100644 --- a/.github/release-template.md +++ b/.github/release-template.md @@ -96,6 +96,19 @@ repositories or docs.] ## Breaking changes / Migration notes - [Delete this section if there are no breaking changes.] +- **Pre-release tester warning**: If you ran an unreleased marketplace or + chart-consolidation branch before this release, `obol stack up` may fail + with Helm `invalid ownership metadata` errors for resources or namespaces + that moved into the `base` chart. This is not a supported production + migration path. Back up anything you need from the local test stack, then + recreate it: + + ```bash + obol stack down + obol stack purge --force + obol stack init + obol stack up + ``` ## Known issues diff --git a/.github/workflows/helm-template-smoke.yml b/.github/workflows/helm-template-smoke.yml new file mode 100644 index 00000000..9320e05a --- /dev/null +++ b/.github/workflows/helm-template-smoke.yml @@ -0,0 +1,116 @@ +name: Helm Template Smoke + +on: + pull_request: + branches: [ main ] + paths: + - 'internal/embed/infrastructure/**' + - '.github/workflows/helm-template-smoke.yml' + push: + branches: [ main ] + paths: + - 'internal/embed/infrastructure/**' + - '.github/workflows/helm-template-smoke.yml' + +permissions: + contents: read + +jobs: + helm-template-smoke: + name: helm template embedded chart + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + + - name: Set up Helm + uses: azure/setup-helm@1a275c3b69536ee54be43f2070a358922e12c8d4 # v4.3.1 + with: + version: v3.20.1 # match obolup.sh pinned version + + - name: helm template ./base + run: | + # Render the embedded `base` chart and fail on Go-template parse + # errors. Catches bugs like the unescaped `{{ $labels }}` in + # PrometheusRule annotations that broke `helm upgrade base` on + # every `obol stack up` (see PR #527). `go test ./...` does not + # exercise Helm rendering, so this is the only pre-merge gate + # for chart parse errors. + # + # The base chart contains `{{PLACEHOLDER}}` strings (e.g. + # `{{OLLAMA_HOST_IP}}`, `{{CLUSTER_ID}}`) that are substituted + # by `internal/defaults/defaults.go::InfrastructureReplacements` + # before helmfile runs. Helm's Go-template parser would treat + # them as actions and fail, so we substitute stub values into + # a working copy first — mirroring what `obol stack init` does. + set -euo pipefail + workdir="$(mktemp -d)" + cp -R internal/embed/infrastructure/base "$workdir/base" + # Mirror internal/defaults InfrastructureReplacements with CI stubs. + find "$workdir/base" -type f -name '*.yaml' -print0 \ + | xargs -0 sed -i \ + -e 's/{{OLLAMA_HOST_IP}}/127.0.0.1/g' \ + -e 's/{{OLLAMA_HOST}}/localhost/g' \ + -e 's/{{CLUSTER_ID}}/ci-helm-smoke/g' + # Match values passed by helmfile.yaml `releases[base]`. + helm template base "$workdir/base" \ + --set dataDir=/data \ + --set network=mainnet \ + > "$workdir/base-rendered.yaml" + + # Kubernetes object identity must be unique within one rendered + # chart. Helm will happily render duplicate apiVersion/kind/name + # tuples and leave the actual outcome to manifest ordering; this + # caught the duplicated obol-frontend ClusterRole/Binding review bug. + awk ' + function flush() { + if (api && kind && name) { + key = api "/" kind "/" ns "/" name + count[key]++ + } + api = kind = name = ns = ""; inmeta = 0 + } + /^---/ { flush(); next } + /^apiVersion:/ { api = $2; next } + /^kind:/ { kind = $2; next } + /^metadata:/ { inmeta = 1; next } + inmeta && /^ name:/ { name = $2; next } + inmeta && /^ namespace:/ { ns = $2; next } + /^[^ ]/ && $0 !~ /^(apiVersion|kind|metadata):/ { inmeta = 0 } + END { + flush() + for (k in count) { + if (count[k] > 1) { + print count[k] " " k + dup = 1 + } + } + exit dup + }' "$workdir/base-rendered.yaml" + + - name: helm template ./cloudflared + run: | + # The cloudflared chart has no placeholder substitution and uses + # default values from values.yaml. + set -euo pipefail + helm template cloudflared internal/embed/infrastructure/cloudflared \ + > /dev/null + + - name: helm lint ./base + run: | + set -euo pipefail + workdir="$(mktemp -d)" + cp -R internal/embed/infrastructure/base "$workdir/base" + find "$workdir/base" -type f -name '*.yaml' -print0 \ + | xargs -0 sed -i \ + -e 's/{{OLLAMA_HOST_IP}}/127.0.0.1/g' \ + -e 's/{{OLLAMA_HOST}}/localhost/g' \ + -e 's/{{CLUSTER_ID}}/ci-helm-smoke/g' + helm lint "$workdir/base" \ + --set dataDir=/data \ + --set network=mainnet + + - name: helm lint ./cloudflared + run: | + set -euo pipefail + helm lint internal/embed/infrastructure/cloudflared diff --git a/.github/workflows/lint-test.yaml b/.github/workflows/lint-test.yaml index 8a251e97..3c5ee256 100644 --- a/.github/workflows/lint-test.yaml +++ b/.github/workflows/lint-test.yaml @@ -4,6 +4,9 @@ on: pull_request: branches: [ main ] +permissions: + contents: read + jobs: lint-test: runs-on: ubuntu-latest @@ -43,3 +46,31 @@ jobs: - name: Run chart-testing (install) run: ct install --target-branch ${{ github.event.repository.default_branch }} + + generate-check: + name: CRD generation up-to-date + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + + - name: Set up Go + uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0 + with: + go-version-file: 'go.mod' + + - name: Set up just + uses: extractions/setup-just@dd310ad5a97d8e7b41793f8ef055398d51ad4de6 # v2.0.2 + + - name: Regenerate CRDs + DeepCopy + run: just generate + + - name: Fail if regeneration changed any tracked files + run: | + if [ -n "$(git status --porcelain)" ]; then + echo "::error::CRD manifests or DeepCopy methods are out of date." + echo "::error::Run 'just generate' locally and commit the result." + git status + git --no-pager diff + exit 1 + fi diff --git a/CLAUDE.md b/CLAUDE.md index 04a71396..b29a9d46 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -342,6 +342,8 @@ A registry digest pin instead of `:latest` on the verifier means your dev rewrit For a fuller debug catalog with symptom→fix mapping, see `.agents/skills/obol-stack-dev/references/release-smoke-debugging.md`. +For observability architecture decisions (Prometheus retention vs. on-chain canonical record, counter-reset semantics, recording-rule naming, label conventions, CRD versioning stance, `clamp_min` epsilon), see `docs/observability.md` — read this before adding a new metric, recording rule, or proposing counter persistence. + ### Security: Tunnel Exposure The Cloudflare tunnel exposes the cluster to the public internet. Only x402-gated endpoints and discovery metadata should be reachable via the tunnel hostname. Internal services (frontend, eRPC, LiteLLM, monitoring) MUST have `hostnames: ["obol.stack"]` on their HTTPRoutes to restrict them to local access. diff --git a/Dockerfile.serviceoffer-controller b/Dockerfile.serviceoffer-controller index 5214a93a..09f6935b 100644 --- a/Dockerfile.serviceoffer-controller +++ b/Dockerfile.serviceoffer-controller @@ -5,6 +5,6 @@ RUN go mod download COPY . . RUN CGO_ENABLED=0 go build -o /serviceoffer-controller ./cmd/serviceoffer-controller -FROM gcr.io/distroless/static-debian12 +FROM gcr.io/distroless/static-debian12:nonroot COPY --from=builder /serviceoffer-controller /serviceoffer-controller ENTRYPOINT ["/serviceoffer-controller"] diff --git a/cmd/obol/sell.go b/cmd/obol/sell.go index 379fb2d9..b06508f7 100644 --- a/cmd/obol/sell.go +++ b/cmd/obol/sell.go @@ -2331,8 +2331,25 @@ Examples: func sellStopCommand(cfg *config.Config) *cli.Command { return &cli.Command{ Name: "stop", - Usage: "Pause a ServiceOffer without deleting it", + Usage: "Drain a ServiceOffer gracefully (advertises wind-down via discovery, then tears down the route)", ArgsUsage: "", + Description: `Marks a ServiceOffer as draining. While draining: + - The offer stays in /skill.md and /.well-known/agent-registration.json + with available=false and a drainEndsAt timestamp, so external + discovery (and ERC-8004 reputation scorers) can see the wind-down. + - The HTTPRoute and x402 payment gate STAY UP for the grace period + so buyers can complete in-flight payments. + - When the grace period elapses, the controller tears down the route + and marks PaymentGateReady/RoutePublished False with reason=Drained. + +The ServiceOffer CR itself is preserved — use 'obol sell delete' to +remove it entirely (which also tombstones the ERC-8004 record). + +Flags: + --grace 30m Override the grace period (default 1h). + --force Skip the drain window (equivalent to --grace 0). Use + this when the abrupt-teardown behavior of the old + pause annotation is required for behavior parity.`, Flags: []cli.Flag{ &cli.StringFlag{ Name: "namespace", @@ -2340,6 +2357,16 @@ func sellStopCommand(cfg *config.Config) *cli.Command { Usage: "Namespace of the ServiceOffer", Required: true, }, + &cli.DurationFlag{ + Name: "grace", + Usage: "Drain grace period (e.g. 30m, 2h). Defaults to 1h.", + Value: monetizeapi.DefaultDrainGracePeriod, + }, + &cli.BoolFlag{ + Name: "force", + Aliases: []string{"now"}, + Usage: "Skip the drain window and tear the route down on the next reconcile (alias: --now)", + }, }, Action: func(ctx context.Context, cmd *cli.Command) error { u := getUI(cmd) @@ -2352,19 +2379,37 @@ func sellStopCommand(cfg *config.Config) *cli.Command { return err } ns := cmd.String("namespace") + grace := cmd.Duration("grace") + if cmd.Bool("force") { + grace = 0 + } + if grace < 0 { + return errors.New("--grace must be >= 0") + } - u.Infof("Stopping the service offering %s/%s...", ns, name) - - removePricingRoute(cfg, u, name) - - patchJSON := `{"status":{"conditions":[{"type":"Ready","status":"False","reason":"Stopped","message":"Offer stopped by user"}]}}` - err := kubectlRun(cfg, "patch", "serviceoffers.obol.org", name, "-n", ns, - "--type=merge", "-p", patchJSON) - if err != nil { - return fmt.Errorf("failed to pause serviceoffer: %w", err) + now := time.Now().UTC() + drainEndsAt := now.Add(grace) + + // metav1.Duration JSON-marshals as the string form (e.g. + // "1h0m0s"), and metav1.Time marshals as RFC3339. We can + // emit a tiny strategic-merge patch directly without + // importing the meta types into the CLI. + patchJSON := fmt.Sprintf( + `{"spec":{"drainAt":%q,"drainGracePeriod":%q}}`, + now.Format(time.RFC3339), + grace.String(), + ) + if err := kubectlRun(cfg, "patch", "serviceoffers.obol.org", name, "-n", ns, + "--type=merge", "-p", patchJSON); err != nil { + return fmt.Errorf("failed to drain serviceoffer: %w", err) } - u.Successf("Service offering %s/%s stopped.", ns, name) + if grace == 0 { + u.Successf("ServiceOffer %s/%s draining; route will be removed on the next reconcile (--force).", ns, name) + } else { + u.Successf("ServiceOffer %s/%s draining; route will be removed at %s.", ns, name, drainEndsAt.Format(time.RFC3339)) + } + u.Infof("In-flight buyers can complete payments until then. Run `obol sell delete %s -n %s` to fully remove.", name, ns) return nil }, } @@ -2518,8 +2563,6 @@ func sellDeleteCommand(cfg *config.Config) *cli.Command { } } - removePricingRoute(cfg, u, name) - // Identity-level registration ownership lives in the AgentIdentity // CR and is managed by the controller. The CLI no longer patches // the registration ConfigMap here; deleting the ServiceOffer is @@ -4126,7 +4169,3 @@ func manifestNSName(manifest map[string]any) (string, string) { return ns, name } -// removePricingRoute is a no-op retained for compatibility. -// The serviceoffer-controller now manages pricing routes via the ServiceOffer -// informer; static ConfigMap routes are no longer used. -func removePricingRoute(_ *config.Config, _ *ui.UI, _ string) {} diff --git a/cmd/obol/sell_test.go b/cmd/obol/sell_test.go index 9f3991ea..4055fa2c 100644 --- a/cmd/obol/sell_test.go +++ b/cmd/obol/sell_test.go @@ -626,9 +626,20 @@ func TestSellStop_Structure(t *testing.T) { stop := findSubcommand(t, cmd, "stop") flags := flagMap(stop) - requireFlags(t, flags, "namespace") + requireFlags(t, flags, "namespace", "grace", "force") assertFlagRequired(t, flags, "namespace") assertFlagHasAlias(t, flags, "namespace", "n") + // --now is the documented alias for --force; if it disappears, + // scripted operators that rely on it break silently. + assertFlagHasAlias(t, flags, "force", "now") + + graceFlag, ok := flags["grace"].(*cli.DurationFlag) + if !ok { + t.Fatalf("--grace should be *cli.DurationFlag, got %T", flags["grace"]) + } + if graceFlag.Value != monetizeapi.DefaultDrainGracePeriod { + t.Errorf("--grace default = %v, want %v", graceFlag.Value, monetizeapi.DefaultDrainGracePeriod) + } } func TestSellDelete_Structure(t *testing.T) { diff --git a/cmd/serviceoffer-controller/main.go b/cmd/serviceoffer-controller/main.go index 8fc01a42..28be8287 100644 --- a/cmd/serviceoffer-controller/main.go +++ b/cmd/serviceoffer-controller/main.go @@ -7,15 +7,27 @@ import ( "os" "os/signal" "syscall" + "time" "github.com/ObolNetwork/obol-stack/internal/serviceoffercontroller" "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" + "k8s.io/client-go/tools/leaderelection" + "k8s.io/client-go/tools/leaderelection/resourcelock" +) + +const ( + defaultLockNamespace = "x402" + leaseName = "serviceoffer-controller" + leaseDuration = 30 * time.Second + renewDeadline = 20 * time.Second + retryPeriod = 5 * time.Second ) func main() { kubeconfig := flag.String("kubeconfig", "", "Path to kubeconfig for out-of-cluster runs") workers := flag.Int("workers", 1, "Number of reconcile workers") + leaderElect := flag.Bool("leader-elect", true, "Acquire a Lease before running the reconcile loop (disable for local dev)") flag.Parse() cfg, err := loadConfig(*kubeconfig) @@ -31,9 +43,79 @@ func main() { ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) defer cancel() - if err := controller.Run(ctx, *workers); err != nil { - log.Fatalf("run controller: %v", err) + if !*leaderElect { + if err := controller.Run(ctx, *workers); err != nil { + log.Fatalf("run controller: %v", err) + } + return + } + + runWithLeaderElection(ctx, cfg, controller, *workers) +} + +func runWithLeaderElection(ctx context.Context, cfg *rest.Config, controller *serviceoffercontroller.Controller, workers int) { + podName := os.Getenv("POD_NAME") + if podName == "" { + // Fall back so local dev (go run ./cmd/serviceoffer-controller --leader-elect=false) + // still works if someone forgets the flag. Identity must be unique across + // candidates — in real deployments the downward API supplies the pod name. + podName = "serviceoffer-controller-local" + } + + lockNamespace := os.Getenv("POD_NAMESPACE") + if lockNamespace == "" { + lockNamespace = defaultLockNamespace + } + + lock, err := resourcelock.NewFromKubeconfig( + resourcelock.LeasesResourceLock, + lockNamespace, + leaseName, + resourcelock.ResourceLockConfig{ + Identity: podName, + }, + cfg, + renewDeadline, + ) + if err != nil { + log.Fatalf("create lease lock: %v", err) + } + + leaderelection.RunOrDie(ctx, leaderelection.LeaderElectionConfig{ + Lock: lock, + ReleaseOnCancel: true, + LeaseDuration: leaseDuration, + RenewDeadline: renewDeadline, + RetryPeriod: retryPeriod, + Callbacks: leaderelection.LeaderCallbacks{ + OnStartedLeading: func(ctx context.Context) { + log.Printf("serviceoffer-controller: became leader %s", podName) + if err := controller.Run(ctx, workers); err != nil { + log.Printf("controller run: %v", err) + os.Exit(controllerRunExitCode(err)) + } + }, + OnStoppedLeading: func() { + // On lost leadership exit non-zero so the kubelet restarts the + // pod and the next election starts from a clean state. Trying + // to keep running without the lease would race the new leader. + log.Printf("serviceoffer-controller: lost leadership %s", podName) + os.Exit(1) + }, + OnNewLeader: func(identity string) { + if identity != podName { + log.Printf("serviceoffer-controller: new leader is %s", identity) + } + }, + }, + }) +} + +func controllerRunExitCode(err error) int { + if err != nil { + return 1 } + return 0 } func loadConfig(kubeconfig string) (*rest.Config, error) { diff --git a/cmd/serviceoffer-controller/main_test.go b/cmd/serviceoffer-controller/main_test.go new file mode 100644 index 00000000..5a1badb4 --- /dev/null +++ b/cmd/serviceoffer-controller/main_test.go @@ -0,0 +1,91 @@ +package main + +import ( + "errors" + "os" + "path/filepath" + "testing" +) + +// TestLoadConfig_FromKubeconfigFile asserts loadConfig parses an explicit +// kubeconfig path. This is the local-dev codepath used when --leader-elect=false. +func TestLoadConfig_FromKubeconfigFile(t *testing.T) { + dir := t.TempDir() + kc := filepath.Join(dir, "kubeconfig") + if err := os.WriteFile(kc, []byte(minimalKubeconfig), 0o600); err != nil { + t.Fatalf("write kubeconfig: %v", err) + } + + cfg, err := loadConfig(kc) + if err != nil { + t.Fatalf("loadConfig: %v", err) + } + if cfg.Host != "https://example.invalid:6443" { + t.Fatalf("unexpected host: %q", cfg.Host) + } +} + +// TestLoadConfig_FromKubeconfigEnv mirrors the path used when KUBECONFIG is set +// (e.g. obol kubectl/helm passthrough during local dev). +func TestLoadConfig_FromKubeconfigEnv(t *testing.T) { + dir := t.TempDir() + kc := filepath.Join(dir, "kubeconfig") + if err := os.WriteFile(kc, []byte(minimalKubeconfig), 0o600); err != nil { + t.Fatalf("write kubeconfig: %v", err) + } + + t.Setenv("KUBECONFIG", kc) + cfg, err := loadConfig("") + if err != nil { + t.Fatalf("loadConfig: %v", err) + } + if cfg.Host != "https://example.invalid:6443" { + t.Fatalf("unexpected host: %q", cfg.Host) + } +} + +// TestLeaderElectionDefaults locks in the lease parameters chosen for fast +// failover on single-node k3d. If you tune these for a multi-zone deployment, +// update this test and the PR-description rationale. +func TestLeaderElectionDefaults(t *testing.T) { + if leaseDuration <= renewDeadline { + t.Fatalf("leaseDuration (%s) must exceed renewDeadline (%s)", leaseDuration, renewDeadline) + } + if renewDeadline <= retryPeriod { + t.Fatalf("renewDeadline (%s) must exceed retryPeriod (%s)", renewDeadline, retryPeriod) + } + if leaseName != "serviceoffer-controller" { + t.Fatalf("leaseName drifted from RBAC + Deployment expectation: %q", leaseName) + } + if defaultLockNamespace != "x402" { + t.Fatalf("defaultLockNamespace drifted from infrastructure manifest: %q", defaultLockNamespace) + } +} + +func TestControllerRunExitCode(t *testing.T) { + if got := controllerRunExitCode(nil); got != 0 { + t.Fatalf("controllerRunExitCode(nil) = %d, want 0", got) + } + if got := controllerRunExitCode(errors.New("informer died")); got != 1 { + t.Fatalf("controllerRunExitCode(error) = %d, want 1", got) + } +} + +const minimalKubeconfig = `apiVersion: v1 +kind: Config +clusters: +- name: test + cluster: + server: https://example.invalid:6443 + insecure-skip-tls-verify: true +contexts: +- name: test + context: + cluster: test + user: test +current-context: test +users: +- name: test + user: + token: test-token +` diff --git a/cmd/x402-verifier/main.go b/cmd/x402-verifier/main.go index d9538c22..e1d52f63 100644 --- a/cmd/x402-verifier/main.go +++ b/cmd/x402-verifier/main.go @@ -60,6 +60,13 @@ func main() { ctx, cancel := context.WithCancel(context.Background()) defer cancel() + // File-sourced routes are populated synchronously by LoadConfig above, + // so they are "loaded" as soon as NewVerifier returns. The kube branch + // below flips this flag only after the first informer apply succeeds. + if *routeSource == "file" { + v.MarkRoutesLoaded() + } + if *watch { switch *routeSource { case "file": @@ -76,7 +83,7 @@ func main() { log.Fatalf("load kube route source config: %v", err) } go func() { - if err := x402verifier.WatchServiceOffers(ctx, kubeCfg, accumulator.SetRoutes); err != nil { + if err := x402verifier.WatchServiceOffers(ctx, kubeCfg, accumulator.SetRoutes, v.MarkRoutesLoaded); err != nil { log.Printf("x402-serviceoffer-source: stopped: %v", err) } }() diff --git a/docs/guides/monetize-inference.md b/docs/guides/monetize-inference.md index 2fdf3680..775ba812 100644 --- a/docs/guides/monetize-inference.md +++ b/docs/guides/monetize-inference.md @@ -572,15 +572,34 @@ obol sell status my-qwen --namespace llm obol sell status ``` -### Pausing +### Draining -Pause an offer without deleting it: +Stop an offer gracefully so buyers can wind down before the route disappears: ```bash -obol sell stop my-qwen --namespace llm +obol sell stop my-qwen --namespace llm # default: 1h grace +obol sell stop my-qwen --namespace llm --grace 30m # custom grace +obol sell stop my-qwen --namespace llm --force # tear down immediately ``` -The CR and any ERC-8004 registration remain intact. Re-create the offer with the same name to restart. +`obol sell stop` sets `spec.drainAt` on the ServiceOffer. While the offer is +draining: + +- `/skill.md` and `/.well-known/agent-registration.json` advertise the offer + with `available: false` and `drainEndsAt: `, so external discovery + (and ERC-8004 reputation scorers) can react before traffic disappears. +- The HTTPRoute and x402 payment gate stay up so in-flight buyers can complete + payments. +- When the grace period elapses, the controller tears down the route and marks + `Draining=False` reason=Drained. + +The ServiceOffer CR and any ERC-8004 registration remain intact. Use +`obol sell delete` to remove the offer entirely. + +`--force` (alias: `--now`) skips the drain window — useful when you want the +abrupt-teardown behavior of the legacy `obol.org/paused` annotation, for +example to reclaim the path immediately. Note that abrupt teardown is a worse +reputation signal for on-chain buyers than a graceful drain. ### Cleanup @@ -815,7 +834,7 @@ manifest. Do not paper over smoke-test failures with an ad hoc patch. | `obol sell http --wallet ... --chain ... --per-request ... --upstream ... --port ...` | Create a ServiceOffer and register by default | | `obol sell list` | List all ServiceOffers | | `obol sell status -n ` | Show conditions for an offer | -| `obol sell stop -n ` | Pause an offer without deleting it | +| `obol sell stop -n [--grace 1h] [--force]` | Drain an offer (advertise wind-down via discovery, then tear down the route after the grace period). `--force`/`--now` skips the grace window. | | `obol sell delete -n ` | Delete an offer and cleanup | | `obol sell status` | Show cluster pricing and registration | | `obol sell register --private-key-file ...` | Advanced/manual registration or repair path | diff --git a/docs/observability.md b/docs/observability.md new file mode 100644 index 00000000..0b98e449 --- /dev/null +++ b/docs/observability.md @@ -0,0 +1,369 @@ +# Observability architecture + +Operator-facing reference for how Obol Stack records, queries, and reasons about +payment-flow telemetry. Read this before adding a new metric, a new recording +rule, or proposing "let's persist the counter to a PVC." + +## TL;DR + +- **Prometheus is for recent operational telemetry**, bounded by TSDB retention + (currently 8d in our cluster values). +- **On-chain settlement TXs are the canonical record for lifetime financial + state.** Every settled x402 payment leaves an immutable on-chain trace via + `X-PAYMENT-RESPONSE` (settle tx hash, asset, amount, payer, payee). +- **Counters reset on every pod restart. That is intentional.** Prometheus + counters are per-process by design. Use `increase()` / `rate()` at query + time — they detect resets in the TSDB and stitch ranges back together. +- Recording rules use `::` (Prometheus convention) + and **name the window** in the rule (`7d_by_offer`, not `lifetime_by_offer`). +- Div-by-zero guards use a small epsilon (`1e-9`), **never `1.0`**. +- CRDs stay on `v1alpha1` during active dev — the alpha promise IS "no compat", + and we have no external operators yet. + +If you find yourself asking "how do we compute lifetime revenue for offer X +since the project started," the answer is **not** a recording rule — it is a +chain indexer over settle TXs. + +--- + +## Why counters reset (and why that's fine) + +Prometheus counters are stored per-process. When a verifier pod restarts (rollout, +node drain, OOM, image bump), the in-memory counter goes back to zero. This is +**not** a bug to engineer around at write time. The Prometheus query engine +already knows about it: + +- `rate(counter[5m])` and `increase(counter[5m])` perform **reset detection**: + if the last sample is less than the previous sample inside the window, the + engine assumes a reset and stitches the two ranges together rather than + emitting a negative delta. +- This is the well-documented "counter reset semantics." See Robust Perception: + *Avoiding the counter-reset undercount* — the canonical writeup of why you + must always range-query counters rather than `sum()`'ing them raw. + +The corollary is the rule that bit us in PR #530: + +> Never write a recording rule of the form `sum(my_counter_total) by (...)`. +> Always write `sum(increase(my_counter_total[])) by (...)`. + +`sum(counter)` collapses to "whatever value the live samples currently hold," +which means **every pod restart silently zeros the recorded series**. The +expert review caught a recording rule shipped in that exact broken form; +PR #530 swapped it to `increase()` over an explicit window. + +--- + +## The thin-layer architecture + +``` + +------------------------------+ + | x402-verifier (stateless) | + | - in-memory counters | + | - labels: | + | offer_namespace, | + | offer_name, chain, | + | asset_symbol | + +---------------+--------------+ + | + | /metrics scrape (Prometheus) + v + +------------------------------+ + | Prometheus TSDB (retention) | + | - 8d rolling window | + | - reset detection built in | + +---------------+--------------+ + | + | recording rules with increase() + v + +------------------------------+ + | Pre-aggregated series | + | x402:revenue:7d_by_offer + | x402:revenue:7d_by_offer_chain + +---------------+--------------+ + | + | PromQL queries + v + +------------------------------+ + | Frontend / dashboards | + | - reads pre-aggregated | + | - cheap, scoped to window | + +------------------------------+ + + + Parallel canonical path (for lifetime financial truth): + + +------------------------------+ + | x402-buyer / facilitator | + +---------------+--------------+ + | + | settle tx (on-chain) + v + +------------------------------+ + | Base / Base Sepolia | + | ERC-20 Transfer events | + | X-PAYMENT-RESPONSE header | + | carries settle tx hash | + +------------------------------+ + | + | chain indexer / explorer + v + +------------------------------+ + | Lifetime per-offer revenue | + | "since first deploy" answer | + +------------------------------+ +``` + +The two paths answer **different questions**: + +- Prometheus answers "what is the system doing in the last N hours/days?" with + cheap, second-resolution queries and label-faceting. +- On-chain answers "what was every payment that ever settled for offer X?" with + immutability and full historical depth, at the cost of being slower and + requiring an indexer. + +Mixing them is a category error. Don't try to make Prometheus answer the +lifetime question, and don't try to make the chain answer "what is the current +402-rate this minute?" + +--- + +## When NOT to add persistence to the counter itself + +Three options come up repeatedly in design discussions. We rejected all three +for the current use case: + +### PVC-backed verifier state +**Why it's tempting**: counters survive restart, no `increase()` gymnastics. + +**Why we rejected it**: it bolts a stateful primitive onto a stateless +component. `x402-verifier` is currently safe to scale, rollout, evict, and +re-image freely. A PVC turns every restart into a sequence-recovery problem +(double-counting on a torn write, undercounting on a crash before flush). +Prometheus already solves reset detection correctly; we'd be reimplementing +it badly and introducing a new failure mode. + +### Pushgateway +**Why it's tempting**: "decouple short-lived job state from scrape." + +**Why we rejected it**: Pushgateway is for batch-job final values, not for +long-running services. Using it for a live verifier inverts the ownership +model (Pushgateway becomes the source of truth, verifier becomes a writer), +loses per-pod identity, and adds a single-point-of-failure that, if it +restarts, **also** zeros the counter — without `rate()` knowing about it. + +### OTel collector with `cumulativetodelta` +**Why it's tempting**: collector-side reset stitching, hand off deltas to a +downstream store. + +**Why we rejected it**: it solves a problem we don't have (we're not sending +deltas to a backend that needs them), at the cost of a new infrastructure +component to operate. For a single-operator local-k3d stack, this is over- +engineering. If we ever export to an OTel-native backend, revisit. + +--- + +## When you WOULD want persistence + +The only legitimate driver is an explicit **billing or compliance requirement +to report "totals since first deploy" that exceeds Prometheus retention.** + +We do not have this requirement today. If we ever do: + +1. **Derive it from on-chain TXs**, not from metrics. Every paid request leaves + an `X-PAYMENT-RESPONSE` with a settle tx hash; an indexer over those is the + canonical answer. +2. Only fall back to a persisted counter if for some reason the chain trace is + unavailable for the offer in question — and even then, treat the indexed + chain data as the source of truth and the counter as a soft mirror. + +The architecture review's framing was right: if you find yourself wanting +Prometheus to answer a lifetime question, you've picked the wrong tool. + +--- + +## Recording rule conventions + +Naming follows the standard Prometheus pattern: + +``` +:: +``` + +Examples we ship: + +- `x402:revenue:7d_by_offer` — paid request count aggregated to the offer + level over the last 7d. The frontend multiplies this by the ServiceOffer + price table to display revenue. +- `x402:revenue:7d_by_offer_chain_asset_symbol` — same window, retaining + chain and settlement-token facets for per-token and per-chain views. + +Rules: + +1. **Name the window in the rule.** `7d_by_offer` is honest; `lifetime_by_offer` + is a lie (Prometheus has no "lifetime"). The window in the name must match + the window in the expression. +2. **Use `increase()` over an explicit range, not `sum()` of the raw counter.** + See PR #530 — the original rule did `sum(by offer) (charged_requests_total)` + and silently zeroed every time the verifier pod restarted. The fixed rule is + `sum by (offer_namespace, offer_name) (increase(obol_x402_verifier_charged_requests_total[7d]))`. +3. **Keep the window aligned with retention.** Recording a `30d` rule with 8d + retention is a footgun: the rule sees nulls and silently produces nothing. + +--- + +## Label conventions + +Labels are the query interface. The rule of thumb: + +- **Add a label if it's an attribute you'd want to facet by directly and the + cardinality is bounded.** "Bounded" means you can write down all possible + values: chains, asset symbols, offer names. Not user addresses, not request + IDs, not arbitrary route paths beyond what the offer CR enumerates. +- **Don't add a label that multiplies cardinality.** Every unique combination + of label values is a separate time series in TSDB. A label that adds 100 + values multiplies storage by 100×. + +Concrete examples: + +| Label | Source | Why include it | +|-------------------|------------------|-------------------------------------------| +| `offer_namespace` | offer CR meta | Tenancy facet | +| `offer_name` | offer CR meta | Per-offer breakdown | +| `chain` | offer CR payment | "Revenue by chain" is a real question | +| `asset_symbol` | offer CR payment | Added in PR #531 — per-token facet | + +`chain` and `asset_symbol` are both CR-derived (operator-set, bounded) and +query-meaningful ("how much USDC vs OBOL did we earn on Base last week?"). They +both belong. PR #531 added `asset_symbol` for exactly this reason — the prior +schema collapsed all asset types into one bucket. + +Anti-pattern: labeling by `payer_address` or `tx_hash`. Those are unbounded and +belong on the chain trace, not on the metric. + +--- + +## CRD versioning: stay on `v1alpha1` during active dev + +For the current single-operator local-stack development, the alpha-stays-alpha +approach matches the design intent. Concretely: + +- **While in active dev with no external operators, stay on `v1alpha1` and edit + the schema in place.** The alpha promise IS "no compat" — that's the whole + point of the version channel. Renaming a field, dropping a field, tightening + validation: all fair game at `v1alpha1`. +- **Bump to `v1alpha2` only when** you need both versions to coexist briefly to + validate a conversion path (which requires standing up a conversion webhook), + or to checkpoint a major redesign you want to land alongside the old shape. +- **Graduate to `v1beta1` only when** all three are true: + 1. The schema has been stable for ~2 releases (no breaking edits). + 2. An external operator has committed to depending on it. + 3. You're committing to backwards-compat for at least one release, with + deprecation warnings for any field you eventually want to remove. + +The architecture review surfaced "should we graduate to `v1beta1`?" as a flag. +That was a "what if we ship externally" hypothetical, not an action item — and +graduating prematurely locks us into compat overhead before the schema has +earned it. The current ServiceOffer / RegistrationRequest / PurchaseRequest +CRDs all stay on `v1alpha1` until the three conditions above hold. + +--- + +## `clamp_min(..., 1)` is an anti-pattern + +Div-by-zero guards in PromQL exist because dividing by an empty counter +produces a `NaN`. The naive fix is: + +```promql +# WRONG +my_success_rate + / +ignoring(...) clamp_min(my_request_total, 1) +``` + +That `1` is poison under low traffic. Suppose the real request rate over 5m is +3 successful out of 4 total (75%). With `clamp_min(..., 1)` and a window in +which the counter shows `0` total requests (e.g. between scrapes), the formula +returns `3/1 = 3.0` — a 300% success rate that breaks any alert downstream of +it. More commonly: the **denominator is clamped to 1 when it should be e.g. +0.5**, and your "success rate" reports half its real value, **causing +low-traffic alerts to under-report and stay silent during exactly the windows +when traffic is degraded**. + +The fix is to use an epsilon that's small enough never to dominate the real +denominator: + +```promql +# RIGHT +my_success_rate + / +ignoring(...) clamp_min(my_request_total, 1e-9) +``` + +`1e-9` keeps the division finite without distorting the result. Pick `1e-9` (or +smaller) as the project-wide epsilon and use it consistently. **Never `1.0`, +never `0.001`, never "a reasonable small number" — pick the smallest value that +avoids NaN and stick with it.** + +This was fixed in the same review pass that produced this doc. Future +contributors: if you write a guarded division, the epsilon is `1e-9`. + +--- + +## Cross-references + +### Code + +- `internal/x402/metrics.go` — verifier metric definitions + (`obol_x402_verifier_requests_total`, `_payment_required_total`, + `_payment_verified_total`, `_payment_failed_total`, `_charged_requests_total`). +- `internal/x402/verifier.go` — `prometheusLabels()` controls the verifier + label set; this is the canonical place to add a new bounded label. +- `internal/x402/buyer/metrics.go` — buyer-side counters + (`payment_attempts`, `payment_success_total`, `payment_failure_total`, + `confirm_spend_failure_total`, `payment_unsettled_confirmations`) plus + gauges (`auth_remaining`, `auth_spent`, `active_model_mappings`). +- `internal/x402/buyer/proxy.go` — `prometheusLabels()` for the buyer side. + +### Infrastructure + +- `internal/embed/infrastructure/values/monitoring.yaml.gotmpl` — Prometheus + values, including retention and recording rule wiring. +- `internal/embed/infrastructure/base/templates/x402.yaml` — verifier + Deployment, ServiceMonitor / PodMonitor. + +### Pull requests that shaped this + +- **PR #527** — `fix(prometheus-rules): escape PromQL $labels for Helm + rendering`. Helm was interpreting `$labels` as a Helm template variable and + blanking it; the fix is to escape so the literal `$labels` reaches the + Prometheus rule engine. +- **PR #530** — `fix(prometheus-rules): use increase() for the per-offer + revenue rule`. The original rule did `sum(counter)`, which silently zeroed + on verifier restart. Now uses `sum(increase(counter[7d]))` per the rules + above. +- **PR #531** — `feat(x402-metrics): add asset_symbol label for per-token + queries`. Unlocks "USDC vs OBOL revenue by chain" without needing a + downstream join. + +### Reports + +- The OBOL parity integration test report — see `plans/` for the most recent + `release-smoke-hardening-*.md` and `post-490-integration-*.md` entries that + reference the metric audits behind PRs #527 / #530 / #531. + +--- + +## Quick checklist for the next change + +Before opening a PR that touches metrics: + +- [ ] New label is bounded and CR-derived (or otherwise enumerable). +- [ ] No label that could grow unbounded (payer address, tx hash, free-form + path beyond CR enumeration). +- [ ] New recording rule uses `increase()` over an explicit window. +- [ ] Window in the rule name matches window in the expression + (no `lifetime_*`). +- [ ] Window is within Prometheus retention. +- [ ] Any guarded division uses `1e-9` as the clamp floor. +- [ ] If the new metric tries to answer a "lifetime" question, you've stopped + and reconsidered using on-chain data instead. diff --git a/go.mod b/go.mod index 98f67795..28171102 100644 --- a/go.mod +++ b/go.mod @@ -15,19 +15,22 @@ require ( github.com/hf/nitrite v0.0.0-20241225144000-c2d5d3c4f303 github.com/hf/nsm v0.0.0-20220930140112-cd181bd646b9 github.com/mattn/go-isatty v0.0.20 - github.com/prometheus/client_golang v1.15.0 - github.com/prometheus/client_model v0.3.0 - github.com/prometheus/common v0.42.0 + github.com/prometheus/client_golang v1.19.1 + github.com/prometheus/client_model v0.6.1 + github.com/prometheus/common v0.55.0 github.com/santhosh-tekuri/jsonschema/v6 v6.0.2 github.com/shopspring/decimal v1.3.1 github.com/urfave/cli/v2 v2.27.5 github.com/urfave/cli/v3 v3.6.2 golang.org/x/crypto v0.46.0 + golang.org/x/net v0.48.0 golang.org/x/sys v0.39.0 golang.org/x/term v0.38.0 gopkg.in/yaml.v3 v3.0.1 + k8s.io/api v0.34.1 k8s.io/apimachinery v0.34.1 k8s.io/client-go v0.34.1 + sigs.k8s.io/controller-tools v0.16.5 ) require ( @@ -48,21 +51,22 @@ require ( github.com/crate-crypto/go-ipa v0.0.0-20240724233137-53bbb0ceb27a // indirect github.com/cucumber/gherkin/go/v26 v26.2.0 // indirect github.com/cucumber/messages/go/v21 v21.0.1 // indirect - github.com/davecgh/go-spew v1.1.1 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/deckarep/golang-set/v2 v2.8.0 // indirect github.com/emicklei/go-restful/v3 v3.12.2 // indirect github.com/ethereum/c-kzg-4844/v2 v2.1.5 // indirect github.com/ethereum/go-verkle v0.2.2 // indirect + github.com/fatih/color v1.18.0 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/fxamacker/cbor/v2 v2.9.0 // indirect - github.com/go-logr/logr v1.4.2 // indirect + github.com/go-logr/logr v1.4.3 // indirect github.com/go-ole/go-ole v1.3.0 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect github.com/go-openapi/jsonreference v0.20.2 // indirect github.com/go-openapi/swag v0.23.0 // indirect + github.com/gobuffalo/flect v1.0.3 // indirect github.com/gofrs/uuid v4.3.1+incompatible // indirect github.com/gogo/protobuf v1.3.2 // indirect - github.com/golang/protobuf v1.5.4 // indirect github.com/google/gnostic-models v0.7.0 // indirect github.com/google/go-cmp v0.7.0 // indirect github.com/google/go-configfs-tsm v0.2.2 // indirect @@ -72,24 +76,25 @@ require ( github.com/hashicorp/go-memdb v1.3.4 // indirect github.com/hashicorp/golang-lru v0.5.4 // indirect github.com/holiman/uint256 v1.3.2 // indirect + github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/klauspost/compress v1.18.1 // indirect github.com/lucasb-eyer/go-colorful v1.2.0 // indirect github.com/mailru/easyjson v0.7.7 // indirect + github.com/mattn/go-colorable v0.1.14 // indirect github.com/mattn/go-runewidth v0.0.16 // indirect - github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/muesli/termenv v0.16.0 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/pkg/errors v0.9.1 // indirect - github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/prometheus/procfs v0.9.0 // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect + github.com/prometheus/procfs v0.16.1 // indirect github.com/rivo/uniseg v0.4.7 // indirect github.com/rogpeppe/go-internal v1.14.1 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/shirou/gopsutil v3.21.4-0.20210419000835-c7a38de76ee5+incompatible // indirect + github.com/spf13/cobra v1.9.1 // indirect github.com/spf13/pflag v1.0.10 // indirect github.com/supranational/blst v0.3.16 // indirect github.com/tklauser/go-sysconf v0.3.12 // indirect @@ -98,22 +103,24 @@ require ( github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect go.uber.org/multierr v1.11.0 // indirect - go.yaml.in/yaml/v2 v2.4.2 // indirect + go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546 // indirect - golang.org/x/net v0.48.0 // indirect + golang.org/x/mod v0.30.0 // indirect golang.org/x/oauth2 v0.32.0 // indirect golang.org/x/sync v0.19.0 // indirect golang.org/x/text v0.32.0 // indirect golang.org/x/time v0.14.0 // indirect + golang.org/x/tools v0.39.0 // indirect google.golang.org/protobuf v1.36.11 // indirect - gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect + gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect - k8s.io/api v0.34.1 // indirect + gopkg.in/yaml.v2 v2.4.0 // indirect + k8s.io/apiextensions-apiserver v0.31.2 // indirect k8s.io/klog/v2 v2.130.1 // indirect - k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b // indirect - k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 // indirect - sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect + k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect + k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect + sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect sigs.k8s.io/randfill v1.0.0 // indirect sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect sigs.k8s.io/yaml v1.6.0 // indirect diff --git a/go.sum b/go.sum index c451ad0f..fd84f4a3 100644 --- a/go.sum +++ b/go.sum @@ -45,6 +45,7 @@ github.com/coinbase/x402/go v0.0.0-20260331075907-bff876de232a/go.mod h1:8xt63HO github.com/consensys/gnark-crypto v0.19.2 h1:qrEAIXq3T4egxqiliFFoNrepkIWVEeIYwt3UL0fvS80= github.com/consensys/gnark-crypto v0.19.2/go.mod h1:rT23F0XSZqE0mUA0+pRtnL56IbPxs6gp4CeRsBk4XS0= github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo= github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/crate-crypto/go-eth-kzg v1.4.0 h1:WzDGjHk4gFg6YzV0rJOAsTK4z3Qkz5jd4RE3DAvPFkg= @@ -60,8 +61,9 @@ github.com/cucumber/messages/go/v21 v21.0.1 h1:wzA0LxwjlWQYZd32VTlAVDTkW6inOFmSM github.com/cucumber/messages/go/v21 v21.0.1/go.mod h1:zheH/2HS9JLVFukdrsPWoPdmUtmYQAQPLk7w5vWsk5s= github.com/cucumber/messages/go/v22 v22.0.0/go.mod h1:aZipXTKc0JnjCsXrJnuZpWhtay93k7Rn3Dee7iyPJjs= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dchest/siphash v1.2.3 h1:QXwFc8cFOR2dSa/gE6o/HokBMWtLUaNDVd+22aKHeEA= github.com/dchest/siphash v1.2.3/go.mod h1:0NvQU092bT0ipiFN++/rXm69QG9tVxLAlQHIXMPAkHc= github.com/deckarep/golang-set/v2 v2.8.0 h1:swm0rlPCmdWn9mESxKOjWk8hXSqoxOp+ZlfuyaAdFlQ= @@ -88,6 +90,8 @@ github.com/ethereum/go-ethereum v1.16.7 h1:qeM4TvbrWK0UC0tgkZ7NiRsmBGwsjqc64BHo2 github.com/ethereum/go-ethereum v1.16.7/go.mod h1:Fs6QebQbavneQTYcA39PEKv2+zIjX7rPUZ14DER46wk= github.com/ethereum/go-verkle v0.2.2 h1:I2W0WjnrFUIzzVPwm8ykY+7pL2d4VhlsePn4j7cnFk8= github.com/ethereum/go-verkle v0.2.2/go.mod h1:M3b90YRnzqKyyzBEWJGqj8Qff4IDeXnzFw0P9bFw3uk= +github.com/fatih/color v1.18.0 h1:S8gINlzdQ840/4pfAwic/ZE0djQEH3wM94VfqLTZcOM= +github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/6HfU= github.com/ferranbt/fastssz v0.1.4 h1:OCDB+dYDEQDvAgtAGnTSidK1Pe2tW3nFV40XyMkTeDY= github.com/ferranbt/fastssz v0.1.4/go.mod h1:Ea3+oeoRGGLGm5shYAeDgu6PGUlcvQhE2fILyD9+tGg= github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= @@ -99,8 +103,8 @@ github.com/gballet/go-libpcsclite v0.0.0-20190607065134-2772fd86a8ff h1:tY80oXqG github.com/gballet/go-libpcsclite v0.0.0-20190607065134-2772fd86a8ff/go.mod h1:x7DCsMOv1taUwEWCzT4cmDeAkigA5/QCwUodaVOe8Ww= github.com/getsentry/sentry-go v0.27.0 h1:Pv98CIbtB3LkMWmXi4Joa5OOcwbmnX88sF5qbK3r3Ps= github.com/getsentry/sentry-go v0.27.0/go.mod h1:lc76E2QywIyW8WuBnwl8Lc4bkmQH4+w1gwTf25trprY= -github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= -github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-ole/go-ole v1.2.5/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= github.com/go-ole/go-ole v1.3.0 h1:Dt6ye7+vXGIKZ7Xtk4s6/xVdGDQynvom7xCFEdWr6uE= github.com/go-ole/go-ole v1.3.0/go.mod h1:5LS6F96DhAwUc7C+1HLexzMXY1xGRSryjyPPKW6zv78= @@ -114,6 +118,8 @@ github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+Gr github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= +github.com/gobuffalo/flect v1.0.3 h1:xeWBM2nui+qnVvNM4S3foBhCAL2XgPU+a7FdpelbTq4= +github.com/gobuffalo/flect v1.0.3/go.mod h1:A5msMlrHtLqh9umBSnvabjsMrCcCpAyzglnDvkbYKHs= github.com/gofrs/flock v0.12.1 h1:MTLVXXHf8ekldpJk3AKicLij9MdwOWkZ+a/jHHZby9E= github.com/gofrs/flock v0.12.1/go.mod h1:9zxTsyu5xtJ9DK+1tFZyibEV7y3uwDxPPfbxeeHCoD0= github.com/gofrs/uuid v4.2.0+incompatible/go.mod h1:b2aQJv3Z4Fp6yNu3cdSllBxTCLRxnplIgP/c0N/04lM= @@ -123,10 +129,6 @@ github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang-jwt/jwt/v4 v4.5.2 h1:YtQM7lnr8iZ+j5q71MGKkNw9Mn7AjHM68uc9g5fXeUI= github.com/golang-jwt/jwt/v4 v4.5.2/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0= -github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.3.5/go.mod h1:6O5/vntMXwX2lRkT1hjjk0nAC1IDOTvTlVgjlRvqsdk= -github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= -github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/golang/snappy v1.0.0 h1:Oy607GVXHs7RtbggtPBnr2RmDArIsAefDwvrdWvRhGs= github.com/golang/snappy v1.0.0/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo= @@ -177,6 +179,7 @@ github.com/holiman/uint256 v1.3.2 h1:a9EgMPSC1AAaj1SZL5zIQD3WbwTuHrMGOerLjGmM/TA github.com/holiman/uint256 v1.3.2/go.mod h1:EOMSn4q6Nyt9P6efbI3bueV4e1b3dGlUCXeiRV4ng7E= github.com/huin/goupnp v1.3.0 h1:UvLUlWDNpoUdYzb2TCn+MuTWtcjXKSza2n6CBdQ0xXc= github.com/huin/goupnp v1.3.0/go.mod h1:gnGPsThkYa7bFi/KWmEysQRf48l2dvR5bxr2OFckNX8= +github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/influxdata/influxdb-client-go/v2 v2.4.0 h1:HGBfZYStlx3Kqvsv1h2pJixbCl/jhnFtxpKFAv9Tu5k= github.com/influxdata/influxdb-client-go/v2 v2.4.0/go.mod h1:vLNHdxTJkIf2mSLvGrpj8TCcISApPoXkaxP8g9uRlW8= @@ -217,8 +220,6 @@ github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWE github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc= github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= -github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo= -github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= github.com/minio/sha256-simd v1.0.0 h1:v1ta+49hkWZyvaKwrQB8elexRqm6Y0aMLjCNsrYxo6g= github.com/minio/sha256-simd v1.0.0/go.mod h1:OuYzVNI5vcoYIAmbIvHPl3N3jUzVedXbKy5RFepssQM= github.com/mitchellh/mapstructure v1.4.1 h1:CpVNEelQCZBooIPDn+AR3NpivK/TIKU8bDxdASFVQag= @@ -235,8 +236,12 @@ github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= +github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec= github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY= +github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE= +github.com/onsi/ginkgo v1.16.5/go.mod h1:+E8gABHa3K6zRBolWtd+ROzc/U5bkGt0FwiG042wbpU= github.com/onsi/ginkgo/v2 v2.21.0 h1:7rg/4f3rB88pb5obDgNZrNHrQ4e6WpjonchcpuBRnZM= github.com/onsi/ginkgo/v2 v2.21.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo= github.com/onsi/gomega v1.35.1 h1:Cwbd75ZBPxFSuZ6T+rN/WCb/gOc6YgFBXLlZLhC7Ds4= @@ -257,16 +262,17 @@ github.com/pion/transport/v3 v3.0.1 h1:gDTlPJwROfSfz6QfSi0ZmeCSkFcnWWiiR9ES0ouAN github.com/pion/transport/v3 v3.0.1/go.mod h1:UY7kiITrlMv7/IKgd5eTUcaahZx5oUN3l9SzK5f5xE0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prometheus/client_golang v1.15.0 h1:5fCgGYogn0hFdhyhLbw7hEsWxufKtY9klyvdNfFlFhM= -github.com/prometheus/client_golang v1.15.0/go.mod h1:e9yaBhRPU2pPNsZwE+JdQl0KEt1N9XgF6zxWmaC0xOk= -github.com/prometheus/client_model v0.3.0 h1:UBgGFHqYdG/TPFD1B1ogZywDqEkwp3fBMvqdiQ7Xew4= -github.com/prometheus/client_model v0.3.0/go.mod h1:LDGWKZIo7rky3hgvBe+caln+Dr3dPggB5dvjtD7w9+w= -github.com/prometheus/common v0.42.0 h1:EKsfXEYo4JpWMHH5cg+KOUWeuJSov1Id8zGR8eeI1YM= -github.com/prometheus/common v0.42.0/go.mod h1:xBwqVerjNdUDjgODMpudtOMwlOwf2SaTr1yjz4b7Zbc= -github.com/prometheus/procfs v0.9.0 h1:wzCHvIvM5SxWqYvwgVL7yJY8Lz3PKn49KQtpgMYJfhI= -github.com/prometheus/procfs v0.9.0/go.mod h1:+pB4zwohETzFnmlpe6yd2lSc+0/46IYZRB/chUwxUZY= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.19.1 h1:wZWJDwK+NameRJuPGDhlnFgx8e8HN3XHQeLaYJFJBOE= +github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho= +github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= +github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= +github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc= +github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8= +github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= +github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= @@ -283,7 +289,10 @@ github.com/shirou/gopsutil v3.21.4-0.20210419000835-c7a38de76ee5+incompatible/go github.com/shopspring/decimal v1.3.1 h1:2Usl1nmF/WZucqkFZhnfFYxxxu8LG21F6nPQBE5gKV8= github.com/shopspring/decimal v1.3.1/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o= github.com/spf13/cobra v1.7.0/go.mod h1:uLxZILRyS/50WlhOIKD7W6V5bgeIt+4sICxh6uRMrb0= +github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo= +github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spf13/pflag v1.0.7/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk= github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= @@ -323,8 +332,8 @@ go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= -go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= -go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= +go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= +go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= @@ -338,6 +347,8 @@ golang.org/x/lint v0.0.0-20201208152925-83fdc39ff7b5/go.mod h1:3xt1FjdF8hUf6vQPI golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.30.0 h1:fDEXFVZ/fmCKProc/yAXXUijritrDzahmwwefnjoPFk= +golang.org/x/mod v0.30.0/go.mod h1:lAsf5O2EvJeSFMiBxXDki7sCgAxEUcZHXoXMKT4GJKc= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= @@ -346,7 +357,6 @@ golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU= golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY= golang.org/x/oauth2 v0.32.0 h1:jsCblLleRMDrxMN29H3z/k1KliIvpLgCkE6R8FXXNgY= golang.org/x/oauth2 v0.32.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= -golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -379,6 +389,10 @@ golang.org/x/tools v0.0.0-20210105210202-9ed45478a130/go.mod h1:emZCQorbCU4vsT4f golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.39.0 h1:ik4ho21kwuQln40uelmciQPp9SipgNDdrafrYA4TmQQ= golang.org/x/tools v0.39.0/go.mod h1:JnefbkDPyD8UU2kI5fuf8ZX4/yUeh9W877ZeBONxUqQ= +golang.org/x/tools/go/expect v0.1.1-deprecated h1:jpBZDwmgPhXsKZC6WhL20P4b/wmnpsEAGHaNy0n/rJM= +golang.org/x/tools/go/expect v0.1.1-deprecated/go.mod h1:eihoPOH+FgIqa3FpoTwguz/bVUSGBlGQU67vpBeOrBY= +golang.org/x/tools/go/packages/packagestest v0.1.1-deprecated h1:1h2MnaIAIXISqTFKdENegdpAgUXz6NrPEsbIeWaBRvM= +golang.org/x/tools/go/packages/packagestest v0.1.1-deprecated/go.mod h1:RVAQXBGNv1ib0J382/DPCRS/BPnsGebyM1Gj5VSDpG8= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -388,12 +402,14 @@ google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= -gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4= -gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= +gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo= +gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/natefinch/lumberjack.v2 v2.2.1 h1:bBRl1b0OH9s/DuPhuXpNl+VtCaJXFZ5/uEFST95x9zc= gopkg.in/natefinch/lumberjack.v2 v2.2.1/go.mod h1:YD8tP3GAjkrDg1eZH7EGmyESg/lsYskCTPBJVb9jqSc= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= @@ -401,18 +417,22 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= k8s.io/api v0.34.1 h1:jC+153630BMdlFukegoEL8E/yT7aLyQkIVuwhmwDgJM= k8s.io/api v0.34.1/go.mod h1:SB80FxFtXn5/gwzCoN6QCtPD7Vbu5w2n1S0J5gFfTYk= +k8s.io/apiextensions-apiserver v0.31.2 h1:W8EwUb8+WXBLu56ser5IudT2cOho0gAKeTOnywBLxd0= +k8s.io/apiextensions-apiserver v0.31.2/go.mod h1:i+Geh+nGCJEGiCGR3MlBDkS7koHIIKWVfWeRFiOsUcM= k8s.io/apimachinery v0.34.1 h1:dTlxFls/eikpJxmAC7MVE8oOeP1zryV7iRyIjB0gky4= k8s.io/apimachinery v0.34.1/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw= k8s.io/client-go v0.34.1 h1:ZUPJKgXsnKwVwmKKdPfw4tB58+7/Ik3CrjOEhsiZ7mY= k8s.io/client-go v0.34.1/go.mod h1:kA8v0FP+tk6sZA0yKLRG67LWjqufAoSHA2xVGKw9Of8= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b h1:MloQ9/bdJyIu9lb1PzujOPolHyvO06MXG5TUIj2mNAA= -k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts= -k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 h1:hwvWFiBzdWw1FhfY1FooPn3kzWuJ8tmbZBHi4zVsl1Y= -k8s.io/utils v0.0.0-20250604170112-4c0f3b243397/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= -sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE= -sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= +k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZcmKS3g6CthxToOb37KgwE= +k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ= +k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck= +k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +sigs.k8s.io/controller-tools v0.16.5 h1:5k9FNRqziBPwqr17AMEPPV/En39ZBplLAdOwwQHruP4= +sigs.k8s.io/controller-tools v0.16.5/go.mod h1:8vztuRVzs8IuuJqKqbXCSlXcw+lkAv/M2sTpg55qjMY= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= sigs.k8s.io/structured-merge-diff/v6 v6.3.0 h1:jTijUJbW353oVOd9oTlifJqOGEkUw2jB/fXCbTiQEco= diff --git a/hack/boilerplate.go.txt b/hack/boilerplate.go.txt new file mode 100644 index 00000000..8c47ec39 --- /dev/null +++ b/hack/boilerplate.go.txt @@ -0,0 +1 @@ +// Code generated by controller-gen. DO NOT EDIT. diff --git a/internal/embed/embed_buyer_state_test.go b/internal/embed/embed_buyer_state_test.go new file mode 100644 index 00000000..e7ed4ae2 --- /dev/null +++ b/internal/embed/embed_buyer_state_test.go @@ -0,0 +1,92 @@ +package embed + +import ( + "testing" +) + +// TestBuyerStatePVC asserts that x402-buyer's /state is backed by a PVC +// (not an emptyDir), and that the litellm Deployment uses the Recreate +// strategy so the RWO PVC can be remounted without overlap. +// +// Regression: emptyDir lost consumed.json on every pod restart, causing +// the buyer to re-spend already-consumed auths from the ConfigMap pool +// and cascading into facilitator 400s ("nonce already used") until a +// manual `buy.py process --all` reseeded. +func TestBuyerStatePVC(t *testing.T) { + data, err := ReadInfrastructureFile("base/templates/llm.yaml") + if err != nil { + t.Fatalf("ReadInfrastructureFile: %v", err) + } + + docs := multiDoc(data) + + // PVC must exist in the llm namespace with RWO + local-path storage class. + pvc := findDocByName(docs, "PersistentVolumeClaim", "x402-buyer-state") + if pvc == nil { + t.Fatal("PersistentVolumeClaim 'x402-buyer-state' missing from llm.yaml") + } + + if ns := nested(pvc, "metadata", "namespace"); ns != "llm" { + t.Errorf("PVC namespace = %v, want llm", ns) + } + + modes, ok := nested(pvc, "spec", "accessModes").([]any) + if !ok || len(modes) != 1 || modes[0] != "ReadWriteOnce" { + t.Errorf("PVC accessModes = %v, want [ReadWriteOnce]", modes) + } + + if sc := nested(pvc, "spec", "storageClassName"); sc != "local-path" { + t.Errorf("PVC storageClassName = %v, want local-path", sc) + } + + if storage := nested(pvc, "spec", "resources", "requests", "storage"); storage == nil { + t.Error("PVC missing spec.resources.requests.storage") + } + + // litellm Deployment volume entry must reference the PVC, not emptyDir. + dep := findDocByName(docs, "Deployment", "litellm") + if dep == nil { + t.Fatal("litellm Deployment missing from llm.yaml") + } + + volumes, ok := nested(dep, "spec", "template", "spec", "volumes").([]any) + if !ok { + t.Fatal("litellm Deployment has no volumes") + } + + var stateVolume map[string]any + for _, v := range volumes { + vm, ok := v.(map[string]any) + if !ok { + continue + } + if vm["name"] == "x402-buyer-state" { + stateVolume = vm + break + } + } + + if stateVolume == nil { + t.Fatal("litellm Deployment missing 'x402-buyer-state' volume") + } + + if _, isEmptyDir := stateVolume["emptyDir"]; isEmptyDir { + t.Error("x402-buyer-state is still emptyDir — must be persistentVolumeClaim to survive pod restarts") + } + + pvcRef, ok := stateVolume["persistentVolumeClaim"].(map[string]any) + if !ok { + t.Fatal("x402-buyer-state volume is not backed by persistentVolumeClaim") + } + + if claim := pvcRef["claimName"]; claim != "x402-buyer-state" { + t.Errorf("persistentVolumeClaim.claimName = %v, want x402-buyer-state", claim) + } + + // Strategy must be Recreate so the new pod waits for the old pod to + // release the RWO PVC before mounting. RollingUpdate with maxSurge>0 + // would block indefinitely. + if strat := nested(dep, "spec", "strategy", "type"); strat != "Recreate" { + t.Errorf("litellm Deployment strategy.type = %v, want Recreate (RWO PVC cannot be co-mounted during surge)", strat) + } +} diff --git a/internal/embed/embed_image_pin_test.go b/internal/embed/embed_image_pin_test.go index 36dd1676..1517bf76 100644 --- a/internal/embed/embed_image_pin_test.go +++ b/internal/embed/embed_image_pin_test.go @@ -136,3 +136,135 @@ func TestEmbeddedImages_NoNewLatestTags(t *testing.T) { strings.Join(stale, "\n ")) } } + +// TestEmbeddedImages_NamedImagesAreDigestPinned guards the @sha256: discipline +// for the cluster-side container images that ship as part of the embedded +// infrastructure. Tag-only refs (e.g. `:b13254e`) are vulnerable to mutable-tag +// rewrites — the class of supply-chain bug CLAUDE.md pitfall #12 documented +// after a real local-cluster incident. +// +// Adding a new image to this list MUST be accompanied by an `@sha256:` +// suffix on the `image:` line (or, for Helm value files, on the `tag:` field +// such that the rendered manifest produces `:@sha256:`). +// +// To regenerate a digest: +// +// docker buildx imagetools inspect : --format '{{ .Manifest.Digest }}' +func TestEmbeddedImages_NamedImagesAreDigestPinned(t *testing.T) { + cases := []struct { + file string + // repo is the substring used to locate the relevant line. The match + // is line-scoped — the line must also contain @sha256: to pass. + repo string + }{ + // internal/embed/infrastructure/base/templates/x402.yaml + {file: "base/templates/x402.yaml", repo: "ghcr.io/obolnetwork/x402-verifier"}, + {file: "base/templates/x402.yaml", repo: "ghcr.io/obolnetwork/serviceoffer-controller"}, + // internal/embed/infrastructure/base/templates/llm.yaml + {file: "base/templates/llm.yaml", repo: "ghcr.io/obolnetwork/litellm"}, + {file: "base/templates/llm.yaml", repo: "ghcr.io/obolnetwork/x402-buyer"}, + } + + for _, tc := range cases { + t.Run(tc.repo, func(t *testing.T) { + data, err := ReadInfrastructureFile(tc.file) + if err != nil { + t.Fatalf("read %s: %v", tc.file, err) + } + + var ( + found bool + offenders []string + ) + + scanner := bufio.NewScanner(bytes.NewReader(data)) + scanner.Buffer(make([]byte, 0, 1024*1024), 1024*1024) + + lineNum := 0 + for scanner.Scan() { + lineNum++ + line := scanner.Text() + + trimmed := strings.TrimSpace(line) + if strings.HasPrefix(trimmed, "#") { + continue + } + // Must look like a Kubernetes container `image:` field, not a + // random doc-comment or env var. + if !strings.Contains(trimmed, "image:") { + continue + } + if !strings.Contains(line, tc.repo) { + continue + } + + found = true + if !strings.Contains(line, "@sha256:") { + offenders = append(offenders, + fmt.Sprintf("%s:%d → %q lacks @sha256: digest pin", tc.file, lineNum, strings.TrimSpace(line))) + } + } + + if err := scanner.Err(); err != nil { + t.Fatalf("scan %s: %v", tc.file, err) + } + + if !found { + t.Fatalf("no image: line containing %q found in %s — has the image been renamed or moved? "+ + "Update this test alongside the manifest change.", tc.repo, tc.file) + } + + if len(offenders) > 0 { + t.Fatalf("digest-pin discipline broken in %s:\n %s\n\n"+ + "Pin the image as `:@sha256:`. Resolve with:\n"+ + " docker buildx imagetools inspect %s: --format '{{ .Manifest.Digest }}'", + tc.file, strings.Join(offenders, "\n "), tc.repo) + } + }) + } +} + +// TestEmbeddedImages_CloudflaredHelmTagIsDigestPinned covers the cloudflared +// chart, which uses the Helm idiom `image.repository` + `image.tag` rather +// than a literal `image:` line. The chart template renders +// `:`; embedding `@sha256:` inside `.tag` produces +// a valid digest-pinned ref at render time and preserves the same +// mutable-tag protection. +func TestEmbeddedImages_CloudflaredHelmTagIsDigestPinned(t *testing.T) { + data, err := ReadInfrastructureFile("cloudflared/values.yaml") + if err != nil { + t.Fatalf("read cloudflared/values.yaml: %v", err) + } + + var tagLine string + + scanner := bufio.NewScanner(bytes.NewReader(data)) + scanner.Buffer(make([]byte, 0, 1024*1024), 1024*1024) + + for scanner.Scan() { + line := scanner.Text() + trimmed := strings.TrimSpace(line) + if strings.HasPrefix(trimmed, "#") { + continue + } + if strings.HasPrefix(trimmed, "tag:") { + tagLine = line + break + } + } + + if err := scanner.Err(); err != nil { + t.Fatalf("scan cloudflared/values.yaml: %v", err) + } + + if tagLine == "" { + t.Fatal("no `tag:` field found in cloudflared/values.yaml — chart layout changed; update this test.") + } + + if !strings.Contains(tagLine, "@sha256:") { + t.Fatalf("cloudflared image tag is not digest-pinned: %q\n\n"+ + "Pin it as `tag: \"@sha256:\"`. Resolve with:\n"+ + " docker buildx imagetools inspect cloudflare/cloudflared: --format '{{ .Manifest.Digest }}'", + strings.TrimSpace(tagLine)) + } +} diff --git a/internal/embed/infrastructure/base/templates/agent-crd.yaml b/internal/embed/infrastructure/base/templates/agent-crd.yaml index 510d4d0c..8338c0d6 100644 --- a/internal/embed/infrastructure/base/templates/agent-crd.yaml +++ b/internal/embed/infrastructure/base/templates/agent-crd.yaml @@ -1,12 +1,9 @@ --- -# Agent CRD -# Declarative spec for an Obol Stack agent (Hermes today, OpenClaw later). -# Decouples agent lifecycle from selling — `obol sell agent ` references -# an existing Agent rather than provisioning one inline. Internal manager -# agents with RBAC can also create Agent resources to spawn sub-agents. apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.5 name: agents.obol.org spec: group: obol.org @@ -14,103 +11,151 @@ spec: kind: Agent listKind: AgentList plural: agents - singular: agent shortNames: - - ag + - ag + singular: agent scope: Namespaced versions: - - name: v1alpha1 - served: true - storage: true - subresources: - status: {} - additionalPrinterColumns: - - name: Runtime - type: string - jsonPath: .spec.runtime - - name: Model - type: string - jsonPath: .status.pinnedModel - - name: Wallet - type: string - jsonPath: .status.walletAddress - - name: Phase - type: string - jsonPath: .status.phase - - name: Ready - type: string - jsonPath: .status.conditions[?(@.type=="Ready")].status - - name: Age - type: date - jsonPath: .metadata.creationTimestamp - schema: - openAPIV3Schema: - type: object - properties: - spec: - type: object - properties: - runtime: - type: string - enum: - - hermes - default: hermes - description: "Agent runtime (only hermes today; openclaw planned)" - model: + - additionalPrinterColumns: + - jsonPath: .spec.runtime + name: Runtime + type: string + - jsonPath: .status.pinnedModel + name: Model + type: string + - jsonPath: .status.walletAddress + name: Wallet + type: string + - jsonPath: .status.phase + name: Phase + type: string + - jsonPath: .status.conditions[?(@.type=="Ready")].status + name: Ready + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + Agent is the declarative spec for an Obol Stack agent (Hermes today, + OpenClaw later). Decouples agent lifecycle from selling: `obol sell + agent ` references an existing Agent rather than provisioning + one inline. Internal manager agents with RBAC can also create Agent + resources to spawn sub-agents. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + properties: + model: + description: |- + LiteLLM model name to pin. Empty = controller picks cluster + top-of-rank on first deploy and writes status.pinnedModel. + maxLength: 256 + type: string + objective: + description: |- + Operator-supplied objective text. Substituted into the SOUL.md + template by the seeder on first write. Agent owns SOUL.md after that. + maxLength: 4096 + type: string + runtime: + default: hermes + description: Agent runtime (only hermes today; openclaw planned). + enum: + - hermes + type: string + skills: + description: |- + Allow-listed skills written to the per-agent skills dir on first + reconcile. Agent can edit afterwards; this is a seed, not a sandbox. + items: + maxLength: 64 + pattern: ^[a-z0-9][a-z0-9-]*$ type: string - maxLength: 256 - description: "LiteLLM model name to pin. Empty = controller picks cluster top-of-rank on first deploy and writes status.pinnedModel." - skills: - type: array - maxItems: 64 - items: - type: string - pattern: "^[a-z0-9][a-z0-9-]*$" - maxLength: 64 - description: "Allow-listed skills written to the per-agent skills dir on first reconcile. Agent can edit afterwards; this is a seed, not a sandbox." - objective: - type: string - maxLength: 4096 - description: "Operator-supplied objective text. Substituted into the SOUL.md template by the seeder on first write. Agent owns SOUL.md after that." - wallet: - type: object + maxItems: 64 + type: array + wallet: + properties: + create: + default: false + description: |- + Provision a per-namespace remote-signer keystore. Address is + published in status.walletAddress. + type: boolean + type: object + type: object + status: + properties: + conditions: + items: properties: - create: - type: boolean - default: false - description: "Provision a per-namespace remote-signer keystore. Address is published in status.walletAddress." - status: - type: object - properties: - observedGeneration: - type: integer - format: int64 - phase: - type: string - description: "Pending | Provisioning | Ready | Failed" - pinnedModel: - type: string - description: "Actual model the agent is using (= spec.model when set, otherwise the auto-picked top-of-rank)." - walletAddress: - type: string - pattern: "^(0x[0-9a-fA-F]{40})?$" - description: "Agent's signing address when wallet.create=true. Empty otherwise." - endpoint: - type: string - description: "Cluster-internal URL for the agent runtime (e.g. http://hermes.agent-quant.svc.cluster.local:8642)." - conditions: - type: array - items: - type: object - properties: - type: - type: string - status: - type: string - reason: - type: string - message: - type: string - lastTransitionTime: - type: string - format: date-time + lastTransitionTime: + description: Last time the condition transitioned. + format: date-time + type: string + message: + description: Human-readable message with details. + type: string + reason: + description: Machine-readable reason for the condition. + type: string + status: + description: Status of the condition. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: Condition type. + type: string + required: + - status + - type + type: object + type: array + endpoint: + description: |- + Cluster-internal URL for the agent runtime (e.g. + http://hermes.agent-quant.svc.cluster.local:8642). + type: string + observedGeneration: + format: int64 + type: integer + phase: + description: Pending | Provisioning | Ready | Failed + type: string + pinnedModel: + description: |- + Actual model the agent is using (= spec.model when set, otherwise + the auto-picked top-of-rank). + type: string + walletAddress: + description: Agent's signing address when wallet.create=true. Empty + otherwise. + pattern: ^(0x[0-9a-fA-F]{40})?$ + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/internal/embed/infrastructure/base/templates/agentidentity-crd.yaml b/internal/embed/infrastructure/base/templates/agentidentity-crd.yaml index 29ad8c03..fa9fc2bc 100644 --- a/internal/embed/infrastructure/base/templates/agentidentity-crd.yaml +++ b/internal/embed/infrastructure/base/templates/agentidentity-crd.yaml @@ -1,13 +1,9 @@ --- -# AgentIdentity CRD -# Durable ERC-8004 agent identity document. Outlives ServiceOffers: when the -# last offer is deleted, the controller renders a tombstone (active:false, -# x402Support:false) instead of removing the registration document. The -# canonical operator identity lives at x402/default and status.registrations -# records the on-chain agentId for each registered chain. apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.5 name: agentidentities.obol.org spec: group: obol.org @@ -15,48 +11,73 @@ spec: kind: AgentIdentity listKind: AgentIdentityList plural: agentidentities - singular: agentidentity shortNames: - - aid + - aid + singular: agentidentity scope: Namespaced versions: - - name: v1alpha1 - served: true - storage: true - subresources: - status: {} - additionalPrinterColumns: - - name: Chains - type: string - jsonPath: .status.registrations[*].chain - - name: AgentIDs - type: string - jsonPath: .status.registrations[*].agentId - - name: Age - type: date - jsonPath: .metadata.creationTimestamp - schema: - openAPIV3Schema: - type: object - properties: - spec: - type: object - status: - type: object - properties: - registrations: - type: array - description: "Per-chain ERC-8004 registrations for this identity document." - items: - type: object - required: - - chain - - agentId - properties: - chain: - type: string - maxLength: 64 - description: "ERC-8004 registration chain alias." - agentId: - type: string - description: "On-chain ERC-721 tokenId on the given chain." + - additionalPrinterColumns: + - jsonPath: .status.registrations[*].chain + name: Chains + type: string + - jsonPath: .status.registrations[*].agentId + name: AgentIDs + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + AgentIdentity is the durable, on-chain identity an operator controls in + the ERC-8004 Identity Registry. A single AgentIdentity outlives + ServiceOffers: deleting the last ServiceOffer that references it does + not delete the NFT, the published registration document, or the + recorded agentId; instead the renderer publishes a tombstone + (active:false, x402Support:false) so external observers still see the + historical record. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + type: object + status: + properties: + registrations: + description: Per-chain ERC-8004 registrations for this identity document. + items: + properties: + agentId: + description: On-chain ERC-721 tokenId on the given chain. + type: string + chain: + description: ERC-8004 registration chain alias. + maxLength: 64 + type: string + required: + - agentId + - chain + type: object + type: array + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/internal/embed/infrastructure/base/templates/erpc.yaml b/internal/embed/infrastructure/base/templates/erpc.yaml new file mode 100644 index 00000000..635665d3 --- /dev/null +++ b/internal/embed/infrastructure/base/templates/erpc.yaml @@ -0,0 +1,100 @@ +# Relocated from helmfile.yaml `erpc-httproute`, `erpc-x402-middleware`, +# and `erpc-metadata` bedag/raw releases. These resources live alongside +# their workload (eRPC in the `erpc` namespace) instead of inlined in +# helmfile so the chart layout is the single source of truth for what +# ships in the erpc namespace. +# +# CRD prerequisites: +# - HTTPRoute -> gateway.networking.k8s.io/v1 (shipped by the Traefik +# v38+ chart's bundled CRDs) +# - Middleware -> traefik.io/v1alpha1 (shipped by the Traefik chart) +# `base` now declares `needs: [traefik/traefik]` in helmfile.yaml to +# guarantee CRDs are present before these templates apply. +# +# The eRPC Deployment + Service themselves still come from the upstream +# `ethereum/erpc` Helm chart (separate release in helmfile.yaml); only +# the routing + discovery metadata is owned here. + +--- +# eRPC namespace. Pre-created here so resources in this file (HTTPRoute, +# Middleware, ConfigMap) can apply during the `base` release without +# waiting for the `erpc` upstream chart release to create it. The `erpc` +# release still sets `createNamespace: true` — kubectl apply on an +# existing namespace is a no-op. +apiVersion: v1 +kind: Namespace +metadata: + name: erpc + +--- +# eRPC HTTPRoute — gates /rpc through the x402-payment Middleware and +# restricts the route to the obol.stack hostname so it cannot be reached +# via the public cloudflared tunnel (see CLAUDE.md "Security: Tunnel +# Exposure"). Removing the hostnames restriction is a critical security +# regression. +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: erpc + namespace: erpc +spec: + hostnames: + - "obol.stack" + parentRefs: + - name: traefik-gateway + namespace: traefik + sectionName: web + rules: + - matches: + - path: + type: PathPrefix + value: /rpc + filters: + - type: ExtensionRef + extensionRef: + group: traefik.io + kind: Middleware + name: x402-payment + backendRefs: + - name: erpc + port: 80 + +--- +# x402 Middleware for the eRPC namespace (ForwardAuth -> central +# verifier). Always deployed; the verifier returns 200 for routes with +# no pricing rules. +apiVersion: traefik.io/v1alpha1 +kind: Middleware +metadata: + name: x402-payment + namespace: erpc +spec: + forwardAuth: + address: http://x402-verifier.x402.svc.cluster.local:8080/verify + authResponseHeaders: + - X-Payment-Response + +--- +# eRPC metadata ConfigMap for frontend discovery. `.Values.network` +# resolves against the `network` value passed to the `base` release +# (default "mainnet", overridable via helmfile state values). +apiVersion: v1 +kind: ConfigMap +metadata: + name: erpc-metadata + namespace: erpc + labels: + app.kubernetes.io/part-of: obol.stack + obol.stack/id: default + obol.stack/app: erpc +data: + metadata.json: | + { + "network": "{{ .Values.network }}", + "endpoints": { + "rpc": { + "external": "http://obol.stack/rpc/{{ .Values.network }}", + "internal": "http://erpc.erpc.svc.cluster.local/rpc/{{ .Values.network }}" + } + } + } diff --git a/internal/embed/infrastructure/base/templates/llm.yaml b/internal/embed/infrastructure/base/templates/llm.yaml index cf34841f..ec02c0d8 100644 --- a/internal/embed/infrastructure/base/templates/llm.yaml +++ b/internal/embed/infrastructure/base/templates/llm.yaml @@ -21,6 +21,15 @@ apiVersion: v1 kind: Namespace metadata: name: llm + labels: + # Pod Security Standards: Restricted profile enforced at admission. + # The litellm pod (litellm + x402-buyer sidecar) runs as non-root with + # all caps dropped, seccomp=RuntimeDefault, and readOnlyRootFilesystem; + # write paths are routed to named emptyDir mounts. + pod-security.kubernetes.io/enforce: restricted + pod-security.kubernetes.io/enforce-version: latest + pod-security.kubernetes.io/audit: restricted + pod-security.kubernetes.io/warn: restricted --- # ClusterIP Service + Endpoints: routes ollama.llm.svc.cluster.local → host Ollama. @@ -113,6 +122,30 @@ type: Opaque stringData: LITELLM_MASTER_KEY: "sk-obol-{{CLUSTER_ID}}" +--- +# x402-buyer maintains consumed-nonce state in /state/consumed.json. +# Previously this was emptyDir, which lost state on every pod restart +# — the buyer would then attempt to re-spend already-consumed auths +# from the ConfigMap-loaded pool, cascading into 400s from the +# facilitator's nonce protection until a manual buy.py process --all. +# PVC backed by local-path (single-node k3d default storage class) +# gives crash-safety without conversion to StatefulSet. +# +# Deployment strategy: Recreate — RWO PVC can't be mounted by two +# pods, so RollingUpdate's surge would block. Recreate accepts a +# brief gap during rollout (litellm is replicas:1 anyway). +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: x402-buyer-state + namespace: llm +spec: + accessModes: [ReadWriteOnce] + storageClassName: local-path + resources: + requests: + storage: 50Mi + --- apiVersion: apps/v1 kind: Deployment @@ -126,11 +159,12 @@ spec: # is local to the sidecar pod. Scale this back out only after consumed auth # state is shared or auth pools are sharded per replica. replicas: 1 + # Recreate (not RollingUpdate) because the x402-buyer-state PVC is RWO and + # cannot be co-mounted by an overlapping new pod during surge. Litellm is + # replicas: 1 so this just trades the (currently maxSurge:1) overlap for a + # short gap during rollout — acceptable, and unavoidable with RWO storage. strategy: - type: RollingUpdate - rollingUpdate: - maxUnavailable: 0 - maxSurge: 1 + type: Recreate selector: matchLabels: app: litellm @@ -142,14 +176,32 @@ spec: secret.reloader.stakater.com/reload: "litellm-secrets" spec: terminationGracePeriodSeconds: 60 + # PSS Restricted: pod-level identity. UID/GID 65532 is the nonroot + # distroless convention; the Obol LiteLLM fork's working dirs are + # routed onto emptyDir mounts below so readOnlyRootFilesystem can + # stay on without breaking Python's tempfile / cache writes. + securityContext: + runAsNonRoot: true + runAsUser: 65532 + runAsGroup: 65532 + fsGroup: 65532 + seccompProfile: + type: RuntimeDefault containers: - name: litellm # Obol fork of LiteLLM with config-only model management API. # No Postgres required — /model/new and /model/delete work via # in-memory router + config.yaml persistence. # Source: https://github.com/ObolNetwork/litellm - image: ghcr.io/obolnetwork/litellm:sha-c16b156 + image: ghcr.io/obolnetwork/litellm:sha-c16b156@sha256:9f112b51ac5a57d73cdd54103fb98d24eabaddd8689a9a285884dca6456dc86e imagePullPolicy: IfNotPresent + # PSS Restricted: drop all caps, no privilege escalation, RO rootfs. + # Python writes are funneled to the emptyDir mounts below. + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"] args: - --config - /etc/litellm/config.yaml @@ -167,10 +219,22 @@ spec: value: "false" - name: DISABLE_SCHEMA_UPDATE value: "true" + # Redirect Python / HF / pip cache lookups onto the writeable + # emptyDir at /home/litellm so readOnlyRootFilesystem=true holds. + - name: HOME + value: /home/litellm + - name: XDG_CACHE_HOME + value: /home/litellm/.cache + - name: HF_HOME + value: /home/litellm/.cache/huggingface volumeMounts: - name: litellm-config mountPath: /etc/litellm/config.yaml subPath: config.yaml + - name: litellm-tmp + mountPath: /tmp + - name: litellm-home + mountPath: /home/litellm startupProbe: httpGet: path: /health/readiness @@ -214,6 +278,14 @@ spec: # across flow-08/11/14/13. See internal/embed/embed_image_pin_test.go. image: ghcr.io/obolnetwork/x402-buyer:b13254e@sha256:446d730fefbe1860e8b3245289aa8979d765ae977b7f0eaa053543e2468313cb imagePullPolicy: IfNotPresent + # PSS Restricted: Go distroless:nonroot image already runs as + # UID 65532; only the state dir under /state needs to be writeable + # and it's already an emptyDir mount. + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"] args: - --config-dir=/config/buyer-config - --auths-dir=/config/buyer-auths @@ -258,6 +330,16 @@ spec: items: - key: config.yaml path: config.yaml + # Writable /tmp for Python tempfile / multipart uploads. Sized + # modestly — LiteLLM streams responses rather than buffering them. + - name: litellm-tmp + emptyDir: + sizeLimit: 128Mi + # Writable HOME for LiteLLM's pip/HF/XDG cache lookups so the + # container can run with readOnlyRootFilesystem=true. + - name: litellm-home + emptyDir: + sizeLimit: 256Mi - name: buyer-config configMap: name: x402-buyer-config @@ -267,7 +349,8 @@ spec: name: x402-buyer-auths optional: true - name: x402-buyer-state - emptyDir: {} + persistentVolumeClaim: + claimName: x402-buyer-state --- apiVersion: policy/v1 @@ -298,3 +381,32 @@ spec: port: 4000 targetPort: http protocol: TCP + +--- +# Relocated from helmfile.yaml `llm-buyer-podmonitor` bedag/raw release. +# Lives alongside its workload (litellm + x402-buyer sidecar) instead of +# inlined in helmfile so the chart layout is the single source of truth +# for what ships in the llm namespace. The PodMonitor CRD comes from the +# monitoring release (kube-prometheus-stack), so `base` now declares a +# `needs: [monitoring/monitoring]` in helmfile.yaml to guarantee CRD +# presence before this template applies. +# +# PodMonitor (not ServiceMonitor) because the sidecar listens on a per-pod +# port (8402) that is NOT exposed via the litellm Service. +# Picked up by kube-prometheus-stack via the `release: monitoring` label. +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: litellm-x402-buyer + namespace: llm + labels: + release: monitoring + app: litellm +spec: + selector: + matchLabels: + app: litellm + podMetricsEndpoints: + - port: buyer-http + path: /metrics + interval: 30s diff --git a/internal/embed/infrastructure/base/templates/obol-frontend.yaml b/internal/embed/infrastructure/base/templates/obol-frontend.yaml new file mode 100644 index 00000000..77a4c806 --- /dev/null +++ b/internal/embed/infrastructure/base/templates/obol-frontend.yaml @@ -0,0 +1,109 @@ +# Relocated from helmfile.yaml `obol-frontend-httproute` and +# `obol-frontend-rbac` bedag/raw releases. These resources live +# alongside their workload (the obol-frontend Helm release in the +# `obol-frontend` namespace) instead of inlined in helmfile so the +# chart layout is the single source of truth for what ships in the +# obol-frontend namespace. +# +# The obol-frontend Deployment + Service themselves still come from +# the `obol/obol-app` upstream chart (separate release in +# helmfile.yaml); only the HTTPRoute and discovery RBAC are owned +# here. +# +# CRD prerequisite: HTTPRoute -> gateway.networking.k8s.io/v1 +# (shipped by the Traefik v38+ chart's bundled CRDs). `base` now +# declares `needs: [traefik/traefik]` in helmfile.yaml to guarantee +# the CRDs are present before this template applies. + +--- +# obol-frontend namespace. Pre-created here so the HTTPRoute and +# ClusterRoleBinding subject reference can resolve during the `base` +# release without waiting for the `obol-frontend` upstream chart +# release to create it. The chart release still sets +# `createNamespace: true` — kubectl apply on an existing namespace is +# a no-op. +apiVersion: v1 +kind: Namespace +metadata: + name: obol-frontend + +--- +# obol-frontend HTTPRoute. The `hostnames: ["obol.stack"]` restriction +# keeps the frontend UI off the public cloudflared tunnel — removing +# it is a critical security regression (see CLAUDE.md "Security: +# Tunnel Exposure"). +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: obol-frontend + namespace: obol-frontend +spec: + hostnames: + - "obol.stack" + parentRefs: + - name: traefik-gateway + namespace: traefik + sectionName: web + rules: + - matches: + - path: + type: PathPrefix + value: / + backendRefs: + - name: obol-frontend-obol-app + port: 3000 + +--- +# obol-frontend RBAC for the pod ServiceAccount. +# +# Keep this as the single frontend RBAC template. A prior bundle carried a +# second obol-frontend-rbac.yaml template with the same ClusterRole and +# ClusterRoleBinding names, which made the rendered chart order-dependent. +# +# The frontend is local-only behind the obol.stack hostname restriction +# (the operator owns the cluster), so this is a single trust boundary. +# Defense-in-depth note: the `secrets` rule is intentionally omitted — no +# frontend code path reads them and the SA token should not have that reach. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: obol-frontend-openclaw-discovery + labels: + app.kubernetes.io/name: obol-frontend +rules: + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["get", "list"] + - apiGroups: [""] + resources: ["pods", "configmaps"] + verbs: ["get", "list"] + # ServiceOffer CRD — frontend sell modal creates offers + - apiGroups: ["obol.org"] + resources: ["serviceoffers", "serviceoffers/status"] + verbs: ["get", "list", "create", "update", "patch", "delete"] + # PurchaseRequest CRD — My Purchases lists agent buys. Read-only: the + # agent and controller own writes. + - apiGroups: ["obol.org"] + resources: ["purchaserequests", "purchaserequests/status"] + verbs: ["get", "list", "watch"] + # RegistrationRequest CRD — listing rows surface ERC-8004 registration + # state. Read-only: the controller owns writes. + - apiGroups: ["obol.org"] + resources: ["registrationrequests", "registrationrequests/status"] + verbs: ["get", "list", "watch"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: obol-frontend-openclaw-discovery + labels: + app.kubernetes.io/name: obol-frontend +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: obol-frontend-openclaw-discovery +subjects: + - kind: ServiceAccount + name: obol-frontend + namespace: obol-frontend diff --git a/internal/embed/infrastructure/base/templates/purchaserequest-crd.yaml b/internal/embed/infrastructure/base/templates/purchaserequest-crd.yaml index 49bb7359..af32f441 100644 --- a/internal/embed/infrastructure/base/templates/purchaserequest-crd.yaml +++ b/internal/embed/infrastructure/base/templates/purchaserequest-crd.yaml @@ -1,6 +1,9 @@ +--- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.5 name: purchaserequests.obol.org spec: group: obol.org @@ -8,162 +11,221 @@ spec: kind: PurchaseRequest listKind: PurchaseRequestList plural: purchaserequests - singular: purchaserequest shortNames: - - pr + - pr + singular: purchaserequest scope: Namespaced versions: - - name: v1alpha1 - served: true - storage: true - subresources: - status: {} - additionalPrinterColumns: - - name: Endpoint - type: string - jsonPath: .spec.endpoint - - name: Model - type: string - jsonPath: .spec.model - - name: Price - type: string - jsonPath: .spec.payment.price - - name: Remaining - type: integer - jsonPath: .status.remaining - - name: Spent - type: integer - jsonPath: .status.spent - - name: Ready - type: string - jsonPath: .status.conditions[?(@.type=="Ready")].status - - name: Age - type: date - jsonPath: .metadata.creationTimestamp - schema: - openAPIV3Schema: - type: object - properties: - spec: - type: object - required: [endpoint, model, count, payment] - properties: - endpoint: - type: string - description: "Full URL to the x402-gated inference endpoint" - model: - type: string - description: "Remote model ID (used as paid/ in LiteLLM)" - count: - type: integer - minimum: 1 - maximum: 2500 - description: "Number of pre-signed auths to create" - preSignedAuths: - type: array - description: "Pre-signed x402 payments (legacy ERC-3009 auths still supported)" - items: - type: object - properties: - id: { type: string } - payment: - type: object - x-kubernetes-preserve-unknown-fields: true - signature: { type: string } - from: { type: string } - to: { type: string } - value: { type: string } - validAfter: { type: string } - validBefore: { type: string } - nonce: { type: string } - autoRefill: - type: object + - additionalPrinterColumns: + - jsonPath: .spec.endpoint + name: Endpoint + type: string + - jsonPath: .spec.model + name: Model + type: string + - jsonPath: .spec.payment.price + name: Price + type: string + - jsonPath: .status.remaining + name: Remaining + type: integer + - jsonPath: .status.spent + name: Spent + type: integer + - jsonPath: .status.conditions[?(@.type=="Ready")].status + name: Ready + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + PurchaseRequest is the buyer-side request for pre-signed x402 auths + against a remote inference endpoint. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + properties: + autoRefill: + description: |- + PurchaseAutoRefill drives the agent-managed auto-refill policy for a + PurchaseRequest. The reconciler reads MaxTotal + MaxSpendPerDay as + budget caps before signing more auths; without these fields populated + the agent will not auto-refill beyond the initial Count. + properties: + count: + description: Number of auths to sign on refill. + minimum: 1 + type: integer + enabled: + default: false + type: boolean + maxSpendPerDay: + description: Max micro-USDC spend per day. + type: string + maxTotal: + description: Cap total auths ever signed. + type: integer + threshold: + description: Refill when remaining < threshold. + minimum: 0 + type: integer + type: object + count: + description: Number of pre-signed auths to create. + maximum: 2500 + minimum: 1 + type: integer + endpoint: + description: Full URL to the x402-gated inference endpoint. + type: string + model: + description: Remote model ID (used as paid/ in LiteLLM). + type: string + payment: + properties: + asset: + description: ERC-20 contract address. + type: string + assetDecimals: + description: Token decimals in atomic units. + format: int64 + type: integer + assetSymbol: + description: Human-friendly token symbol (e.g. USDC, OBOL). + type: string + assetTransferMethod: + description: x402 transfer method used for this asset. + type: string + eip712Name: + description: EIP-712 domain name used for signing. + type: string + eip712Version: + description: EIP-712 domain version used for signing. + type: string + network: + type: string + payTo: + type: string + price: + description: Atomic token units per request. + type: string + required: + - asset + - network + - payTo + - price + type: object + preSignedAuths: + description: Pre-signed x402 payments (legacy ERC-3009 auths still + supported). + items: + description: |- + PreSignedAuth carries a pre-signed x402 payment authorization. The + Payment map is opaque (forwarded verbatim to the buyer sidecar / x402 + facilitator) which can't be deep-copied by controller-gen; DeepCopy + methods for this type are hand-written in deepcopy_manual.go. properties: - enabled: - type: boolean - default: false - threshold: - type: integer - minimum: 0 - description: "Refill when remaining < threshold" - count: - type: integer - minimum: 1 - description: "Number of auths to sign on refill" - maxTotal: - type: integer - description: "Cap total auths ever signed" - maxSpendPerDay: + from: type: string - description: "Max micro-USDC spend per day" - payment: - type: object - required: [network, payTo, price, asset] - properties: - network: + id: + type: string + nonce: type: string - payTo: + payment: + x-kubernetes-preserve-unknown-fields: true + signature: type: string - price: + to: type: string - description: "Atomic token units per request" - asset: + validAfter: type: string - description: "ERC-20 contract address" - assetSymbol: + validBefore: type: string - description: "Human-friendly token symbol (e.g. USDC, OBOL)" - assetDecimals: - type: integer - description: "Token decimals in atomic units" - assetTransferMethod: + value: type: string - description: "x402 transfer method used for this asset" - eip712Name: + type: object + type: array + required: + - count + - endpoint + - model + - payment + type: object + status: + properties: + conditions: + items: + properties: + lastTransitionTime: + description: Last time the condition transitioned. + format: date-time + type: string + message: + description: Human-readable message with details. type: string - description: "EIP-712 domain name used for signing" - eip712Version: + reason: + description: Machine-readable reason for the condition. type: string - description: "EIP-712 domain version used for signing" - status: - type: object - properties: - observedGeneration: - type: integer - format: int64 - conditions: - type: array - items: - type: object - properties: - type: - type: string - status: - type: string - reason: - type: string - message: - type: string - lastTransitionTime: - type: string - format: date-time - publicModel: - type: string - description: "LiteLLM model name (paid/)" - remaining: - type: integer - spent: - type: integer - totalSigned: - type: integer - totalSpent: - type: string - probedAt: - type: string - format: date-time - probedPrice: - type: string - walletBalance: - type: string - signerAddress: - type: string + status: + description: Status of the condition. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: Condition type. + type: string + required: + - status + - type + type: object + type: array + observedGeneration: + format: int64 + type: integer + probedAt: + format: date-time + type: string + probedPrice: + type: string + publicModel: + description: LiteLLM model name (paid/). + type: string + remaining: + type: integer + signerAddress: + type: string + spent: + type: integer + totalSigned: + type: integer + totalSpent: + type: string + walletBalance: + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/internal/embed/infrastructure/base/templates/registrationrequest-crd.yaml b/internal/embed/infrastructure/base/templates/registrationrequest-crd.yaml index b6266db2..8ac1a00c 100644 --- a/internal/embed/infrastructure/base/templates/registrationrequest-crd.yaml +++ b/internal/embed/infrastructure/base/templates/registrationrequest-crd.yaml @@ -1,10 +1,9 @@ --- -# RegistrationRequest CRD -# Isolates ERC-8004 publication and on-chain side effects from the main -# ServiceOffer reconciliation loop. ServiceOffer remains the source of truth. apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.5 name: registrationrequests.obol.org spec: group: obol.org @@ -12,74 +11,95 @@ spec: kind: RegistrationRequest listKind: RegistrationRequestList plural: registrationrequests - singular: registrationrequest shortNames: - - rr + - rr + singular: registrationrequest scope: Namespaced versions: - - name: v1alpha1 - served: true - storage: true - subresources: - status: {} - additionalPrinterColumns: - - name: Offer - type: string - jsonPath: .spec.serviceOfferName - - name: State - type: string - jsonPath: .spec.desiredState - - name: Phase - type: string - jsonPath: .status.phase - - name: AgentID - type: string - jsonPath: .status.agentId - - name: Age - type: date - jsonPath: .metadata.creationTimestamp - schema: - openAPIV3Schema: - type: object - properties: - spec: - type: object - required: - - serviceOfferName - - serviceOfferNamespace - - desiredState - properties: - serviceOfferName: - type: string - serviceOfferNamespace: - type: string - desiredState: - type: string - enum: - - Active - - Tombstoned - chain: - type: string - description: "ERC-8004 registration chain alias for this request." - status: - type: object - properties: - phase: - type: string - message: - type: string - publishedUrl: - type: string - agentId: - type: string - registrationTxHash: - type: string - registrationOwner: - type: string - registrationUri: - type: string - registrationSearchFromBlock: - type: integer - format: int64 - metadataSynced: - type: boolean + - additionalPrinterColumns: + - jsonPath: .spec.serviceOfferName + name: Offer + type: string + - jsonPath: .spec.desiredState + name: State + type: string + - jsonPath: .status.phase + name: Phase + type: string + - jsonPath: .status.agentId + name: AgentID + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + RegistrationRequest isolates ERC-8004 publication and on-chain side + effects from the main ServiceOffer reconciliation loop. ServiceOffer + remains the source of truth. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + properties: + chain: + description: ERC-8004 registration chain alias for this request. + type: string + desiredState: + enum: + - Active + - Tombstoned + type: string + serviceOfferName: + type: string + serviceOfferNamespace: + type: string + required: + - desiredState + - serviceOfferName + - serviceOfferNamespace + type: object + status: + properties: + agentId: + type: string + message: + type: string + metadataSynced: + type: boolean + phase: + type: string + publishedUrl: + type: string + registrationOwner: + type: string + registrationSearchFromBlock: + format: int64 + type: integer + registrationTxHash: + type: string + registrationUri: + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/internal/embed/infrastructure/base/templates/serviceoffer-crd.yaml b/internal/embed/infrastructure/base/templates/serviceoffer-crd.yaml index 5b37ba23..7b67e13a 100644 --- a/internal/embed/infrastructure/base/templates/serviceoffer-crd.yaml +++ b/internal/embed/infrastructure/base/templates/serviceoffer-crd.yaml @@ -1,15 +1,9 @@ --- -# ServiceOffer CRD -# Defines a compute service the agent can expose, gate with x402, and register on-chain. -# Condition lifecycle: ModelReady -> UpstreamHealthy -> PaymentGateReady -> RoutePublished -> Registered -> Ready -# -# Field naming conventions: -# - payment.* fields align with x402 PaymentRequirements (V2): payTo, network, scheme, maxTimeoutSeconds -# - registration.* fields align with ERC-8004 AgentRegistration: name, description, services, supportedTrust -# - Human-friendly values (e.g., "base-sepolia") are used; the reconciler translates to wire format apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.5 name: serviceoffers.obol.org spec: group: obol.org @@ -17,354 +11,373 @@ spec: kind: ServiceOffer listKind: ServiceOfferList plural: serviceoffers - singular: serviceoffer shortNames: - - so + - so + singular: serviceoffer scope: Namespaced versions: - - name: v1alpha1 - served: true - storage: true - subresources: - status: {} - additionalPrinterColumns: - - name: Type - type: string - jsonPath: .spec.type - - name: Model - type: string - jsonPath: .spec.model.name - - name: Price - type: string - jsonPath: .spec.payment.price.perRequest - - name: Network - type: string - jsonPath: .spec.payment.network - - name: Ready - type: string - jsonPath: .status.conditions[?(@.type=="Ready")].status - - name: Age - type: date - jsonPath: .metadata.creationTimestamp - schema: - openAPIV3Schema: - type: object - description: >- - ServiceOffer declares a compute service that can be exposed publicly, - gated with x402 payments, and optionally registered on an ERC-8004 - service registry. Field names align with x402 and ERC-8004 standards. - properties: - spec: - type: object - required: - - payment - # upstream is required for type=http|inference|fine-tuning but - # synthesized by the controller from Agent.status.endpoint when - # type=agent. Validation of "upstream OR agent.ref" lives in - # the controller's runtime check. - properties: - type: - type: string - description: >- - Service type. 'inference' enables model management; 'http' for any HTTP service; - 'agent' references an Agent CR via spec.agent.ref and the controller derives - upstream + model + skills from the agent's status. - default: "http" - enum: - - inference - - fine-tuning - - http - - agent - agent: - type: object - description: >- - Required when type='agent'. The controller resolves spec.agent.ref to the - referenced Agent CR, derives upstream from Agent.status.endpoint, and surfaces - the agent's pinned model + skills in the 402 response's extra block. - properties: - ref: - type: object - required: - - name - - namespace - properties: - name: - type: string - namespace: - type: string - model: - type: object - description: "LLM model metadata. Required when the upstream serves an LLM." - required: + - additionalPrinterColumns: + - jsonPath: .spec.type + name: Type + type: string + - jsonPath: .spec.model.name + name: Model + type: string + - jsonPath: .spec.payment.price.perRequest + name: Price + type: string + - jsonPath: .spec.payment.network + name: Network + type: string + - jsonPath: .status.conditions[?(@.type=="Ready")].status + name: Ready + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + ServiceOffer declares a compute service that can be exposed publicly, + gated with x402 payments, and optionally registered on an ERC-8004 + service registry. Field names align with x402 and ERC-8004 standards. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + properties: + agent: + description: |- + Required when type='agent'. The controller resolves spec.agent.ref to + the referenced Agent CR, derives upstream from Agent.status.endpoint, + and surfaces the agent's pinned model + skills in the 402 response. + properties: + ref: + properties: + name: + type: string + namespace: + type: string + required: - name - - runtime - properties: - name: - type: string - description: "Model identifier (e.g. qwen3.5:35b)." - runtime: - type: string - description: "Runtime serving the model." - enum: - - ollama - - vllm - - tgi - upstream: - type: object - description: "In-cluster service that handles the actual workload." - required: - - service - namespace - - port - properties: - service: - type: string - description: "Kubernetes Service name." - namespace: - type: string - description: "Namespace of the upstream Service." - port: - type: integer - description: "Port on the upstream Service." - default: 11434 - minimum: 1 - maximum: 65535 - healthPath: - type: string - description: "HTTP path used for health probes against the upstream." - default: "/health" - payment: - type: object - description: >- - x402 payment terms. Field names align with x402 PaymentRequirements (V2): - payTo, network, scheme, maxTimeoutSeconds. - required: - - network - - payTo - - price - properties: - scheme: - type: string - description: "x402 payment scheme." - default: "exact" - enum: - - exact - network: + type: object + type: object + drainAt: + description: |- + DrainAt marks the offer as draining when non-nil. While the offer + is in the drain window, discovery surfaces (/skill.md and + /.well-known/agent-registration.json) advertise the offer with + available=false and drainEndsAt set, so buyers can migrate before + the route is torn down. The route + payment gate stay up until + DrainEndsAt() so in-flight payments can complete. Replaces the + legacy obol.org/paused annotation. + format: date-time + type: string + drainGracePeriod: + description: |- + DrainGracePeriod is how long after DrainAt the HTTPRoute remains + up. Defaults to DefaultDrainGracePeriod when nil. A zero duration + is honored as "tear down immediately on the next reconcile" (the + equivalent of `obol sell stop --force`). + type: string + model: + description: LLM model metadata. Required when the upstream serves + an LLM. + properties: + name: + description: Model identifier (e.g. qwen3.5:35b). + type: string + runtime: + description: Runtime serving the model. + enum: + - ollama + - vllm + - tgi + type: string + required: + - name + - runtime + type: object + path: + description: URL path prefix for the HTTPRoute, defaults to /services/. + pattern: ^/[a-zA-Z0-9/_.-]*$ + type: string + payment: + properties: + asset: + description: |- + Optional token metadata override for x402 settlement. When omitted, + the verifier uses the chain default asset. + properties: + address: + description: ERC-20 contract address. + pattern: ^0x[0-9a-fA-F]{40}$ + type: string + decimals: + description: Token decimals in atomic units. + format: int64 + maximum: 255 + minimum: 0 + type: integer + eip712Name: + description: EIP-712 domain name used by the token. + type: string + eip712Version: + description: EIP-712 domain version used by the token. + type: string + symbol: + description: Human-friendly token symbol (e.g. USDC, OBOL). + type: string + transferMethod: + description: x402 transfer method for the asset. + enum: + - eip3009 + - permit2 + type: string + type: object + maxTimeoutSeconds: + default: 300 + description: 'Payment validity window in seconds (x402: maxTimeoutSeconds).' + format: int64 + type: integer + network: + description: |- + Chain identifier for payments (human-friendly). Reconciler resolves + to CAIP-2 format (e.g., "base-sepolia" → "eip155:84532"). + type: string + payTo: + description: 'USDC recipient wallet address (x402: payTo).' + pattern: ^0x[0-9a-fA-F]{40}$ + type: string + price: + description: |- + Pricing table with per-unit prices in USDC (human-readable decimals). + Which fields are applicable depends on the workload type. + properties: + perEpoch: + description: Per-training-epoch price in USDC. Fine-tuning + only. + type: string + perHour: + description: Per-compute-hour price in USDC. Fine-tuning only. + type: string + perMTok: + description: Per-million-tokens price in USDC. Inference only. + type: string + perRequest: + description: Flat per-request price in USDC. Applicable to + all types. + type: string + type: object + scheme: + default: exact + description: x402 payment scheme. + enum: + - exact + type: string + required: + - network + - payTo + - price + type: object + provenance: + additionalProperties: + type: string + description: |- + Optional provenance metadata for the service. Tracks how the model or + service was produced (e.g. autoresearch experiment data). Included in + the ERC-8004 registration document when present. + type: object + registration: + description: |- + ERC-8004 registration metadata. Field names align with the + AgentRegistration document schema (ERC-8004 spec). + properties: + description: + description: 'Agent description (ERC-8004: AgentRegistration.description).' + type: string + domains: + description: |- + OASF domains for discovery (e.g. technology/artificial_intelligence). + Mapped to an OASF service entry in the registration JSON. + items: type: string - description: >- - Chain identifier for payments (human-friendly). - Reconciler resolves to CAIP-2 format (e.g., "base-sepolia" → "eip155:84532"). - payTo: + type: array + enabled: + default: false + description: If true, register on ERC-8004 after routing is live. + type: boolean + image: + description: 'Agent icon URL (ERC-8004: AgentRegistration.image).' + type: string + metadata: + additionalProperties: type: string - description: "USDC recipient wallet address (x402: payTo)." - pattern: "^0x[0-9a-fA-F]{40}$" - maxTimeoutSeconds: - type: integer - description: "Payment validity window in seconds (x402: maxTimeoutSeconds)." - default: 300 - asset: - type: object - description: >- - Optional token metadata override for x402 settlement. - When omitted, the verifier uses the chain default asset. + description: |- + Additional registration metadata published into the generated + agent-registration.json for discovery and ranking. + type: object + name: + description: 'Agent name (ERC-8004: AgentRegistration.name).' + type: string + services: + description: 'Service endpoints (ERC-8004: AgentRegistration.services[]).' + items: properties: - address: - type: string - description: "ERC-20 contract address." - pattern: "^0x[0-9a-fA-F]{40}$" - symbol: + endpoint: + description: Service URL. Auto-filled from tunnel URL if + empty. type: string - description: "Human-friendly token symbol (e.g. USDC, OBOL)." - decimals: - type: integer - description: "Token decimals in atomic units." - minimum: 0 - maximum: 255 - transferMethod: - type: string - description: "x402 transfer method for the asset." - enum: - - eip3009 - - permit2 - eip712Name: + name: + description: 'Service type: web, A2A, MCP, OASF, ENS, DID, + email.' type: string - description: "EIP-712 domain name used by the token." - eip712Version: + version: + description: Protocol version (SHOULD per ERC-8004 spec). type: string - description: "EIP-712 domain version used by the token." - price: + required: + - endpoint + - name type: object - description: >- - Pricing table with per-unit prices in USDC (human-readable decimals). - Which fields are applicable depends on the workload type. - properties: - perRequest: - type: string - description: "Flat per-request price in USDC. Applicable to all types." - perMTok: - type: string - description: "Per-million-tokens price in USDC. Inference only." - perHour: - type: string - description: "Per-compute-hour price in USDC. Fine-tuning only." - perEpoch: - type: string - description: "Per-training-epoch price in USDC. Fine-tuning only." - provenance: - type: object - description: >- - Optional provenance metadata for the service. Tracks how the - model or service was produced (e.g. autoresearch experiment data). - Included in the ERC-8004 registration document when present. - properties: - framework: - type: string - description: "Optimization framework (e.g. autoresearch)." - metricName: - type: string - description: "Name of the primary quality metric (e.g. val_bpb)." - metricValue: + type: array + skills: + description: |- + OASF skills for discovery (e.g. + natural_language_processing/text_generation). Mapped to an OASF + service entry in the registration JSON. + items: type: string - description: "Primary quality metric value (e.g. 0.9973)." - experimentId: + type: array + supportedTrust: + description: |- + Trust verification methods (ERC-8004: AgentRegistration.supportedTrust[]). + Valid values: reputation, crypto-economic, tee-attestation. + items: type: string - description: "Experiment or commit identifier." - trainHash: + type: array + type: object + type: + default: http + description: |- + Service type. 'inference' enables model management; 'http' for any HTTP + service; 'agent' references an Agent CR via spec.agent.ref and the + controller derives upstream + model + skills from the agent's status. + enum: + - inference + - fine-tuning + - http + - agent + type: string + upstream: + description: In-cluster service that handles the actual workload. + properties: + healthPath: + default: /health + description: HTTP path used for health probes against the upstream. + type: string + namespace: + description: Namespace of the upstream Service. + type: string + port: + default: 11434 + description: Port on the upstream Service. + format: int64 + maximum: 65535 + minimum: 1 + type: integer + service: + description: Kubernetes Service name. + type: string + required: + - namespace + - port + - service + type: object + required: + - payment + type: object + status: + properties: + agentId: + description: ERC-8004 agent NFT token ID after on-chain registration. + type: string + agentResolution: + description: |- + Controller's resolved view of an agent-type offer's referenced Agent. + Populated only when type=agent and the Agent is Ready. + properties: + endpoint: + type: string + model: + type: string + runtime: + type: string + skills: + items: type: string - description: "SHA-256 hash of the training code that produced this model." - paramCount: - type: string - description: "Model parameter count (e.g. 50M, 1.3B)." - path: - type: string - description: "URL path prefix for the HTTPRoute, defaults to /services/." - pattern: "^/[a-zA-Z0-9/_.-]*$" - registration: - type: object - description: >- - ERC-8004 registration metadata. Field names align with the - AgentRegistration document schema (ERC-8004 spec). + type: array + type: object + conditions: + description: |- + Condition types: ModelReady, UpstreamHealthy, PaymentGateReady, + RoutePublished, Registered, Ready. + items: properties: - enabled: - type: boolean - description: "If true, register on ERC-8004 after routing is live." - default: false - name: + lastTransitionTime: + description: Last time the condition transitioned. + format: date-time type: string - description: "Agent name (ERC-8004: AgentRegistration.name)." - description: + message: + description: Human-readable message with details. type: string - description: "Agent description (ERC-8004: AgentRegistration.description)." - image: + reason: + description: Machine-readable reason for the condition. type: string - description: "Agent icon URL (ERC-8004: AgentRegistration.image)." - services: - type: array - description: "Service endpoints (ERC-8004: AgentRegistration.services[])." - items: - type: object - required: - - name - - endpoint - properties: - name: - type: string - description: "Service type: web, A2A, MCP, OASF, ENS, DID, email." - endpoint: - type: string - description: "Service URL. Auto-filled from tunnel URL if empty." - version: - type: string - description: "Protocol version (SHOULD per ERC-8004 spec)." - skills: - type: array - description: >- - OASF skills for discovery (e.g. natural_language_processing/text_generation). - Mapped to an OASF service entry in the registration JSON. - items: - type: string - domains: - type: array - description: >- - OASF domains for discovery (e.g. technology/artificial_intelligence). - Mapped to an OASF service entry in the registration JSON. - items: - type: string - supportedTrust: - type: array - description: >- - Trust verification methods (ERC-8004: AgentRegistration.supportedTrust[]). - Valid values: reputation, crypto-economic, tee-attestation. - items: - type: string - metadata: - type: object - description: >- - Additional registration metadata published into the generated - agent-registration.json for discovery and ranking (for example: - gpu, framework, best_val_bpb, total_experiments). - additionalProperties: - type: string - status: - type: object - properties: - conditions: - type: array - description: >- - Condition types: ModelReady, UpstreamHealthy, PaymentGateReady, - RoutePublished, Registered, Ready. - items: - type: object - required: - - type - - status - properties: - type: - type: string - description: "Condition type." - status: - type: string - description: "Status of the condition." - enum: - - "True" - - "False" - - "Unknown" - reason: - type: string - description: "Machine-readable reason for the condition." - message: - type: string - description: "Human-readable message with details." - lastTransitionTime: - type: string - format: date-time - description: "Last time the condition transitioned." - endpoint: - type: string - description: "The public endpoint URL once the route is published." - agentId: - type: string - description: "ERC-8004 agent NFT token ID after on-chain registration." - registrationTxHash: - type: string - description: "Transaction hash of the ERC-8004 registration." - observedGeneration: - type: integer - format: int64 - description: "The generation observed by the controller." - agentResolution: - type: object - description: >- - Controller's resolved view of an agent-type offer's referenced - Agent. Populated only when type=agent and the Agent is Ready. - properties: - model: - type: string - skills: - type: array - items: - type: string - runtime: + status: + description: Status of the condition. + enum: + - "True" + - "False" + - Unknown type: string - endpoint: + type: + description: Condition type. type: string + required: + - status + - type + type: object + type: array + endpoint: + description: The public endpoint URL once the route is published. + type: string + observedGeneration: + description: The generation observed by the controller. + format: int64 + type: integer + registrationTxHash: + description: Transaction hash of the ERC-8004 registration. + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml b/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml new file mode 100644 index 00000000..5115a7d4 --- /dev/null +++ b/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml @@ -0,0 +1,203 @@ +--- +# Recording + alerting rules for x402 verifier traffic. +# +# Recording rules pre-aggregate the queries that the frontend's +# /api/sell/list joins use (chargedSalesByOfferAndChain, +# chargedRequests24hByOffer). The frontend reads the recorded series +# directly, which: +# * removes the `increase()` 2-sample minimum quirk (cold offers no +# longer show "0" for the first 30s after they receive traffic), +# * decouples the page from raw metric names (renaming +# obol_x402_verifier_charged_requests_total no longer breaks the UI), +# * cuts query cost on dashboards / page reloads (sum is done once at +# evaluation time, not per page-load). +# +# Alerting rules surface the two operator-meaningful failure modes the +# release-smoke flows historically caught manually. +# +# Picked up by kube-prometheus-stack via the `release: monitoring` label +# (configured in values/monitoring.yaml.gotmpl). +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: x402-verifier + namespace: x402 + labels: + release: monitoring + app: x402-verifier +spec: + groups: + - name: x402.recording + interval: 30s + rules: + # 24h charged-request count per (offer, chain). Replaces the + # frontend's `increase(charged_requests_total[24h])` query — same + # math, pre-computed every 30s. + # + # Kept unchanged for backwards compatibility. The + # _by_offer_chain_asset_symbol sibling below is the migration + # target for the frontend's per-token EarningsStrip columns. + - record: x402:revenue:24h_by_offer_chain + expr: | + sum by (offer_namespace, offer_name, chain) ( + increase(obol_x402_verifier_charged_requests_total[24h]) + ) + + # 24h charged-request count per (offer, chain, asset_symbol). + # Same math as :24h_by_offer_chain but keeps the asset dimension + # so the frontend can answer "what's my OBOL revenue?" with a + # single PromQL query instead of joining metrics with the + # ServiceOffer CR. Adding asset_symbol is non-multiplicative + # because each offer pins exactly one asset (A=1 per offer). + - record: x402:revenue:24h_by_offer_chain_asset_symbol + expr: | + sum by (offer_namespace, offer_name, chain, asset_symbol) ( + increase(obol_x402_verifier_charged_requests_total[24h]) + ) + + # 7d charged-request count per (offer, chain). Powers the + # EarningsStrip per-chain × CRD price multiplication. + - record: x402:revenue:7d_by_offer_chain + expr: | + sum by (offer_namespace, offer_name, chain) ( + increase(obol_x402_verifier_charged_requests_total[7d]) + ) + + # 7d charged-request count per (offer, chain, asset_symbol). + # Sibling of :7d_by_offer_chain — once the frontend migrates to + # the per-asset rule, the EarningsStrip can drop its + # CR-join-at-query-time for per-token columns. Cardinality is + # non-multiplicative because each offer pins exactly one asset. + - record: x402:revenue:7d_by_offer_chain_asset_symbol + expr: | + sum by (offer_namespace, offer_name, chain, asset_symbol) ( + increase(obol_x402_verifier_charged_requests_total[7d]) + ) + + # 7d charged-request count per offer (chain-agnostic). Used in the + # My Listings "7d · X earned" header text and the Browse catalog + # usage badge. + # + # Why `increase()` and not `sum(counter)`: + # Prometheus counters are per-process by design — they reset to + # zero on every pod restart (rollout, OOM, eviction, node + # reschedule). A naive `sum by (...) (counter)` query therefore + # drops to zero whenever the verifier restarts, producing a + # misleading "0 requests" reading on offers with real on-chain + # traffic. `increase()` performs reset detection at query time + # across the samples the TSDB holds, accounting for the wraps. + # + # Why `[7d]` and not `[8d]` (matching retention): + # The TSDB is the canonical persistence layer. `increase()` + # needs samples on both sides of the window edge to do reset + # detection at the left edge; a 7d window inside 8d retention + # gives a 1-day headroom so the rule keeps working at exactly + # the moment data ages out, instead of silently producing + # NaN/undercounts at the boundary. + # + # Canonical reference: Robust Perception, "avoiding the counter- + # reset undercount". + - record: x402:revenue:7d_by_offer + expr: | + sum by (offer_namespace, offer_name) ( + increase(obol_x402_verifier_charged_requests_total[7d]) + ) + + # Settlement rate (verified / attempted) over the last hour, per + # (offer, chain). Useful for the dashboard + the alert below. + # + # The `clamp_min(..., 1e-9)` is a division-by-zero guard, not a + # traffic floor. An earlier revision used `clamp_min(..., 1)`, + # which floored the denominator at 1 req/s and silently + # distorted the ratio on low-traffic offers (e.g. verified= + # 0.001/s ÷ floored_denominator=1 ≈ 0 instead of the real + # 0.001/0.002 = 0.5). Epsilon keeps the answer accurate at any + # non-zero traffic level while still avoiding a NaN when no + # samples exist in the window. + - record: x402:settlement_rate:1h_by_offer_chain + expr: | + sum by (offer_namespace, offer_name, chain) ( + rate(obol_x402_verifier_payment_verified_total[1h]) + ) + / + clamp_min( + sum by (offer_namespace, offer_name, chain) ( + rate(obol_x402_verifier_payment_required_total[1h]) + + + rate(obol_x402_verifier_payment_verified_total[1h]) + + + rate(obol_x402_verifier_payment_failed_total[1h]) + ), + 1e-9 + ) + + - name: x402.alerting + rules: + # Payment-failure ratio crossed 10% over the last hour for a paid + # route that's actually receiving traffic. Typical cause: + # facilitator unreachable, chain pruning, or seller's CA bundle + # missing (CLAUDE.md pitfall #8). + # + # The `clamp_min(..., 1e-9)` here is a div-by-zero guard only. + # A prior `clamp_min(..., 1)` floored the denominator at 1 req/s, + # which under-reports the failure ratio on light-traffic + # endpoints (failed=0.001/s ÷ floored_denominator=1 = 0.001 + # instead of the true 0.001/0.002 = 0.5) and prevented the + # alert from ever firing at sub-1 req/s. Epsilon avoids NaN + # without distorting the ratio. + - alert: X402PaymentFailureRateHigh + expr: | + ( + sum by (offer_namespace, offer_name, chain) ( + rate(obol_x402_verifier_payment_failed_total[1h]) + ) + / + clamp_min( + sum by (offer_namespace, offer_name, chain) ( + rate(obol_x402_verifier_payment_failed_total[1h]) + + + rate(obol_x402_verifier_payment_verified_total[1h]) + ), + 1e-9 + ) + ) > 0.10 + for: 10m + labels: + severity: warning + annotations: + summary: "x402 payment failures > 10% on {{ "{{" }} $labels.offer_namespace {{ "}}" }}/{{ "{{" }} $labels.offer_name {{ "}}" }} ({{ "{{" }} $labels.chain {{ "}}" }})" + description: | + More than 10% of paid requests to + {{ "{{" }} $labels.offer_namespace {{ "}}" }}/{{ "{{" }} $labels.offer_name {{ "}}" }} on + {{ "{{" }} $labels.chain {{ "}}" }} have failed verification over the last + hour. Check the verifier logs for x509/facilitator errors and + the seller's `ca-certificates` ConfigMap. + + # An offer received a 402 (payment_required) within the last hour + # but no charged_requests happened in the same window. Either + # buyers aren't completing the flow, or settlement is broken + # downstream of the verifier. + - alert: X402NoSettlementsAfterChallenge + expr: | + ( + sum by (offer_namespace, offer_name) ( + increase(obol_x402_verifier_payment_required_total[1h]) + ) > 0 + ) + and + ( + sum by (offer_namespace, offer_name) ( + increase(obol_x402_verifier_charged_requests_total[1h]) + ) == 0 + ) + for: 30m + labels: + severity: warning + annotations: + summary: "{{ "{{" }} $labels.offer_namespace {{ "}}" }}/{{ "{{" }} $labels.offer_name {{ "}}" }} returns 402 but never settles" + description: | + The x402 verifier issued 402 responses for + {{ "{{" }} $labels.offer_namespace {{ "}}" }}/{{ "{{" }} $labels.offer_name {{ "}}" }} in the + last hour but observed no settled requests. Check the buyer + sidecar's auth pool (/status) and the facilitator's settlement + endpoint. diff --git a/internal/embed/infrastructure/base/templates/x402.yaml b/internal/embed/infrastructure/base/templates/x402.yaml index 9dcc933e..f6cdd59d 100644 --- a/internal/embed/infrastructure/base/templates/x402.yaml +++ b/internal/embed/infrastructure/base/templates/x402.yaml @@ -6,6 +6,16 @@ apiVersion: v1 kind: Namespace metadata: name: x402 + labels: + # Pod Security Standards: Restricted profile enforced at admission. + # Future Deployment edits that omit the per-pod securityContext will be + # rejected by the apiserver. Both x402-verifier and serviceoffer-controller + # run as non-root with all caps dropped, seccomp=RuntimeDefault, and + # readOnlyRootFilesystem. + pod-security.kubernetes.io/enforce: restricted + pod-security.kubernetes.io/enforce-version: latest + pod-security.kubernetes.io/audit: restricted + pod-security.kubernetes.io/warn: restricted --- # Static gateway settings plus optional manual routes. In cluster mode the @@ -200,7 +210,11 @@ metadata: labels: app: x402-verifier spec: - replicas: 2 + # Single replica — verifier holds per-pod metric registries and per-pod + # informer caches; multiple replicas produce metric series drift across + # ServiceMonitor scrape rotations and the pruneSeriesNotIn GC (metrics.go) + # becomes inconsistent. Single-node k3d gains no HA from 2 replicas. + replicas: 1 selector: matchLabels: app: x402-verifier @@ -210,10 +224,25 @@ spec: app: x402-verifier spec: serviceAccountName: x402-verifier + # PSS Restricted: pod-level identity. + securityContext: + runAsNonRoot: true + runAsUser: 65532 + runAsGroup: 65532 + fsGroup: 65532 + seccompProfile: + type: RuntimeDefault containers: - name: verifier - image: ghcr.io/obolnetwork/x402-verifier:b13254e + image: ghcr.io/obolnetwork/x402-verifier:b13254e@sha256:a8a7aa0ca4c35b0ddf6983fa6e3e5f8a3f64e44d8e506ebfd55e39de2bc0342d imagePullPolicy: IfNotPresent + # PSS Restricted: per-container hardening. Verifier is a Go binary + # reading two RO ConfigMaps; no writeable rootfs paths required. + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"] ports: - name: http containerPort: 8080 @@ -271,7 +300,10 @@ metadata: labels: app: serviceoffer-controller spec: - replicas: 1 # Do not scale — multiple replicas race on ERC-8004 on-chain registration + # Single replica by default; bumping to 2+ is now safe — leader election + # (client-go Lease in this namespace) prevents split-brain on the reconcile + # loop and the resulting double on-chain ERC-8004 registration. + replicas: 1 selector: matchLabels: app: serviceoffer-controller @@ -281,11 +313,32 @@ spec: app: serviceoffer-controller spec: serviceAccountName: serviceoffer-controller + # PSS Restricted: pod-level identity. Paired with Dockerfile + # FROM gcr.io/distroless/static-debian12:nonroot which default-runs + # as UID/GID 65532. Container escape via a Go-runtime CVE on a + # UID-0 / no-seccomp / no-cap-drop / RW-rootfs container was the + # easiest path to host pivot on k3s single-node; this closes it. + securityContext: + runAsNonRoot: true + runAsUser: 65532 + runAsGroup: 65532 + fsGroup: 65532 + seccompProfile: + type: RuntimeDefault containers: - name: controller - image: ghcr.io/obolnetwork/serviceoffer-controller:b13254e + image: ghcr.io/obolnetwork/serviceoffer-controller:b13254e@sha256:f83bd7e55bdc5d87edb49c04e7fd9257097364e2d43e769c19dfd7c8b47d07af imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"] env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name - name: POD_NAMESPACE valueFrom: fieldRef: @@ -322,13 +375,27 @@ spec: protocol: TCP --- -apiVersion: policy/v1 -kind: PodDisruptionBudget +# ServiceMonitor for x402-verifier — scrapes the stable Service endpoint +# rather than per-pod IPs (which is what a PodMonitor would do). Lives +# alongside the Service it observes so adding/changing the port or +# selector here is a single-file change. +# +# Picked up by kube-prometheus-stack via the `release: monitoring` label +# (configured in values/monitoring.yaml.gotmpl as the serviceMonitorSelector). +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor metadata: name: x402-verifier namespace: x402 + labels: + release: monitoring + app: x402-verifier spec: - minAvailable: 1 selector: matchLabels: app: x402-verifier + endpoints: + - port: http + path: /metrics + interval: 30s + scrapeTimeout: 10s diff --git a/internal/embed/infrastructure/cloudflared/values.yaml b/internal/embed/infrastructure/cloudflared/values.yaml index a41a4715..8ff1670c 100644 --- a/internal/embed/infrastructure/cloudflared/values.yaml +++ b/internal/embed/infrastructure/cloudflared/values.yaml @@ -5,7 +5,7 @@ transport: image: repository: cloudflare/cloudflared - tag: "2026.3.0" + tag: "2026.3.0@sha256:6b599ca3e974349ead3286d178da61d291961182ec3fe9c505e1dd02c8ac31b0" metrics: address: "0.0.0.0:2000" diff --git a/internal/embed/infrastructure/helmfile.yaml b/internal/embed/infrastructure/helmfile.yaml index aa7fc052..65008dcc 100644 --- a/internal/embed/infrastructure/helmfile.yaml +++ b/internal/embed/infrastructure/helmfile.yaml @@ -11,8 +11,6 @@ repositories: url: https://obolnetwork.github.io/helm-charts/ - name: ethereum url: https://ethpandaops.github.io/ethereum-helm-charts - - name: bedag - url: https://bedag.github.io/helm-charts/ - name: stakater url: https://stakater.github.io/stakater-charts @@ -27,15 +25,9 @@ values: enabled: true releases: - # Local storage provisioner (raw manifests wrapped as chart) - - name: base - namespace: kube-system - chart: ./base - values: - - dataDir: /data - - network: "{{ .Values.network }}" - - # Monitoring stack (Prometheus operator + Prometheus) + # Monitoring stack (Prometheus operator + Prometheus). Must run before + # `base` so the PodMonitor CRD exists when base/templates/llm.yaml + # applies the litellm-x402-buyer PodMonitor. - name: monitoring namespace: monitoring createNamespace: true @@ -44,34 +36,20 @@ releases: values: - ./values/monitoring.yaml.gotmpl - - name: llm-buyer-podmonitor - namespace: llm - createNamespace: true - chart: bedag/raw - version: 2.0.2 - needs: - - monitoring/monitoring - - kube-system/base - values: - - resources: - - apiVersion: monitoring.coreos.com/v1 - kind: PodMonitor - metadata: - name: litellm-x402-buyer - namespace: llm - labels: - release: monitoring - spec: - selector: - matchLabels: - app: litellm - podMetricsEndpoints: - - port: buyer-http - path: /metrics - interval: 30s + # NOTE: PodMonitor for litellm-x402-buyer and ServiceMonitor for + # x402-verifier moved into base/templates/llm.yaml and + # base/templates/x402.yaml respectively. They live alongside the + # workloads they observe so a port/selector edit is one-file. Kills + # two `bedag/raw` releases. kube-prometheus-stack picks them up via + # the `release: monitoring` label. + # Traefik ingress controller with Gateway API support - # Traefik v38+ bundles Gateway API CRDs in its crds/ directory + # Traefik v38+ bundles Gateway API CRDs in its crds/ directory. + # Declared before `base` so the Traefik CRDs (Middleware, + # IngressRoute, …) and Gateway API CRDs are available when base + # templates that depend on them (erpc.yaml, obol-frontend.yaml) + # apply. - name: traefik namespace: traefik createNamespace: true @@ -135,6 +113,24 @@ releases: dashboard: enabled: false + # Local storage provisioner + co-located cluster-wide manifests: + # CRDs, agent RBAC, x402 controller + verifier, LiteLLM + buyer + # PodMonitor, eRPC HTTPRoute + Middleware + metadata ConfigMap, and + # the obol-frontend HTTPRoute + discovery RBAC. The `needs` on + # traefik + monitoring guarantee the Traefik / Gateway API and + # monitoring CRDs are present before the relocated routing / + # PodMonitor templates (previously shipped as separate bedag/raw + # helmfile releases) apply. + - name: base + namespace: kube-system + chart: ./base + needs: + - traefik/traefik + - monitoring/monitoring + values: + - dataDir: /data + - network: "{{ .Values.network }}" + # Cloudflare Tunnel (dormant until configured via obol tunnel login/provision). # `condition: cloudflared.enabled` lets `obol stack up` flip this off when an # active quick tunnel is already serving — re-syncing the chart kills the @@ -168,75 +164,16 @@ releases: - ./values/erpc.yaml.gotmpl # The chart exposes port 4000 (container) via Service port 4000. # In-cluster callers use erpc.erpc.svc.cluster.local:4000. + # + # The eRPC HTTPRoute, x402-payment Middleware, and erpc-metadata + # ConfigMap previously shipped as separate bedag/raw helmfile + # releases now live in base/templates/erpc.yaml. - # eRPC HTTPRoute - - name: erpc-httproute - namespace: erpc - chart: bedag/raw - version: 2.0.2 - needs: - - traefik/traefik - - erpc/erpc - values: - - resources: - - apiVersion: gateway.networking.k8s.io/v1 - kind: HTTPRoute - metadata: - name: erpc - namespace: erpc - spec: - hostnames: - - "obol.stack" - parentRefs: - - name: traefik-gateway - namespace: traefik - sectionName: web - rules: - - matches: - - path: - type: PathPrefix - value: /rpc - filters: - - type: ExtensionRef - extensionRef: - group: traefik.io - kind: Middleware - name: x402-payment - backendRefs: - - name: erpc - port: 80 - - # x402 Middleware for eRPC namespace (ForwardAuth -> central verifier). - # Always deployed; the verifier returns 200 for routes with no pricing rules. - - name: erpc-x402-middleware - namespace: erpc - chart: bedag/raw - version: 2.0.2 - needs: - - traefik/traefik - values: - - resources: - - apiVersion: traefik.io/v1alpha1 - kind: Middleware - metadata: - name: x402-payment - namespace: erpc - spec: - forwardAuth: - address: http://x402-verifier.x402.svc.cluster.local:8080/verify - authResponseHeaders: - - X-Payment-Response - - # eRPC metadata ConfigMap for frontend discovery - - name: erpc-metadata - namespace: erpc - chart: bedag/raw - needs: - - erpc/erpc - values: - - ./values/erpc-metadata.yaml.gotmpl - - # Obol Stack frontend + # Obol Stack frontend. + # + # The frontend HTTPRoute and discovery RBAC (ClusterRole + + # ClusterRoleBinding) previously shipped as separate bedag/raw + # helmfile releases now live in base/templates/obol-frontend.yaml. - name: obol-frontend namespace: obol-frontend createNamespace: true @@ -248,74 +185,12 @@ releases: values: - ./values/obol-frontend.yaml.gotmpl - # Obol Frontend HTTPRoute - - name: obol-frontend-httproute - namespace: obol-frontend - chart: bedag/raw - version: 2.0.2 - needs: - - traefik/traefik - - obol-frontend/obol-frontend - values: - - resources: - - apiVersion: gateway.networking.k8s.io/v1 - kind: HTTPRoute - metadata: - name: obol-frontend - namespace: obol-frontend - spec: - hostnames: - - "obol.stack" - parentRefs: - - name: traefik-gateway - namespace: traefik - sectionName: web - rules: - - matches: - - path: - type: PathPrefix - value: / - backendRefs: - - name: obol-frontend-obol-app - port: 3000 - - # Obol Frontend RBAC (OpenClaw instance discovery via Kubernetes API) - - name: obol-frontend-rbac - namespace: obol-frontend - chart: bedag/raw - version: 2.0.2 - needs: - - obol-frontend/obol-frontend - values: - - resources: - - apiVersion: rbac.authorization.k8s.io/v1 - kind: ClusterRole - metadata: - name: obol-frontend-openclaw-discovery - labels: - app.kubernetes.io/name: obol-frontend - rules: - - apiGroups: [""] - resources: ["namespaces"] - verbs: ["get", "list"] - - apiGroups: [""] - resources: ["pods", "configmaps", "secrets"] - verbs: ["get", "list"] - # ServiceOffer CRD — frontend sell modal creates offers - - apiGroups: ["obol.org"] - resources: ["serviceoffers", "serviceoffers/status"] - verbs: ["get", "list", "create", "update", "patch", "delete"] - - apiVersion: rbac.authorization.k8s.io/v1 - kind: ClusterRoleBinding - metadata: - name: obol-frontend-openclaw-discovery - labels: - app.kubernetes.io/name: obol-frontend - roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: obol-frontend-openclaw-discovery - subjects: - - kind: ServiceAccount - name: obol-frontend - namespace: obol-frontend + # NOTE: Obol frontend HTTPRoute and obol-frontend-rbac ClusterRole + + # ClusterRoleBinding moved into base/templates/obol-frontend.yaml. + # Co-located with the workload they configure; kills two `bedag/raw` + # releases. Frontend-egress NetworkPolicy was attempted and reverted — + # on k3s + Flannel (k3d's default CNI) the kubernetes apiserver Service + # Endpoints point at the host process, outside the cluster pod/service + # CIDRs. A clean allowlist can't target the apiserver portably without + # an install-specific ipBlock for the k3s host IP. Tracking as a + # deferred hardening item. diff --git a/internal/embed/infrastructure/values/erpc-metadata.yaml.gotmpl b/internal/embed/infrastructure/values/erpc-metadata.yaml.gotmpl deleted file mode 100644 index fe94d8ef..00000000 --- a/internal/embed/infrastructure/values/erpc-metadata.yaml.gotmpl +++ /dev/null @@ -1,21 +0,0 @@ -resources: - - apiVersion: v1 - kind: ConfigMap - metadata: - name: erpc-metadata - namespace: erpc - labels: - app.kubernetes.io/part-of: obol.stack - obol.stack/id: default - obol.stack/app: erpc - data: - metadata.json: | - { - "network": "{{ .Values.network }}", - "endpoints": { - "rpc": { - "external": "http://obol.stack/rpc/{{ .Values.network }}", - "internal": "http://erpc.erpc.svc.cluster.local/rpc/{{ .Values.network }}" - } - } - } diff --git a/internal/embed/infrastructure/values/monitoring.yaml.gotmpl b/internal/embed/infrastructure/values/monitoring.yaml.gotmpl index 18e6ba01..e440bd0d 100644 --- a/internal/embed/infrastructure/values/monitoring.yaml.gotmpl +++ b/internal/embed/infrastructure/values/monitoring.yaml.gotmpl @@ -11,7 +11,7 @@ prometheus: matchLabels: release: monitoring podMonitorNamespaceSelector: {} - retention: 6h + retention: 8d resources: requests: cpu: 100m diff --git a/internal/embed/skills/sell/references/serviceoffer-spec.md b/internal/embed/skills/sell/references/serviceoffer-spec.md index a2b6183e..5ead12d7 100644 --- a/internal/embed/skills/sell/references/serviceoffer-spec.md +++ b/internal/embed/skills/sell/references/serviceoffer-spec.md @@ -180,7 +180,13 @@ Each condition contains: ## Lifecycle Notes -- Pausing is represented via the `obol.org/paused: "true"` annotation. +- Graceful stop is represented via `spec.drainAt` (RFC3339 timestamp) and + the optional `spec.drainGracePeriod` (Go duration, e.g. `"30m"`, defaults + to `1h`). While draining, discovery surfaces advertise the offer with + `available: false` + `drainEndsAt`, and the HTTPRoute/payment gate stay + up until the grace period expires so in-flight buyers can settle. + `obol sell stop --force` is the equivalent of `drainGracePeriod: 0s` — + abrupt teardown with no advertised wind-down. - Deleting a `ServiceOffer` cascades owned `Middleware` and `HTTPRoute` resources via `ownerReferences`. - Registration side effects are isolated in a child `RegistrationRequest` diff --git a/internal/erc8004/types.go b/internal/erc8004/types.go index 3e22d8c5..85463f51 100644 --- a/internal/erc8004/types.go +++ b/internal/erc8004/types.go @@ -33,11 +33,13 @@ const RegistrationType = "https://eips.ethereum.org/EIPS/eip-8004#registration-v // For OASF entries (name="OASF"), Skills and Domains provide machine-readable // taxonomy for agent discovery. See https://schema.oasf.outshift.com/ type ServiceDef struct { - Name string `json:"name"` // e.g., "web", "A2A", "MCP", "OASF" - Endpoint string `json:"endpoint,omitempty"` // full URL (omitempty for OASF entries) - Version string `json:"version,omitempty"` // protocol version (SHOULD per spec) - Skills []string `json:"skills,omitempty"` // OASF skill taxonomy paths - Domains []string `json:"domains,omitempty"` // OASF domain taxonomy paths + Name string `json:"name"` // e.g., "web", "A2A", "MCP", "OASF" + Endpoint string `json:"endpoint,omitempty"` // full URL (omitempty for OASF entries) + Version string `json:"version,omitempty"` // protocol version (SHOULD per spec) + Skills []string `json:"skills,omitempty"` // OASF skill taxonomy paths + Domains []string `json:"domains,omitempty"` // OASF domain taxonomy paths + Available *bool `json:"available,omitempty"` // false only while the service is draining + DrainEndsAt string `json:"drainEndsAt,omitempty"` // RFC3339 timestamp for draining services } // OnChainReg links the registration to its on-chain record. diff --git a/internal/erc8004/types_test.go b/internal/erc8004/types_test.go index 80bd025e..e8fdd9fb 100644 --- a/internal/erc8004/types_test.go +++ b/internal/erc8004/types_test.go @@ -179,6 +179,32 @@ func TestServiceDef_VersionOptional(t *testing.T) { } } +func TestServiceDef_DrainMetadataSerializesFalseAvailability(t *testing.T) { + available := false + svc := ServiceDef{ + Name: "web", + Endpoint: "https://example.com/services/demo", + Available: &available, + DrainEndsAt: "2026-05-24T12:00:00Z", + } + + data, err := json.Marshal(svc) + if err != nil { + t.Fatalf("Marshal: %v", err) + } + + var m map[string]json.RawMessage + if err := json.Unmarshal(data, &m); err != nil { + t.Fatalf("unmarshal to map: %v", err) + } + if string(m["available"]) != "false" { + t.Fatalf("available = %s, want false in %s", m["available"], data) + } + if string(m["drainEndsAt"]) != `"2026-05-24T12:00:00Z"` { + t.Fatalf("drainEndsAt = %s, want timestamp in %s", m["drainEndsAt"], data) + } +} + func TestOnChainReg_AgentIDNumeric(t *testing.T) { reg := OnChainReg{ AgentID: 42, diff --git a/internal/monetizeapi/deepcopy_manual.go b/internal/monetizeapi/deepcopy_manual.go new file mode 100644 index 00000000..89636588 --- /dev/null +++ b/internal/monetizeapi/deepcopy_manual.go @@ -0,0 +1,58 @@ +package monetizeapi + +// PreSignedAuth deep-copy is hand-written because its Payment field is +// an opaque map[string]interface{} (controller-gen can't deep-copy +// untyped JSON). The type is excluded from generation via the +// object-generate=false marker in types.go. + +// DeepCopyInto copies the receiver into out. The Payment map is +// shallow-copied; values inside the map are JSON-serializable scalars / +// maps / slices passed through to the buyer sidecar, so a shallow copy +// is sufficient for the controller's deep-copy contract (no internal +// pointer aliasing into caller-owned mutable structures). +func (in *PreSignedAuth) DeepCopyInto(out *PreSignedAuth) { + *out = *in + if in.Payment != nil { + out.Payment = deepCopyJSONMap(in.Payment) + } +} + +// DeepCopy returns a deep copy of the receiver. +func (in *PreSignedAuth) DeepCopy() *PreSignedAuth { + if in == nil { + return nil + } + out := new(PreSignedAuth) + in.DeepCopyInto(out) + return out +} + +// deepCopyJSONMap walks an opaque JSON-decoded map[string]interface{} +// tree and returns a structurally identical copy. Handles the nested +// shapes the x402 PaymentPayload uses (object, array, scalar). +func deepCopyJSONMap(in map[string]interface{}) map[string]interface{} { + if in == nil { + return nil + } + out := make(map[string]interface{}, len(in)) + for k, v := range in { + out[k] = deepCopyJSONValue(v) + } + return out +} + +func deepCopyJSONValue(v interface{}) interface{} { + switch t := v.(type) { + case map[string]interface{}: + return deepCopyJSONMap(t) + case []interface{}: + out := make([]interface{}, len(t)) + for i, item := range t { + out[i] = deepCopyJSONValue(item) + } + return out + default: + // Strings, numbers, bools, nil — value types, safe to share. + return v + } +} diff --git a/internal/monetizeapi/doc.go b/internal/monetizeapi/doc.go new file mode 100644 index 00000000..63ab340c --- /dev/null +++ b/internal/monetizeapi/doc.go @@ -0,0 +1,16 @@ +// Package monetizeapi defines the Custom Resource Definitions for the +// Obol Stack monetize subsystem. +// +// The Go types in this package are the single source of truth for the +// CRD OpenAPI schemas embedded under +// internal/embed/infrastructure/base/templates/*-crd.yaml. +// +// Edit a field or marker here, then run `just generate` to regenerate +// the CRD YAML manifests + zz_generated_deepcopy.go from kubebuilder +// markers. CI fails if the working tree is dirty after that command +// runs (see .github/workflows/lint-test.yaml::generate-check). +// +// +kubebuilder:object:generate=true +// +groupName=obol.org +// +versionName=v1alpha1 +package monetizeapi diff --git a/internal/monetizeapi/drain_test.go b/internal/monetizeapi/drain_test.go new file mode 100644 index 00000000..0241a49a --- /dev/null +++ b/internal/monetizeapi/drain_test.go @@ -0,0 +1,113 @@ +package monetizeapi + +import ( + "testing" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestServiceOffer_IsDraining(t *testing.T) { + t.Run("nil drainAt", func(t *testing.T) { + o := &ServiceOffer{} + if o.IsDraining() { + t.Errorf("IsDraining() = true, want false for nil drainAt") + } + }) + t.Run("set drainAt", func(t *testing.T) { + now := metav1.Now() + o := &ServiceOffer{Spec: ServiceOfferSpec{DrainAt: &now}} + if !o.IsDraining() { + t.Errorf("IsDraining() = false, want true for non-nil drainAt") + } + }) +} + +func TestServiceOffer_DrainEndsAt(t *testing.T) { + base := time.Date(2026, time.May, 1, 12, 0, 0, 0, time.UTC) + baseMeta := metav1.NewTime(base) + + cases := []struct { + name string + drain *metav1.Time + grace *metav1.Duration + want time.Time + }{ + { + name: "nil drainAt returns zero", + drain: nil, + grace: nil, + want: time.Time{}, + }, + { + name: "nil grace applies default 1h", + drain: &baseMeta, + grace: nil, + want: base.Add(time.Hour), + }, + { + name: "explicit zero grace honored", + drain: &baseMeta, + grace: &metav1.Duration{Duration: 0}, + want: base, + }, + { + name: "custom grace honored", + drain: &baseMeta, + grace: &metav1.Duration{Duration: 30 * time.Minute}, + want: base.Add(30 * time.Minute), + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + o := &ServiceOffer{Spec: ServiceOfferSpec{DrainAt: tc.drain, DrainGracePeriod: tc.grace}} + if got := o.DrainEndsAt(); !got.Equal(tc.want) { + t.Errorf("DrainEndsAt() = %v, want %v", got, tc.want) + } + }) + } +} + +func TestServiceOffer_DrainExpired(t *testing.T) { + now := time.Date(2026, time.May, 1, 12, 0, 0, 0, time.UTC) + + t.Run("not draining returns false", func(t *testing.T) { + o := &ServiceOffer{} + if o.DrainExpired(now) { + t.Errorf("DrainExpired() = true, want false for non-draining offer") + } + }) + + t.Run("mid-drain returns false", func(t *testing.T) { + drainAt := metav1.NewTime(now.Add(-10 * time.Minute)) + o := &ServiceOffer{Spec: ServiceOfferSpec{ + DrainAt: &drainAt, + DrainGracePeriod: &metav1.Duration{Duration: time.Hour}, + }} + if o.DrainExpired(now) { + t.Errorf("DrainExpired() = true, want false for mid-drain offer") + } + }) + + t.Run("expired returns true", func(t *testing.T) { + drainAt := metav1.NewTime(now.Add(-2 * time.Hour)) + o := &ServiceOffer{Spec: ServiceOfferSpec{ + DrainAt: &drainAt, + DrainGracePeriod: &metav1.Duration{Duration: time.Hour}, + }} + if !o.DrainExpired(now) { + t.Errorf("DrainExpired() = false, want true for expired drain") + } + }) + + t.Run("force path zero grace tears down on next reconcile", func(t *testing.T) { + drainAt := metav1.NewTime(now) + o := &ServiceOffer{Spec: ServiceOfferSpec{ + DrainAt: &drainAt, + DrainGracePeriod: &metav1.Duration{Duration: 0}, + }} + if !o.DrainExpired(now) { + t.Errorf("DrainExpired() = false at now == drainAt with zero grace, want true") + } + }) +} diff --git a/internal/monetizeapi/types.go b/internal/monetizeapi/types.go index 6e905eee..db231a71 100644 --- a/internal/monetizeapi/types.go +++ b/internal/monetizeapi/types.go @@ -3,11 +3,18 @@ package monetizeapi import ( "fmt" "strings" + "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" ) +// DefaultDrainGracePeriod is the grace period applied to a draining +// ServiceOffer when spec.drainGracePeriod is unset. Buyers using the +// offer can complete in-flight payments and migrate to alternative +// providers within this window before the HTTPRoute is torn down. +const DefaultDrainGracePeriod = time.Hour + const ( Group = "obol.org" Version = "v1alpha1" @@ -29,8 +36,6 @@ const ( AgentIdentityDefaultNamespace = "x402" AgentIdentityDefaultName = "default" - PausedAnnotation = "obol.org/paused" - AgentRuntimeHermes = "hermes" AgentPhasePending = "Pending" @@ -61,6 +66,21 @@ var ( PVCGVR = schema.GroupVersionResource{Group: "", Version: "v1", Resource: "persistentvolumeclaims"} ) +// ── ServiceOffer ──────────────────────────────────────────────────────────── + +// +kubebuilder:object:root=true +// +kubebuilder:resource:scope=Namespaced,shortName=so +// +kubebuilder:subresource:status +// +kubebuilder:printcolumn:name="Type",type=string,JSONPath=`.spec.type` +// +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.spec.model.name` +// +kubebuilder:printcolumn:name="Price",type=string,JSONPath=`.spec.payment.price.perRequest` +// +kubebuilder:printcolumn:name="Network",type=string,JSONPath=`.spec.payment.network` +// +kubebuilder:printcolumn:name="Ready",type=string,JSONPath=`.status.conditions[?(@.type=="Ready")].status` +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp` + +// ServiceOffer declares a compute service that can be exposed publicly, +// gated with x402 payments, and optionally registered on an ERC-8004 +// service registry. Field names align with x402 and ERC-8004 standards. type ServiceOffer struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` @@ -68,15 +88,64 @@ type ServiceOffer struct { Status ServiceOfferStatus `json:"status,omitempty"` } +// +kubebuilder:object:root=true + +// ServiceOfferList is the list form for kubectl/list operations. +type ServiceOfferList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []ServiceOffer `json:"items"` +} + type ServiceOfferSpec struct { - Type string `json:"type,omitempty"` - Agent ServiceOfferAgent `json:"agent,omitempty"` - Model ServiceOfferModel `json:"model,omitempty"` - Upstream ServiceOfferUpstream `json:"upstream,omitempty"` - Payment ServiceOfferPayment `json:"payment,omitempty"` - Path string `json:"path,omitempty"` - Provenance map[string]string `json:"provenance,omitempty"` + // Service type. 'inference' enables model management; 'http' for any HTTP + // service; 'agent' references an Agent CR via spec.agent.ref and the + // controller derives upstream + model + skills from the agent's status. + // +kubebuilder:default="http" + // +kubebuilder:validation:Enum=inference;fine-tuning;http;agent + Type string `json:"type,omitempty"` + + // Required when type='agent'. The controller resolves spec.agent.ref to + // the referenced Agent CR, derives upstream from Agent.status.endpoint, + // and surfaces the agent's pinned model + skills in the 402 response. + Agent ServiceOfferAgent `json:"agent,omitempty"` + + // LLM model metadata. Required when the upstream serves an LLM. + Model ServiceOfferModel `json:"model,omitempty"` + + // In-cluster service that handles the actual workload. + Upstream ServiceOfferUpstream `json:"upstream,omitempty"` + + // +kubebuilder:validation:Required + Payment ServiceOfferPayment `json:"payment"` + + // URL path prefix for the HTTPRoute, defaults to /services/. + // +kubebuilder:validation:Pattern=`^/[a-zA-Z0-9/_.-]*$` + Path string `json:"path,omitempty"` + + // Optional provenance metadata for the service. Tracks how the model or + // service was produced (e.g. autoresearch experiment data). Included in + // the ERC-8004 registration document when present. + Provenance map[string]string `json:"provenance,omitempty"` + + // ERC-8004 registration metadata. Field names align with the + // AgentRegistration document schema (ERC-8004 spec). Registration ServiceOfferRegistration `json:"registration,omitempty"` + + // DrainAt marks the offer as draining when non-nil. While the offer + // is in the drain window, discovery surfaces (/skill.md and + // /.well-known/agent-registration.json) advertise the offer with + // available=false and drainEndsAt set, so buyers can migrate before + // the route is torn down. The route + payment gate stay up until + // DrainEndsAt() so in-flight payments can complete. Replaces the + // legacy obol.org/paused annotation. + DrainAt *metav1.Time `json:"drainAt,omitempty"` + + // DrainGracePeriod is how long after DrainAt the HTTPRoute remains + // up. Defaults to DefaultDrainGracePeriod when nil. A zero duration + // is honored as "tear down immediately on the next reconcile" (the + // equivalent of `obol sell stop --force`). + DrainGracePeriod *metav1.Duration `json:"drainGracePeriod,omitempty"` } // ServiceOfferAgent is populated when Spec.Type == "agent". The controller @@ -88,72 +157,148 @@ type ServiceOfferAgent struct { } type ServiceOfferAgentRef struct { - Name string `json:"name,omitempty"` - Namespace string `json:"namespace,omitempty"` + // +kubebuilder:validation:Required + Name string `json:"name"` + // +kubebuilder:validation:Required + Namespace string `json:"namespace"` } type ServiceOfferModel struct { - Name string `json:"name,omitempty"` - Runtime string `json:"runtime,omitempty"` + // Model identifier (e.g. qwen3.5:35b). + // +kubebuilder:validation:Required + Name string `json:"name"` + // Runtime serving the model. + // +kubebuilder:validation:Required + // +kubebuilder:validation:Enum=ollama;vllm;tgi + Runtime string `json:"runtime"` } type ServiceOfferUpstream struct { - Service string `json:"service,omitempty"` - Namespace string `json:"namespace,omitempty"` - Port int64 `json:"port,omitempty"` + // Kubernetes Service name. + // +kubebuilder:validation:Required + Service string `json:"service"` + // Namespace of the upstream Service. + // +kubebuilder:validation:Required + Namespace string `json:"namespace"` + // Port on the upstream Service. + // +kubebuilder:validation:Required + // +kubebuilder:default=11434 + // +kubebuilder:validation:Minimum=1 + // +kubebuilder:validation:Maximum=65535 + Port int64 `json:"port"` + // HTTP path used for health probes against the upstream. + // +kubebuilder:default="/health" HealthPath string `json:"healthPath,omitempty"` } type ServiceOfferPayment struct { - Scheme string `json:"scheme,omitempty"` - Network string `json:"network,omitempty"` - PayTo string `json:"payTo,omitempty"` - MaxTimeoutSeconds int64 `json:"maxTimeoutSeconds,omitempty"` - Asset ServiceOfferAsset `json:"asset,omitempty"` - Price ServiceOfferPriceTable `json:"price,omitempty"` + // x402 payment scheme. + // +kubebuilder:default="exact" + // +kubebuilder:validation:Enum=exact + Scheme string `json:"scheme,omitempty"` + // Chain identifier for payments (human-friendly). Reconciler resolves + // to CAIP-2 format (e.g., "base-sepolia" → "eip155:84532"). + // +kubebuilder:validation:Required + Network string `json:"network"` + // USDC recipient wallet address (x402: payTo). + // +kubebuilder:validation:Required + // +kubebuilder:validation:Pattern=`^0x[0-9a-fA-F]{40}$` + PayTo string `json:"payTo"` + // Payment validity window in seconds (x402: maxTimeoutSeconds). + // +kubebuilder:default=300 + MaxTimeoutSeconds int64 `json:"maxTimeoutSeconds,omitempty"` + // Optional token metadata override for x402 settlement. When omitted, + // the verifier uses the chain default asset. + Asset ServiceOfferAsset `json:"asset,omitempty"` + // Pricing table with per-unit prices in USDC (human-readable decimals). + // Which fields are applicable depends on the workload type. + // +kubebuilder:validation:Required + Price ServiceOfferPriceTable `json:"price"` } type ServiceOfferAsset struct { - Address string `json:"address,omitempty"` - Symbol string `json:"symbol,omitempty"` - Decimals int64 `json:"decimals,omitempty"` + // ERC-20 contract address. + // +kubebuilder:validation:Pattern=`^0x[0-9a-fA-F]{40}$` + Address string `json:"address,omitempty"` + // Human-friendly token symbol (e.g. USDC, OBOL). + Symbol string `json:"symbol,omitempty"` + // Token decimals in atomic units. + // +kubebuilder:validation:Minimum=0 + // +kubebuilder:validation:Maximum=255 + Decimals int64 `json:"decimals,omitempty"` + // x402 transfer method for the asset. + // +kubebuilder:validation:Enum=eip3009;permit2 TransferMethod string `json:"transferMethod,omitempty"` - EIP712Name string `json:"eip712Name,omitempty"` - EIP712Version string `json:"eip712Version,omitempty"` + // EIP-712 domain name used by the token. + EIP712Name string `json:"eip712Name,omitempty"` + // EIP-712 domain version used by the token. + EIP712Version string `json:"eip712Version,omitempty"` } type ServiceOfferPriceTable struct { + // Flat per-request price in USDC. Applicable to all types. PerRequest string `json:"perRequest,omitempty"` - PerMTok string `json:"perMTok,omitempty"` - PerHour string `json:"perHour,omitempty"` - PerEpoch string `json:"perEpoch,omitempty"` + // Per-million-tokens price in USDC. Inference only. + PerMTok string `json:"perMTok,omitempty"` + // Per-compute-hour price in USDC. Fine-tuning only. + PerHour string `json:"perHour,omitempty"` + // Per-training-epoch price in USDC. Fine-tuning only. + PerEpoch string `json:"perEpoch,omitempty"` } type ServiceOfferRegistration struct { - Enabled bool `json:"enabled,omitempty"` - Name string `json:"name,omitempty"` - Description string `json:"description,omitempty"` - Image string `json:"image,omitempty"` - Services []ServiceOfferService `json:"services,omitempty"` - SupportedTrust []string `json:"supportedTrust,omitempty"` - Skills []string `json:"skills,omitempty"` - Domains []string `json:"domains,omitempty"` - Metadata map[string]string `json:"metadata,omitempty"` + // If true, register on ERC-8004 after routing is live. + // +kubebuilder:default=false + Enabled bool `json:"enabled,omitempty"` + // Agent name (ERC-8004: AgentRegistration.name). + Name string `json:"name,omitempty"` + // Agent description (ERC-8004: AgentRegistration.description). + Description string `json:"description,omitempty"` + // Agent icon URL (ERC-8004: AgentRegistration.image). + Image string `json:"image,omitempty"` + // Service endpoints (ERC-8004: AgentRegistration.services[]). + Services []ServiceOfferService `json:"services,omitempty"` + // Trust verification methods (ERC-8004: AgentRegistration.supportedTrust[]). + // Valid values: reputation, crypto-economic, tee-attestation. + SupportedTrust []string `json:"supportedTrust,omitempty"` + // OASF skills for discovery (e.g. + // natural_language_processing/text_generation). Mapped to an OASF + // service entry in the registration JSON. + Skills []string `json:"skills,omitempty"` + // OASF domains for discovery (e.g. technology/artificial_intelligence). + // Mapped to an OASF service entry in the registration JSON. + Domains []string `json:"domains,omitempty"` + // Additional registration metadata published into the generated + // agent-registration.json for discovery and ranking. + Metadata map[string]string `json:"metadata,omitempty"` } type ServiceOfferService struct { - Name string `json:"name,omitempty"` - Endpoint string `json:"endpoint,omitempty"` - Version string `json:"version,omitempty"` + // Service type: web, A2A, MCP, OASF, ENS, DID, email. + // +kubebuilder:validation:Required + Name string `json:"name"` + // Service URL. Auto-filled from tunnel URL if empty. + // +kubebuilder:validation:Required + Endpoint string `json:"endpoint"` + // Protocol version (SHOULD per ERC-8004 spec). + Version string `json:"version,omitempty"` } type ServiceOfferStatus struct { - Conditions []Condition `json:"conditions,omitempty"` - Endpoint string `json:"endpoint,omitempty"` - AgentID string `json:"agentId,omitempty"` - RegistrationTxHash string `json:"registrationTxHash,omitempty"` - ObservedGeneration int64 `json:"observedGeneration,omitempty"` - AgentResolution *ServiceOfferAgentResolution `json:"agentResolution,omitempty"` + // Condition types: ModelReady, UpstreamHealthy, PaymentGateReady, + // RoutePublished, Registered, Ready. + Conditions []Condition `json:"conditions,omitempty"` + // The public endpoint URL once the route is published. + Endpoint string `json:"endpoint,omitempty"` + // ERC-8004 agent NFT token ID after on-chain registration. + AgentID string `json:"agentId,omitempty"` + // Transaction hash of the ERC-8004 registration. + RegistrationTxHash string `json:"registrationTxHash,omitempty"` + // The generation observed by the controller. + ObservedGeneration int64 `json:"observedGeneration,omitempty"` + // Controller's resolved view of an agent-type offer's referenced Agent. + // Populated only when type=agent and the Agent is Ready. + AgentResolution *ServiceOfferAgentResolution `json:"agentResolution,omitempty"` } // ServiceOfferAgentResolution is the controller's resolved view of an @@ -169,13 +314,35 @@ type ServiceOfferAgentResolution struct { } type Condition struct { - Type string `json:"type"` - Status string `json:"status"` - Reason string `json:"reason,omitempty"` - Message string `json:"message,omitempty"` + // Condition type. + // +kubebuilder:validation:Required + Type string `json:"type"` + // Status of the condition. + // +kubebuilder:validation:Required + // +kubebuilder:validation:Enum=True;False;Unknown + Status string `json:"status"` + // Machine-readable reason for the condition. + Reason string `json:"reason,omitempty"` + // Human-readable message with details. + Message string `json:"message,omitempty"` + // Last time the condition transitioned. LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"` } +// ── RegistrationRequest ───────────────────────────────────────────────────── + +// +kubebuilder:object:root=true +// +kubebuilder:resource:scope=Namespaced,shortName=rr +// +kubebuilder:subresource:status +// +kubebuilder:printcolumn:name="Offer",type=string,JSONPath=`.spec.serviceOfferName` +// +kubebuilder:printcolumn:name="State",type=string,JSONPath=`.spec.desiredState` +// +kubebuilder:printcolumn:name="Phase",type=string,JSONPath=`.status.phase` +// +kubebuilder:printcolumn:name="AgentID",type=string,JSONPath=`.status.agentId` +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp` + +// RegistrationRequest isolates ERC-8004 publication and on-chain side +// effects from the main ServiceOffer reconciliation loop. ServiceOffer +// remains the source of truth. type RegistrationRequest struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` @@ -183,11 +350,25 @@ type RegistrationRequest struct { Status RegistrationRequestStatus `json:"status,omitempty"` } +// +kubebuilder:object:root=true + +// RegistrationRequestList is the list form for kubectl/list operations. +type RegistrationRequestList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []RegistrationRequest `json:"items"` +} + type RegistrationRequestSpec struct { - ServiceOfferName string `json:"serviceOfferName,omitempty"` - ServiceOfferNamespace string `json:"serviceOfferNamespace,omitempty"` - DesiredState string `json:"desiredState,omitempty"` - Chain string `json:"chain,omitempty"` + // +kubebuilder:validation:Required + ServiceOfferName string `json:"serviceOfferName"` + // +kubebuilder:validation:Required + ServiceOfferNamespace string `json:"serviceOfferNamespace"` + // +kubebuilder:validation:Required + // +kubebuilder:validation:Enum=Active;Tombstoned + DesiredState string `json:"desiredState"` + // ERC-8004 registration chain alias for this request. + Chain string `json:"chain,omitempty"` } type RegistrationRequestStatus struct { @@ -241,12 +422,58 @@ func (o *ServiceOffer) IsAgent() bool { return o.Spec.Type == "agent" } -func (o *ServiceOffer) IsPaused() bool { - return o.Annotations != nil && o.Annotations[PausedAnnotation] == "true" +// IsDraining reports whether spec.drainAt has been set. Drained offers +// transition through three phases: pre-drain (DrainAt nil), draining +// (DrainAt set, now < DrainEndsAt), and drain-expired (DrainAt set, +// now >= DrainEndsAt). The controller keeps the route up during +// "draining" and tears it down once "drain-expired" is reached. +func (o *ServiceOffer) IsDraining() bool { + return o.Spec.DrainAt != nil +} + +// DrainEndsAt returns DrainAt + DrainGracePeriod. When DrainAt is nil +// the zero time is returned (caller should gate on IsDraining first). +// When DrainGracePeriod is nil the default grace period is applied; a +// zero grace period is honored as "drain ends at DrainAt", i.e. tear +// down on the next reconcile (the --force/--now path). +func (o *ServiceOffer) DrainEndsAt() time.Time { + if o.Spec.DrainAt == nil { + return time.Time{} + } + grace := DefaultDrainGracePeriod + if o.Spec.DrainGracePeriod != nil { + grace = o.Spec.DrainGracePeriod.Duration + } + return o.Spec.DrainAt.Time.Add(grace) +} + +// DrainExpired reports whether the drain grace period has elapsed. +// Returns false when the offer is not draining at all. Callers should +// use this rather than IsDraining when deciding whether to tear down +// the HTTPRoute or filter the offer from the live x402 verifier rules. +func (o *ServiceOffer) DrainExpired(now time.Time) bool { + if !o.IsDraining() { + return false + } + end := o.DrainEndsAt() + return !now.Before(end) } // ── PurchaseRequest ───────────────────────────────────────────────────────── +// +kubebuilder:object:root=true +// +kubebuilder:resource:scope=Namespaced,shortName=pr +// +kubebuilder:subresource:status +// +kubebuilder:printcolumn:name="Endpoint",type=string,JSONPath=`.spec.endpoint` +// +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.spec.model` +// +kubebuilder:printcolumn:name="Price",type=string,JSONPath=`.spec.payment.price` +// +kubebuilder:printcolumn:name="Remaining",type=integer,JSONPath=`.status.remaining` +// +kubebuilder:printcolumn:name="Spent",type=integer,JSONPath=`.status.spent` +// +kubebuilder:printcolumn:name="Ready",type=string,JSONPath=`.status.conditions[?(@.type=="Ready")].status` +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp` + +// PurchaseRequest is the buyer-side request for pre-signed x402 auths +// against a remote inference endpoint. type PurchaseRequest struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` @@ -254,57 +481,110 @@ type PurchaseRequest struct { Status PurchaseRequestStatus `json:"status,omitempty"` } +// +kubebuilder:object:root=true + +// PurchaseRequestList is the list form for kubectl/list operations. +type PurchaseRequestList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []PurchaseRequest `json:"items"` +} + type PurchaseRequestSpec struct { - Endpoint string `json:"endpoint"` - Model string `json:"model"` - Count int `json:"count"` + // Full URL to the x402-gated inference endpoint. + // +kubebuilder:validation:Required + Endpoint string `json:"endpoint"` + // Remote model ID (used as paid/ in LiteLLM). + // +kubebuilder:validation:Required + Model string `json:"model"` + // Number of pre-signed auths to create. + // +kubebuilder:validation:Required + // +kubebuilder:validation:Minimum=1 + // +kubebuilder:validation:Maximum=2500 + Count int `json:"count"` + // Pre-signed x402 payments (legacy ERC-3009 auths still supported). PreSignedAuths []PreSignedAuth `json:"preSignedAuths,omitempty"` AutoRefill PurchaseAutoRefill `json:"autoRefill,omitempty"` - Payment PurchasePayment `json:"payment"` + // +kubebuilder:validation:Required + Payment PurchasePayment `json:"payment"` } +// +kubebuilder:object:generate=false + +// PreSignedAuth carries a pre-signed x402 payment authorization. The +// Payment map is opaque (forwarded verbatim to the buyer sidecar / x402 +// facilitator) which can't be deep-copied by controller-gen; DeepCopy +// methods for this type are hand-written in deepcopy_manual.go. type PreSignedAuth struct { - ID string `json:"id,omitempty"` + ID string `json:"id,omitempty"` + // +kubebuilder:pruning:PreserveUnknownFields + // +kubebuilder:validation:Schemaless Payment map[string]interface{} `json:"payment,omitempty"` - Signature string `json:"signature"` - From string `json:"from"` - To string `json:"to"` - Value string `json:"value"` - ValidAfter string `json:"validAfter"` - ValidBefore string `json:"validBefore"` - Nonce string `json:"nonce"` -} - + Signature string `json:"signature,omitempty"` + From string `json:"from,omitempty"` + To string `json:"to,omitempty"` + Value string `json:"value,omitempty"` + ValidAfter string `json:"validAfter,omitempty"` + ValidBefore string `json:"validBefore,omitempty"` + Nonce string `json:"nonce,omitempty"` +} + +// PurchaseAutoRefill drives the agent-managed auto-refill policy for a +// PurchaseRequest. The reconciler reads MaxTotal + MaxSpendPerDay as +// budget caps before signing more auths; without these fields populated +// the agent will not auto-refill beyond the initial Count. type PurchaseAutoRefill struct { - Enabled bool `json:"enabled,omitempty"` - Threshold int `json:"threshold,omitempty"` - Count int `json:"count,omitempty"` + // +kubebuilder:default=false + Enabled bool `json:"enabled,omitempty"` + // Refill when remaining < threshold. + // +kubebuilder:validation:Minimum=0 + Threshold int `json:"threshold,omitempty"` + // Number of auths to sign on refill. + // +kubebuilder:validation:Minimum=1 + Count int `json:"count,omitempty"` + // Cap total auths ever signed. + MaxTotal int `json:"maxTotal,omitempty"` + // Max micro-USDC spend per day. + MaxSpendPerDay string `json:"maxSpendPerDay,omitempty"` } type PurchasePayment struct { - Network string `json:"network"` - PayTo string `json:"payTo"` - Price string `json:"price"` - Asset string `json:"asset"` - AssetSymbol string `json:"assetSymbol,omitempty"` - AssetDecimals int64 `json:"assetDecimals,omitempty"` + // +kubebuilder:validation:Required + Network string `json:"network"` + // +kubebuilder:validation:Required + PayTo string `json:"payTo"` + // Atomic token units per request. + // +kubebuilder:validation:Required + Price string `json:"price"` + // ERC-20 contract address. + // +kubebuilder:validation:Required + Asset string `json:"asset"` + // Human-friendly token symbol (e.g. USDC, OBOL). + AssetSymbol string `json:"assetSymbol,omitempty"` + // Token decimals in atomic units. + AssetDecimals int64 `json:"assetDecimals,omitempty"` + // x402 transfer method used for this asset. AssetTransferMethod string `json:"assetTransferMethod,omitempty"` - EIP712Name string `json:"eip712Name,omitempty"` - EIP712Version string `json:"eip712Version,omitempty"` + // EIP-712 domain name used for signing. + EIP712Name string `json:"eip712Name,omitempty"` + // EIP-712 domain version used for signing. + EIP712Version string `json:"eip712Version,omitempty"` } type PurchaseRequestStatus struct { ObservedGeneration int64 `json:"observedGeneration,omitempty"` Conditions []Condition `json:"conditions,omitempty"` - PublicModel string `json:"publicModel,omitempty"` - Remaining int `json:"remaining,omitempty"` - Spent int `json:"spent,omitempty"` - TotalSigned int `json:"totalSigned,omitempty"` - TotalSpent string `json:"totalSpent,omitempty"` - ProbedAt string `json:"probedAt,omitempty"` - ProbedPrice string `json:"probedPrice,omitempty"` - WalletBalance string `json:"walletBalance,omitempty"` - SignerAddress string `json:"signerAddress,omitempty"` + // LiteLLM model name (paid/). + PublicModel string `json:"publicModel,omitempty"` + Remaining int `json:"remaining,omitempty"` + Spent int `json:"spent,omitempty"` + TotalSigned int `json:"totalSigned,omitempty"` + TotalSpent string `json:"totalSpent,omitempty"` + // +kubebuilder:validation:Format=date-time + ProbedAt string `json:"probedAt,omitempty"` + ProbedPrice string `json:"probedPrice,omitempty"` + WalletBalance string `json:"walletBalance,omitempty"` + SignerAddress string `json:"signerAddress,omitempty"` } func (pr *PurchaseRequest) EffectiveBuyerNamespace() string { @@ -313,6 +593,21 @@ func (pr *PurchaseRequest) EffectiveBuyerNamespace() string { // ── Agent ─────────────────────────────────────────────────────────────────── +// +kubebuilder:object:root=true +// +kubebuilder:resource:scope=Namespaced,shortName=ag +// +kubebuilder:subresource:status +// +kubebuilder:printcolumn:name="Runtime",type=string,JSONPath=`.spec.runtime` +// +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.status.pinnedModel` +// +kubebuilder:printcolumn:name="Wallet",type=string,JSONPath=`.status.walletAddress` +// +kubebuilder:printcolumn:name="Phase",type=string,JSONPath=`.status.phase` +// +kubebuilder:printcolumn:name="Ready",type=string,JSONPath=`.status.conditions[?(@.type=="Ready")].status` +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp` + +// Agent is the declarative spec for an Obol Stack agent (Hermes today, +// OpenClaw later). Decouples agent lifecycle from selling: `obol sell +// agent ` references an existing Agent rather than provisioning +// one inline. Internal manager agents with RBAC can also create Agent +// resources to spawn sub-agents. type Agent struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` @@ -320,25 +615,58 @@ type Agent struct { Status AgentStatus `json:"status,omitempty"` } +// +kubebuilder:object:root=true + +// AgentList is the list form for kubectl/list operations. +type AgentList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []Agent `json:"items"` +} + type AgentSpec struct { - Runtime string `json:"runtime,omitempty"` - Model string `json:"model,omitempty"` - Skills []string `json:"skills,omitempty"` + // Agent runtime (only hermes today; openclaw planned). + // +kubebuilder:default=hermes + // +kubebuilder:validation:Enum=hermes + Runtime string `json:"runtime,omitempty"` + // LiteLLM model name to pin. Empty = controller picks cluster + // top-of-rank on first deploy and writes status.pinnedModel. + // +kubebuilder:validation:MaxLength=256 + Model string `json:"model,omitempty"` + // Allow-listed skills written to the per-agent skills dir on first + // reconcile. Agent can edit afterwards; this is a seed, not a sandbox. + // +kubebuilder:validation:MaxItems=64 + // +kubebuilder:validation:items:Pattern=`^[a-z0-9][a-z0-9-]*$` + // +kubebuilder:validation:items:MaxLength=64 + Skills []string `json:"skills,omitempty"` + // Operator-supplied objective text. Substituted into the SOUL.md + // template by the seeder on first write. Agent owns SOUL.md after that. + // +kubebuilder:validation:MaxLength=4096 Objective string `json:"objective,omitempty"` Wallet AgentWallet `json:"wallet,omitempty"` } type AgentWallet struct { + // Provision a per-namespace remote-signer keystore. Address is + // published in status.walletAddress. + // +kubebuilder:default=false Create bool `json:"create,omitempty"` } type AgentStatus struct { - ObservedGeneration int64 `json:"observedGeneration,omitempty"` - Phase string `json:"phase,omitempty"` - PinnedModel string `json:"pinnedModel,omitempty"` - WalletAddress string `json:"walletAddress,omitempty"` - Endpoint string `json:"endpoint,omitempty"` - Conditions []Condition `json:"conditions,omitempty"` + ObservedGeneration int64 `json:"observedGeneration,omitempty"` + // Pending | Provisioning | Ready | Failed + Phase string `json:"phase,omitempty"` + // Actual model the agent is using (= spec.model when set, otherwise + // the auto-picked top-of-rank). + PinnedModel string `json:"pinnedModel,omitempty"` + // Agent's signing address when wallet.create=true. Empty otherwise. + // +kubebuilder:validation:Pattern=`^(0x[0-9a-fA-F]{40})?$` + WalletAddress string `json:"walletAddress,omitempty"` + // Cluster-internal URL for the agent runtime (e.g. + // http://hermes.agent-quant.svc.cluster.local:8642). + Endpoint string `json:"endpoint,omitempty"` + Conditions []Condition `json:"conditions,omitempty"` } func (a *Agent) EffectiveRuntime() string { @@ -363,12 +691,22 @@ func (a *Agent) IsReady() bool { return a.Status.Phase == AgentPhaseReady } -// AgentIdentity is the durable, on-chain identity an operator controls in the -// ERC-8004 Identity Registry. A single AgentIdentity outlives ServiceOffers: -// deleting the last ServiceOffer that references it does not delete the NFT, -// the published registration document, or the recorded agentId; instead the -// renderer publishes a tombstone (active:false, x402Support:false) so external -// observers still see the historical record. +// ── AgentIdentity ─────────────────────────────────────────────────────────── + +// +kubebuilder:object:root=true +// +kubebuilder:resource:scope=Namespaced,shortName=aid +// +kubebuilder:subresource:status +// +kubebuilder:printcolumn:name="Chains",type=string,JSONPath=`.status.registrations[*].chain` +// +kubebuilder:printcolumn:name="AgentIDs",type=string,JSONPath=`.status.registrations[*].agentId` +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp` + +// AgentIdentity is the durable, on-chain identity an operator controls in +// the ERC-8004 Identity Registry. A single AgentIdentity outlives +// ServiceOffers: deleting the last ServiceOffer that references it does +// not delete the NFT, the published registration document, or the +// recorded agentId; instead the renderer publishes a tombstone +// (active:false, x402Support:false) so external observers still see the +// historical record. type AgentIdentity struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` @@ -376,16 +714,31 @@ type AgentIdentity struct { Status AgentIdentityStatus `json:"status,omitempty"` } +// +kubebuilder:object:root=true + +// AgentIdentityList is the list form for kubectl/list operations. +type AgentIdentityList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []AgentIdentity `json:"items"` +} + type AgentIdentitySpec struct { } type AgentIdentityStatus struct { + // Per-chain ERC-8004 registrations for this identity document. Registrations []AgentIdentityRegistration `json:"registrations,omitempty"` } type AgentIdentityRegistration struct { - Chain string `json:"chain,omitempty"` - AgentID string `json:"agentId,omitempty"` + // ERC-8004 registration chain alias. + // +kubebuilder:validation:Required + // +kubebuilder:validation:MaxLength=64 + Chain string `json:"chain"` + // On-chain ERC-721 tokenId on the given chain. + // +kubebuilder:validation:Required + AgentID string `json:"agentId"` } func AgentIdentityAgentIDForChain(status AgentIdentityStatus, chain string) string { diff --git a/internal/monetizeapi/types_test.go b/internal/monetizeapi/types_test.go new file mode 100644 index 00000000..77a15a90 --- /dev/null +++ b/internal/monetizeapi/types_test.go @@ -0,0 +1,108 @@ +package monetizeapi + +import ( + "encoding/json" + "testing" +) + +// TestPurchaseAutoRefill_JSONRoundTrip asserts every field on +// PurchaseAutoRefill marshals to JSON and unmarshals back without loss. The +// MaxTotal + MaxSpendPerDay fields were added to match the CRD spec; this test +// pins the wire format and `omitempty` semantics so silent drift between the +// Go struct and the CRD surfaces as a test failure. +func TestPurchaseAutoRefill_JSONRoundTrip(t *testing.T) { + tests := []struct { + name string + in PurchaseAutoRefill + wantJSON string + }{ + { + name: "all fields populated", + in: PurchaseAutoRefill{ + Enabled: true, + Threshold: 5, + Count: 10, + MaxTotal: 100, + MaxSpendPerDay: "1.50", + }, + wantJSON: `{"enabled":true,"threshold":5,"count":10,"maxTotal":100,"maxSpendPerDay":"1.50"}`, + }, + { + name: "only enabled + new caps", + in: PurchaseAutoRefill{ + Enabled: true, + MaxTotal: 42, + MaxSpendPerDay: "0.05", + }, + wantJSON: `{"enabled":true,"maxTotal":42,"maxSpendPerDay":"0.05"}`, + }, + { + name: "zero values omit every field", + in: PurchaseAutoRefill{}, + wantJSON: `{}`, + }, + { + name: "MaxSpendPerDay alone", + in: PurchaseAutoRefill{ + MaxSpendPerDay: "0.0001", + }, + wantJSON: `{"maxSpendPerDay":"0.0001"}`, + }, + { + name: "MaxTotal alone", + in: PurchaseAutoRefill{ + MaxTotal: 7, + }, + wantJSON: `{"maxTotal":7}`, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gotJSON, err := json.Marshal(tt.in) + if err != nil { + t.Fatalf("marshal: %v", err) + } + if string(gotJSON) != tt.wantJSON { + t.Fatalf("marshal:\n got: %s\nwant: %s", gotJSON, tt.wantJSON) + } + + var roundTripped PurchaseAutoRefill + if err := json.Unmarshal(gotJSON, &roundTripped); err != nil { + t.Fatalf("unmarshal: %v", err) + } + if roundTripped != tt.in { + t.Fatalf("round-trip mismatch:\n got: %+v\nwant: %+v", roundTripped, tt.in) + } + }) + } +} + +// TestPurchaseAutoRefill_UnmarshalAcceptsCRDForm asserts that a JSON document +// shaped like the CRD spec deserialises into every Go field — this is the +// inverse of the marshal direction and catches accidental json-tag drift. +func TestPurchaseAutoRefill_UnmarshalAcceptsCRDForm(t *testing.T) { + const crdJSON = `{ + "enabled": true, + "threshold": 5, + "count": 10, + "maxTotal": 100, + "maxSpendPerDay": "1.50" + }` + + want := PurchaseAutoRefill{ + Enabled: true, + Threshold: 5, + Count: 10, + MaxTotal: 100, + MaxSpendPerDay: "1.50", + } + + var got PurchaseAutoRefill + if err := json.Unmarshal([]byte(crdJSON), &got); err != nil { + t.Fatalf("unmarshal: %v", err) + } + if got != want { + t.Fatalf("unmarshal mismatch:\n got: %+v\nwant: %+v", got, want) + } +} diff --git a/internal/monetizeapi/zz_generated.deepcopy.go b/internal/monetizeapi/zz_generated.deepcopy.go new file mode 100644 index 00000000..3c0207f3 --- /dev/null +++ b/internal/monetizeapi/zz_generated.deepcopy.go @@ -0,0 +1,785 @@ +//go:build !ignore_autogenerated + +// Code generated by controller-gen. DO NOT EDIT. + +// Code generated by controller-gen. DO NOT EDIT. + +package monetizeapi + +import ( + "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Agent) DeepCopyInto(out *Agent) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Agent. +func (in *Agent) DeepCopy() *Agent { + if in == nil { + return nil + } + out := new(Agent) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *Agent) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentIdentity) DeepCopyInto(out *AgentIdentity) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + out.Spec = in.Spec + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentIdentity. +func (in *AgentIdentity) DeepCopy() *AgentIdentity { + if in == nil { + return nil + } + out := new(AgentIdentity) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *AgentIdentity) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentIdentityList) DeepCopyInto(out *AgentIdentityList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]AgentIdentity, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentIdentityList. +func (in *AgentIdentityList) DeepCopy() *AgentIdentityList { + if in == nil { + return nil + } + out := new(AgentIdentityList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *AgentIdentityList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentIdentityRegistration) DeepCopyInto(out *AgentIdentityRegistration) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentIdentityRegistration. +func (in *AgentIdentityRegistration) DeepCopy() *AgentIdentityRegistration { + if in == nil { + return nil + } + out := new(AgentIdentityRegistration) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentIdentitySpec) DeepCopyInto(out *AgentIdentitySpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentIdentitySpec. +func (in *AgentIdentitySpec) DeepCopy() *AgentIdentitySpec { + if in == nil { + return nil + } + out := new(AgentIdentitySpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentIdentityStatus) DeepCopyInto(out *AgentIdentityStatus) { + *out = *in + if in.Registrations != nil { + in, out := &in.Registrations, &out.Registrations + *out = make([]AgentIdentityRegistration, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentIdentityStatus. +func (in *AgentIdentityStatus) DeepCopy() *AgentIdentityStatus { + if in == nil { + return nil + } + out := new(AgentIdentityStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentList) DeepCopyInto(out *AgentList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]Agent, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentList. +func (in *AgentList) DeepCopy() *AgentList { + if in == nil { + return nil + } + out := new(AgentList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *AgentList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentSpec) DeepCopyInto(out *AgentSpec) { + *out = *in + if in.Skills != nil { + in, out := &in.Skills, &out.Skills + *out = make([]string, len(*in)) + copy(*out, *in) + } + out.Wallet = in.Wallet +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentSpec. +func (in *AgentSpec) DeepCopy() *AgentSpec { + if in == nil { + return nil + } + out := new(AgentSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentStatus) DeepCopyInto(out *AgentStatus) { + *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentStatus. +func (in *AgentStatus) DeepCopy() *AgentStatus { + if in == nil { + return nil + } + out := new(AgentStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentWallet) DeepCopyInto(out *AgentWallet) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentWallet. +func (in *AgentWallet) DeepCopy() *AgentWallet { + if in == nil { + return nil + } + out := new(AgentWallet) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Condition) DeepCopyInto(out *Condition) { + *out = *in + in.LastTransitionTime.DeepCopyInto(&out.LastTransitionTime) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Condition. +func (in *Condition) DeepCopy() *Condition { + if in == nil { + return nil + } + out := new(Condition) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PurchaseAutoRefill) DeepCopyInto(out *PurchaseAutoRefill) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PurchaseAutoRefill. +func (in *PurchaseAutoRefill) DeepCopy() *PurchaseAutoRefill { + if in == nil { + return nil + } + out := new(PurchaseAutoRefill) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PurchasePayment) DeepCopyInto(out *PurchasePayment) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PurchasePayment. +func (in *PurchasePayment) DeepCopy() *PurchasePayment { + if in == nil { + return nil + } + out := new(PurchasePayment) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PurchaseRequest) DeepCopyInto(out *PurchaseRequest) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PurchaseRequest. +func (in *PurchaseRequest) DeepCopy() *PurchaseRequest { + if in == nil { + return nil + } + out := new(PurchaseRequest) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *PurchaseRequest) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PurchaseRequestList) DeepCopyInto(out *PurchaseRequestList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]PurchaseRequest, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PurchaseRequestList. +func (in *PurchaseRequestList) DeepCopy() *PurchaseRequestList { + if in == nil { + return nil + } + out := new(PurchaseRequestList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *PurchaseRequestList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PurchaseRequestSpec) DeepCopyInto(out *PurchaseRequestSpec) { + *out = *in + if in.PreSignedAuths != nil { + in, out := &in.PreSignedAuths, &out.PreSignedAuths + *out = make([]PreSignedAuth, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + out.AutoRefill = in.AutoRefill + out.Payment = in.Payment +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PurchaseRequestSpec. +func (in *PurchaseRequestSpec) DeepCopy() *PurchaseRequestSpec { + if in == nil { + return nil + } + out := new(PurchaseRequestSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PurchaseRequestStatus) DeepCopyInto(out *PurchaseRequestStatus) { + *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PurchaseRequestStatus. +func (in *PurchaseRequestStatus) DeepCopy() *PurchaseRequestStatus { + if in == nil { + return nil + } + out := new(PurchaseRequestStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RegistrationRequest) DeepCopyInto(out *RegistrationRequest) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + out.Spec = in.Spec + out.Status = in.Status +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RegistrationRequest. +func (in *RegistrationRequest) DeepCopy() *RegistrationRequest { + if in == nil { + return nil + } + out := new(RegistrationRequest) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *RegistrationRequest) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RegistrationRequestList) DeepCopyInto(out *RegistrationRequestList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]RegistrationRequest, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RegistrationRequestList. +func (in *RegistrationRequestList) DeepCopy() *RegistrationRequestList { + if in == nil { + return nil + } + out := new(RegistrationRequestList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *RegistrationRequestList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RegistrationRequestSpec) DeepCopyInto(out *RegistrationRequestSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RegistrationRequestSpec. +func (in *RegistrationRequestSpec) DeepCopy() *RegistrationRequestSpec { + if in == nil { + return nil + } + out := new(RegistrationRequestSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RegistrationRequestStatus) DeepCopyInto(out *RegistrationRequestStatus) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RegistrationRequestStatus. +func (in *RegistrationRequestStatus) DeepCopy() *RegistrationRequestStatus { + if in == nil { + return nil + } + out := new(RegistrationRequestStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOffer) DeepCopyInto(out *ServiceOffer) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOffer. +func (in *ServiceOffer) DeepCopy() *ServiceOffer { + if in == nil { + return nil + } + out := new(ServiceOffer) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ServiceOffer) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOfferAgent) DeepCopyInto(out *ServiceOfferAgent) { + *out = *in + out.Ref = in.Ref +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOfferAgent. +func (in *ServiceOfferAgent) DeepCopy() *ServiceOfferAgent { + if in == nil { + return nil + } + out := new(ServiceOfferAgent) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOfferAgentRef) DeepCopyInto(out *ServiceOfferAgentRef) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOfferAgentRef. +func (in *ServiceOfferAgentRef) DeepCopy() *ServiceOfferAgentRef { + if in == nil { + return nil + } + out := new(ServiceOfferAgentRef) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOfferAgentResolution) DeepCopyInto(out *ServiceOfferAgentResolution) { + *out = *in + if in.Skills != nil { + in, out := &in.Skills, &out.Skills + *out = make([]string, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOfferAgentResolution. +func (in *ServiceOfferAgentResolution) DeepCopy() *ServiceOfferAgentResolution { + if in == nil { + return nil + } + out := new(ServiceOfferAgentResolution) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOfferAsset) DeepCopyInto(out *ServiceOfferAsset) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOfferAsset. +func (in *ServiceOfferAsset) DeepCopy() *ServiceOfferAsset { + if in == nil { + return nil + } + out := new(ServiceOfferAsset) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOfferList) DeepCopyInto(out *ServiceOfferList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]ServiceOffer, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOfferList. +func (in *ServiceOfferList) DeepCopy() *ServiceOfferList { + if in == nil { + return nil + } + out := new(ServiceOfferList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ServiceOfferList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOfferModel) DeepCopyInto(out *ServiceOfferModel) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOfferModel. +func (in *ServiceOfferModel) DeepCopy() *ServiceOfferModel { + if in == nil { + return nil + } + out := new(ServiceOfferModel) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOfferPayment) DeepCopyInto(out *ServiceOfferPayment) { + *out = *in + out.Asset = in.Asset + out.Price = in.Price +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOfferPayment. +func (in *ServiceOfferPayment) DeepCopy() *ServiceOfferPayment { + if in == nil { + return nil + } + out := new(ServiceOfferPayment) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOfferPriceTable) DeepCopyInto(out *ServiceOfferPriceTable) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOfferPriceTable. +func (in *ServiceOfferPriceTable) DeepCopy() *ServiceOfferPriceTable { + if in == nil { + return nil + } + out := new(ServiceOfferPriceTable) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOfferRegistration) DeepCopyInto(out *ServiceOfferRegistration) { + *out = *in + if in.Services != nil { + in, out := &in.Services, &out.Services + *out = make([]ServiceOfferService, len(*in)) + copy(*out, *in) + } + if in.SupportedTrust != nil { + in, out := &in.SupportedTrust, &out.SupportedTrust + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.Skills != nil { + in, out := &in.Skills, &out.Skills + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.Domains != nil { + in, out := &in.Domains, &out.Domains + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.Metadata != nil { + in, out := &in.Metadata, &out.Metadata + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOfferRegistration. +func (in *ServiceOfferRegistration) DeepCopy() *ServiceOfferRegistration { + if in == nil { + return nil + } + out := new(ServiceOfferRegistration) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOfferService) DeepCopyInto(out *ServiceOfferService) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOfferService. +func (in *ServiceOfferService) DeepCopy() *ServiceOfferService { + if in == nil { + return nil + } + out := new(ServiceOfferService) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOfferSpec) DeepCopyInto(out *ServiceOfferSpec) { + *out = *in + out.Agent = in.Agent + out.Model = in.Model + out.Upstream = in.Upstream + out.Payment = in.Payment + if in.Provenance != nil { + in, out := &in.Provenance, &out.Provenance + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + in.Registration.DeepCopyInto(&out.Registration) + if in.DrainAt != nil { + in, out := &in.DrainAt, &out.DrainAt + *out = (*in).DeepCopy() + } + if in.DrainGracePeriod != nil { + in, out := &in.DrainGracePeriod, &out.DrainGracePeriod + *out = new(v1.Duration) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOfferSpec. +func (in *ServiceOfferSpec) DeepCopy() *ServiceOfferSpec { + if in == nil { + return nil + } + out := new(ServiceOfferSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOfferStatus) DeepCopyInto(out *ServiceOfferStatus) { + *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.AgentResolution != nil { + in, out := &in.AgentResolution, &out.AgentResolution + *out = new(ServiceOfferAgentResolution) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOfferStatus. +func (in *ServiceOfferStatus) DeepCopy() *ServiceOfferStatus { + if in == nil { + return nil + } + out := new(ServiceOfferStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOfferUpstream) DeepCopyInto(out *ServiceOfferUpstream) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOfferUpstream. +func (in *ServiceOfferUpstream) DeepCopy() *ServiceOfferUpstream { + if in == nil { + return nil + } + out := new(ServiceOfferUpstream) + in.DeepCopyInto(out) + return out +} diff --git a/internal/schemas/service-catalog.schema.json b/internal/schemas/service-catalog.schema.json index 6f7578ba..58dbc7c4 100644 --- a/internal/schemas/service-catalog.schema.json +++ b/internal/schemas/service-catalog.schema.json @@ -78,7 +78,8 @@ "payTo", "network", "description", - "isDemo" + "isDemo", + "available" ], "properties": { "name": { @@ -149,6 +150,18 @@ }, "isDemo": { "type": "boolean" + }, + "registrationPending": { + "type": "boolean" + }, + "available": { + "type": "boolean", + "description": "False during a drain window. Catalog consumers should treat unset as true for backwards compatibility." + }, + "drainEndsAt": { + "type": "string", + "format": "date-time", + "description": "RFC3339 timestamp at which the offer's HTTPRoute will be torn down. Set only when available=false." } } } diff --git a/internal/schemas/service_catalog.go b/internal/schemas/service_catalog.go index e53085b6..eb8bba78 100644 --- a/internal/schemas/service_catalog.go +++ b/internal/schemas/service_catalog.go @@ -39,6 +39,21 @@ type ServiceCatalogEntry struct { // know the offer is usable for x402 payments today, even though // ERC-8004 discovery via the chain still resolves to the prior state. RegistrationPending bool `json:"registrationPending,omitempty"` + + // Available is false when the offer is in its drain window. Buyers + // can still complete in-flight payments until DrainEndsAt, but + // discovery surfaces should advertise the wind-down so external + // observers can react. When false, DrainEndsAt is set to the RFC3339 + // timestamp at which the HTTPRoute will be torn down. Catalog + // consumers should treat unset Available (the default-true field) as + // "available" for backwards compatibility — the field is only written + // false during drain. + Available bool `json:"available"` + + // DrainEndsAt is the RFC3339 timestamp at which the offer's + // HTTPRoute will be removed. Set only when Available=false. Buyers + // SHOULD migrate to alternative providers before this time. + DrainEndsAt string `json:"drainEndsAt,omitempty"` } // ServiceCatalogAsset describes the settlement token resolved for a catalog diff --git a/internal/serviceoffercontroller/controller.go b/internal/serviceoffercontroller/controller.go index fac586b0..8226f06a 100644 --- a/internal/serviceoffercontroller/controller.go +++ b/internal/serviceoffercontroller/controller.go @@ -296,7 +296,7 @@ func (c *Controller) enqueueOfferFromRegistration(obj any) { log.Printf("serviceoffer-controller: decode offer for registration fan-out: %v", err) continue } - if offer.DeletionTimestamp != nil || offer.IsPaused() || !offer.Spec.Registration.Enabled { + if offer.DeletionTimestamp != nil || !offer.Spec.Registration.Enabled { continue } c.offerQueue.Add(offer.Namespace + "/" + offer.Name) @@ -439,8 +439,17 @@ func (c *Controller) reconcileOffer(ctx context.Context, key string) error { if !ready { setCondition(&status, "ModelReady", "False", "WaitingForAgent", "Referenced Agent is not yet Ready") setCondition(&status, "UpstreamHealthy", "False", "WaitingForAgent", "Referenced Agent is not yet Ready") - setCondition(&status, "PaymentGateReady", "False", "WaitingForAgent", "Referenced Agent is not yet Ready") - setCondition(&status, "RoutePublished", "False", "WaitingForAgent", "Referenced Agent is not yet Ready") + if offer.DrainExpired(time.Now()) { + if err := c.deleteRouteChildren(ctx, offer); err != nil { + return err + } + setCondition(&status, "Draining", "False", "Drained", fmt.Sprintf("Drain ended at %s; route torn down", offer.DrainEndsAt().UTC().Format(time.RFC3339))) + setCondition(&status, "PaymentGateReady", "False", "Drained", "Offer drained; payment gate removed") + setCondition(&status, "RoutePublished", "False", "Drained", "Offer drained; route removed") + } else { + setCondition(&status, "PaymentGateReady", "False", "WaitingForAgent", "Referenced Agent is not yet Ready") + setCondition(&status, "RoutePublished", "False", "WaitingForAgent", "Referenced Agent is not yet Ready") + } setCondition(&status, "Ready", "False", "WaitingForAgent", "Referenced Agent is not yet Ready") return c.updateOfferStatus(ctx, raw, status) } @@ -455,13 +464,49 @@ func (c *Controller) reconcileOffer(ctx context.Context, key string) error { return err } - if offer.IsPaused() { - if err := c.deleteRouteChildren(ctx, offer); err != nil { - return err + if offer.IsDraining() { + now := time.Now() + drainEndsAt := offer.DrainEndsAt() + if offer.DrainExpired(now) { + // Drain grace period elapsed: tear down the HTTPRoute + + // payment gate. The CR itself stays (delete is the canonical + // removal path) so external observers continue to see the + // offer in the catalog with available=false. + if err := c.deleteRouteChildren(ctx, offer); err != nil { + return err + } + setCondition(&status, "Draining", "False", "Drained", fmt.Sprintf("Drain ended at %s; route torn down", drainEndsAt.UTC().Format(time.RFC3339))) + setCondition(&status, "PaymentGateReady", "False", "Drained", "Offer drained; payment gate removed") + setCondition(&status, "RoutePublished", "False", "Drained", "Offer drained; route removed") + } else { + // Still in the drain window: keep the route + payment gate + // up so in-flight buyers can finish, but mark Draining=True + // so discovery surfaces can advertise available=false. + if upstreamHealthy && isConditionTrue(status, "ModelReady") { + if err := c.reconcilePaymentGate(ctx, &status, offer); err != nil { + return err + } + if isConditionTrue(status, "PaymentGateReady") { + if err := c.reconcileRoute(ctx, &status, offer); err != nil { + return err + } + } + } else { + setCondition(&status, "PaymentGateReady", "False", "WaitingForUpstream", "Waiting for upstream health before publishing payment gate") + setCondition(&status, "RoutePublished", "False", "WaitingForPaymentGate", "Waiting for payment gate before publishing route") + } + setCondition(&status, "Draining", "True", "Draining", fmt.Sprintf("Drain ends at %s", drainEndsAt.UTC().Format(time.RFC3339))) + // Requeue at the drain expiry so the route is torn down on + // time even without any spec change in the interim. Add a + // small slack so the comparison in DrainExpired clears. + if delay := time.Until(drainEndsAt) + time.Second; delay > 0 { + c.offerQueue.AddAfter(offer.Namespace+"/"+offer.Name, delay) + } else { + c.offerQueue.Add(offer.Namespace + "/" + offer.Name) + } } - setCondition(&status, "PaymentGateReady", "False", "Paused", "Offer is paused") - setCondition(&status, "RoutePublished", "False", "Paused", "Offer is paused") } else if upstreamHealthy && isConditionTrue(status, "ModelReady") { + setCondition(&status, "Draining", "False", "Active", "Offer is active") if err := c.reconcilePaymentGate(ctx, &status, offer); err != nil { return err } @@ -471,6 +516,7 @@ func (c *Controller) reconcileOffer(ctx context.Context, key string) error { } } } else { + setCondition(&status, "Draining", "False", "Active", "Offer is active") setCondition(&status, "PaymentGateReady", "False", "WaitingForUpstream", "Waiting for upstream health before publishing payment gate") setCondition(&status, "RoutePublished", "False", "WaitingForPaymentGate", "Waiting for payment gate before publishing route") } @@ -1114,7 +1160,7 @@ func (c *Controller) reconcileSkillCatalog(ctx context.Context, override *moneti } readyOffers := 0 for _, offer := range offers { - if offer != nil && offer.DeletionTimestamp == nil && !offer.IsPaused() && isConditionTrue(offer.Status, "Ready") { + if offer != nil && offer.DeletionTimestamp == nil && isConditionTrue(offer.Status, "Ready") { readyOffers++ } } diff --git a/internal/serviceoffercontroller/identity_controller.go b/internal/serviceoffercontroller/identity_controller.go index 030bf013..92fcd03e 100644 --- a/internal/serviceoffercontroller/identity_controller.go +++ b/internal/serviceoffercontroller/identity_controller.go @@ -386,7 +386,10 @@ func (c *Controller) registrationOffersForIdentity(key agentIdentityKey, exclude if offer.Namespace == excludeNamespace && offer.Name == excludeName { continue } - if offer.DeletionTimestamp != nil || offer.IsPaused() || !offer.Spec.Registration.Enabled { + // Draining offers stay in the registration candidate list so + // the registration document continues to advertise them with + // available=false until the drain grace period expires. + if offer.DeletionTimestamp != nil || !offer.Spec.Registration.Enabled { continue } if !isConditionTrue(offer.Status, "UpstreamHealthy") { diff --git a/internal/serviceoffercontroller/identity_render.go b/internal/serviceoffercontroller/identity_render.go index 347d0faf..89623d36 100644 --- a/internal/serviceoffercontroller/identity_render.go +++ b/internal/serviceoffercontroller/identity_render.go @@ -150,10 +150,10 @@ func buildIdentityRegistrationServices(offers []*monetizeapi.ServiceOffer, baseU baseURL = strings.TrimRight(baseURL, "/") services := make([]erc8004.ServiceDef, 0, len(offers)*2) for _, offer := range offers { - services = append(services, erc8004.ServiceDef{ + services = append(services, serviceDefWithDrain(offer, erc8004.ServiceDef{ Name: "web", Endpoint: baseURL + offer.EffectivePath(), - }) + })) if len(offer.Spec.Registration.Skills) > 0 || len(offer.Spec.Registration.Domains) > 0 { services = append(services, erc8004.ServiceDef{ Name: "OASF", @@ -163,11 +163,11 @@ func buildIdentityRegistrationServices(offers []*monetizeapi.ServiceOffer, baseU }) } for _, svc := range offer.Spec.Registration.Services { - services = append(services, erc8004.ServiceDef{ + services = append(services, serviceDefWithDrain(offer, erc8004.ServiceDef{ Name: svc.Name, Endpoint: svc.Endpoint, Version: svc.Version, - }) + })) } } return services diff --git a/internal/serviceoffercontroller/render.go b/internal/serviceoffercontroller/render.go index a733213b..0f65fa89 100644 --- a/internal/serviceoffercontroller/render.go +++ b/internal/serviceoffercontroller/render.go @@ -26,6 +26,39 @@ const ( servicesJSONRouteName = "obol-services-json-route" ) +// restrictedPodSecurityContext returns a Pod-level securityContext that +// satisfies the Restricted Pod Security Standard (PSS). PR #521 enforces +// Restricted PSS on the x402 namespace, so the controller-rendered httpd +// workloads (obol-skill-md and agentidentity-*-registration) must ship a +// compliant securityContext or they fail admission and never start. +// +// UID/GID 1000 is the canonical non-root user available in the busybox +// image used by both Deployments. fsGroup keeps the projected ConfigMap +// volumes readable by the httpd process. +func restrictedPodSecurityContext() map[string]any { + return map[string]any{ + "runAsNonRoot": true, + "runAsUser": int64(1000), + "runAsGroup": int64(1000), + "fsGroup": int64(1000), + "seccompProfile": map[string]any{ + "type": "RuntimeDefault", + }, + } +} + +// restrictedContainerSecurityContext returns a container-level +// securityContext compliant with the Restricted PSS profile: privilege +// escalation disabled and all Linux capabilities dropped. +func restrictedContainerSecurityContext() map[string]any { + return map[string]any{ + "allowPrivilegeEscalation": false, + "capabilities": map[string]any{ + "drop": []any{"ALL"}, + }, + } +} + func buildRegistrationRequest(offer *monetizeapi.ServiceOffer, desiredState string) *unstructured.Unstructured { return &unstructured.Unstructured{ Object: map[string]any{ @@ -92,11 +125,13 @@ func buildAgentIdentityRegistrationDeployment(identity *monetizeapi.AgentIdentit }, }, "spec": map[string]any{ + "securityContext": restrictedPodSecurityContext(), "containers": []any{ map[string]any{ - "name": "httpd", - "image": "busybox:1.36", - "command": []any{"httpd", "-f", "-p", "8080", "-h", "/www"}, + "name": "httpd", + "image": "busybox:1.36", + "command": []any{"httpd", "-f", "-p", "8080", "-h", "/www"}, + "securityContext": restrictedContainerSecurityContext(), "ports": []any{ map[string]any{"containerPort": int64(8080), "protocol": "TCP"}, }, @@ -259,11 +294,13 @@ func buildSkillCatalogDeployment(contentHash string) *unstructured.Unstructured }, }, "spec": map[string]any{ + "securityContext": restrictedPodSecurityContext(), "containers": []any{ map[string]any{ - "name": "httpd", - "image": "busybox:1.36", - "command": []any{"httpd", "-f", "-p", "8080", "-h", "/www"}, + "name": "httpd", + "image": "busybox:1.36", + "command": []any{"httpd", "-f", "-p", "8080", "-h", "/www"}, + "securityContext": restrictedContainerSecurityContext(), "ports": []any{ map[string]any{"containerPort": int64(8080), "protocol": "TCP"}, }, @@ -689,10 +726,10 @@ func buildRegistrationServices(owner *monetizeapi.ServiceOffer, offers []*moneti services := make([]erc8004.ServiceDef, 0, len(ordered)*2) for _, offer := range ordered { - services = append(services, erc8004.ServiceDef{ + services = append(services, serviceDefWithDrain(offer, erc8004.ServiceDef{ Name: "web", Endpoint: baseURL + offer.EffectivePath(), - }) + })) if len(offer.Spec.Registration.Skills) > 0 || len(offer.Spec.Registration.Domains) > 0 { services = append(services, erc8004.ServiceDef{ Name: "OASF", @@ -702,18 +739,37 @@ func buildRegistrationServices(owner *monetizeapi.ServiceOffer, offers []*moneti }) } for _, service := range offer.Spec.Registration.Services { - services = append(services, erc8004.ServiceDef{ + services = append(services, serviceDefWithDrain(offer, erc8004.ServiceDef{ Name: service.Name, Endpoint: service.Endpoint, Version: service.Version, - }) + })) } } return services } +func serviceDefWithDrain(offer *monetizeapi.ServiceOffer, svc erc8004.ServiceDef) erc8004.ServiceDef { + if offer == nil || !offer.IsDraining() || offer.DrainExpired(time.Now()) { + return svc + } + available := false + svc.Available = &available + svc.DrainEndsAt = offer.DrainEndsAt().UTC().Format(time.RFC3339) + return svc +} + +// offerPublishedForRegistration reports whether an offer should appear +// in the operator's ERC-8004 registration document as a live, gated +// service. Draining offers stay in the document with available=false +// so external observers can see the wind-down — this function filters +// them out only after the drain window has fully expired (i.e. the +// HTTPRoute is gone and there is no payment surface to advertise). func offerPublishedForRegistration(offer *monetizeapi.ServiceOffer) bool { - if offer == nil || offer.DeletionTimestamp != nil || offer.IsPaused() || !offer.Spec.Registration.Enabled { + if offer == nil || offer.DeletionTimestamp != nil || !offer.Spec.Registration.Enabled { + return false + } + if offer.DrainExpired(time.Now()) { return false } return isConditionTrue(offer.Status, "ModelReady") && @@ -731,9 +787,18 @@ func buildSkillCatalogMarkdown(offers []*monetizeapi.ServiceOffer, baseURL strin // both /skill.md and /api/services.json, with the on-chain ERC-8004 // registration treated as informational metadata rather than a gating // signal. See offerOperationallyReady's doc comment for the rationale. + now := time.Now() var ready []*monetizeapi.ServiceOffer for _, offer := range offers { - if offer == nil || offer.DeletionTimestamp != nil || offer.IsPaused() { + if offer == nil || offer.DeletionTimestamp != nil { + continue + } + // Drained offers (post-grace-period) have no live route — drop + // them from the catalog entirely. Draining offers (pre-expiry) + // stay in the catalog with available=false + drainEndsAt set so + // buyers can see the wind-down via discovery before the route + // disappears. + if offer.DrainExpired(now) { continue } if offerOperationallyReady(offer) { @@ -762,20 +827,25 @@ func buildSkillCatalogMarkdown(offers []*monetizeapi.ServiceOffer, baseURL strin } lines = append(lines, "## Services", "") - lines = append(lines, "| Service | Type | Model | Price | Endpoint |") - lines = append(lines, "|---------|------|-------|-------|----------|") + lines = append(lines, "| Service | Type | Model | Price | Available | Endpoint |") + lines = append(lines, "|---------|------|-------|-------|-----------|----------|") for _, offer := range ready { modelName := offer.Spec.Model.Name if modelName == "" { modelName = "—" } + availability := "yes" + if offer.IsDraining() { + availability = fmt.Sprintf("draining (ends %s)", offer.DrainEndsAt().UTC().Format(time.RFC3339)) + } lines = append(lines, fmt.Sprintf( - "| [%s](#%s) | %s | %s | %s | `%s%s` |", + "| [%s](#%s) | %s | %s | %s | %s | `%s%s` |", offer.Name, offer.Name, fallbackOfferType(offer), modelName, describeOfferPrice(offer), + availability, baseURL, offer.EffectivePath(), )) @@ -792,6 +862,12 @@ func buildSkillCatalogMarkdown(offers []*monetizeapi.ServiceOffer, baseURL strin lines = append(lines, fmt.Sprintf("- **Price**: %s", describeOfferPrice(offer))) lines = append(lines, fmt.Sprintf("- **Pay To**: `%s`", firstNonEmpty(offer.Spec.Payment.PayTo, "—"))) lines = append(lines, fmt.Sprintf("- **Network**: %s", firstNonEmpty(offer.Spec.Payment.Network, "—"))) + if offer.IsDraining() { + lines = append(lines, "- **Available**: false (draining)") + lines = append(lines, fmt.Sprintf("- **Drain ends at**: %s", offer.DrainEndsAt().UTC().Format(time.RFC3339))) + } else { + lines = append(lines, "- **Available**: true") + } description := offer.Spec.Registration.Description if description == "" { description = fmt.Sprintf("x402 payment-gated %s service", fallbackOfferType(offer)) @@ -868,9 +944,17 @@ func offerAwaitingRegistration(offer *monetizeapi.ServiceOffer) bool { func buildServiceCatalogJSON(offers []*monetizeapi.ServiceOffer, baseURL string) string { baseURL = strings.TrimRight(baseURL, "/") + now := time.Now() var ready []*monetizeapi.ServiceOffer for _, offer := range offers { - if offer == nil || offer.DeletionTimestamp != nil || offer.IsPaused() { + if offer == nil || offer.DeletionTimestamp != nil { + continue + } + // Drained offers (post-grace-period) have no live route — drop + // them from the catalog entirely. Draining offers (pre-expiry) + // stay in the catalog with available=false + drainEndsAt set so + // buyers can react before the route disappears. + if offer.DrainExpired(now) { continue } if offerOperationallyReady(offer) { @@ -895,6 +979,12 @@ func buildServiceCatalogJSON(offers []*monetizeapi.ServiceOffer, baseURL string) modelName = offer.Status.AgentResolution.Model } + available := !offer.IsDraining() + drainEndsAt := "" + if offer.IsDraining() { + drainEndsAt = offer.DrainEndsAt().UTC().Format(time.RFC3339) + } + svc := schemas.ServiceCatalogEntry{ Name: offer.Name, Namespace: offer.Namespace, @@ -907,6 +997,8 @@ func buildServiceCatalogJSON(offers []*monetizeapi.ServiceOffer, baseURL string) Description: desc, IsDemo: offer.Namespace == "demo", RegistrationPending: offerAwaitingRegistration(offer), + Available: available, + DrainEndsAt: drainEndsAt, } raw, unit := offerPriceRawAndUnit(offer) diff --git a/internal/serviceoffercontroller/render_builders_test.go b/internal/serviceoffercontroller/render_builders_test.go index 22efa4b5..573d72af 100644 --- a/internal/serviceoffercontroller/render_builders_test.go +++ b/internal/serviceoffercontroller/render_builders_test.go @@ -5,8 +5,101 @@ import ( "testing" "github.com/ObolNetwork/obol-stack/internal/monetizeapi" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +// assertRestrictedPSS checks that a controller-rendered Deployment satisfies +// the Restricted Pod Security Standard. PR #521 enforces Restricted PSS on +// the x402 namespace, so any httpd workload missing these fields gets +// rejected at admission and never starts (Bug #3 from the 14-PR integration +// test campaign). +func assertRestrictedPSS(t *testing.T, deploymentName string, spec map[string]any) { + t.Helper() + template, _ := spec["template"].(map[string]any) + podSpec, _ := template["spec"].(map[string]any) + + psc, ok := podSpec["securityContext"].(map[string]any) + if !ok { + t.Fatalf("%s: pod spec missing securityContext", deploymentName) + } + if v, _ := psc["runAsNonRoot"].(bool); !v { + t.Errorf("%s: pod securityContext.runAsNonRoot = %v, want true", deploymentName, psc["runAsNonRoot"]) + } + if v, _ := psc["runAsUser"].(int64); v == 0 { + t.Errorf("%s: pod securityContext.runAsUser must be set to a non-zero UID", deploymentName) + } + if v, _ := psc["runAsGroup"].(int64); v == 0 { + t.Errorf("%s: pod securityContext.runAsGroup must be set to a non-zero GID", deploymentName) + } + sp, ok := psc["seccompProfile"].(map[string]any) + if !ok { + t.Errorf("%s: pod securityContext missing seccompProfile", deploymentName) + } else if t2, _ := sp["type"].(string); t2 != "RuntimeDefault" && t2 != "Localhost" { + t.Errorf("%s: pod seccompProfile.type = %q, want RuntimeDefault or Localhost", deploymentName, t2) + } + + containers, _ := podSpec["containers"].([]any) + if len(containers) == 0 { + t.Fatalf("%s: no containers in pod spec", deploymentName) + } + for _, c := range containers { + cm, _ := c.(map[string]any) + name, _ := cm["name"].(string) + csc, ok := cm["securityContext"].(map[string]any) + if !ok { + t.Errorf("%s/%s: container missing securityContext", deploymentName, name) + continue + } + if v, _ := csc["allowPrivilegeEscalation"].(bool); v { + t.Errorf("%s/%s: container allowPrivilegeEscalation = true, want false", deploymentName, name) + } + if _, present := csc["allowPrivilegeEscalation"]; !present { + t.Errorf("%s/%s: container missing allowPrivilegeEscalation (must be false)", deploymentName, name) + } + caps, ok := csc["capabilities"].(map[string]any) + if !ok { + t.Errorf("%s/%s: container securityContext missing capabilities", deploymentName, name) + continue + } + drop, _ := caps["drop"].([]any) + var droppedAll bool + for _, d := range drop { + if s, _ := d.(string); s == "ALL" { + droppedAll = true + } + } + if !droppedAll { + t.Errorf("%s/%s: container capabilities.drop must include \"ALL\", got %v", deploymentName, name, drop) + } + } +} + +// TestBuildSkillCatalogDeployment_RestrictedPSS verifies the skill-md +// httpd Deployment ships a Restricted-PSS-compliant securityContext. +// Regression test for the cross-PR interaction with #521 surfaced by +// the 14-PR integration test (Bug #3). +func TestBuildSkillCatalogDeployment_RestrictedPSS(t *testing.T) { + d := buildSkillCatalogDeployment("hash-x") + spec, _ := d.Object["spec"].(map[string]any) + assertRestrictedPSS(t, skillCatalogConfigMapName, spec) +} + +// TestBuildAgentIdentityRegistrationDeployment_RestrictedPSS verifies the +// agentidentity well-known/agent-registration.json publisher httpd +// Deployment ships a Restricted-PSS-compliant securityContext. +func TestBuildAgentIdentityRegistrationDeployment_RestrictedPSS(t *testing.T) { + identity := &monetizeapi.AgentIdentity{ + ObjectMeta: metav1.ObjectMeta{ + Name: monetizeapi.AgentIdentityDefaultName, + Namespace: "x402", + UID: "test-uid", + }, + } + d := buildAgentIdentityRegistrationDeployment(identity, "hash-y") + spec, _ := d.Object["spec"].(map[string]any) + assertRestrictedPSS(t, agentIdentityRegistrationName(identity), spec) +} + // TestBuildSkillCatalogConfigMap: exposes skill.md + services.json + httpd conf. func TestBuildSkillCatalogConfigMap(t *testing.T) { cm := buildSkillCatalogConfigMap("# Catalog", `[{"name":"a"}]`) diff --git a/internal/serviceoffercontroller/render_test.go b/internal/serviceoffercontroller/render_test.go index eb71891c..286e1763 100644 --- a/internal/serviceoffercontroller/render_test.go +++ b/internal/serviceoffercontroller/render_test.go @@ -409,6 +409,80 @@ func TestBuildRegistrationServices_IncludesOwnerWhenOwnerNotYetPublished(t *test } } +func TestBuildRegistrationServices_IncludesDrainMetadata(t *testing.T) { + drainAt := metav1.NewTime(time.Now()) + grace := metav1.Duration{Duration: time.Hour} + offer := &monetizeapi.ServiceOffer{ + ObjectMeta: metav1.ObjectMeta{Name: "draining", Namespace: "demo"}, + Spec: monetizeapi.ServiceOfferSpec{ + Path: "/services/draining", + DrainAt: &drainAt, + DrainGracePeriod: &grace, + Registration: monetizeapi.ServiceOfferRegistration{ + Enabled: true, + Services: []monetizeapi.ServiceOfferService{ + {Name: "A2A", Endpoint: "https://example.com/a2a", Version: "0.2.1"}, + }, + }, + }, + Status: monetizeapi.ServiceOfferStatus{ + Conditions: []monetizeapi.Condition{ + {Type: "ModelReady", Status: "True"}, + {Type: "UpstreamHealthy", Status: "True"}, + {Type: "PaymentGateReady", Status: "True"}, + {Type: "RoutePublished", Status: "True"}, + }, + }, + } + + services := buildRegistrationServices(offer, []*monetizeapi.ServiceOffer{offer}, "https://example.com") + if len(services) != 2 { + t.Fatalf("services = %+v, want web + A2A", services) + } + for _, svc := range services { + if svc.Available == nil { + t.Fatalf("%s missing available=false drain marker: %+v", svc.Name, svc) + } + if *svc.Available { + t.Fatalf("%s available = true, want false during drain: %+v", svc.Name, svc) + } + if _, err := time.Parse(time.RFC3339, svc.DrainEndsAt); err != nil { + t.Fatalf("%s drainEndsAt = %q is not RFC3339: %v", svc.Name, svc.DrainEndsAt, err) + } + } +} + +func TestBuildIdentityRegistrationServices_IncludesDrainMetadata(t *testing.T) { + drainAt := metav1.NewTime(time.Now()) + grace := metav1.Duration{Duration: 30 * time.Minute} + offer := &monetizeapi.ServiceOffer{ + ObjectMeta: metav1.ObjectMeta{Name: "identity-drain", Namespace: "demo"}, + Spec: monetizeapi.ServiceOfferSpec{ + Path: "/services/identity-drain", + DrainAt: &drainAt, + DrainGracePeriod: &grace, + Registration: monetizeapi.ServiceOfferRegistration{ + Services: []monetizeapi.ServiceOfferService{ + {Name: "MCP", Endpoint: "https://example.com/mcp", Version: "2025-06-18"}, + }, + }, + }, + } + + services := buildIdentityRegistrationServices([]*monetizeapi.ServiceOffer{offer}, "https://example.com") + if len(services) != 2 { + t.Fatalf("services = %+v, want web + MCP", services) + } + for _, svc := range services { + if svc.Available == nil || *svc.Available { + t.Fatalf("%s missing available=false drain marker: %+v", svc.Name, svc) + } + if _, err := time.Parse(time.RFC3339, svc.DrainEndsAt); err != nil { + t.Fatalf("%s drainEndsAt = %q is not RFC3339: %v", svc.Name, svc.DrainEndsAt, err) + } + } +} + func TestBuildRegistrationConfigMap_PublishesAggregatedAgentRegistration(t *testing.T) { readyConditions := []monetizeapi.Condition{ {Type: "ModelReady", Status: "True"}, @@ -719,18 +793,27 @@ func TestBuildServiceCatalogJSON_AgentOfferUsesResolvedModel(t *testing.T) { } // TestBuildServiceCatalogJSON_ExcludesNonReady locks in the filter pipeline: -// nil offers, paused offers, and offers with a DeletionTimestamp must never -// leak onto the public storefront, even if they carry Ready=True. +// nil offers, drain-expired offers, and offers with a DeletionTimestamp +// must never leak onto the public storefront, even if they carry +// Ready=True. Mid-drain offers DO stay in the catalog with available=false +// and drainEndsAt set — that's the whole point of the drain replacement. func TestBuildServiceCatalogJSON_ExcludesNonReady(t *testing.T) { readyCond := []monetizeapi.Condition{{Type: "Ready", Status: "True"}} deleting := metav1.Now() + drainedAt := metav1.NewTime(time.Now().Add(-2 * time.Hour)) + zeroGrace := metav1.Duration{Duration: 0} + offers := []*monetizeapi.ServiceOffer{ nil, { - ObjectMeta: metav1.ObjectMeta{ - Name: "paused-svc", Namespace: "llm", - Annotations: map[string]string{monetizeapi.PausedAnnotation: "true"}, + ObjectMeta: metav1.ObjectMeta{Name: "drained-svc", Namespace: "llm"}, + Spec: monetizeapi.ServiceOfferSpec{ + DrainAt: &drainedAt, + DrainGracePeriod: &zeroGrace, + Payment: monetizeapi.ServiceOfferPayment{ + Price: monetizeapi.ServiceOfferPriceTable{PerRequest: "0.001"}, + }, }, Status: monetizeapi.ServiceOfferStatus{Conditions: readyCond}, }, @@ -773,6 +856,89 @@ func TestBuildServiceCatalogJSON_ExcludesNonReady(t *testing.T) { if services[0].Name != "ready-svc" { t.Errorf("got %q, want ready-svc — filter pipeline leaked another offer", services[0].Name) } + if !services[0].Available { + t.Errorf("ready-svc.available = false, want true (offer is not draining)") + } +} + +// TestBuildServiceCatalogJSON_DrainLifecycle covers the three drain +// states explicitly: pre-drain (available=true, no drainEndsAt), mid-drain +// (in catalog, available=false, drainEndsAt populated), and drain-expired +// (filtered out of the catalog because the controller has torn down the +// underlying route). +func TestBuildServiceCatalogJSON_DrainLifecycle(t *testing.T) { + readyCond := []monetizeapi.Condition{{Type: "Ready", Status: "True"}} + mkOffer := func(name string) monetizeapi.ServiceOffer { + return monetizeapi.ServiceOffer{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: "llm"}, + Spec: monetizeapi.ServiceOfferSpec{ + Type: "http", + Payment: monetizeapi.ServiceOfferPayment{ + Network: "base", + PayTo: "0x1111111111111111111111111111111111111111", + Price: monetizeapi.ServiceOfferPriceTable{PerRequest: "0.001"}, + }, + }, + Status: monetizeapi.ServiceOfferStatus{Conditions: readyCond}, + } + } + + // Pre-drain. + pre := mkOffer("pre") + + // Mid-drain: drainAt = now, grace = 1h → ends ~1h from now. + midDrainAt := metav1.NewTime(time.Now()) + midGrace := metav1.Duration{Duration: time.Hour} + mid := mkOffer("mid") + mid.Spec.DrainAt = &midDrainAt + mid.Spec.DrainGracePeriod = &midGrace + + // Drain-expired. + expDrainAt := metav1.NewTime(time.Now().Add(-2 * time.Hour)) + expGrace := metav1.Duration{Duration: time.Hour} + exp := mkOffer("expired") + exp.Spec.DrainAt = &expDrainAt + exp.Spec.DrainGracePeriod = &expGrace + + jsonStr := buildServiceCatalogJSON([]*monetizeapi.ServiceOffer{&pre, &mid, &exp}, "https://example.com") + var services []schemas.ServiceCatalogEntry + if err := json.Unmarshal([]byte(jsonStr), &services); err != nil { + t.Fatalf("invalid JSON: %v\n%s", err, jsonStr) + } + if len(services) != 2 { + t.Fatalf("expected 2 services (pre + mid; expired filtered out), got %d: %+v", len(services), services) + } + + byName := map[string]schemas.ServiceCatalogEntry{} + for _, s := range services { + byName[s.Name] = s + } + if pre, ok := byName["pre"]; !ok { + t.Fatal("pre-drain offer missing from catalog") + } else { + if !pre.Available { + t.Errorf("pre.available = false, want true") + } + if pre.DrainEndsAt != "" { + t.Errorf("pre.drainEndsAt = %q, want empty", pre.DrainEndsAt) + } + } + if mid, ok := byName["mid"]; !ok { + t.Fatal("mid-drain offer missing from catalog") + } else { + if mid.Available { + t.Errorf("mid.available = true, want false (offer is draining)") + } + if mid.DrainEndsAt == "" { + t.Errorf("mid.drainEndsAt is empty, want RFC3339 timestamp") + } + if _, err := time.Parse(time.RFC3339, mid.DrainEndsAt); err != nil { + t.Errorf("mid.drainEndsAt = %q is not RFC3339: %v", mid.DrainEndsAt, err) + } + } + if _, ok := byName["expired"]; ok { + t.Error("drain-expired offer leaked into catalog; should be filtered") + } } // TestBuildServiceCatalogJSON_SortOrder ensures offers render in diff --git a/internal/stack/stack_test.go b/internal/stack/stack_test.go index 0d7cca31..a07932fa 100644 --- a/internal/stack/stack_test.go +++ b/internal/stack/stack_test.go @@ -434,33 +434,38 @@ func TestDockerBridgeGatewayIP(t *testing.T) { t.Logf("docker0 gateway IP: %s", ip) } +// TestHelmfile_IncludesBuyerPodMonitor asserts the litellm-x402-buyer +// PodMonitor is shipped with the stack. The PodMonitor previously lived +// as an inline `bedag/raw` release in helmfile.yaml; it now lives next +// to its workload in base/templates/llm.yaml. The chart layout (the +// `base` Helm release) renders it during `obol stack up`. func TestHelmfile_IncludesBuyerPodMonitor(t *testing.T) { projectRoot := findProjectRoot() if projectRoot == "" { t.Fatal("project root not found") } - data, err := os.ReadFile(filepath.Join(projectRoot, "internal/embed/infrastructure/helmfile.yaml")) + data, err := os.ReadFile(filepath.Join(projectRoot, "internal/embed/infrastructure/base/templates/llm.yaml")) if err != nil { - t.Fatalf("read helmfile: %v", err) + t.Fatalf("read llm template: %v", err) } out := string(data) if !strings.Contains(out, "kind: PodMonitor") { - t.Fatalf("helmfile missing PodMonitor:\n%s", out) + t.Fatalf("llm template missing PodMonitor:\n%s", out) } if !strings.Contains(out, "name: litellm-x402-buyer") { - t.Fatalf("helmfile missing buyer PodMonitor name:\n%s", out) + t.Fatalf("llm template missing buyer PodMonitor name:\n%s", out) } if !strings.Contains(out, "release: monitoring") { - t.Fatalf("helmfile missing monitoring label:\n%s", out) + t.Fatalf("llm template missing monitoring label:\n%s", out) } if !strings.Contains(out, "port: buyer-http") || !strings.Contains(out, "path: /metrics") { - t.Fatalf("helmfile missing buyer metrics endpoint:\n%s", out) + t.Fatalf("llm template missing buyer metrics endpoint:\n%s", out) } } @@ -486,7 +491,7 @@ func TestLLMTemplate_IncludesPaidRouteAndBuyerSidecar(t *testing.T) { `name: buyer-http`, `name: x402-buyer-config`, `name: x402-buyer-auths`, - `emptyDir: {}`, + `emptyDir:`, } { if !strings.Contains(out, want) { t.Fatalf("llm template missing %q:\n%s", want, out) diff --git a/internal/x402/buyer/metrics.go b/internal/x402/buyer/metrics.go index 0df96424..5079f0a0 100644 --- a/internal/x402/buyer/metrics.go +++ b/internal/x402/buyer/metrics.go @@ -29,35 +29,35 @@ func newMetrics() *metrics { Name: "obol_x402_buyer_requests_total", Help: "Total requests routed through the x402 buyer sidecar.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), paymentAttempts: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_buyer_payment_attempts_total", Help: "Total x402 payment attempts made by the buyer sidecar.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), paymentSuccessTotal: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_buyer_payment_success_total", Help: "Total successful x402 payments made by the buyer sidecar.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), paymentFailureTotal: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_buyer_payment_failure_total", Help: "Total failed x402 payments attempted by the buyer sidecar.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), confirmSpendFailureTotal: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_buyer_confirm_spend_failure_total", Help: "Successful upstream responses whose consumed-auth state could not be persisted.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), // paymentUnsettledConfirmations counts the occurrences of an upstream // returning 2xx without X-PAYMENT-RESPONSE. The buyer still marks the @@ -69,28 +69,28 @@ func newMetrics() *metrics { Name: "obol_x402_buyer_payment_unsettled_confirmations_total", Help: "Upstream 2xx responses with no X-PAYMENT-RESPONSE header — auth consumed locally without observed on-chain settlement.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), authRemaining: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "obol_x402_buyer_auth_remaining", Help: "Remaining pre-signed authorizations for an upstream model mapping.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), authSpent: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "obol_x402_buyer_auth_spent", Help: "Consumed pre-signed authorizations for an upstream model mapping.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), activeModelMappings: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "obol_x402_buyer_active_model_mappings", Help: "Active paid model mappings loaded in the buyer sidecar.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), } diff --git a/internal/x402/buyer/metrics_test.go b/internal/x402/buyer/metrics_test.go new file mode 100644 index 00000000..9ce4fabc --- /dev/null +++ b/internal/x402/buyer/metrics_test.go @@ -0,0 +1,191 @@ +package buyer + +import ( + "net/http" + "net/http/httptest" + "strings" + "testing" + + dto "github.com/prometheus/client_model/go" + "github.com/prometheus/common/expfmt" +) + +// TestPrometheusLabels_ChainPropagation asserts that prometheusLabels surfaces +// the `chain` label sourced from UpstreamConfig.Network so paid-request metrics +// can be partitioned by chain (base, base-sepolia, etc.). The empty-chain case +// is also exercised so the label is always rendered cleanly even when an +// upstream has no Network set. +func TestPrometheusLabels_ChainPropagation(t *testing.T) { + tests := []struct { + name string + upstream string + remoteModel string + chain string + want map[string]string + }{ + { + name: "base-sepolia chain propagates", + upstream: "upstream-a", + remoteModel: "qwen3.5:9b", + chain: "base-sepolia", + want: map[string]string{ + "upstream": "upstream-a", + "remote_model": "qwen3.5:9b", + "chain": "base-sepolia", + }, + }, + { + name: "base mainnet chain propagates", + upstream: "upstream-b", + remoteModel: "qwen3.5:4b", + chain: "base", + want: map[string]string{ + "upstream": "upstream-b", + "remote_model": "qwen3.5:4b", + "chain": "base", + }, + }, + { + name: "empty chain renders cleanly", + upstream: "upstream-c", + remoteModel: "qwen3.5:1b", + chain: "", + want: map[string]string{ + "upstream": "upstream-c", + "remote_model": "qwen3.5:1b", + "chain": "", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := prometheusLabels(tt.upstream, tt.remoteModel, tt.chain) + if len(got) != len(tt.want) { + t.Fatalf("got %d labels, want %d (%v vs %v)", len(got), len(tt.want), got, tt.want) + } + for k, v := range tt.want { + if got[k] != v { + t.Errorf("label %q = %q, want %q", k, got[k], v) + } + } + }) + } +} + +// TestMetrics_ChainLabelScrapeRoundtrip increments each of the 9 buyer +// counters/gauges using prometheusLabels and then scrapes /metrics through the +// registry's handler, asserting the `chain` label appears (with the expected +// value) on every series. +func TestMetrics_ChainLabelScrapeRoundtrip(t *testing.T) { + tests := []struct { + name string + upstream string + remoteModel string + chain string + }{ + { + name: "base-sepolia label visible on every series", + upstream: "upstream-a", + remoteModel: "qwen3.5:9b", + chain: "base-sepolia", + }, + { + name: "empty chain label is present and empty", + upstream: "upstream-b", + remoteModel: "qwen3.5:4b", + chain: "", + }, + } + + // Every metric registered by newMetrics carries the same {upstream, + // remote_model, chain} label set. + wantFamilies := []string{ + "obol_x402_buyer_requests_total", + "obol_x402_buyer_payment_attempts_total", + "obol_x402_buyer_payment_success_total", + "obol_x402_buyer_payment_failure_total", + "obol_x402_buyer_confirm_spend_failure_total", + "obol_x402_buyer_payment_unsettled_confirmations_total", + "obol_x402_buyer_auth_remaining", + "obol_x402_buyer_auth_spent", + "obol_x402_buyer_active_model_mappings", + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + m := newMetrics() + labels := prometheusLabels(tt.upstream, tt.remoteModel, tt.chain) + + // Counters: incremented once each. + m.requestsTotal.With(labels).Inc() + m.paymentAttempts.With(labels).Inc() + m.paymentSuccessTotal.With(labels).Inc() + m.paymentFailureTotal.With(labels).Inc() + m.confirmSpendFailureTotal.With(labels).Inc() + m.paymentUnsettledConfirmations.With(labels).Inc() + // Gauges: stamped with arbitrary non-zero values. + m.authRemaining.With(labels).Set(7) + m.authSpent.With(labels).Set(3) + m.activeModelMappings.With(labels).Set(1) + + families := scrapeBuyerMetrics(t, m) + + wantLabels := map[string]string{ + "upstream": tt.upstream, + "remote_model": tt.remoteModel, + "chain": tt.chain, + } + for _, name := range wantFamilies { + fam, ok := families[name] + if !ok { + t.Errorf("missing metric family %s", name) + continue + } + if !buyerHasSeriesWithLabels(fam, wantLabels) { + t.Errorf("metric %s missing series with labels %v", name, wantLabels) + } + } + }) + } +} + +// scrapeBuyerMetrics renders the metrics registry through its HTTP handler and +// parses the Prometheus text exposition into a name → MetricFamily map. +func scrapeBuyerMetrics(t *testing.T, m *metrics) map[string]*dto.MetricFamily { + t.Helper() + + rec := httptest.NewRecorder() + m.handler().ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/metrics", nil)) + if rec.Code != http.StatusOK { + t.Fatalf("metrics status = %d, want 200", rec.Code) + } + + var parser expfmt.TextParser + families, err := parser.TextToMetricFamilies(strings.NewReader(rec.Body.String())) + if err != nil { + t.Fatalf("parse metrics: %v", err) + } + return families +} + +// buyerHasSeriesWithLabels returns true iff `family` contains at least one +// series whose label set exactly equals `want`. +func buyerHasSeriesWithLabels(family *dto.MetricFamily, want map[string]string) bool { + for _, metric := range family.GetMetric() { + if len(metric.GetLabel()) != len(want) { + continue + } + match := true + for _, label := range metric.GetLabel() { + if want[label.GetName()] != label.GetValue() { + match = false + break + } + } + if match { + return true + } + } + return false +} diff --git a/internal/x402/buyer/proxy.go b/internal/x402/buyer/proxy.go index afcb3d5c..b1b644e0 100644 --- a/internal/x402/buyer/proxy.go +++ b/internal/x402/buyer/proxy.go @@ -203,17 +203,18 @@ func (p *Proxy) syncMetricsLocked() { for name, upstream := range p.upstreams { signer := p.signers[name] - labels := prometheusLabels(name, upstream.remoteModel) + labels := prometheusLabels(name, upstream.remoteModel, upstream.config.Network) p.metrics.activeModelMappings.With(labels).Set(1) p.metrics.authRemaining.With(labels).Set(float64(signer.Remaining())) p.metrics.authSpent.With(labels).Set(float64(signer.Spent())) } } -func prometheusLabels(name, remoteModel string) map[string]string { +func prometheusLabels(name, remoteModel, chain string) map[string]string { return map[string]string{ "upstream": name, "remote_model": remoteModel, + "chain": chain, } } @@ -226,7 +227,7 @@ func (p *Proxy) buildUpstreamHandler(name, remoteModel string, cfg UpstreamConfi return nil, fmt.Errorf("parse upstream URL %q: %w", cfg.URL, err) } - labels := prometheusLabels(name, remoteModel) + labels := prometheusLabels(name, remoteModel, cfg.Network) rp := &httputil.ReverseProxy{ Rewrite: func(pr *httputil.ProxyRequest) { pr.SetURL(target) @@ -298,7 +299,7 @@ func (p *Proxy) handleModelRequest(w http.ResponseWriter, r *http.Request) { return io.NopCloser(bytes.NewReader(rewrittenBody)), nil } - labels := prometheusLabels(entry.name, remoteModel) + labels := prometheusLabels(entry.name, remoteModel, entry.config.Network) p.metrics.requestsTotal.With(labels).Inc() entry.handler.ServeHTTP(w, r) } diff --git a/internal/x402/buyer/proxy_test.go b/internal/x402/buyer/proxy_test.go index 55281c46..5b135720 100644 --- a/internal/x402/buyer/proxy_test.go +++ b/internal/x402/buyer/proxy_test.go @@ -1045,7 +1045,7 @@ func TestProxy_ModelRoutingAndMetrics(t *testing.T) { } metrics := scrapeMetricFamilies(t, proxy) - labels := map[string]string{"upstream": "seller-qwen", "remote_model": "qwen3:32b"} + labels := map[string]string{"upstream": "seller-qwen", "remote_model": "qwen3:32b", "chain": "base-sepolia"} assertMetricValue(t, metrics["obol_x402_buyer_requests_total"], labels, 1) assertMetricValue(t, metrics["obol_x402_buyer_payment_attempts_total"], labels, 1) assertMetricValue(t, metrics["obol_x402_buyer_payment_success_total"], labels, 1) @@ -1295,10 +1295,10 @@ func TestProxy_ReloadSkipsConsumedAuthsAndReplacesModelMapping(t *testing.T) { t.Fatalf("active model mapping series = %d, want 1", metricFamilyLen(activeMappings)) } - assertMetricValue(t, activeMappings, map[string]string{"upstream": "seller-new", "remote_model": "new-model"}, 1) - assertMetricMissing(t, activeMappings, map[string]string{"upstream": "seller-old", "remote_model": "old-model"}) - assertMetricValue(t, metrics["obol_x402_buyer_auth_remaining"], map[string]string{"upstream": "seller-new", "remote_model": "new-model"}, 1) - assertMetricMissing(t, metrics["obol_x402_buyer_auth_remaining"], map[string]string{"upstream": "seller-old", "remote_model": "old-model"}) + assertMetricValue(t, activeMappings, map[string]string{"upstream": "seller-new", "remote_model": "new-model", "chain": "base-sepolia"}, 1) + assertMetricMissing(t, activeMappings, map[string]string{"upstream": "seller-old", "remote_model": "old-model", "chain": "base-sepolia"}) + assertMetricValue(t, metrics["obol_x402_buyer_auth_remaining"], map[string]string{"upstream": "seller-new", "remote_model": "new-model", "chain": "base-sepolia"}, 1) + assertMetricMissing(t, metrics["obol_x402_buyer_auth_remaining"], map[string]string{"upstream": "seller-old", "remote_model": "old-model", "chain": "base-sepolia"}) } func TestProxy_ReloadSamePurchasePreservesSpentAndAppendsAuthPool(t *testing.T) { @@ -1752,6 +1752,7 @@ func TestProxy_UpstreamSuccessNoSettlementHeader_IncrementsUnsettledMetric(t *te assertMetricValue(t, family, map[string]string{ "upstream": "paid", "remote_model": "paid", + "chain": "base-sepolia", }, 1) } @@ -1826,6 +1827,7 @@ func TestProxy_UpstreamSuccessWithSettlementHeader_DoesNotIncrementUnsettledMetr assertMetricMissing(t, metrics["obol_x402_buyer_payment_unsettled_confirmations_total"], map[string]string{ "upstream": "paid", "remote_model": "paid", + "chain": "base-sepolia", }) } @@ -1901,6 +1903,7 @@ func TestProxy_ConfirmSpendFailure_IncrementsMetric(t *testing.T) { assertMetricValue(t, metrics["obol_x402_buyer_confirm_spend_failure_total"], map[string]string{ "upstream": "paid", "remote_model": "paid", + "chain": "base-sepolia", }, 1) } diff --git a/internal/x402/manifest_devmode_test.go b/internal/x402/manifest_devmode_test.go deleted file mode 100644 index 7bcc3e52..00000000 --- a/internal/x402/manifest_devmode_test.go +++ /dev/null @@ -1,43 +0,0 @@ -package x402 - -import ( - "strings" - "testing" -) - -func TestX402Manifest_DevModeRewritesPins(t *testing.T) { - t.Setenv("OBOL_DEVELOPMENT", "true") - out := string(x402ManifestForApply()) - - for _, want := range []string{ - "ghcr.io/obolnetwork/x402-verifier:latest", - "ghcr.io/obolnetwork/serviceoffer-controller:latest", - } { - if !strings.Contains(out, want) { - t.Errorf("dev mode did not rewrite to %q", want) - } - } - for _, bad := range []string{ - "ghcr.io/obolnetwork/x402-verifier:b13254e", - "ghcr.io/obolnetwork/serviceoffer-controller:b13254e", - } { - if strings.Contains(out, bad) && !strings.Contains(out, ":latest@sha256:") { - // b13254e in a *comment* would be acceptable, but the regex doesn't - // match comments preceded by '#' — flag any unrewritten image: line. - for _, line := range strings.Split(out, "\n") { - trim := strings.TrimSpace(line) - if strings.HasPrefix(trim, "image:") && strings.Contains(trim, bad) { - t.Errorf("dev mode left immutable pin on image line: %q", line) - } - } - } - } -} - -func TestX402Manifest_ProductionPreservesPins(t *testing.T) { - t.Setenv("OBOL_DEVELOPMENT", "") - out := string(x402ManifestForApply()) - if !strings.Contains(out, "ghcr.io/obolnetwork/x402-verifier:b13254e") { - t.Error("production manifest should preserve x402-verifier:b13254e pin") - } -} diff --git a/internal/x402/metrics.go b/internal/x402/metrics.go index 21d87b68..2779d148 100644 --- a/internal/x402/metrics.go +++ b/internal/x402/metrics.go @@ -10,11 +10,12 @@ import ( type verifierMetrics struct { registry *prometheus.Registry - requestsTotal *prometheus.CounterVec - paymentRequired *prometheus.CounterVec - paymentVerified *prometheus.CounterVec - paymentFailed *prometheus.CounterVec - chargedRequests *prometheus.CounterVec + requestsTotal *prometheus.CounterVec + paymentRequired *prometheus.CounterVec + paymentVerified *prometheus.CounterVec + paymentFailed *prometheus.CounterVec + chargedRequests *prometheus.CounterVec + lastPaymentSuccess *prometheus.GaugeVec } func newVerifierMetrics() *verifierMetrics { @@ -25,35 +26,42 @@ func newVerifierMetrics() *verifierMetrics { Name: "obol_x402_verifier_requests_total", Help: "Requests evaluated by the x402 verifier for matched paid routes.", }, - []string{"route", "offer_namespace", "offer_name"}, + []string{"offer_namespace", "offer_name", "chain", "asset_symbol"}, ), paymentRequired: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_payment_required_total", Help: "Requests rejected with 402 because payment was required.", }, - []string{"route", "offer_namespace", "offer_name"}, + []string{"offer_namespace", "offer_name", "chain", "asset_symbol"}, ), paymentVerified: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_payment_verified_total", Help: "Requests approved after successful x402 payment verification.", }, - []string{"route", "offer_namespace", "offer_name"}, + []string{"offer_namespace", "offer_name", "chain", "asset_symbol"}, ), paymentFailed: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_payment_failed_total", Help: "Requests rejected after a provided x402 payment failed verification.", }, - []string{"route", "offer_namespace", "offer_name"}, + []string{"offer_namespace", "offer_name", "chain", "asset_symbol"}, ), chargedRequests: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_charged_requests_total", Help: "Requests that incurred a paid x402 charge.", }, - []string{"route", "offer_namespace", "offer_name"}, + []string{"offer_namespace", "offer_name", "chain", "asset_symbol"}, + ), + lastPaymentSuccess: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "obol_x402_verifier_last_payment_success_seconds", + Help: "Unix timestamp (seconds) of the most recent successful paid x402 charge for a route.", + }, + []string{"offer_namespace", "offer_name", "chain", "asset_symbol"}, ), } @@ -63,6 +71,7 @@ func newVerifierMetrics() *verifierMetrics { m.paymentVerified, m.paymentFailed, m.chargedRequests, + m.lastPaymentSuccess, ) return m @@ -71,3 +80,67 @@ func newVerifierMetrics() *verifierMetrics { func (m *verifierMetrics) handler() http.Handler { return promhttp.HandlerFor(m.registry, promhttp.HandlerOpts{}) } + +// pruneSeriesNotIn drops every (offer_namespace, offer_name, chain, +// asset_symbol) series from the verifier's counter/gauge vecs that is not +// present in `keep`. Called from Verifier.load whenever the route set changes +// so deleted offers (e.g. `obol sell delete`) stop emitting stale series — +// most importantly the last_payment_success_seconds gauge, which would +// otherwise hold the deleted offer's last-success timestamp forever and +// falsely satisfy "recent activity" alerts and dashboards. +// +// Key shape: "ns\x00name\x00chain\x00asset" — \x00 is forbidden in +// Kubernetes object names, CAIP-2 chain ids, and ERC-20 symbols, so the +// byte-join can't collide. Including asset_symbol in the key means an +// asset-repin (USDC → OBOL on the same offer) prunes the old series rather +// than leaking a stale per-asset timestamp. +func (m *verifierMetrics) pruneSeriesNotIn(keep map[string]struct{}) { + vecs := []interface { + DeletePartialMatch(prometheus.Labels) int + }{ + m.requestsTotal, + m.paymentRequired, + m.paymentVerified, + m.paymentFailed, + m.chargedRequests, + m.lastPaymentSuccess, + } + + gathered, err := m.registry.Gather() + if err != nil { + return + } + for _, family := range gathered { + for _, metric := range family.GetMetric() { + labels := metric.GetLabel() + ns, name, chain, asset := "", "", "", "" + for _, l := range labels { + switch l.GetName() { + case "offer_namespace": + ns = l.GetValue() + case "offer_name": + name = l.GetValue() + case "chain": + chain = l.GetValue() + case "asset_symbol": + asset = l.GetValue() + } + } + if ns == "" && name == "" { + continue + } + if _, ok := keep[ns+"\x00"+name+"\x00"+chain+"\x00"+asset]; ok { + continue + } + match := prometheus.Labels{ + "offer_namespace": ns, + "offer_name": name, + "chain": chain, + "asset_symbol": asset, + } + for _, vec := range vecs { + vec.DeletePartialMatch(match) + } + } + } +} diff --git a/internal/x402/serviceoffer_source.go b/internal/x402/serviceoffer_source.go index f0b1999a..17005871 100644 --- a/internal/x402/serviceoffer_source.go +++ b/internal/x402/serviceoffer_source.go @@ -7,6 +7,7 @@ import ( "log" "sort" "strings" + "time" "github.com/ObolNetwork/obol-stack/internal/monetizeapi" "github.com/ObolNetwork/obol-stack/internal/schemas" @@ -20,7 +21,12 @@ import ( "k8s.io/client-go/tools/cache" ) -func WatchServiceOffers(ctx context.Context, cfg *rest.Config, apply func([]RouteRule) error) error { +// WatchServiceOffers runs the ServiceOffer + litellm-secrets informers and +// pushes rendered RouteRules to apply on every change. The optional +// onFirstApply callback is invoked exactly once after the post-cache-sync +// refresh succeeds; it is the signal that the route source has produced its +// first usable snapshot. Pass nil to skip. +func WatchServiceOffers(ctx context.Context, cfg *rest.Config, apply func([]RouteRule) error, onFirstApply func()) error { client, err := dynamic.NewForConfig(cfg) if err != nil { return fmt.Errorf("create dynamic client: %w", err) @@ -33,17 +39,18 @@ func WatchServiceOffers(ctx context.Context, cfg *rest.Config, apply func([]Rout offers := offerFactory.ForResource(monetizeapi.ServiceOfferGVR).Informer() secrets := secretFactory.ForResource(monetizeapi.SecretGVR).Informer() - refresh := func() { + refresh := func() (ok bool) { routes, err := routesFromStore(offers.GetStore().List(), secrets.GetStore().List()) if err != nil { log.Printf("x402-serviceoffer-source: render routes: %v", err) - return + return false } if err := apply(routes); err != nil { log.Printf("x402-serviceoffer-source: apply routes: %v", err) - return + return false } log.Printf("x402-serviceoffer-source: routes reloaded (%d routes)", len(routes)) + return true } handler := cache.ResourceEventHandlerFuncs{ @@ -60,7 +67,9 @@ func WatchServiceOffers(ctx context.Context, cfg *rest.Config, apply func([]Rout return fmt.Errorf("wait for serviceoffer informer sync") } - refresh() + if refresh() && onFirstApply != nil { + onFirstApply() + } <-ctx.Done() return nil } @@ -85,7 +94,12 @@ func routesFromStore(offerItems, secretItems []any) ([]RouteRule, error) { if offer.Spec.Upstream.Namespace == "" { offer.Spec.Upstream.Namespace = offer.Namespace } - if offer.IsPaused() || !offerConditionTrue(offer.Status, "RoutePublished") { + // Draining offers keep their route up until the grace period + // expires so in-flight payments can settle. Only skip after the + // drain window has elapsed — at that point the controller has + // also torn down the HTTPRoute, so the verifier rule would + // gate traffic against a non-existent backend. + if offer.DrainExpired(time.Now()) || !offerConditionTrue(offer.Status, "RoutePublished") { continue } diff --git a/internal/x402/serviceoffer_source_test.go b/internal/x402/serviceoffer_source_test.go index 9733095e..6c825cda 100644 --- a/internal/x402/serviceoffer_source_test.go +++ b/internal/x402/serviceoffer_source_test.go @@ -3,6 +3,7 @@ package x402 import ( "encoding/base64" "testing" + "time" "github.com/ObolNetwork/obol-stack/internal/monetizeapi" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -37,11 +38,16 @@ func TestRoutesFromStore(t *testing.T) { Conditions: []monetizeapi.Condition{{Type: "RoutePublished", Status: "True"}}, }, }), + // Drain-expired offer: drainAt + zero grace period in the past + // → route should already be torn down, and the verifier rule + // should be filtered out even though RoutePublished is still + // True in the cached status snapshot. mustOfferObject(t, monetizeapi.ServiceOffer{ - ObjectMeta: metav1.ObjectMeta{Name: "paused", Namespace: "alpha", Annotations: map[string]string{ - monetizeapi.PausedAnnotation: "true", - }}, + ObjectMeta: metav1.ObjectMeta{Name: "drained", Namespace: "alpha"}, Spec: monetizeapi.ServiceOfferSpec{ + Upstream: monetizeapi.ServiceOfferUpstream{Service: "httpbin"}, + DrainAt: &metav1.Time{Time: time.Now().Add(-2 * time.Hour)}, + DrainGracePeriod: &metav1.Duration{Duration: time.Hour}, Payment: monetizeapi.ServiceOfferPayment{ Price: monetizeapi.ServiceOfferPriceTable{PerRequest: "1"}, }, @@ -50,6 +56,23 @@ func TestRoutesFromStore(t *testing.T) { Conditions: []monetizeapi.Condition{{Type: "RoutePublished", Status: "True"}}, }, }), + // Mid-drain offer: drainAt = now, grace = 1h → still within the + // drain window, route stays up so in-flight buyers can settle. + // Should appear in the verifier rules. + mustOfferObject(t, monetizeapi.ServiceOffer{ + ObjectMeta: metav1.ObjectMeta{Name: "c", Namespace: "alpha"}, + Spec: monetizeapi.ServiceOfferSpec{ + Upstream: monetizeapi.ServiceOfferUpstream{Service: "httpbin"}, + DrainAt: &metav1.Time{Time: time.Now()}, + DrainGracePeriod: &metav1.Duration{Duration: time.Hour}, + Payment: monetizeapi.ServiceOfferPayment{ + Price: monetizeapi.ServiceOfferPriceTable{PerRequest: "0.1"}, + }, + }, + Status: monetizeapi.ServiceOfferStatus{ + Conditions: []monetizeapi.Condition{{Type: "RoutePublished", Status: "True"}}, + }, + }), } secrets := []any{ mustSecretObject(t, "alpha", "litellm-secrets", map[string]string{ @@ -62,11 +85,16 @@ func TestRoutesFromStore(t *testing.T) { t.Fatalf("routesFromStore: %v", err) } - if len(routes) != 2 { - t.Fatalf("len(routes) = %d, want 2", len(routes)) + if len(routes) != 3 { + t.Fatalf("len(routes) = %d, want 3", len(routes)) + } + // Expected sort order: alpha/a, alpha/c, beta/b. + // "drained" must be filtered out because its drain window expired. + if routes[0].OfferName != "a" || routes[1].OfferName != "c" || routes[2].OfferName != "b" { + t.Fatalf("routes not sorted by offer identity (drained leaked?): %+v", routes) } - if routes[0].OfferName != "a" || routes[1].OfferName != "b" { - t.Fatalf("routes not sorted by offer identity: %+v", routes) + if routes[0].OfferNamespace != "alpha" || routes[1].OfferNamespace != "alpha" || routes[2].OfferNamespace != "beta" { + t.Fatalf("unexpected route namespaces: %+v", routes) } if routes[0].Pattern != "/services/a/*" { t.Fatalf("routes[0].Pattern = %q, want /services/a/*", routes[0].Pattern) @@ -83,11 +111,16 @@ func TestRoutesFromStore(t *testing.T) { if routes[0].StripPrefix != "/services/a" { t.Fatalf("routes[0].StripPrefix = %q, want /services/a", routes[0].StripPrefix) } - if routes[1].UpstreamAuth != "" { - t.Fatalf("routes[1].UpstreamAuth = %q, want empty", routes[1].UpstreamAuth) + if routes[2].UpstreamAuth != "" { + t.Fatalf("routes[2].UpstreamAuth = %q, want empty", routes[2].UpstreamAuth) + } + if routes[2].UpstreamURL != "http://httpbin.beta.svc.cluster.local:11434" { + t.Fatalf("routes[2].UpstreamURL = %q, want httpbin upstream URL", routes[2].UpstreamURL) } - if routes[1].UpstreamURL != "http://httpbin.beta.svc.cluster.local:11434" { - t.Fatalf("routes[1].UpstreamURL = %q, want httpbin upstream URL", routes[1].UpstreamURL) + // Mid-drain offer "c" stays in the rules but tracks its own + // upstream — verifies the drain window keeps the route alive. + if routes[1].UpstreamURL != "http://httpbin.alpha.svc.cluster.local:11434" { + t.Fatalf("routes[1] (mid-drain) UpstreamURL = %q, want httpbin upstream URL", routes[1].UpstreamURL) } } diff --git a/internal/x402/setup.go b/internal/x402/setup.go index 562da993..8af3fe8e 100644 --- a/internal/x402/setup.go +++ b/internal/x402/setup.go @@ -4,15 +4,32 @@ import ( "encoding/json" "fmt" "os" - "regexp" + "os/exec" + "path/filepath" "strings" "github.com/ObolNetwork/obol-stack/internal/config" + stackdefaults "github.com/ObolNetwork/obol-stack/internal/defaults" "github.com/ObolNetwork/obol-stack/internal/embed" + "github.com/ObolNetwork/obol-stack/internal/helmcmd" "github.com/ObolNetwork/obol-stack/internal/kubectl" "gopkg.in/yaml.v3" ) +// x402Manifest is the raw embedded x402.yaml. It is no longer applied +// directly via kubectl — helmfile renders the same file via the `base` +// release (see EnsureVerifier). Retained as a package-level value so +// shape/content tests can assert invariants about the embedded source. +var x402Manifest = mustReadX402Manifest() + +func mustReadX402Manifest() []byte { + data, err := embed.ReadInfrastructureFile("base/templates/x402.yaml") + if err != nil { + panic(fmt.Sprintf("read embedded x402 manifest: %v", err)) + } + return data +} + const ( x402Namespace = "x402" pricingConfigMap = "x402-pricing" @@ -37,77 +54,343 @@ const ( // Used only as a hint in error messages; the actual chain is taken // from the seller's 402 response by buy.py. DefaultBuySellerChain = "base-sepolia" + + // baseReleaseName matches the helmfile release in + // internal/embed/infrastructure/helmfile.yaml whose `chart: ./base` + // renders the x402 manifests. EnsureVerifier targets this release + // via --selector so the verifier deployment is reconciled the same + // way `obol stack up` deploys it — single source of truth. + baseReleaseName = "base" ) -var x402Manifest = mustReadX402Manifest() +// EnsureVerifier deploys the x402 verifier subsystem if it doesn't exist. +// Idempotent — helmfile sync is safe to run multiple times. +// +// Historical note: this used to read embed.FS x402.yaml directly and +// `kubectl apply` it, which fought helmfile's field manager and forced +// us to duplicate the dev-mode image-pin rewrite (formerly in this file, +// now lives canonically in internal/defaults/defaults.go). Driving the +// deployment through helmfile against the already-populated +// $OBOL_CONFIG_DIR/defaults/ tree picks up the canonical dev rewrite +// for free and removes the entire footgun. See CLAUDE.md pitfall #9. +func EnsureVerifier(cfg *config.Config) error { + if err := kubectl.EnsureCluster(cfg); err != nil { + return err + } -func mustReadX402Manifest() []byte { - data, err := embed.ReadInfrastructureFile("base/templates/x402.yaml") + // Refresh the defaults tree so the helmfile sync below reads the + // most recent embedded manifests. Under OBOL_DEVELOPMENT=true this + // also applies the canonical digest-pin -> :latest rewrite via + // defaults.rewriteDevDigestPins so freshly built local images are + // honored. No-op when the stamp is up to date. + backendName := stackdefaults.DetectedBackendName(cfg) + stackID := stackdefaults.StackID(cfg) + if stackID == "" { + return fmt.Errorf("stack ID not found, run 'obol stack init' first") + } + if _, err := stackdefaults.RefreshInfrastructureIfChanged(cfg, backendName, stackID); err != nil { + return fmt.Errorf("refresh infrastructure defaults: %w", err) + } + + kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") + snapshots, err := preserveMutableRuntimeConfigMaps(cfg, kubeconfigPath) if err != nil { - panic(fmt.Sprintf("read embedded x402 manifest: %v", err)) + return fmt.Errorf("snapshot mutable runtime configmaps: %w", err) } - return data + + if err := helmfileSyncBaseRelease(cfg); err != nil { + return fmt.Errorf("helmfile sync %s: %w", baseReleaseName, err) + } + if err := restoreMutableRuntimeConfigMaps(cfg, kubeconfigPath, snapshots); err != nil { + return fmt.Errorf("restore mutable runtime configmaps: %w", err) + } + + // Populate the CA bundle after deploying the verifier so TLS verification + // of the facilitator works immediately. Idempotent — safe to call multiple times. + bin, kc := kubectl.Paths(cfg) + populateCABundle(bin, kc) + return nil } -// devLocallyBuiltImageBases mirrors internal/defaults.devLocallyBuiltImageBases -// — duplicated here to avoid a defaults → x402 → defaults import cycle. -// Must stay in lockstep with the canonical list there. -var devLocallyBuiltImageBases = []string{ - "ghcr.io/obolnetwork/x402-verifier", - "ghcr.io/obolnetwork/serviceoffer-controller", - "ghcr.io/obolnetwork/x402-buyer", - "ghcr.io/obolnetwork/demo-server", - "ghcr.io/obolnetwork/obol-stack-public-storefront", +type mutableConfigMapSnapshot struct { + Name string + Namespace string + Data map[string]string } -// rewriteDevImagePinsInManifest applies the same `:tag@sha256:digest` / -// `@sha256:digest` / `:tag` → `:latest` rewrite the defaults pipeline uses, -// so kubectl-applied manifests inside EnsureVerifier honor the local-build -// path under OBOL_DEVELOPMENT=true. Without this rewrite, the embedded -// x402.yaml carrying `:b13254e` pins beats the helmfile-rendered :latest -// deployment, and the cluster runs the stale registry image regardless of -// OBOL_FORCE_REBUILD_LOCAL_DEV_IMAGES (root cause of the missing -// HandleProxy debug-log saga during flow-11 step 43 chase, May 2026). -// -// Pattern parity with internal/defaults.rewriteDevDigestPins is enforced -// by the regression test in TestX402Manifest_DevModeRewritesPins. -func rewriteDevImagePinsInManifest(data []byte) []byte { - out := data - for _, base := range devLocallyBuiltImageBases { - re := regexp.MustCompile(regexp.QuoteMeta(base) + - `(:[a-f0-9]{7,40}@sha256:[a-f0-9]{64}|@sha256:[a-f0-9]{64}|:[a-f0-9]{7,40})`) - out = re.ReplaceAll(out, []byte(base+":latest")) +var mutableRuntimeConfigMaps = []mutableConfigMapSnapshot{ + {Name: "litellm-config", Namespace: "llm"}, + {Name: "x402-buyer-config", Namespace: "llm"}, + {Name: "x402-buyer-auths", Namespace: "llm"}, +} + +// preserveMutableRuntimeConfigMaps snapshots ConfigMaps whose data is mutated +// at runtime by `obol model setup`, PurchaseRequest reconciliation, or the +// buyer auth-pool flow. `EnsureVerifier` must sync the base release so the +// verifier uses canonical Helm ownership, but the base chart contains only +// bootstrap defaults for these objects. Without this snapshot/restore pass, +// `obol x402 setup` can erase configured models and buyer auth state. +func preserveMutableRuntimeConfigMaps(cfg *config.Config, kubeconfigPath string) ([]mutableConfigMapSnapshot, error) { + out := make([]mutableConfigMapSnapshot, 0, len(mutableRuntimeConfigMaps)) + for _, item := range mutableRuntimeConfigMaps { + data, found, err := readConfigMapData(cfg, kubeconfigPath, item.Namespace, item.Name) + if err != nil { + return nil, err + } + if !found || len(data) == 0 { + continue + } + out = append(out, mutableConfigMapSnapshot{Name: item.Name, Namespace: item.Namespace, Data: data}) + } + return out, nil +} + +func restoreMutableRuntimeConfigMaps(cfg *config.Config, kubeconfigPath string, snapshots []mutableConfigMapSnapshot) error { + for _, snap := range snapshots { + current, _, err := readConfigMapData(cfg, kubeconfigPath, snap.Namespace, snap.Name) + if err != nil { + return err + } + data, err := mergeRuntimeConfigMapData(snap.Name, current, snap.Data) + if err != nil { + return err + } + if len(data) == 0 { + continue + } + manifest, err := configMapDataManifest(snap.Namespace, snap.Name, data) + if err != nil { + return err + } + if err := kubectl.ApplyServerSideForceConflicts(filepath.Join(cfg.BinDir, "kubectl"), kubeconfigPath, manifest, "helm"); err != nil { + return err + } + } + return nil +} + +func readConfigMapData(cfg *config.Config, kubeconfigPath, namespace, name string) (map[string]string, bool, error) { + raw, err := kubectl.Output(filepath.Join(cfg.BinDir, "kubectl"), kubeconfigPath, + "get", "configmap", name, "-n", namespace, "-o", "json") + if err != nil { + if strings.Contains(err.Error(), "not found") || strings.Contains(err.Error(), "NotFound") { + return nil, false, nil + } + return nil, false, fmt.Errorf("get configmap %s/%s: %w", namespace, name, err) + } + var obj struct { + Data map[string]string `json:"data"` + } + if err := json.Unmarshal([]byte(raw), &obj); err != nil { + return nil, false, fmt.Errorf("parse configmap %s/%s: %w", namespace, name, err) + } + return obj.Data, true, nil +} + +func mergeRuntimeConfigMapData(name string, current, previous map[string]string) (map[string]string, error) { + if name == "litellm-config" { + currentRaw := current["config.yaml"] + previousRaw := previous["config.yaml"] + if strings.TrimSpace(previousRaw) == "" { + return current, nil + } + if strings.TrimSpace(currentRaw) == "" { + return previous, nil + } + merged, err := mergeLiteLLMConfig(currentRaw, previousRaw) + if err != nil { + return nil, err + } + out := copyStringMap(current) + out["config.yaml"] = merged + return out, nil + } + + out := copyStringMap(previous) + for k, v := range current { + out[k] = v + } + return out, nil +} + +func mergeLiteLLMConfig(currentRaw, previousRaw string) (string, error) { + var current map[string]any + if err := yaml.Unmarshal([]byte(currentRaw), ¤t); err != nil { + return "", fmt.Errorf("parse current LiteLLM config: %w", err) + } + if current == nil { + current = map[string]any{} + } + + var previous map[string]any + if err := yaml.Unmarshal([]byte(previousRaw), &previous); err != nil { + return "", fmt.Errorf("parse previous LiteLLM config: %w", err) + } + if previous == nil { + previous = map[string]any{} + } + + merged := copyAnyMap(previous) + for key, value := range current { + merged[key] = value + } + + models, err := mergeLiteLLMModelLists(current["model_list"], previous["model_list"]) + if err != nil { + return "", err + } + if len(models) > 0 { + merged["model_list"] = models + } + + for _, key := range []string{"general_settings", "litellm_settings"} { + if liteLLMValueEmpty(current[key]) && !liteLLMValueEmpty(previous[key]) { + merged[key] = previous[key] + } + } + + mergedRaw, err := yaml.Marshal(merged) + if err != nil { + return "", fmt.Errorf("serialize merged LiteLLM config: %w", err) + } + return string(mergedRaw), nil +} + +func mergeLiteLLMModelLists(currentRaw, previousRaw any) ([]any, error) { + current, err := liteLLMModelList(currentRaw) + if err != nil { + return nil, fmt.Errorf("parse current LiteLLM model_list: %w", err) + } + previous, err := liteLLMModelList(previousRaw) + if err != nil { + return nil, fmt.Errorf("parse previous LiteLLM model_list: %w", err) + } + + merged := append([]any{}, current...) + byName := make(map[string]bool, len(current)) + for _, entry := range current { + if name := liteLLMModelName(entry); name != "" { + byName[name] = true + } + } + for _, entry := range previous { + name := liteLLMModelName(entry) + if name == "" { + continue + } + if byName[name] { + continue + } + byName[name] = true + merged = append(merged, entry) + } + return merged, nil +} + +func liteLLMModelList(value any) ([]any, error) { + if value == nil { + return nil, nil + } + list, ok := value.([]any) + if !ok { + return nil, fmt.Errorf("expected sequence, got %T", value) + } + return list, nil +} + +func liteLLMModelName(entry any) string { + switch typed := entry.(type) { + case map[string]any: + if name, ok := typed["model_name"].(string); ok { + return strings.TrimSpace(name) + } + case map[any]any: + if name, ok := typed["model_name"].(string); ok { + return strings.TrimSpace(name) + } + } + return "" +} + +func liteLLMValueEmpty(value any) bool { + switch typed := value.(type) { + case nil: + return true + case string: + return strings.TrimSpace(typed) == "" + case []any: + return len(typed) == 0 + case map[string]any: + return len(typed) == 0 + case map[any]any: + return len(typed) == 0 + default: + return false + } +} + +func configMapDataManifest(namespace, name string, data map[string]string) ([]byte, error) { + obj := map[string]any{ + "apiVersion": "v1", + "kind": "ConfigMap", + "metadata": map[string]string{ + "name": name, + "namespace": namespace, + }, + "data": data, + } + return yaml.Marshal(obj) +} + +func copyStringMap(in map[string]string) map[string]string { + out := make(map[string]string, len(in)) + for k, v := range in { + out[k] = v } return out } -// x402ManifestForApply returns the kubectl-apply-ready bytes, rewriting -// immutable image pins to `:latest` when OBOL_DEVELOPMENT=true so the -// in-cluster verifier/controller uses the freshly-built local image. -// In production (OBOL_DEVELOPMENT unset/false) returns the embedded -// manifest verbatim — the pins are intentional and immutable. -func x402ManifestForApply() []byte { - if os.Getenv("OBOL_DEVELOPMENT") != "true" { - return x402Manifest +func copyAnyMap(in map[string]any) map[string]any { + out := make(map[string]any, len(in)) + for k, v := range in { + out[k] = v } - return rewriteDevImagePinsInManifest(x402Manifest) + return out } -// EnsureVerifier deploys the x402 verifier subsystem if it doesn't exist. -// Idempotent — kubectl apply is safe to run multiple times. -func EnsureVerifier(cfg *config.Config) error { - if err := kubectl.EnsureCluster(cfg); err != nil { - return err +// helmfileSyncBaseRelease runs `helmfile --selector name=base sync` +// against the defaults helmfile rendered into $OBOL_CONFIG_DIR/defaults. +// This is the same invocation pattern used by `internal/stack.syncDefaults` +// and `internal/update.ApplyUpgrades`, scoped to the single release that +// owns the x402 manifests. +func helmfileSyncBaseRelease(cfg *config.Config) error { + kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") + helmfilePath := filepath.Join(cfg.ConfigDir, "defaults", "helmfile.yaml") + + if _, err := os.Stat(helmfilePath); err != nil { + return fmt.Errorf("defaults helmfile not found at %s (run 'obol stack init' first): %w", helmfilePath, err) } - bin, kc := kubectl.Paths(cfg) - fmt.Println("Applying x402 payment components...") - if err := kubectl.Apply(bin, kc, x402ManifestForApply()); err != nil { - return err + helmfileBin := filepath.Join(cfg.BinDir, "helmfile") + helmBin := filepath.Join(cfg.BinDir, "helm") + + args := []string{ + "--file", helmfilePath, + "--kubeconfig", kubeconfigPath, + "--selector", "name=" + baseReleaseName, + "sync", + } + args = append(args, helmcmd.SyncFlagsForVersion(helmBin)...) + + cmd := exec.Command(helmfileBin, args...) + cmd.Env = append(os.Environ(), + "KUBECONFIG="+kubeconfigPath, + "STACK_DATA_DIR="+cfg.DataDir, + ) + out, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("%w: %s", err, strings.TrimSpace(string(out))) } - // Populate the CA bundle after deploying the verifier so TLS verification - // of the facilitator works immediately. Idempotent — safe to call multiple times. - populateCABundle(bin, kc) return nil } diff --git a/internal/x402/setup_runtime_config_test.go b/internal/x402/setup_runtime_config_test.go new file mode 100644 index 00000000..bc0f22e3 --- /dev/null +++ b/internal/x402/setup_runtime_config_test.go @@ -0,0 +1,103 @@ +package x402 + +import ( + "testing" + + "gopkg.in/yaml.v3" +) + +func TestMergeRuntimeConfigMapData_LiteLLMPreservesUserModels(t *testing.T) { + current := map[string]string{"config.yaml": ` +model_list: + - model_name: paid/* + litellm_params: + model: openai/* + api_base: http://127.0.0.1:8402/v1 + api_key: unused +general_settings: + master_key: os.environ/LITELLM_MASTER_KEY +`} + previous := map[string]string{"config.yaml": ` +model_list: + - model_name: paid/qwen36 + litellm_params: + model: openai/qwen36-apex-i-compact + api_base: http://silvermesh.v1337.lan:8081/v1 + api_key: unused +litellm_settings: + drop_params: true +`} + + merged, err := mergeRuntimeConfigMapData("litellm-config", current, previous) + if err != nil { + t.Fatalf("mergeRuntimeConfigMapData: %v", err) + } + + var parsed struct { + ModelList []struct { + ModelName string `yaml:"model_name"` + } `yaml:"model_list"` + GeneralSettings map[string]any `yaml:"general_settings"` + LiteLLMSettings map[string]any `yaml:"litellm_settings"` + } + if err := yaml.Unmarshal([]byte(merged["config.yaml"]), &parsed); err != nil { + t.Fatalf("parse merged yaml: %v\n%s", err, merged["config.yaml"]) + } + + got := map[string]bool{} + for _, entry := range parsed.ModelList { + got[entry.ModelName] = true + } + for _, want := range []string{"paid/*", "paid/qwen36"} { + if !got[want] { + t.Fatalf("merged config missing model %q:\n%s", want, merged["config.yaml"]) + } + } + if parsed.GeneralSettings["master_key"] == nil { + t.Fatalf("current general_settings should be preserved:\n%s", merged["config.yaml"]) + } + if parsed.LiteLLMSettings["drop_params"] == nil { + t.Fatalf("previous litellm_settings should be restored when current is empty:\n%s", merged["config.yaml"]) + } +} + +func TestMergeRuntimeConfigMapData_BuyerConfigPreservesRuntimeKeys(t *testing.T) { + current := map[string]string{"new.json": `{"new":true}`} + previous := map[string]string{ + "alice.json": `{"auths":["a"]}`, + "new.json": `{"old":true}`, + } + + merged, err := mergeRuntimeConfigMapData("x402-buyer-auths", current, previous) + if err != nil { + t.Fatalf("mergeRuntimeConfigMapData: %v", err) + } + if merged["alice.json"] != previous["alice.json"] { + t.Fatalf("runtime key was not preserved: %#v", merged) + } + if merged["new.json"] != current["new.json"] { + t.Fatalf("current key should win on conflicts: %#v", merged) + } +} + +func TestConfigMapDataManifest_RendersConfigMap(t *testing.T) { + manifest, err := configMapDataManifest("llm", "x402-buyer-config", map[string]string{ + "demo.json": `{"endpoint":"http://example"}`, + }) + if err != nil { + t.Fatalf("configMapDataManifest: %v", err) + } + + var parsed struct { + APIVersion string `yaml:"apiVersion"` + Kind string `yaml:"kind"` + Metadata map[string]string `yaml:"metadata"` + Data map[string]string `yaml:"data"` + } + if err := yaml.Unmarshal(manifest, &parsed); err != nil { + t.Fatalf("manifest is not yaml: %v\n%s", err, manifest) + } + if parsed.Kind != "ConfigMap" || parsed.Metadata["namespace"] != "llm" || parsed.Data["demo.json"] == "" { + t.Fatalf("unexpected manifest: %#v\n%s", parsed, manifest) + } +} diff --git a/internal/x402/setup_structure_test.go b/internal/x402/setup_structure_test.go new file mode 100644 index 00000000..4353a0fd --- /dev/null +++ b/internal/x402/setup_structure_test.go @@ -0,0 +1,82 @@ +package x402 + +import ( + "go/parser" + "go/token" + "os" + "path/filepath" + "strings" + "testing" +) + +// TestEnsureVerifier_NoInlineRegex enforces CLAUDE.md pitfall #9 at the +// structural level: setup.go must not carry its own image-pin rewrite +// regex. The canonical rewrite lives in internal/defaults/defaults.go, +// applied to the helmfile-rendered tree under $OBOL_CONFIG_DIR/defaults. +// Driving the verifier deployment through helmfile (not kubectl apply +// of embed.FS) means any duplicated regex is dead code at best and a +// silent-bypass footgun at worst. +// +// If this test fires, either: +// - delete the duplicate regex from internal/x402/setup.go, or +// - if the duplicate is genuinely needed (it almost never is), move +// it behind a shared helper in internal/defaults and call that. +func TestEnsureVerifier_NoInlineRegex(t *testing.T) { + setupPath := mustResolveFile(t, "setup.go") + + data, err := os.ReadFile(setupPath) + if err != nil { + t.Fatalf("read setup.go: %v", err) + } + src := string(data) + + // Cheap textual guard first — surfaces a clear error message even when + // the AST parse below would also catch it. + if strings.Contains(src, `"regexp"`) { + t.Fatalf("internal/x402/setup.go must not import the regexp package; " + + "the image-pin rewrite belongs in internal/defaults (see CLAUDE.md pitfall #9)") + } + if strings.Contains(src, "regexp.MustCompile") || strings.Contains(src, "regexp.Compile") { + t.Fatalf("internal/x402/setup.go must not compile regexes inline; " + + "the duplicated rewrite was deleted in favor of helmfile-driven deploy") + } + + // AST-level guard: catches aliased imports (e.g. `re "regexp"`) and is + // resilient to comments that happen to contain the word "regexp". + fset := token.NewFileSet() + file, err := parser.ParseFile(fset, setupPath, data, parser.ImportsOnly) + if err != nil { + t.Fatalf("parse setup.go: %v", err) + } + for _, imp := range file.Imports { + path := strings.Trim(imp.Path.Value, `"`) + if path == "regexp" { + t.Fatalf("internal/x402/setup.go imports %q; remove the duplicated rewrite", path) + } + } +} + +// mustResolveFile locates a source file relative to this test file. Works +// whether `go test` is run from the package directory or from the repo root. +func mustResolveFile(t *testing.T, name string) string { + t.Helper() + // First try working directory (default for `go test ./...`). + if _, err := os.Stat(name); err == nil { + abs, err := filepath.Abs(name) + if err != nil { + t.Fatalf("abs %q: %v", name, err) + } + return abs + } + t.Fatalf("could not locate %q from %q", name, mustGetwd(t)) + return "" +} + +func mustGetwd(t *testing.T) string { + t.Helper() + wd, err := os.Getwd() + if err != nil { + t.Fatalf("getwd: %v", err) + } + return wd +} diff --git a/internal/x402/setup_test.go b/internal/x402/setup_test.go index ff8b7652..7dba813e 100644 --- a/internal/x402/setup_test.go +++ b/internal/x402/setup_test.go @@ -258,7 +258,11 @@ func TestX402Manifest_UsesServiceOfferControllerModel(t *testing.T) { if !strings.Contains(manifest, "resources: [\"serviceoffers\"]") { t.Fatalf("x402 manifest missing serviceoffer watch RBAC:\n%s", manifest) } - if strings.Contains(manifest, "kind: ServiceMonitor") { - t.Fatalf("x402 manifest still includes legacy ServiceMonitor stanza:\n%s", manifest) + // ServiceMonitor now lives in this manifest by design — relocated here + // from a bedag/raw helmfile release so the scrape config sits next to + // the Service it observes. Assert presence so a future cleanup can't + // silently drop it. + if !strings.Contains(manifest, "kind: ServiceMonitor") { + t.Fatalf("x402 manifest missing ServiceMonitor (relocated from bedag/raw helmfile in PR #513 hardening):\n%s", manifest) } } diff --git a/internal/x402/verifier.go b/internal/x402/verifier.go index 65374ea4..295460dd 100644 --- a/internal/x402/verifier.go +++ b/internal/x402/verifier.go @@ -6,6 +6,7 @@ import ( "net/http" "net/http/httputil" "net/url" + "sort" "strings" "sync/atomic" @@ -21,8 +22,29 @@ type Verifier struct { chain atomic.Pointer[ChainInfo] chains atomic.Pointer[map[string]ChainInfo] // pre-resolved: chain name → config metrics *verifierMetrics + + // routesLoaded is set true after the first route source apply completes. + // Until then HandleReadyz returns 503 so kubelet keeps the pod out of + // the Service Endpoints, preventing the "no rule -> 200 free pass" + // window during informer warmup (CLAUDE.md pitfall #14). + routesLoaded atomic.Bool + + // paidPrefixes is the list of URI prefixes the verifier KNOWS are + // paid routes (derived from cfg.Routes patterns on each load). Used + // by HandleVerify to fail-closed when a URI is under a paid prefix + // but no rule matches — the alternative (200 → ForwardAuth allow) + // would silently make the route free. + // + // Sorted by length descending so longer-prefix matches win first + // (defensive — fixes nothing today but cheap insurance). + paidPrefixes atomic.Pointer[[]string] } +// MarkRoutesLoaded signals that the route source has produced its first +// non-error apply. Idempotent. After this, HandleReadyz returns 200 +// once config is also loaded. +func (v *Verifier) MarkRoutesLoaded() { v.routesLoaded.Store(true) } + // NewVerifier creates a Verifier with the given initial configuration. func NewVerifier(cfg *PricingConfig) (*Verifier, error) { v := &Verifier{metrics: newVerifierMetrics()} @@ -64,6 +86,33 @@ func (v *Verifier) load(cfg *PricingConfig) error { v.chains.Store(&chains) v.config.Store(cfg) + // Derive paid-prefix tracker from the route patterns. HandleVerify + // uses this to fail-closed when a URI is under a tracked prefix but + // no rule matches (see isUnderPaidPrefix for the rationale). + prefixes := make([]string, 0, len(cfg.Routes)) + for _, r := range cfg.Routes { + prefix := patternToPrefix(r.Pattern) + if prefix != "" { + prefixes = append(prefixes, prefix) + } + } + sort.Slice(prefixes, func(i, j int) bool { return len(prefixes[i]) > len(prefixes[j]) }) + v.paidPrefixes.Store(&prefixes) + + // Drop metric series for offers that are no longer in the route set. + // Without this, deleting an offer leaves its counters + last-success + // gauge in the registry forever, polluting dashboards and silently + // keeping alerts (e.g. "no settlements after challenge") tied to dead + // labels. + live := make(map[string]struct{}, len(cfg.Routes)) + for _, r := range cfg.Routes { + if r.OfferNamespace == "" && r.OfferName == "" { + continue + } + live[r.OfferNamespace+"\x00"+r.OfferName+"\x00"+r.Network+"\x00"+r.AssetSymbol] = struct{}{} + } + v.metrics.pruneSeriesNotIn(live) + return nil } @@ -90,7 +139,17 @@ func (v *Verifier) HandleVerify(w http.ResponseWriter, r *http.Request) { rule, requirement, extensions, _, chain, asset, ok := v.matchPaidRouteFull(cfg, uri) if !ok { - // No pricing rule matches — route is free. + // Check if this URI is under a tracked paid prefix. If yes, + // the route was supposed to match but didn't — fail closed + // rather than silently make it free (Traefik ForwardAuth 200 + // means "allow"). + if v.isUnderPaidPrefix(uri) { + log.Printf("x402-verifier: URI %q is under a paid prefix but no rule matches — fail closed", uri) + http.Error(w, "no rule matches; route appears to be a paid prefix with stale or missing rule", http.StatusForbidden) + + return + } + // Not under any paid prefix — legitimately free route. w.WriteHeader(http.StatusOK) return } @@ -144,6 +203,7 @@ func (v *Verifier) HandleVerify(w http.ResponseWriter, r *http.Request) { case tracker.status == http.StatusOK && r.Header.Get("X-Payment") != "": v.metrics.paymentVerified.With(labels).Inc() v.metrics.chargedRequests.With(labels).Inc() + v.metrics.lastPaymentSuccess.With(labels).SetToCurrentTime() case tracker.status == http.StatusPaymentRequired && r.Header.Get("X-Payment") != "": v.metrics.paymentFailed.With(labels).Inc() case tracker.status == http.StatusPaymentRequired: @@ -198,6 +258,7 @@ func (v *Verifier) HandleProxy(w http.ResponseWriter, r *http.Request) { v.metrics.paymentVerified.With(labels).Inc() if tracker.Header().Get("X-PAYMENT-RESPONSE") != "" { v.metrics.chargedRequests.With(labels).Inc() + v.metrics.lastPaymentSuccess.With(labels).SetToCurrentTime() } } } @@ -208,10 +269,18 @@ func (v *Verifier) HandleHealthz(w http.ResponseWriter, r *http.Request) { fmt.Fprintln(w, `{"status":"ok"}`) } -// HandleReadyz returns 200 OK if pricing config is loaded, 503 otherwise. +// HandleReadyz returns 200 OK once BOTH pricing config and the first route +// source apply have completed. Until then it returns 503 with a cause-specific +// body so kubelet keeps the pod out of Service Endpoints, preventing the +// "no rule -> 200 free pass" window during informer warmup +// (CLAUDE.md pitfall #14). func (v *Verifier) HandleReadyz(w http.ResponseWriter, r *http.Request) { if v.config.Load() == nil { - http.Error(w, "not ready", http.StatusServiceUnavailable) + http.Error(w, "not ready: config not loaded", http.StatusServiceUnavailable) + return + } + if !v.routesLoaded.Load() { + http.Error(w, "not ready: routes not loaded", http.StatusServiceUnavailable) return } @@ -261,6 +330,38 @@ func (v *Verifier) matchPaidRouteFull(cfg *PricingConfig, uri string) (*RouteRul return rule, requirement, extensions, prometheusLabels(rule), chain, asset, true } +// isUnderPaidPrefix reports whether uri starts with any of the URI +// prefixes the verifier knows are paid routes. Used by HandleVerify +// to fail-closed when matchRoute returns nil but the URI is still +// under a tracked prefix — i.e. the route was supposed to match but +// didn't (stale route table, code bug, etc.). +func (v *Verifier) isUnderPaidPrefix(uri string) bool { + prefixes := v.paidPrefixes.Load() + if prefixes == nil { + return false + } + for _, p := range *prefixes { + if strings.HasPrefix(uri, p) { + return true + } + } + return false +} + +// patternToPrefix converts a route Pattern like "/services/foo/*" +// into a directory-style prefix "/services/foo/" suitable for +// strings.HasPrefix matching. Returns "" for patterns without a +// trailing glob — exact-match patterns aren't paid prefixes, so +// fail-closed only applies to the broader "any URI under this path" +// semantic. The trailing slash is preserved so HasPrefix +// distinguishes /services/foo/ from /services/foobar/. +func patternToPrefix(pattern string) string { + if !strings.HasSuffix(pattern, "/*") { + return "" + } + return strings.TrimSuffix(pattern, "*") +} + // mergeAgentExtras adds the agent fields from a RouteRule to the // requirement's Extra map so buyers probing a 402 see which model and // skills are powering the offer. No-op for non-agent rules. @@ -446,9 +547,29 @@ func (r *statusRecorder) WriteHeader(status int) { } func prometheusLabels(rule *RouteRule) prometheus.Labels { + // `route` (= rule.Pattern) was dropped in favor of (offer_namespace, + // offer_name) which already uniquely identifies a paid route — the + // pattern was redundant and unbounded by path fragments, which would + // have ballooned series count for sellers running many granular routes. + // + // asset_symbol is included for direct per-token aggregation in PromQL + // (e.g. "what's my OBOL revenue?") without having to join the metric + // against the ServiceOffer CR at query time. Cardinality cost is zero + // because each offer pins exactly one asset — the new dimension is + // functionally constant within the existing (ns, name) group. + asset := rule.AssetSymbol + if asset == "" { + // Defensive: a missing symbol is operationally ugly in PromQL. + // Empty-string labels are legal in Prometheus but render as a + // bare "asset_symbol=" in selectors, which makes dashboard + // filters harder to write. "unknown" is unambiguous and matches + // the convention we use elsewhere for under-populated metadata. + asset = "unknown" + } return prometheus.Labels{ - "route": rule.Pattern, "offer_namespace": rule.OfferNamespace, "offer_name": rule.OfferName, + "chain": rule.Network, + "asset_symbol": asset, } } diff --git a/internal/x402/verifier_test.go b/internal/x402/verifier_test.go index 083604a0..b4c5113f 100644 --- a/internal/x402/verifier_test.go +++ b/internal/x402/verifier_test.go @@ -10,6 +10,7 @@ import ( "strings" "sync/atomic" "testing" + "time" x402types "github.com/coinbase/x402/go/types" dto "github.com/prometheus/client_model/go" @@ -99,6 +100,8 @@ func testPaymentHeaderFor(t *testing.T, payTo, amount string) string { } // newTestVerifier creates a Verifier backed by the given facilitator URL. +// It also marks routes as loaded so /readyz returns 200 immediately, which +// matches what the production wire-up does once the route source warms up. func newTestVerifier(t *testing.T, facilitatorURL string, routes []RouteRule) *Verifier { t.Helper() v, err := NewVerifier(&PricingConfig{ @@ -111,6 +114,7 @@ func newTestVerifier(t *testing.T, facilitatorURL string, routes []RouteRule) *V if err != nil { t.Fatalf("NewVerifier: %v", err) } + v.MarkRoutesLoaded() return v } @@ -487,6 +491,55 @@ func TestVerifier_ReadyzNotReady(t *testing.T) { if w.Code != http.StatusServiceUnavailable { t.Errorf("expected 503 when config is nil, got %d", w.Code) } + if got := w.Body.String(); !strings.Contains(got, "config not loaded") { + t.Errorf("expected body to mention %q, got %q", "config not loaded", got) + } +} + +// TestVerifier_Readyz_BlocksUntilRoutesLoaded asserts the fix for +// CLAUDE.md pitfall #14: /readyz must return 503 between "config loaded" +// and "first route source apply completed" so kubelet keeps the pod out +// of the Service Endpoints during informer warm-up. +func TestVerifier_Readyz_BlocksUntilRoutesLoaded(t *testing.T) { + v, err := NewVerifier(&PricingConfig{ + Wallet: "0xdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef", + Chain: "base-sepolia", + FacilitatorURL: "http://example.invalid", + }) + if err != nil { + t.Fatalf("NewVerifier: %v", err) + } + + // Config is loaded by NewVerifier, but routes have NOT been marked + // loaded yet — /readyz must still 503 with a routes-specific message + // so kubectl describe pod surfaces the actual cause. + req := httptest.NewRequest(http.MethodGet, "/readyz", nil) + w := httptest.NewRecorder() + v.HandleReadyz(w, req) + + if w.Code != http.StatusServiceUnavailable { + t.Fatalf("expected 503 before routes loaded, got %d", w.Code) + } + if got := w.Body.String(); !strings.Contains(got, "routes not loaded") { + t.Errorf("expected body to mention %q, got %q", "routes not loaded", got) + } + + // After the route source signals first apply, /readyz flips to 200. + v.MarkRoutesLoaded() + + w = httptest.NewRecorder() + v.HandleReadyz(w, req) + if w.Code != http.StatusOK { + t.Fatalf("expected 200 after MarkRoutesLoaded, got %d (body=%q)", w.Code, w.Body.String()) + } + + // MarkRoutesLoaded is idempotent — calling it again must not regress. + v.MarkRoutesLoaded() + w = httptest.NewRecorder() + v.HandleReadyz(w, req) + if w.Code != http.StatusOK { + t.Fatalf("expected 200 after second MarkRoutesLoaded, got %d", w.Code) + } } // ── Per-route PayTo / Network override tests ───────────────────────────────── @@ -752,9 +805,10 @@ func TestVerifier_MetricsPaymentRequired(t *testing.T) { metrics := scrapeVerifierMetrics(t, v) labels := map[string]string{ - "route": "/rpc/*", "offer_namespace": "llm", "offer_name": "paid-rpc", + "chain": "", + "asset_symbol": "unknown", } assertVerifierMetricValue(t, metrics["obol_x402_verifier_requests_total"], labels, 1) assertVerifierMetricValue(t, metrics["obol_x402_verifier_payment_required_total"], labels, 1) @@ -765,9 +819,10 @@ func TestVerifier_MetricsPaymentRequired(t *testing.T) { func TestVerifier_MetricsVerifiedAndRejectedPayments(t *testing.T) { labels := map[string]string{ - "route": "/rpc/*", "offer_namespace": "llm", "offer_name": "paid-rpc", + "chain": "", + "asset_symbol": "unknown", } okFac := newMockFacilitator(t, mockFacilitatorOpts{}) @@ -821,6 +876,281 @@ func TestVerifier_MetricsVerifiedAndRejectedPayments(t *testing.T) { assertVerifierMetricMissing(t, rejectMetrics["obol_x402_verifier_charged_requests_total"], labels) } +// TestVerifier_LastPaymentSuccessGauge asserts that the +// obol_x402_verifier_last_payment_success_seconds gauge is stamped to the +// current wall-clock time when a paid request succeeds, and is NOT touched +// when an unpaid request is rejected with 402. +// +// The gauge is labeled identically to the verifier counters; for this rule +// `chain` is the empty string because the test RouteRule has no Network set, +// and `asset_symbol` is "unknown" because AssetSymbol is unset (the defensive +// fallback emitted by prometheusLabels). +func TestVerifier_LastPaymentSuccessGauge(t *testing.T) { + labels := map[string]string{ + "offer_namespace": "llm", + "offer_name": "paid-rpc", + "chain": "", + "asset_symbol": "unknown", + } + + tests := []struct { + name string + setPayment bool + rejectPayment bool + wantStatus int + wantGaugeFresh bool // assert gauge ~= now() + }{ + { + name: "successful paid request stamps gauge", + setPayment: true, + rejectPayment: false, + wantStatus: http.StatusOK, + wantGaugeFresh: true, + }, + { + name: "unpaid 402 leaves gauge untouched", + setPayment: false, + rejectPayment: false, + wantStatus: http.StatusPaymentRequired, + wantGaugeFresh: false, + }, + { + name: "rejected payment leaves gauge untouched", + setPayment: true, + rejectPayment: true, + wantStatus: http.StatusPaymentRequired, + wantGaugeFresh: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + fac := newMockFacilitator(t, mockFacilitatorOpts{rejectPayment: tt.rejectPayment}) + v := newTestVerifier(t, fac.URL, []RouteRule{{ + Pattern: "/rpc/*", + Price: "0.0001", + OfferNamespace: "llm", + OfferName: "paid-rpc", + }}) + + req := httptest.NewRequest(http.MethodPost, "/verify", nil) + req.Header.Set("X-Forwarded-Uri", "/rpc/mainnet") + req.Header.Set("X-Forwarded-Host", "obol.stack") + if tt.setPayment { + req.Header.Set("X-PAYMENT", testPaymentHeader(t)) + } + + before := time.Now().Unix() + rec := httptest.NewRecorder() + v.HandleVerify(rec, req) + after := time.Now().Unix() + + if rec.Code != tt.wantStatus { + t.Fatalf("status = %d, want %d", rec.Code, tt.wantStatus) + } + + families := scrapeVerifierMetrics(t, v) + gauge := families["obol_x402_verifier_last_payment_success_seconds"] + + if !tt.wantGaugeFresh { + // Either the family is absent (no series emitted) or no + // series exists for these labels — both are acceptable for + // an untouched gauge. + assertVerifierMetricMissing(t, gauge, labels) + return + } + + if gauge == nil { + t.Fatalf("missing metric family obol_x402_verifier_last_payment_success_seconds") + } + got := findVerifierMetricValue(t, gauge, labels) + // Allow ±5s slack for clock skew / slow CI. + if got < float64(before-5) || got > float64(after+5) { + t.Fatalf("gauge = %v, want within [%d, %d]", got, before-5, after+5) + } + }) + } +} + +// TestVerifier_Reload_PrunesDeletedOfferSeries asserts that when an offer is +// removed from the route set (via Reload, the same path used by both the +// file-config watcher and the kube ServiceOffer informer), its previously +// stamped metric series are dropped from the registry. Without this, deleted +// offers' last_payment_success_seconds gauge would survive forever and keep +// firing/silencing alerts on dead labels. +func TestVerifier_Reload_PrunesDeletedOfferSeries(t *testing.T) { + fac := newMockFacilitator(t, mockFacilitatorOpts{}) + keptRoute := RouteRule{ + Pattern: "/keep/*", + Price: "0.0001", + OfferNamespace: "llm", + OfferName: "keep", + } + removedRoute := RouteRule{ + Pattern: "/gone/*", + Price: "0.0001", + OfferNamespace: "llm", + OfferName: "gone", + } + v := newTestVerifier(t, fac.URL, []RouteRule{keptRoute, removedRoute}) + + // Stamp metrics for both offers with a successful paid request each. + for _, path := range []string{"/keep/x", "/gone/x"} { + req := httptest.NewRequest(http.MethodPost, "/verify", nil) + req.Header.Set("X-Forwarded-Uri", path) + req.Header.Set("X-Forwarded-Host", "obol.stack") + req.Header.Set("X-PAYMENT", testPaymentHeader(t)) + rec := httptest.NewRecorder() + v.HandleVerify(rec, req) + if rec.Code != http.StatusOK { + t.Fatalf("setup paid request to %s: status=%d", path, rec.Code) + } + } + + keptLabels := map[string]string{"offer_namespace": "llm", "offer_name": "keep", "chain": "", "asset_symbol": "unknown"} + goneLabels := map[string]string{"offer_namespace": "llm", "offer_name": "gone", "chain": "", "asset_symbol": "unknown"} + + families := scrapeVerifierMetrics(t, v) + for _, name := range []string{ + "obol_x402_verifier_charged_requests_total", + "obol_x402_verifier_last_payment_success_seconds", + } { + family := families[name] + if family == nil { + t.Fatalf("baseline: missing %s before reload", name) + } + findVerifierMetricValue(t, family, keptLabels) + findVerifierMetricValue(t, family, goneLabels) + } + + // Reload with the second offer dropped — the same path ServiceOffer + // deletion takes through ConfigAccumulator.SetRoutes. + if err := v.Reload(&PricingConfig{ + Wallet: "0xdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef", + Chain: "base-sepolia", + FacilitatorURL: fac.URL, + Routes: []RouteRule{keptRoute}, + }); err != nil { + t.Fatalf("Reload: %v", err) + } + + families = scrapeVerifierMetrics(t, v) + for _, name := range []string{ + "obol_x402_verifier_requests_total", + "obol_x402_verifier_payment_required_total", + "obol_x402_verifier_payment_verified_total", + "obol_x402_verifier_payment_failed_total", + "obol_x402_verifier_charged_requests_total", + "obol_x402_verifier_last_payment_success_seconds", + } { + assertVerifierMetricMissing(t, families[name], goneLabels) + } + + // Kept offer's series must survive the prune. + if charged := families["obol_x402_verifier_charged_requests_total"]; charged != nil { + findVerifierMetricValue(t, charged, keptLabels) + } + if gauge := families["obol_x402_verifier_last_payment_success_seconds"]; gauge != nil { + findVerifierMetricValue(t, gauge, keptLabels) + } +} + +// TestVerifier_HandleVerify_FailClosed_ManualPrefixInjection sanity checks +// that an arbitrary prefix in paidPrefixes triggers fail-closed (403) when +// no rule matches. The manual prefix injection simulates the case where the +// verifier KNOWS about a paid prefix (because a route was previously loaded) +// but the matcher rejects the URI — config drift, code bug, etc. +func TestVerifier_HandleVerify_FailClosed_ManualPrefixInjection(t *testing.T) { + fac := newMockFacilitator(t, mockFacilitatorOpts{}) + v := newTestVerifier(t, fac.URL, []RouteRule{ + // No rules; matchRoute will return nil for everything. + }) + + // Manually inject a paid prefix (simulating a stale prefix state). + prefixes := []string{"/services/gated/"} + v.paidPrefixes.Store(&prefixes) + + req := httptest.NewRequest(http.MethodPost, "/verify", nil) + req.Header.Set("X-Forwarded-Uri", "/services/gated/foo") + rec := httptest.NewRecorder() + v.HandleVerify(rec, req) + + if rec.Code != http.StatusForbidden { + t.Errorf("expected 403 (fail-closed) for URI under tracked paid prefix, got %d", rec.Code) + } +} + +// TestVerifier_HandleVerify_FreeRoute_OutsidePrefixes asserts that URIs +// outside all tracked paid prefixes still return 200 (legitimate free pass). +// The verifier is mounted on routes that may or may not be paid; only URIs +// under a known paid prefix should fail closed. +func TestVerifier_HandleVerify_FreeRoute_OutsidePrefixes(t *testing.T) { + fac := newMockFacilitator(t, mockFacilitatorOpts{}) + v := newTestVerifier(t, fac.URL, []RouteRule{ + {Pattern: "/services/known/*", Price: "0.0001"}, + }) + + req := httptest.NewRequest(http.MethodPost, "/verify", nil) + req.Header.Set("X-Forwarded-Uri", "/health") // Not under any paid prefix. + rec := httptest.NewRecorder() + v.HandleVerify(rec, req) + + if rec.Code != http.StatusOK { + t.Errorf("expected 200 for free route outside paid prefixes, got %d", rec.Code) + } +} + +// TestVerifier_HandleVerify_PrefixBoundary_NoFalseMatch verifies that the +// trailing slash on paid prefixes prevents false matches between siblings +// like /services/foo/ and /services/foobar/. Without the trailing slash, +// a request to /services/foobar/x would falsely match /services/foo/*. +func TestVerifier_HandleVerify_PrefixBoundary_NoFalseMatch(t *testing.T) { + fac := newMockFacilitator(t, mockFacilitatorOpts{}) + v := newTestVerifier(t, fac.URL, []RouteRule{ + {Pattern: "/services/foo/*", Price: "0.0001"}, + }) + + // /services/foobar/x is NOT under /services/foo/ — must return 200. + req := httptest.NewRequest(http.MethodPost, "/verify", nil) + req.Header.Set("X-Forwarded-Uri", "/services/foobar/x") + rec := httptest.NewRecorder() + v.HandleVerify(rec, req) + + if rec.Code != http.StatusOK { + t.Errorf("expected 200 for sibling path not under prefix, got %d", rec.Code) + } +} + +func TestPatternToPrefix(t *testing.T) { + cases := []struct{ pattern, want string }{ + {"/services/foo/*", "/services/foo/"}, + {"/rpc/*", "/rpc/"}, + {"/health", ""}, // No glob, returns empty. + {"/*", "/"}, + {"", ""}, + {"/exact/match", ""}, // Exact pattern, not a prefix. + } + for _, c := range cases { + if got := patternToPrefix(c.pattern); got != c.want { + t.Errorf("patternToPrefix(%q) = %q, want %q", c.pattern, got, c.want) + } + } +} + +// findVerifierMetricValue returns the value of the series in `family` whose +// labels match `wantLabels` exactly, failing the test if no such series exists. +func findVerifierMetricValue(t *testing.T, family *dto.MetricFamily, wantLabels map[string]string) float64 { + t.Helper() + + for _, metric := range family.GetMetric() { + if verifierLabelsMatch(metric, wantLabels) { + return verifierMetricValue(metric) + } + } + t.Fatalf("metric %s missing labels %v", family.GetName(), wantLabels) + return 0 +} + func scrapeVerifierMetrics(t *testing.T, v *Verifier) map[string]*dto.MetricFamily { t.Helper() @@ -895,3 +1225,140 @@ func verifierMetricValue(metric *dto.Metric) float64 { return 0 } } + +// TestVerifier_PrometheusLabels_IncludesAssetSymbol asserts that the +// asset_symbol label is emitted with the value from RouteRule.AssetSymbol +// (which the serviceoffer_source populates from +// offer.Spec.Payment.Asset.Symbol). This is what makes "what's my OBOL +// revenue?" a single PromQL aggregation instead of a metric × CR join. +func TestVerifier_PrometheusLabels_IncludesAssetSymbol(t *testing.T) { + rule := &RouteRule{ + OfferNamespace: "llm", + OfferName: "demo-hello", + Network: "eip155:84532", + AssetSymbol: "USDC", + } + labels := prometheusLabels(rule) + if got := labels["asset_symbol"]; got != "USDC" { + t.Errorf("asset_symbol = %q, want %q (full labels: %v)", got, "USDC", labels) + } + if got := labels["chain"]; got != "eip155:84532" { + t.Errorf("chain = %q, want %q", got, "eip155:84532") + } +} + +// TestVerifier_PrometheusLabels_DefaultsToUnknownIfEmpty asserts the +// defensive fallback: when AssetSymbol is empty (legacy offers, parsing +// hiccup, etc.) the label value is "unknown" rather than "" — empty-string +// labels are legal in Prometheus but render as bare selectors that are +// awkward to filter in dashboards. +func TestVerifier_PrometheusLabels_DefaultsToUnknownIfEmpty(t *testing.T) { + rule := &RouteRule{ + OfferNamespace: "llm", + OfferName: "no-asset", + Network: "eip155:84532", + AssetSymbol: "", + } + labels := prometheusLabels(rule) + if got := labels["asset_symbol"]; got != "unknown" { + t.Errorf("asset_symbol = %q, want %q (full labels: %v)", got, "unknown", labels) + } +} + +// TestVerifier_PruneSeriesNotIn_DistinguishesAssetSymbol asserts that +// pruning treats asset_symbol as part of the series key, so an asset-repin +// scenario (USDC route gets dropped, OBOL route for the same offer is +// retained) prunes the dead USDC series without taking the live OBOL one +// with it. Without asset_symbol in the key, both series would map to the +// same (ns, name, chain) tuple and pruning would either drop both or +// neither — leaking a stale per-asset series. +func TestVerifier_PruneSeriesNotIn_DistinguishesAssetSymbol(t *testing.T) { + fac := newMockFacilitator(t, mockFacilitatorOpts{}) + usdcRoute := RouteRule{ + Pattern: "/svc/*", + Price: "0.0001", + OfferNamespace: "llm", + OfferName: "demo", + Network: "base-sepolia", + AssetSymbol: "USDC", + } + obolRoute := RouteRule{ + Pattern: "/svc-obol/*", + Price: "0.0001", + OfferNamespace: "llm", + OfferName: "demo", + Network: "base-sepolia", + AssetSymbol: "OBOL", + } + v := newTestVerifier(t, fac.URL, []RouteRule{usdcRoute, obolRoute}) + + // Stamp a successful paid request through each asset variant so both + // series exist in the registry before pruning. + for _, path := range []string{"/svc/x", "/svc-obol/x"} { + req := httptest.NewRequest(http.MethodPost, "/verify", nil) + req.Header.Set("X-Forwarded-Uri", path) + req.Header.Set("X-Forwarded-Host", "obol.stack") + req.Header.Set("X-PAYMENT", testPaymentHeader(t)) + rec := httptest.NewRecorder() + v.HandleVerify(rec, req) + if rec.Code != http.StatusOK { + t.Fatalf("setup paid request to %s: status=%d", path, rec.Code) + } + } + + usdcLabels := map[string]string{ + "offer_namespace": "llm", + "offer_name": "demo", + "chain": "base-sepolia", + "asset_symbol": "USDC", + } + obolLabels := map[string]string{ + "offer_namespace": "llm", + "offer_name": "demo", + "chain": "base-sepolia", + "asset_symbol": "OBOL", + } + + families := scrapeVerifierMetrics(t, v) + for _, name := range []string{ + "obol_x402_verifier_charged_requests_total", + "obol_x402_verifier_last_payment_success_seconds", + } { + family := families[name] + if family == nil { + t.Fatalf("baseline: missing %s before reload", name) + } + findVerifierMetricValue(t, family, usdcLabels) + findVerifierMetricValue(t, family, obolLabels) + } + + // Drop the USDC route, keep OBOL. If pruneSeriesNotIn ignored + // asset_symbol, both series would key to (llm, demo, base-sepolia) + // and the OBOL series would survive (because the OBOL route is in + // the keep set) — masking the bug. Conversely, if the key didn't + // distinguish at all, both could be wiped. Including asset_symbol + // in the key keeps USDC prunable and OBOL alive. + if err := v.Reload(&PricingConfig{ + Wallet: "0xdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef", + Chain: "base-sepolia", + FacilitatorURL: fac.URL, + Routes: []RouteRule{obolRoute}, + }); err != nil { + t.Fatalf("Reload: %v", err) + } + + families = scrapeVerifierMetrics(t, v) + for _, name := range []string{ + "obol_x402_verifier_requests_total", + "obol_x402_verifier_charged_requests_total", + "obol_x402_verifier_last_payment_success_seconds", + } { + assertVerifierMetricMissing(t, families[name], usdcLabels) + } + + if charged := families["obol_x402_verifier_charged_requests_total"]; charged != nil { + findVerifierMetricValue(t, charged, obolLabels) + } else { + t.Errorf("OBOL charged series was pruned along with USDC — asset_symbol was ignored in prune key") + } +} diff --git a/justfile b/justfile index c3cc2996..40d7115c 100644 --- a/justfile +++ b/justfile @@ -81,6 +81,41 @@ dev-frontend-reset: obol kubectl rollout status deployment/obol-frontend-obol-app -n obol-frontend --timeout=120s echo "✓ Frontend reset to released image" +# Regenerate CRD manifests + DeepCopy methods from kubebuilder markers +# in internal/monetizeapi/. The Go types are the single source of truth; +# CI (.github/workflows/lint-test.yaml::generate-check) fails if the +# working tree is dirty after this command runs. See CLAUDE.md for the +# edit-types -> just generate -> commit-both workflow. +generate: + #!/usr/bin/env bash + set -euo pipefail + # DeepCopy methods (zz_generated_deepcopy.go) next to the Go types. + go run sigs.k8s.io/controller-tools/cmd/controller-gen \ + object:headerFile=hack/boilerplate.go.txt \ + paths=./internal/monetizeapi/... + # CRD manifests into the embed dir. controller-gen names files + # obol.org_.yaml; rename to existing -crd.yaml + # naming so embed.FS readers don't need to change. + out=internal/embed/infrastructure/base/templates + go run sigs.k8s.io/controller-tools/cmd/controller-gen \ + crd \ + paths=./internal/monetizeapi/... \ + output:crd:dir="$out" + for f in "$out"/obol.org_*.yaml; do + [ -e "$f" ] || continue + plural=$(basename "$f" .yaml | sed 's/^obol\.org_//') + case "$plural" in + agentidentities) target="agentidentity-crd.yaml" ;; + agents) target="agent-crd.yaml" ;; + purchaserequests) target="purchaserequest-crd.yaml" ;; + registrationrequests) target="registrationrequest-crd.yaml" ;; + serviceoffers) target="serviceoffer-crd.yaml" ;; + *) target="${plural%s}-crd.yaml" ;; + esac + mv "$f" "$out/$target" + done + echo "✓ Regenerated CRDs and DeepCopy methods" + # Install pre-commit hooks (run once after cloning) setup: #!/usr/bin/env bash diff --git a/tools/tools.go b/tools/tools.go new file mode 100644 index 00000000..b177a3e4 --- /dev/null +++ b/tools/tools.go @@ -0,0 +1,14 @@ +//go:build tools + +// Package tools tracks build-time dependencies that are not imported by +// production code. controller-gen is the canonical source-of-truth tool +// for generating CRD manifests and DeepCopy methods from kubebuilder +// markers on the Go types in internal/monetizeapi. +// +// See `just generate`. CI fails if generated artifacts drift from the +// markers (see .github/workflows/lint-test.yaml::generate-check). +package tools + +import ( + _ "sigs.k8s.io/controller-tools/cmd/controller-gen" +)