From 0b9a9b367ebc425cb25dcf8d90d0c4475d929db0 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 23 May 2026 18:00:04 +0400 Subject: [PATCH 01/31] feat(x402): chain label on buyer + verifier metrics, sync PurchaseAutoRefill Unblocks per-chain earnings/spend aggregation in the frontend's My Listings and My Purchases pages. - obol_x402_buyer_* metrics now carry a `chain` label sourced from UpstreamConfig.Network (already in payload, just wasn't on the labels). - obol_x402_verifier_* metrics now carry a `chain` label sourced from RouteRule.Network. Existing verifier metric tests updated to assert the new label (empty string when no Network is set on the rule). - internal/monetizeapi/types.go PurchaseAutoRefill struct now mirrors the CRD spec (purchaserequest-crd.yaml lines 93-96) by including MaxTotal + MaxSpendPerDay. The CRD already accepts these, the Go types just weren't reading them. Together this means the frontend can soon switch the EarningsStrip / WalletStrip from zeroed placeholders to real PromQL aggregates such as: sum by (chain) (increase(obol_x402_buyer_payment_success_total[7d])) --- internal/monetizeapi/types.go | 8 +++++--- internal/x402/buyer/metrics.go | 18 +++++++++--------- internal/x402/buyer/proxy.go | 9 +++++---- internal/x402/metrics.go | 10 +++++----- internal/x402/verifier.go | 1 + internal/x402/verifier_test.go | 2 ++ 6 files changed, 27 insertions(+), 21 deletions(-) diff --git a/internal/monetizeapi/types.go b/internal/monetizeapi/types.go index 6e905eee..403cffdf 100644 --- a/internal/monetizeapi/types.go +++ b/internal/monetizeapi/types.go @@ -276,9 +276,11 @@ type PreSignedAuth struct { } type PurchaseAutoRefill struct { - Enabled bool `json:"enabled,omitempty"` - Threshold int `json:"threshold,omitempty"` - Count int `json:"count,omitempty"` + Enabled bool `json:"enabled,omitempty"` + Threshold int `json:"threshold,omitempty"` + Count int `json:"count,omitempty"` + MaxTotal int `json:"maxTotal,omitempty"` + MaxSpendPerDay string `json:"maxSpendPerDay,omitempty"` } type PurchasePayment struct { diff --git a/internal/x402/buyer/metrics.go b/internal/x402/buyer/metrics.go index 0df96424..5079f0a0 100644 --- a/internal/x402/buyer/metrics.go +++ b/internal/x402/buyer/metrics.go @@ -29,35 +29,35 @@ func newMetrics() *metrics { Name: "obol_x402_buyer_requests_total", Help: "Total requests routed through the x402 buyer sidecar.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), paymentAttempts: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_buyer_payment_attempts_total", Help: "Total x402 payment attempts made by the buyer sidecar.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), paymentSuccessTotal: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_buyer_payment_success_total", Help: "Total successful x402 payments made by the buyer sidecar.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), paymentFailureTotal: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_buyer_payment_failure_total", Help: "Total failed x402 payments attempted by the buyer sidecar.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), confirmSpendFailureTotal: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_buyer_confirm_spend_failure_total", Help: "Successful upstream responses whose consumed-auth state could not be persisted.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), // paymentUnsettledConfirmations counts the occurrences of an upstream // returning 2xx without X-PAYMENT-RESPONSE. The buyer still marks the @@ -69,28 +69,28 @@ func newMetrics() *metrics { Name: "obol_x402_buyer_payment_unsettled_confirmations_total", Help: "Upstream 2xx responses with no X-PAYMENT-RESPONSE header — auth consumed locally without observed on-chain settlement.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), authRemaining: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "obol_x402_buyer_auth_remaining", Help: "Remaining pre-signed authorizations for an upstream model mapping.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), authSpent: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "obol_x402_buyer_auth_spent", Help: "Consumed pre-signed authorizations for an upstream model mapping.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), activeModelMappings: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "obol_x402_buyer_active_model_mappings", Help: "Active paid model mappings loaded in the buyer sidecar.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), } diff --git a/internal/x402/buyer/proxy.go b/internal/x402/buyer/proxy.go index afcb3d5c..b1b644e0 100644 --- a/internal/x402/buyer/proxy.go +++ b/internal/x402/buyer/proxy.go @@ -203,17 +203,18 @@ func (p *Proxy) syncMetricsLocked() { for name, upstream := range p.upstreams { signer := p.signers[name] - labels := prometheusLabels(name, upstream.remoteModel) + labels := prometheusLabels(name, upstream.remoteModel, upstream.config.Network) p.metrics.activeModelMappings.With(labels).Set(1) p.metrics.authRemaining.With(labels).Set(float64(signer.Remaining())) p.metrics.authSpent.With(labels).Set(float64(signer.Spent())) } } -func prometheusLabels(name, remoteModel string) map[string]string { +func prometheusLabels(name, remoteModel, chain string) map[string]string { return map[string]string{ "upstream": name, "remote_model": remoteModel, + "chain": chain, } } @@ -226,7 +227,7 @@ func (p *Proxy) buildUpstreamHandler(name, remoteModel string, cfg UpstreamConfi return nil, fmt.Errorf("parse upstream URL %q: %w", cfg.URL, err) } - labels := prometheusLabels(name, remoteModel) + labels := prometheusLabels(name, remoteModel, cfg.Network) rp := &httputil.ReverseProxy{ Rewrite: func(pr *httputil.ProxyRequest) { pr.SetURL(target) @@ -298,7 +299,7 @@ func (p *Proxy) handleModelRequest(w http.ResponseWriter, r *http.Request) { return io.NopCloser(bytes.NewReader(rewrittenBody)), nil } - labels := prometheusLabels(entry.name, remoteModel) + labels := prometheusLabels(entry.name, remoteModel, entry.config.Network) p.metrics.requestsTotal.With(labels).Inc() entry.handler.ServeHTTP(w, r) } diff --git a/internal/x402/metrics.go b/internal/x402/metrics.go index 21d87b68..6be9d14a 100644 --- a/internal/x402/metrics.go +++ b/internal/x402/metrics.go @@ -25,35 +25,35 @@ func newVerifierMetrics() *verifierMetrics { Name: "obol_x402_verifier_requests_total", Help: "Requests evaluated by the x402 verifier for matched paid routes.", }, - []string{"route", "offer_namespace", "offer_name"}, + []string{"route", "offer_namespace", "offer_name", "chain"}, ), paymentRequired: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_payment_required_total", Help: "Requests rejected with 402 because payment was required.", }, - []string{"route", "offer_namespace", "offer_name"}, + []string{"route", "offer_namespace", "offer_name", "chain"}, ), paymentVerified: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_payment_verified_total", Help: "Requests approved after successful x402 payment verification.", }, - []string{"route", "offer_namespace", "offer_name"}, + []string{"route", "offer_namespace", "offer_name", "chain"}, ), paymentFailed: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_payment_failed_total", Help: "Requests rejected after a provided x402 payment failed verification.", }, - []string{"route", "offer_namespace", "offer_name"}, + []string{"route", "offer_namespace", "offer_name", "chain"}, ), chargedRequests: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_charged_requests_total", Help: "Requests that incurred a paid x402 charge.", }, - []string{"route", "offer_namespace", "offer_name"}, + []string{"route", "offer_namespace", "offer_name", "chain"}, ), } diff --git a/internal/x402/verifier.go b/internal/x402/verifier.go index 65374ea4..c8d5252e 100644 --- a/internal/x402/verifier.go +++ b/internal/x402/verifier.go @@ -450,5 +450,6 @@ func prometheusLabels(rule *RouteRule) prometheus.Labels { "route": rule.Pattern, "offer_namespace": rule.OfferNamespace, "offer_name": rule.OfferName, + "chain": rule.Network, } } diff --git a/internal/x402/verifier_test.go b/internal/x402/verifier_test.go index 083604a0..d6bbbced 100644 --- a/internal/x402/verifier_test.go +++ b/internal/x402/verifier_test.go @@ -755,6 +755,7 @@ func TestVerifier_MetricsPaymentRequired(t *testing.T) { "route": "/rpc/*", "offer_namespace": "llm", "offer_name": "paid-rpc", + "chain": "", } assertVerifierMetricValue(t, metrics["obol_x402_verifier_requests_total"], labels, 1) assertVerifierMetricValue(t, metrics["obol_x402_verifier_payment_required_total"], labels, 1) @@ -768,6 +769,7 @@ func TestVerifier_MetricsVerifiedAndRejectedPayments(t *testing.T) { "route": "/rpc/*", "offer_namespace": "llm", "offer_name": "paid-rpc", + "chain": "", } okFac := newMockFacilitator(t, mockFacilitatorOpts{}) From 08b303ea0e796e3447bf880be75537e3d458a56d Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 23 May 2026 18:45:16 +0400 Subject: [PATCH 02/31] feat(x402): last-settlement gauge + verifier PodMonitor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the data loop for the frontend My Listings EarningsStrip + the "Last settlement" timestamp the design canvas wants. - New gauge obol_x402_verifier_last_payment_success_seconds, labeled by (route, offer_namespace, offer_name, chain). Stamped via SetToCurrentTime() in both ForwardAuth and proxy-mode paths whenever a paid request reaches the seller successfully. - helmfile.yaml grows an x402-verifier PodMonitor (the namespace was previously scraping only litellm-x402-buyer). Same release: monitoring label so kube-prometheus-stack picks it up. The frontend already has matching consumers (chargedSalesByOfferAndChain, chargedRequests24hByOffer, lastSettlementByOffer in PrometheusClient) — without this scrape the metrics never reach the dashboard. --- internal/embed/infrastructure/helmfile.yaml | 26 +++++++++++++++++++++ internal/x402/metrics.go | 19 +++++++++++---- internal/x402/verifier.go | 2 ++ 3 files changed, 42 insertions(+), 5 deletions(-) diff --git a/internal/embed/infrastructure/helmfile.yaml b/internal/embed/infrastructure/helmfile.yaml index aa7fc052..d4df212d 100644 --- a/internal/embed/infrastructure/helmfile.yaml +++ b/internal/embed/infrastructure/helmfile.yaml @@ -70,6 +70,32 @@ releases: path: /metrics interval: 30s + - name: x402-verifier-podmonitor + namespace: x402 + createNamespace: false + chart: bedag/raw + version: 2.0.2 + needs: + - monitoring/monitoring + - kube-system/base + values: + - resources: + - apiVersion: monitoring.coreos.com/v1 + kind: PodMonitor + metadata: + name: x402-verifier + namespace: x402 + labels: + release: monitoring + spec: + selector: + matchLabels: + app: x402-verifier + podMetricsEndpoints: + - port: http + path: /metrics + interval: 30s + # Traefik ingress controller with Gateway API support # Traefik v38+ bundles Gateway API CRDs in its crds/ directory - name: traefik diff --git a/internal/x402/metrics.go b/internal/x402/metrics.go index 6be9d14a..42266c2b 100644 --- a/internal/x402/metrics.go +++ b/internal/x402/metrics.go @@ -10,11 +10,12 @@ import ( type verifierMetrics struct { registry *prometheus.Registry - requestsTotal *prometheus.CounterVec - paymentRequired *prometheus.CounterVec - paymentVerified *prometheus.CounterVec - paymentFailed *prometheus.CounterVec - chargedRequests *prometheus.CounterVec + requestsTotal *prometheus.CounterVec + paymentRequired *prometheus.CounterVec + paymentVerified *prometheus.CounterVec + paymentFailed *prometheus.CounterVec + chargedRequests *prometheus.CounterVec + lastPaymentSuccess *prometheus.GaugeVec } func newVerifierMetrics() *verifierMetrics { @@ -55,6 +56,13 @@ func newVerifierMetrics() *verifierMetrics { }, []string{"route", "offer_namespace", "offer_name", "chain"}, ), + lastPaymentSuccess: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "obol_x402_verifier_last_payment_success_seconds", + Help: "Unix timestamp (seconds) of the most recent successful paid x402 charge for a route.", + }, + []string{"route", "offer_namespace", "offer_name", "chain"}, + ), } m.registry.MustRegister( @@ -63,6 +71,7 @@ func newVerifierMetrics() *verifierMetrics { m.paymentVerified, m.paymentFailed, m.chargedRequests, + m.lastPaymentSuccess, ) return m diff --git a/internal/x402/verifier.go b/internal/x402/verifier.go index c8d5252e..b4451508 100644 --- a/internal/x402/verifier.go +++ b/internal/x402/verifier.go @@ -144,6 +144,7 @@ func (v *Verifier) HandleVerify(w http.ResponseWriter, r *http.Request) { case tracker.status == http.StatusOK && r.Header.Get("X-Payment") != "": v.metrics.paymentVerified.With(labels).Inc() v.metrics.chargedRequests.With(labels).Inc() + v.metrics.lastPaymentSuccess.With(labels).SetToCurrentTime() case tracker.status == http.StatusPaymentRequired && r.Header.Get("X-Payment") != "": v.metrics.paymentFailed.With(labels).Inc() case tracker.status == http.StatusPaymentRequired: @@ -198,6 +199,7 @@ func (v *Verifier) HandleProxy(w http.ResponseWriter, r *http.Request) { v.metrics.paymentVerified.With(labels).Inc() if tracker.Header().Get("X-PAYMENT-RESPONSE") != "" { v.metrics.chargedRequests.With(labels).Inc() + v.metrics.lastPaymentSuccess.With(labels).SetToCurrentTime() } } } From da721d442b59b3773d2af80c72d57887af621f31 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 23 May 2026 19:44:14 +0400 Subject: [PATCH 03/31] test(x402): cover new chain label + last-payment gauge + PurchaseAutoRefill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the test gap left open by the recent chain-label + last-settlement gauge work. 14 new subtests across three packages plus four pre-existing buyer-proxy assertions updated to carry the new chain label. New tests: - internal/x402/verifier_test.go TestVerifier_LastPaymentSuccessGauge (3 subtests): successful payment stamps gauge within ±5s of time.Now(), unpaid 402 leaves it untouched, rejected payment leaves it untouched. findVerifierMetricValue helper for time-window assertions. - internal/x402/buyer/metrics_test.go TestPrometheusLabels_ChainPropagation (3 subtests): base-sepolia / base mainnet / empty chain. TestMetrics_ChainLabelScrapeRoundtrip (2 subtests): scrape /metrics through the registry, assert every counter + gauge series carries the expected chain label. - internal/monetizeapi/types_test.go TestPurchaseAutoRefill_JSONRoundTrip (5 subtests): full population, only new caps, all-zero omitempty, single fields. TestPurchaseAutoRefill_UnmarshalAcceptsCRDForm: catches json-tag drift between the Go struct and CRD spec. Pre-existing fix: - internal/x402/buyer/proxy_test.go — four TestProxy_* assertions had label maps without `chain`; tests use Network "base-sepolia" so the expected chain is now spelled out alongside upstream + remote_model. RBAC: - helmfile.yaml: obol-frontend ClusterRole grows read access for purchaserequests + purchaserequests/status (frontend My Purchases needs list; agent buy.py + controller remain the only writers). Live-patched into the running cluster too. --- internal/embed/infrastructure/helmfile.yaml | 6 + internal/monetizeapi/types_test.go | 108 +++++++++++ internal/x402/buyer/metrics_test.go | 191 ++++++++++++++++++++ internal/x402/buyer/proxy_test.go | 13 +- internal/x402/verifier_test.go | 109 +++++++++++ 5 files changed, 422 insertions(+), 5 deletions(-) create mode 100644 internal/monetizeapi/types_test.go create mode 100644 internal/x402/buyer/metrics_test.go diff --git a/internal/embed/infrastructure/helmfile.yaml b/internal/embed/infrastructure/helmfile.yaml index d4df212d..fe6eb001 100644 --- a/internal/embed/infrastructure/helmfile.yaml +++ b/internal/embed/infrastructure/helmfile.yaml @@ -331,6 +331,12 @@ releases: - apiGroups: ["obol.org"] resources: ["serviceoffers", "serviceoffers/status"] verbs: ["get", "list", "create", "update", "patch", "delete"] + # PurchaseRequest CRD — frontend My Purchases page lists buyer-side + # purchase records. Read-only; agent buy.py and the controller + # remain the only writers. + - apiGroups: ["obol.org"] + resources: ["purchaserequests", "purchaserequests/status"] + verbs: ["get", "list", "watch"] - apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: diff --git a/internal/monetizeapi/types_test.go b/internal/monetizeapi/types_test.go new file mode 100644 index 00000000..77a15a90 --- /dev/null +++ b/internal/monetizeapi/types_test.go @@ -0,0 +1,108 @@ +package monetizeapi + +import ( + "encoding/json" + "testing" +) + +// TestPurchaseAutoRefill_JSONRoundTrip asserts every field on +// PurchaseAutoRefill marshals to JSON and unmarshals back without loss. The +// MaxTotal + MaxSpendPerDay fields were added to match the CRD spec; this test +// pins the wire format and `omitempty` semantics so silent drift between the +// Go struct and the CRD surfaces as a test failure. +func TestPurchaseAutoRefill_JSONRoundTrip(t *testing.T) { + tests := []struct { + name string + in PurchaseAutoRefill + wantJSON string + }{ + { + name: "all fields populated", + in: PurchaseAutoRefill{ + Enabled: true, + Threshold: 5, + Count: 10, + MaxTotal: 100, + MaxSpendPerDay: "1.50", + }, + wantJSON: `{"enabled":true,"threshold":5,"count":10,"maxTotal":100,"maxSpendPerDay":"1.50"}`, + }, + { + name: "only enabled + new caps", + in: PurchaseAutoRefill{ + Enabled: true, + MaxTotal: 42, + MaxSpendPerDay: "0.05", + }, + wantJSON: `{"enabled":true,"maxTotal":42,"maxSpendPerDay":"0.05"}`, + }, + { + name: "zero values omit every field", + in: PurchaseAutoRefill{}, + wantJSON: `{}`, + }, + { + name: "MaxSpendPerDay alone", + in: PurchaseAutoRefill{ + MaxSpendPerDay: "0.0001", + }, + wantJSON: `{"maxSpendPerDay":"0.0001"}`, + }, + { + name: "MaxTotal alone", + in: PurchaseAutoRefill{ + MaxTotal: 7, + }, + wantJSON: `{"maxTotal":7}`, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gotJSON, err := json.Marshal(tt.in) + if err != nil { + t.Fatalf("marshal: %v", err) + } + if string(gotJSON) != tt.wantJSON { + t.Fatalf("marshal:\n got: %s\nwant: %s", gotJSON, tt.wantJSON) + } + + var roundTripped PurchaseAutoRefill + if err := json.Unmarshal(gotJSON, &roundTripped); err != nil { + t.Fatalf("unmarshal: %v", err) + } + if roundTripped != tt.in { + t.Fatalf("round-trip mismatch:\n got: %+v\nwant: %+v", roundTripped, tt.in) + } + }) + } +} + +// TestPurchaseAutoRefill_UnmarshalAcceptsCRDForm asserts that a JSON document +// shaped like the CRD spec deserialises into every Go field — this is the +// inverse of the marshal direction and catches accidental json-tag drift. +func TestPurchaseAutoRefill_UnmarshalAcceptsCRDForm(t *testing.T) { + const crdJSON = `{ + "enabled": true, + "threshold": 5, + "count": 10, + "maxTotal": 100, + "maxSpendPerDay": "1.50" + }` + + want := PurchaseAutoRefill{ + Enabled: true, + Threshold: 5, + Count: 10, + MaxTotal: 100, + MaxSpendPerDay: "1.50", + } + + var got PurchaseAutoRefill + if err := json.Unmarshal([]byte(crdJSON), &got); err != nil { + t.Fatalf("unmarshal: %v", err) + } + if got != want { + t.Fatalf("unmarshal mismatch:\n got: %+v\nwant: %+v", got, want) + } +} diff --git a/internal/x402/buyer/metrics_test.go b/internal/x402/buyer/metrics_test.go new file mode 100644 index 00000000..9ce4fabc --- /dev/null +++ b/internal/x402/buyer/metrics_test.go @@ -0,0 +1,191 @@ +package buyer + +import ( + "net/http" + "net/http/httptest" + "strings" + "testing" + + dto "github.com/prometheus/client_model/go" + "github.com/prometheus/common/expfmt" +) + +// TestPrometheusLabels_ChainPropagation asserts that prometheusLabels surfaces +// the `chain` label sourced from UpstreamConfig.Network so paid-request metrics +// can be partitioned by chain (base, base-sepolia, etc.). The empty-chain case +// is also exercised so the label is always rendered cleanly even when an +// upstream has no Network set. +func TestPrometheusLabels_ChainPropagation(t *testing.T) { + tests := []struct { + name string + upstream string + remoteModel string + chain string + want map[string]string + }{ + { + name: "base-sepolia chain propagates", + upstream: "upstream-a", + remoteModel: "qwen3.5:9b", + chain: "base-sepolia", + want: map[string]string{ + "upstream": "upstream-a", + "remote_model": "qwen3.5:9b", + "chain": "base-sepolia", + }, + }, + { + name: "base mainnet chain propagates", + upstream: "upstream-b", + remoteModel: "qwen3.5:4b", + chain: "base", + want: map[string]string{ + "upstream": "upstream-b", + "remote_model": "qwen3.5:4b", + "chain": "base", + }, + }, + { + name: "empty chain renders cleanly", + upstream: "upstream-c", + remoteModel: "qwen3.5:1b", + chain: "", + want: map[string]string{ + "upstream": "upstream-c", + "remote_model": "qwen3.5:1b", + "chain": "", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := prometheusLabels(tt.upstream, tt.remoteModel, tt.chain) + if len(got) != len(tt.want) { + t.Fatalf("got %d labels, want %d (%v vs %v)", len(got), len(tt.want), got, tt.want) + } + for k, v := range tt.want { + if got[k] != v { + t.Errorf("label %q = %q, want %q", k, got[k], v) + } + } + }) + } +} + +// TestMetrics_ChainLabelScrapeRoundtrip increments each of the 9 buyer +// counters/gauges using prometheusLabels and then scrapes /metrics through the +// registry's handler, asserting the `chain` label appears (with the expected +// value) on every series. +func TestMetrics_ChainLabelScrapeRoundtrip(t *testing.T) { + tests := []struct { + name string + upstream string + remoteModel string + chain string + }{ + { + name: "base-sepolia label visible on every series", + upstream: "upstream-a", + remoteModel: "qwen3.5:9b", + chain: "base-sepolia", + }, + { + name: "empty chain label is present and empty", + upstream: "upstream-b", + remoteModel: "qwen3.5:4b", + chain: "", + }, + } + + // Every metric registered by newMetrics carries the same {upstream, + // remote_model, chain} label set. + wantFamilies := []string{ + "obol_x402_buyer_requests_total", + "obol_x402_buyer_payment_attempts_total", + "obol_x402_buyer_payment_success_total", + "obol_x402_buyer_payment_failure_total", + "obol_x402_buyer_confirm_spend_failure_total", + "obol_x402_buyer_payment_unsettled_confirmations_total", + "obol_x402_buyer_auth_remaining", + "obol_x402_buyer_auth_spent", + "obol_x402_buyer_active_model_mappings", + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + m := newMetrics() + labels := prometheusLabels(tt.upstream, tt.remoteModel, tt.chain) + + // Counters: incremented once each. + m.requestsTotal.With(labels).Inc() + m.paymentAttempts.With(labels).Inc() + m.paymentSuccessTotal.With(labels).Inc() + m.paymentFailureTotal.With(labels).Inc() + m.confirmSpendFailureTotal.With(labels).Inc() + m.paymentUnsettledConfirmations.With(labels).Inc() + // Gauges: stamped with arbitrary non-zero values. + m.authRemaining.With(labels).Set(7) + m.authSpent.With(labels).Set(3) + m.activeModelMappings.With(labels).Set(1) + + families := scrapeBuyerMetrics(t, m) + + wantLabels := map[string]string{ + "upstream": tt.upstream, + "remote_model": tt.remoteModel, + "chain": tt.chain, + } + for _, name := range wantFamilies { + fam, ok := families[name] + if !ok { + t.Errorf("missing metric family %s", name) + continue + } + if !buyerHasSeriesWithLabels(fam, wantLabels) { + t.Errorf("metric %s missing series with labels %v", name, wantLabels) + } + } + }) + } +} + +// scrapeBuyerMetrics renders the metrics registry through its HTTP handler and +// parses the Prometheus text exposition into a name → MetricFamily map. +func scrapeBuyerMetrics(t *testing.T, m *metrics) map[string]*dto.MetricFamily { + t.Helper() + + rec := httptest.NewRecorder() + m.handler().ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/metrics", nil)) + if rec.Code != http.StatusOK { + t.Fatalf("metrics status = %d, want 200", rec.Code) + } + + var parser expfmt.TextParser + families, err := parser.TextToMetricFamilies(strings.NewReader(rec.Body.String())) + if err != nil { + t.Fatalf("parse metrics: %v", err) + } + return families +} + +// buyerHasSeriesWithLabels returns true iff `family` contains at least one +// series whose label set exactly equals `want`. +func buyerHasSeriesWithLabels(family *dto.MetricFamily, want map[string]string) bool { + for _, metric := range family.GetMetric() { + if len(metric.GetLabel()) != len(want) { + continue + } + match := true + for _, label := range metric.GetLabel() { + if want[label.GetName()] != label.GetValue() { + match = false + break + } + } + if match { + return true + } + } + return false +} diff --git a/internal/x402/buyer/proxy_test.go b/internal/x402/buyer/proxy_test.go index 55281c46..5b135720 100644 --- a/internal/x402/buyer/proxy_test.go +++ b/internal/x402/buyer/proxy_test.go @@ -1045,7 +1045,7 @@ func TestProxy_ModelRoutingAndMetrics(t *testing.T) { } metrics := scrapeMetricFamilies(t, proxy) - labels := map[string]string{"upstream": "seller-qwen", "remote_model": "qwen3:32b"} + labels := map[string]string{"upstream": "seller-qwen", "remote_model": "qwen3:32b", "chain": "base-sepolia"} assertMetricValue(t, metrics["obol_x402_buyer_requests_total"], labels, 1) assertMetricValue(t, metrics["obol_x402_buyer_payment_attempts_total"], labels, 1) assertMetricValue(t, metrics["obol_x402_buyer_payment_success_total"], labels, 1) @@ -1295,10 +1295,10 @@ func TestProxy_ReloadSkipsConsumedAuthsAndReplacesModelMapping(t *testing.T) { t.Fatalf("active model mapping series = %d, want 1", metricFamilyLen(activeMappings)) } - assertMetricValue(t, activeMappings, map[string]string{"upstream": "seller-new", "remote_model": "new-model"}, 1) - assertMetricMissing(t, activeMappings, map[string]string{"upstream": "seller-old", "remote_model": "old-model"}) - assertMetricValue(t, metrics["obol_x402_buyer_auth_remaining"], map[string]string{"upstream": "seller-new", "remote_model": "new-model"}, 1) - assertMetricMissing(t, metrics["obol_x402_buyer_auth_remaining"], map[string]string{"upstream": "seller-old", "remote_model": "old-model"}) + assertMetricValue(t, activeMappings, map[string]string{"upstream": "seller-new", "remote_model": "new-model", "chain": "base-sepolia"}, 1) + assertMetricMissing(t, activeMappings, map[string]string{"upstream": "seller-old", "remote_model": "old-model", "chain": "base-sepolia"}) + assertMetricValue(t, metrics["obol_x402_buyer_auth_remaining"], map[string]string{"upstream": "seller-new", "remote_model": "new-model", "chain": "base-sepolia"}, 1) + assertMetricMissing(t, metrics["obol_x402_buyer_auth_remaining"], map[string]string{"upstream": "seller-old", "remote_model": "old-model", "chain": "base-sepolia"}) } func TestProxy_ReloadSamePurchasePreservesSpentAndAppendsAuthPool(t *testing.T) { @@ -1752,6 +1752,7 @@ func TestProxy_UpstreamSuccessNoSettlementHeader_IncrementsUnsettledMetric(t *te assertMetricValue(t, family, map[string]string{ "upstream": "paid", "remote_model": "paid", + "chain": "base-sepolia", }, 1) } @@ -1826,6 +1827,7 @@ func TestProxy_UpstreamSuccessWithSettlementHeader_DoesNotIncrementUnsettledMetr assertMetricMissing(t, metrics["obol_x402_buyer_payment_unsettled_confirmations_total"], map[string]string{ "upstream": "paid", "remote_model": "paid", + "chain": "base-sepolia", }) } @@ -1901,6 +1903,7 @@ func TestProxy_ConfirmSpendFailure_IncrementsMetric(t *testing.T) { assertMetricValue(t, metrics["obol_x402_buyer_confirm_spend_failure_total"], map[string]string{ "upstream": "paid", "remote_model": "paid", + "chain": "base-sepolia", }, 1) } diff --git a/internal/x402/verifier_test.go b/internal/x402/verifier_test.go index d6bbbced..6b97548d 100644 --- a/internal/x402/verifier_test.go +++ b/internal/x402/verifier_test.go @@ -10,6 +10,7 @@ import ( "strings" "sync/atomic" "testing" + "time" x402types "github.com/coinbase/x402/go/types" dto "github.com/prometheus/client_model/go" @@ -823,6 +824,114 @@ func TestVerifier_MetricsVerifiedAndRejectedPayments(t *testing.T) { assertVerifierMetricMissing(t, rejectMetrics["obol_x402_verifier_charged_requests_total"], labels) } +// TestVerifier_LastPaymentSuccessGauge asserts that the +// obol_x402_verifier_last_payment_success_seconds gauge is stamped to the +// current wall-clock time when a paid request succeeds, and is NOT touched +// when an unpaid request is rejected with 402. +// +// The gauge is labeled identically to the verifier counters; for this rule +// `chain` is the empty string because the test RouteRule has no Network set. +func TestVerifier_LastPaymentSuccessGauge(t *testing.T) { + labels := map[string]string{ + "route": "/rpc/*", + "offer_namespace": "llm", + "offer_name": "paid-rpc", + "chain": "", + } + + tests := []struct { + name string + setPayment bool + rejectPayment bool + wantStatus int + wantGaugeFresh bool // assert gauge ~= now() + }{ + { + name: "successful paid request stamps gauge", + setPayment: true, + rejectPayment: false, + wantStatus: http.StatusOK, + wantGaugeFresh: true, + }, + { + name: "unpaid 402 leaves gauge untouched", + setPayment: false, + rejectPayment: false, + wantStatus: http.StatusPaymentRequired, + wantGaugeFresh: false, + }, + { + name: "rejected payment leaves gauge untouched", + setPayment: true, + rejectPayment: true, + wantStatus: http.StatusPaymentRequired, + wantGaugeFresh: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + fac := newMockFacilitator(t, mockFacilitatorOpts{rejectPayment: tt.rejectPayment}) + v := newTestVerifier(t, fac.URL, []RouteRule{{ + Pattern: "/rpc/*", + Price: "0.0001", + OfferNamespace: "llm", + OfferName: "paid-rpc", + }}) + + req := httptest.NewRequest(http.MethodPost, "/verify", nil) + req.Header.Set("X-Forwarded-Uri", "/rpc/mainnet") + req.Header.Set("X-Forwarded-Host", "obol.stack") + if tt.setPayment { + req.Header.Set("X-PAYMENT", testPaymentHeader(t)) + } + + before := time.Now().Unix() + rec := httptest.NewRecorder() + v.HandleVerify(rec, req) + after := time.Now().Unix() + + if rec.Code != tt.wantStatus { + t.Fatalf("status = %d, want %d", rec.Code, tt.wantStatus) + } + + families := scrapeVerifierMetrics(t, v) + gauge := families["obol_x402_verifier_last_payment_success_seconds"] + + if !tt.wantGaugeFresh { + // Either the family is absent (no series emitted) or no + // series exists for these labels — both are acceptable for + // an untouched gauge. + assertVerifierMetricMissing(t, gauge, labels) + return + } + + if gauge == nil { + t.Fatalf("missing metric family obol_x402_verifier_last_payment_success_seconds") + } + got := findVerifierMetricValue(t, gauge, labels) + // Allow ±5s slack for clock skew / slow CI. + if got < float64(before-5) || got > float64(after+5) { + t.Fatalf("gauge = %v, want within [%d, %d]", got, before-5, after+5) + } + }) + } +} + +// findVerifierMetricValue returns the value of the series in `family` whose +// labels match `wantLabels` exactly, failing the test if no such series exists. +func findVerifierMetricValue(t *testing.T, family *dto.MetricFamily, wantLabels map[string]string) float64 { + t.Helper() + + for _, metric := range family.GetMetric() { + if verifierLabelsMatch(metric, wantLabels) { + return verifierMetricValue(metric) + } + } + t.Fatalf("metric %s missing labels %v", family.GetName(), wantLabels) + return 0 +} + func scrapeVerifierMetrics(t *testing.T, v *Verifier) map[string]*dto.MetricFamily { t.Helper() From 27e1ac594dc53f4b0736bd566efa5fa4219b063f Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 23 May 2026 21:28:14 +0400 Subject: [PATCH 04/31] chore(x402): RBAC trim, ServiceMonitor, relocate monitoring YAML, recording rules Phase 1 + Phase 2 hardening on top of the chain-label/last-settlement work, incorporating findings from the 4-agent K8s architecture review. Skips the auth-on-mutating-endpoints item per operator clarification: the obol-stack frontend is local-only behind the obol.stack hostname restriction, so it's not the primary trust boundary. RBAC trims: - Drop `secrets get/list` from obol-frontend-openclaw-discovery ClusterRole; pre-existing dangling grant, no code reads them. - Drop /status subresource from purchaserequests rule; frontend never writes status (only the controller does). Monitoring + RBAC co-location (kills 3 bedag/raw helmfile releases): - x402-verifier: PodMonitor -> ServiceMonitor in base/templates/x402.yaml. Verifier has a stable Service on port http:8080; ServiceMonitor scrapes the endpoint cleanly across replicas. - litellm-x402-buyer: PodMonitor moved into base/templates/llm.yaml. Stays a PodMonitor because the sidecar's port 8402 is per-pod, not fronted by a Service. - obol-frontend RBAC moved into base/templates/obol-frontend-rbac.yaml next to the workload it grants. Label cardinality: - Drop `route` label from verifier metrics. (offer_namespace, offer_name, chain) already uniquely scopes a paid route; `route` (= rule.Pattern) was redundant and unbounded by path fragments. PrometheusRule (new base/templates/x402-prometheus-rules.yaml): - Recording: x402:revenue:24h_by_offer_chain, x402:revenue:7d_by_offer_chain, x402:revenue:lifetime_by_offer, x402:settlement_rate:1h_by_offer_chain. The frontend's PrometheusClient reads these so renaming raw metrics no longer breaks the UI, and the `increase()` 2-sample minimum no longer leaves cold offers at "0" for the first 30s of traffic. - Alerting: X402PaymentFailureRateHigh (>10% over 1h), X402NoSettlementsAfterChallenge (402s issued, no charges). Deferred (out of scope for this hardening pass): - Frontend-egress NetworkPolicy: on k3s + Flannel the apiserver Service endpoints point at the host process, outside the cluster pod/service CIDRs. A clean allowlist policy can't target the apiserver portably without an install-specific ipBlock; revisit when obol-stack ships a non-k3s deployment surface. - obol-marketplace-api aggregator service: overkill for the local single-operator context. - Three-deployment-paths consolidation (helmfile + bedag/raw + Go `EnsureVerifier`): larger refactor; tracked as separate workstream. Live validation: - 2 paid requests against demo-hello survive both the RBAC trims and the ServiceMonitor swap. `x402:revenue:7d_by_offer_chain` returns 1.0076 for chain=eip155:84532 (matches the underlying obol_x402_verifier_charged_requests_total counter at value 2 over 2 samples). - /api/marketplace/purchases still returns 200 after dropping the /status grant. - /api/agents/wallets returns the agent wallet via the new batched listAllWalletMetadata path (1 ConfigMap list vs N+1 per-instance). --- .../infrastructure/base/templates/llm.yaml | 24 +++ .../base/templates/obol-frontend-rbac.yaml | 53 +++++++ .../base/templates/x402-prometheus-rules.yaml | 139 ++++++++++++++++++ .../infrastructure/base/templates/x402.yaml | 26 ++++ internal/embed/infrastructure/helmfile.yaml | 111 ++------------ internal/x402/metrics.go | 12 +- internal/x402/verifier.go | 5 +- internal/x402/verifier_test.go | 3 - 8 files changed, 266 insertions(+), 107 deletions(-) create mode 100644 internal/embed/infrastructure/base/templates/obol-frontend-rbac.yaml create mode 100644 internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml diff --git a/internal/embed/infrastructure/base/templates/llm.yaml b/internal/embed/infrastructure/base/templates/llm.yaml index cf34841f..18982349 100644 --- a/internal/embed/infrastructure/base/templates/llm.yaml +++ b/internal/embed/infrastructure/base/templates/llm.yaml @@ -298,3 +298,27 @@ spec: port: 4000 targetPort: http protocol: TCP + +--- +# PodMonitor for the x402-buyer sidecar — kept as PodMonitor (not +# ServiceMonitor) because the sidecar listens on a per-pod port (8402) +# that is NOT exposed via the litellm Service. Lives alongside the +# Deployment that hosts it so changing the buyer port here is one edit. +# +# Picked up by kube-prometheus-stack via the `release: monitoring` label. +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: litellm-x402-buyer + namespace: llm + labels: + release: monitoring + app: litellm +spec: + selector: + matchLabels: + app: litellm + podMetricsEndpoints: + - port: buyer-http + path: /metrics + interval: 30s diff --git a/internal/embed/infrastructure/base/templates/obol-frontend-rbac.yaml b/internal/embed/infrastructure/base/templates/obol-frontend-rbac.yaml new file mode 100644 index 00000000..038df594 --- /dev/null +++ b/internal/embed/infrastructure/base/templates/obol-frontend-rbac.yaml @@ -0,0 +1,53 @@ +--- +# RBAC for the obol-frontend pod's ServiceAccount. +# +# The frontend pod uses this SA's bearer token to: +# - Discover OpenClaw / Hermes instances (namespaces, pods, configmaps) +# - List + mutate ServiceOffer CRs (sell-modal + pause/resume/delete row actions) +# - List PurchaseRequest CRs (My Purchases page; never writes) +# +# The frontend is local-only behind the obol.stack hostname restriction +# (the operator owns the cluster), so this is a single trust boundary. +# Defense-in-depth note: the `secrets` rule is intentionally omitted — +# no code path reads them and the SA token shouldn't have that reach. +# /status subresources are omitted from PurchaseRequest because the +# controller is the only writer. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: obol-frontend-openclaw-discovery + labels: + app.kubernetes.io/name: obol-frontend +rules: + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["get", "list"] + - apiGroups: [""] + resources: ["pods", "configmaps"] + verbs: ["get", "list"] + # ServiceOffer CRD — frontend sell modal creates offers, row actions + # pause/resume (annotation patch) and delete. + - apiGroups: ["obol.org"] + resources: ["serviceoffers", "serviceoffers/status"] + verbs: ["get", "list", "create", "update", "patch", "delete"] + # PurchaseRequest CRD — frontend My Purchases page lists buyer-side + # records. Read-only; agent buy.py and the controller are the writers. + - apiGroups: ["obol.org"] + resources: ["purchaserequests"] + verbs: ["get", "list", "watch"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: obol-frontend-openclaw-discovery + labels: + app.kubernetes.io/name: obol-frontend +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: obol-frontend-openclaw-discovery +subjects: + - kind: ServiceAccount + name: obol-frontend + namespace: obol-frontend diff --git a/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml b/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml new file mode 100644 index 00000000..73b10f94 --- /dev/null +++ b/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml @@ -0,0 +1,139 @@ +--- +# Recording + alerting rules for x402 verifier traffic. +# +# Recording rules pre-aggregate the queries that the frontend's +# /api/sell/list joins use (chargedSalesByOfferAndChain, +# chargedRequests24hByOffer). The frontend reads the recorded series +# directly, which: +# * removes the `increase()` 2-sample minimum quirk (cold offers no +# longer show "0" for the first 30s after they receive traffic), +# * decouples the page from raw metric names (renaming +# obol_x402_verifier_charged_requests_total no longer breaks the UI), +# * cuts query cost on dashboards / page reloads (sum is done once at +# evaluation time, not per page-load). +# +# Alerting rules surface the two operator-meaningful failure modes the +# release-smoke flows historically caught manually. +# +# Picked up by kube-prometheus-stack via the `release: monitoring` label +# (configured in values/monitoring.yaml.gotmpl). +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: x402-verifier + namespace: x402 + labels: + release: monitoring + app: x402-verifier +spec: + groups: + - name: x402.recording + interval: 30s + rules: + # 24h charged-request count per (offer, chain). Replaces the + # frontend's `increase(charged_requests_total[24h])` query — same + # math, pre-computed every 30s. + - record: x402:revenue:24h_by_offer_chain + expr: | + sum by (offer_namespace, offer_name, chain) ( + increase(obol_x402_verifier_charged_requests_total[24h]) + ) + + # 7d charged-request count per (offer, chain). Powers the + # EarningsStrip per-chain × CRD price multiplication. + - record: x402:revenue:7d_by_offer_chain + expr: | + sum by (offer_namespace, offer_name, chain) ( + increase(obol_x402_verifier_charged_requests_total[7d]) + ) + + # Lifetime charged-request count per offer (sum across replicas + # + chains). Used in the My Listings "today · X earned" header + # text and the Browse catalog usage badge. + - record: x402:revenue:lifetime_by_offer + expr: | + sum by (offer_namespace, offer_name) ( + obol_x402_verifier_charged_requests_total + ) + + # Settlement rate (verified / attempted) over the last hour, per + # (offer, chain). Useful for the dashboard + the alert below. + - record: x402:settlement_rate:1h_by_offer_chain + expr: | + sum by (offer_namespace, offer_name, chain) ( + rate(obol_x402_verifier_payment_verified_total[1h]) + ) + / + clamp_min( + sum by (offer_namespace, offer_name, chain) ( + rate(obol_x402_verifier_payment_required_total[1h]) + + + rate(obol_x402_verifier_payment_verified_total[1h]) + + + rate(obol_x402_verifier_payment_failed_total[1h]) + ), + 1 + ) + + - name: x402.alerting + rules: + # Payment-failure ratio crossed 10% over the last hour for a paid + # route that's actually receiving traffic. Typical cause: + # facilitator unreachable, chain pruning, or seller's CA bundle + # missing (CLAUDE.md pitfall #8). + - alert: X402PaymentFailureRateHigh + expr: | + ( + sum by (offer_namespace, offer_name, chain) ( + rate(obol_x402_verifier_payment_failed_total[1h]) + ) + / + clamp_min( + sum by (offer_namespace, offer_name, chain) ( + rate(obol_x402_verifier_payment_failed_total[1h]) + + + rate(obol_x402_verifier_payment_verified_total[1h]) + ), + 1 + ) + ) > 0.10 + for: 10m + labels: + severity: warning + annotations: + summary: "x402 payment failures > 10% on {{ $labels.offer_namespace }}/{{ $labels.offer_name }} ({{ $labels.chain }})" + description: | + More than 10% of paid requests to + {{ $labels.offer_namespace }}/{{ $labels.offer_name }} on + {{ $labels.chain }} have failed verification over the last + hour. Check the verifier logs for x509/facilitator errors and + the seller's `ca-certificates` ConfigMap. + + # An offer received a 402 (payment_required) within the last hour + # but no charged_requests happened in the same window. Either + # buyers aren't completing the flow, or settlement is broken + # downstream of the verifier. + - alert: X402NoSettlementsAfterChallenge + expr: | + ( + sum by (offer_namespace, offer_name) ( + increase(obol_x402_verifier_payment_required_total[1h]) + ) > 0 + ) + and + ( + sum by (offer_namespace, offer_name) ( + increase(obol_x402_verifier_charged_requests_total[1h]) + ) == 0 + ) + for: 30m + labels: + severity: warning + annotations: + summary: "{{ $labels.offer_namespace }}/{{ $labels.offer_name }} returns 402 but never settles" + description: | + The x402 verifier issued 402 responses for + {{ $labels.offer_namespace }}/{{ $labels.offer_name }} in the + last hour but observed no settled requests. Check the buyer + sidecar's auth pool (/status) and the facilitator's settlement + endpoint. diff --git a/internal/embed/infrastructure/base/templates/x402.yaml b/internal/embed/infrastructure/base/templates/x402.yaml index 9dcc933e..38482384 100644 --- a/internal/embed/infrastructure/base/templates/x402.yaml +++ b/internal/embed/infrastructure/base/templates/x402.yaml @@ -332,3 +332,29 @@ spec: selector: matchLabels: app: x402-verifier + +--- +# ServiceMonitor for x402-verifier — scrapes the stable Service endpoint +# rather than per-pod IPs (which is what a PodMonitor would do). Lives +# alongside the Service it observes so adding/changing the port or +# selector here is a single-file change. +# +# Picked up by kube-prometheus-stack via the `release: monitoring` label +# (configured in values/monitoring.yaml.gotmpl as the serviceMonitorSelector). +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: x402-verifier + namespace: x402 + labels: + release: monitoring + app: x402-verifier +spec: + selector: + matchLabels: + app: x402-verifier + endpoints: + - port: http + path: /metrics + interval: 30s + scrapeTimeout: 10s diff --git a/internal/embed/infrastructure/helmfile.yaml b/internal/embed/infrastructure/helmfile.yaml index fe6eb001..95f7c8b5 100644 --- a/internal/embed/infrastructure/helmfile.yaml +++ b/internal/embed/infrastructure/helmfile.yaml @@ -44,57 +44,12 @@ releases: values: - ./values/monitoring.yaml.gotmpl - - name: llm-buyer-podmonitor - namespace: llm - createNamespace: true - chart: bedag/raw - version: 2.0.2 - needs: - - monitoring/monitoring - - kube-system/base - values: - - resources: - - apiVersion: monitoring.coreos.com/v1 - kind: PodMonitor - metadata: - name: litellm-x402-buyer - namespace: llm - labels: - release: monitoring - spec: - selector: - matchLabels: - app: litellm - podMetricsEndpoints: - - port: buyer-http - path: /metrics - interval: 30s - - - name: x402-verifier-podmonitor - namespace: x402 - createNamespace: false - chart: bedag/raw - version: 2.0.2 - needs: - - monitoring/monitoring - - kube-system/base - values: - - resources: - - apiVersion: monitoring.coreos.com/v1 - kind: PodMonitor - metadata: - name: x402-verifier - namespace: x402 - labels: - release: monitoring - spec: - selector: - matchLabels: - app: x402-verifier - podMetricsEndpoints: - - port: http - path: /metrics - interval: 30s + # NOTE: PodMonitor for litellm-x402-buyer and ServiceMonitor for + # x402-verifier moved into base/templates/llm.yaml and + # base/templates/x402.yaml respectively. They live alongside the + # workloads they observe so a port/selector edit is one-file. Kills + # two `bedag/raw` releases. kube-prometheus-stack picks them up via + # the `release: monitoring` label. # Traefik ingress controller with Gateway API support # Traefik v38+ bundles Gateway API CRDs in its crds/ directory @@ -305,49 +260,11 @@ releases: - name: obol-frontend-obol-app port: 3000 - # Obol Frontend RBAC (OpenClaw instance discovery via Kubernetes API) - - name: obol-frontend-rbac - namespace: obol-frontend - chart: bedag/raw - version: 2.0.2 - needs: - - obol-frontend/obol-frontend - values: - - resources: - - apiVersion: rbac.authorization.k8s.io/v1 - kind: ClusterRole - metadata: - name: obol-frontend-openclaw-discovery - labels: - app.kubernetes.io/name: obol-frontend - rules: - - apiGroups: [""] - resources: ["namespaces"] - verbs: ["get", "list"] - - apiGroups: [""] - resources: ["pods", "configmaps", "secrets"] - verbs: ["get", "list"] - # ServiceOffer CRD — frontend sell modal creates offers - - apiGroups: ["obol.org"] - resources: ["serviceoffers", "serviceoffers/status"] - verbs: ["get", "list", "create", "update", "patch", "delete"] - # PurchaseRequest CRD — frontend My Purchases page lists buyer-side - # purchase records. Read-only; agent buy.py and the controller - # remain the only writers. - - apiGroups: ["obol.org"] - resources: ["purchaserequests", "purchaserequests/status"] - verbs: ["get", "list", "watch"] - - apiVersion: rbac.authorization.k8s.io/v1 - kind: ClusterRoleBinding - metadata: - name: obol-frontend-openclaw-discovery - labels: - app.kubernetes.io/name: obol-frontend - roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: obol-frontend-openclaw-discovery - subjects: - - kind: ServiceAccount - name: obol-frontend - namespace: obol-frontend + # NOTE: obol-frontend-rbac ClusterRole + ClusterRoleBinding moved into + # base/templates/obol-frontend-rbac.yaml. Co-located with the workload + # they grant; kills a `bedag/raw` release. Frontend-egress NetworkPolicy + # was attempted and reverted — on k3s + Flannel (k3d's default CNI) the + # kubernetes apiserver Service Endpoints point at the host process, + # outside the cluster pod/service CIDRs. A clean allowlist can't target + # the apiserver portably without an install-specific ipBlock for the k3s + # host IP. Tracking as a deferred hardening item. diff --git a/internal/x402/metrics.go b/internal/x402/metrics.go index 42266c2b..8734a58d 100644 --- a/internal/x402/metrics.go +++ b/internal/x402/metrics.go @@ -26,42 +26,42 @@ func newVerifierMetrics() *verifierMetrics { Name: "obol_x402_verifier_requests_total", Help: "Requests evaluated by the x402 verifier for matched paid routes.", }, - []string{"route", "offer_namespace", "offer_name", "chain"}, + []string{"offer_namespace", "offer_name", "chain"}, ), paymentRequired: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_payment_required_total", Help: "Requests rejected with 402 because payment was required.", }, - []string{"route", "offer_namespace", "offer_name", "chain"}, + []string{"offer_namespace", "offer_name", "chain"}, ), paymentVerified: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_payment_verified_total", Help: "Requests approved after successful x402 payment verification.", }, - []string{"route", "offer_namespace", "offer_name", "chain"}, + []string{"offer_namespace", "offer_name", "chain"}, ), paymentFailed: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_payment_failed_total", Help: "Requests rejected after a provided x402 payment failed verification.", }, - []string{"route", "offer_namespace", "offer_name", "chain"}, + []string{"offer_namespace", "offer_name", "chain"}, ), chargedRequests: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_charged_requests_total", Help: "Requests that incurred a paid x402 charge.", }, - []string{"route", "offer_namespace", "offer_name", "chain"}, + []string{"offer_namespace", "offer_name", "chain"}, ), lastPaymentSuccess: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "obol_x402_verifier_last_payment_success_seconds", Help: "Unix timestamp (seconds) of the most recent successful paid x402 charge for a route.", }, - []string{"route", "offer_namespace", "offer_name", "chain"}, + []string{"offer_namespace", "offer_name", "chain"}, ), } diff --git a/internal/x402/verifier.go b/internal/x402/verifier.go index b4451508..38437ec3 100644 --- a/internal/x402/verifier.go +++ b/internal/x402/verifier.go @@ -448,8 +448,11 @@ func (r *statusRecorder) WriteHeader(status int) { } func prometheusLabels(rule *RouteRule) prometheus.Labels { + // `route` (= rule.Pattern) was dropped in favor of (offer_namespace, + // offer_name) which already uniquely identifies a paid route — the + // pattern was redundant and unbounded by path fragments, which would + // have ballooned series count for sellers running many granular routes. return prometheus.Labels{ - "route": rule.Pattern, "offer_namespace": rule.OfferNamespace, "offer_name": rule.OfferName, "chain": rule.Network, diff --git a/internal/x402/verifier_test.go b/internal/x402/verifier_test.go index 6b97548d..4bfd46a4 100644 --- a/internal/x402/verifier_test.go +++ b/internal/x402/verifier_test.go @@ -753,7 +753,6 @@ func TestVerifier_MetricsPaymentRequired(t *testing.T) { metrics := scrapeVerifierMetrics(t, v) labels := map[string]string{ - "route": "/rpc/*", "offer_namespace": "llm", "offer_name": "paid-rpc", "chain": "", @@ -767,7 +766,6 @@ func TestVerifier_MetricsPaymentRequired(t *testing.T) { func TestVerifier_MetricsVerifiedAndRejectedPayments(t *testing.T) { labels := map[string]string{ - "route": "/rpc/*", "offer_namespace": "llm", "offer_name": "paid-rpc", "chain": "", @@ -833,7 +831,6 @@ func TestVerifier_MetricsVerifiedAndRejectedPayments(t *testing.T) { // `chain` is the empty string because the test RouteRule has no Network set. func TestVerifier_LastPaymentSuccessGauge(t *testing.T) { labels := map[string]string{ - "route": "/rpc/*", "offer_namespace": "llm", "offer_name": "paid-rpc", "chain": "", From 0fbb99ae93c60975f64beed49b92e4802b0bbac5 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 23 May 2026 21:53:16 +0400 Subject: [PATCH 05/31] fix(x402): GC verifier metric series for deleted offers The verifier's per-offer counters and the last_payment_success_seconds gauge were created on first use and never removed. Deleting an offer (via `obol sell delete`, ServiceOffer CR deletion, or pricing config edit) left stale series in the registry forever, which: * pollutes My Listings / dashboards with rows for offers that no longer exist, * lets X402NoSettlementsAfterChallenge keep referencing dead labels, * silently inflates the "last successful charge" gauge with timestamps from offers the operator already retired. Verifier.load() now diffs the incoming route set against the live label tuples in the registry and calls DeletePartialMatch on each vec for every (offer_namespace, offer_name, chain) triple that is no longer served. Both reload paths (file config watcher and the kube ServiceOffer informer via ConfigAccumulator) funnel through load(), so one hook covers everything. Also fixes a guard test from the prior hardening commit that was still asserting the old "no ServiceMonitor here" invariant after we intentionally relocated the ServiceMonitor into this manifest. Flipped to assert presence so a future cleanup can't silently drop it. Test: TestVerifier_Reload_PrunesDeletedOfferSeries stamps two offers' worth of metrics, reloads with one removed, and asserts the removed offer is gone from all six vecs while the kept offer survives. --- internal/x402/metrics.go | 58 ++++++++++++++++++++++++ internal/x402/setup_test.go | 8 +++- internal/x402/verifier.go | 14 ++++++ internal/x402/verifier_test.go | 83 ++++++++++++++++++++++++++++++++++ 4 files changed, 161 insertions(+), 2 deletions(-) diff --git a/internal/x402/metrics.go b/internal/x402/metrics.go index 8734a58d..b445d4c3 100644 --- a/internal/x402/metrics.go +++ b/internal/x402/metrics.go @@ -80,3 +80,61 @@ func newVerifierMetrics() *verifierMetrics { func (m *verifierMetrics) handler() http.Handler { return promhttp.HandlerFor(m.registry, promhttp.HandlerOpts{}) } + +// pruneSeriesNotIn drops every (offer_namespace, offer_name, chain) series +// from the verifier's counter/gauge vecs that is not present in `keep`. +// Called from Verifier.load whenever the route set changes so deleted offers +// (e.g. `obol sell delete`) stop emitting stale series — most importantly the +// last_payment_success_seconds gauge, which would otherwise hold the deleted +// offer's last-success timestamp forever and falsely satisfy "recent activity" +// alerts and dashboards. +// +// Key shape: "ns\x00name\x00chain" — \x00 is forbidden in Kubernetes object +// names and CAIP-2 chain ids, so the byte-join can't collide. +func (m *verifierMetrics) pruneSeriesNotIn(keep map[string]struct{}) { + vecs := []interface { + DeletePartialMatch(prometheus.Labels) int + }{ + m.requestsTotal, + m.paymentRequired, + m.paymentVerified, + m.paymentFailed, + m.chargedRequests, + m.lastPaymentSuccess, + } + + gathered, err := m.registry.Gather() + if err != nil { + return + } + for _, family := range gathered { + for _, metric := range family.GetMetric() { + labels := metric.GetLabel() + ns, name, chain := "", "", "" + for _, l := range labels { + switch l.GetName() { + case "offer_namespace": + ns = l.GetValue() + case "offer_name": + name = l.GetValue() + case "chain": + chain = l.GetValue() + } + } + if ns == "" && name == "" { + continue + } + if _, ok := keep[ns+"\x00"+name+"\x00"+chain]; ok { + continue + } + match := prometheus.Labels{ + "offer_namespace": ns, + "offer_name": name, + "chain": chain, + } + for _, vec := range vecs { + vec.DeletePartialMatch(match) + } + } + } +} diff --git a/internal/x402/setup_test.go b/internal/x402/setup_test.go index ff8b7652..7dba813e 100644 --- a/internal/x402/setup_test.go +++ b/internal/x402/setup_test.go @@ -258,7 +258,11 @@ func TestX402Manifest_UsesServiceOfferControllerModel(t *testing.T) { if !strings.Contains(manifest, "resources: [\"serviceoffers\"]") { t.Fatalf("x402 manifest missing serviceoffer watch RBAC:\n%s", manifest) } - if strings.Contains(manifest, "kind: ServiceMonitor") { - t.Fatalf("x402 manifest still includes legacy ServiceMonitor stanza:\n%s", manifest) + // ServiceMonitor now lives in this manifest by design — relocated here + // from a bedag/raw helmfile release so the scrape config sits next to + // the Service it observes. Assert presence so a future cleanup can't + // silently drop it. + if !strings.Contains(manifest, "kind: ServiceMonitor") { + t.Fatalf("x402 manifest missing ServiceMonitor (relocated from bedag/raw helmfile in PR #513 hardening):\n%s", manifest) } } diff --git a/internal/x402/verifier.go b/internal/x402/verifier.go index 38437ec3..60e2fa80 100644 --- a/internal/x402/verifier.go +++ b/internal/x402/verifier.go @@ -64,6 +64,20 @@ func (v *Verifier) load(cfg *PricingConfig) error { v.chains.Store(&chains) v.config.Store(cfg) + // Drop metric series for offers that are no longer in the route set. + // Without this, deleting an offer leaves its counters + last-success + // gauge in the registry forever, polluting dashboards and silently + // keeping alerts (e.g. "no settlements after challenge") tied to dead + // labels. + live := make(map[string]struct{}, len(cfg.Routes)) + for _, r := range cfg.Routes { + if r.OfferNamespace == "" && r.OfferName == "" { + continue + } + live[r.OfferNamespace+"\x00"+r.OfferName+"\x00"+r.Network] = struct{}{} + } + v.metrics.pruneSeriesNotIn(live) + return nil } diff --git a/internal/x402/verifier_test.go b/internal/x402/verifier_test.go index 4bfd46a4..3b62c815 100644 --- a/internal/x402/verifier_test.go +++ b/internal/x402/verifier_test.go @@ -915,6 +915,89 @@ func TestVerifier_LastPaymentSuccessGauge(t *testing.T) { } } +// TestVerifier_Reload_PrunesDeletedOfferSeries asserts that when an offer is +// removed from the route set (via Reload, the same path used by both the +// file-config watcher and the kube ServiceOffer informer), its previously +// stamped metric series are dropped from the registry. Without this, deleted +// offers' last_payment_success_seconds gauge would survive forever and keep +// firing/silencing alerts on dead labels. +func TestVerifier_Reload_PrunesDeletedOfferSeries(t *testing.T) { + fac := newMockFacilitator(t, mockFacilitatorOpts{}) + keptRoute := RouteRule{ + Pattern: "/keep/*", + Price: "0.0001", + OfferNamespace: "llm", + OfferName: "keep", + } + removedRoute := RouteRule{ + Pattern: "/gone/*", + Price: "0.0001", + OfferNamespace: "llm", + OfferName: "gone", + } + v := newTestVerifier(t, fac.URL, []RouteRule{keptRoute, removedRoute}) + + // Stamp metrics for both offers with a successful paid request each. + for _, path := range []string{"/keep/x", "/gone/x"} { + req := httptest.NewRequest(http.MethodPost, "/verify", nil) + req.Header.Set("X-Forwarded-Uri", path) + req.Header.Set("X-Forwarded-Host", "obol.stack") + req.Header.Set("X-PAYMENT", testPaymentHeader(t)) + rec := httptest.NewRecorder() + v.HandleVerify(rec, req) + if rec.Code != http.StatusOK { + t.Fatalf("setup paid request to %s: status=%d", path, rec.Code) + } + } + + keptLabels := map[string]string{"offer_namespace": "llm", "offer_name": "keep", "chain": ""} + goneLabels := map[string]string{"offer_namespace": "llm", "offer_name": "gone", "chain": ""} + + families := scrapeVerifierMetrics(t, v) + for _, name := range []string{ + "obol_x402_verifier_charged_requests_total", + "obol_x402_verifier_last_payment_success_seconds", + } { + family := families[name] + if family == nil { + t.Fatalf("baseline: missing %s before reload", name) + } + findVerifierMetricValue(t, family, keptLabels) + findVerifierMetricValue(t, family, goneLabels) + } + + // Reload with the second offer dropped — the same path ServiceOffer + // deletion takes through ConfigAccumulator.SetRoutes. + if err := v.Reload(&PricingConfig{ + Wallet: "0xdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef", + Chain: "base-sepolia", + FacilitatorURL: fac.URL, + Routes: []RouteRule{keptRoute}, + }); err != nil { + t.Fatalf("Reload: %v", err) + } + + families = scrapeVerifierMetrics(t, v) + for _, name := range []string{ + "obol_x402_verifier_requests_total", + "obol_x402_verifier_payment_required_total", + "obol_x402_verifier_payment_verified_total", + "obol_x402_verifier_payment_failed_total", + "obol_x402_verifier_charged_requests_total", + "obol_x402_verifier_last_payment_success_seconds", + } { + assertVerifierMetricMissing(t, families[name], goneLabels) + } + + // Kept offer's series must survive the prune. + if charged := families["obol_x402_verifier_charged_requests_total"]; charged != nil { + findVerifierMetricValue(t, charged, keptLabels) + } + if gauge := families["obol_x402_verifier_last_payment_success_seconds"]; gauge != nil { + findVerifierMetricValue(t, gauge, keptLabels) + } +} + // findVerifierMetricValue returns the value of the series in `family` whose // labels match `wantLabels` exactly, failing the test if no such series exists. func findVerifierMetricValue(t *testing.T, family *dto.MetricFamily, wantLabels map[string]string) float64 { From 9be9de8dcf4eb01bd3b8c157082bbc3d57da290b Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 23 May 2026 22:32:13 +0400 Subject: [PATCH 06/31] =?UTF-8?q?fix(x402):=20verifier=20replicas:=202=20?= =?UTF-8?q?=E2=86=92=201=20to=20keep=20metric=20GC=20correct?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 0fbb99a (fix(x402): GC verifier metric series for deleted offers) added pruneSeriesNotIn to Verifier.load. Each verifier pod runs its own informer + its own metric registry, so the GC is per-pod. With replicas: 2 + ServiceMonitor (round-robin scrape over Endpoints), Prometheus sees: * one pod's registry on scrape N (pruned correctly), * the other pod's on scrape N+1 (may still hold a deleted offer's series until that pod's informer also sees the delete). Result: deleted offers' last_payment_success_seconds gauge and charged_requests_total counters reappear every other scrape, polluting dashboards and creating spurious alert state. Cheapest correct fix is replicas: 1. The verifier is on the request path but single-node k3d gains no HA from 2 replicas. Drop the PodDisruptionBudget too — minAvailable:1 at replicas:1 just blocks voluntary drains on the only pod, useless on k3d. If/when the stack ever runs multi-node and HA replicas are wanted, the right pattern is ServiceMonitor → PodMonitor with a `pod` label and recording rules using `sum without(pod)`. That's a future change; right now correctness > theoretical HA. --- .../infrastructure/base/templates/x402.yaml | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/internal/embed/infrastructure/base/templates/x402.yaml b/internal/embed/infrastructure/base/templates/x402.yaml index 38482384..25b97ede 100644 --- a/internal/embed/infrastructure/base/templates/x402.yaml +++ b/internal/embed/infrastructure/base/templates/x402.yaml @@ -200,7 +200,11 @@ metadata: labels: app: x402-verifier spec: - replicas: 2 + # Single replica — verifier holds per-pod metric registries and per-pod + # informer caches; multiple replicas produce metric series drift across + # ServiceMonitor scrape rotations and the pruneSeriesNotIn GC (metrics.go) + # becomes inconsistent. Single-node k3d gains no HA from 2 replicas. + replicas: 1 selector: matchLabels: app: x402-verifier @@ -321,18 +325,6 @@ spec: targetPort: http protocol: TCP ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - name: x402-verifier - namespace: x402 -spec: - minAvailable: 1 - selector: - matchLabels: - app: x402-verifier - --- # ServiceMonitor for x402-verifier — scrapes the stable Service endpoint # rather than per-pod IPs (which is what a PodMonitor would do). Lives From 522aeaeabe83b306e589d89bca8395a2200524ef Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 23 May 2026 22:32:56 +0400 Subject: [PATCH 07/31] fix(x402-metrics): align Prometheus retention with recording-rule windows; rename mis-named lifetime rule MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related metric-correctness fixes layered on top of the recording rules added in 27e1ac5: 1. Retention 6h → 8d. The recording rules added in 27e1ac5 use [24h] and [7d] windows. `increase(x[24h])` against a 6h-retention TSDB silently returns "last 6h extrapolated to 24h" with no error. The frontend displays that result as "24h revenue" — wrong by 4x. 8d (= 7d + 1d safety margin) keeps the [7d] rule valid across a brief Prometheus outage. 2. `x402:revenue:lifetime_by_offer` → `x402:revenue:total_by_offer_current`. The original expression was `sum(counter)` (not `sum(increase[lifetime])`), so it: * is NOT lifetime — it's "sum across currently-alive verifier replicas of their since-last-restart counts", * drops ~50% on every replica rollout, * compounds with the per-pod-registry issue addressed by the replicas:1 fix. Renaming makes the semantic explicit. True lifetime queries should use `sum_over_time(...[Nd])` against a long-retention store. Retention bump increases Prometheus disk footprint roughly proportional to (8d/6h) ≈ 32x. The local-only kube-prometheus-stack PVC sizing in monitoring.yaml.gotmpl needs review on next `obol stack up` if disk pressure shows up — currently no PVC size cap set, so it inherits the storageClass default. --- .../base/templates/x402-prometheus-rules.yaml | 9 +++++---- .../embed/infrastructure/values/monitoring.yaml.gotmpl | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml b/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml index 73b10f94..d0dbe754 100644 --- a/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml +++ b/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml @@ -47,10 +47,11 @@ spec: increase(obol_x402_verifier_charged_requests_total[7d]) ) - # Lifetime charged-request count per offer (sum across replicas - # + chains). Used in the My Listings "today · X earned" header - # text and the Browse catalog usage badge. - - record: x402:revenue:lifetime_by_offer + # Sum of currently-running verifier replicas' counters — resets + # on rollout; for true lifetime, query against a long-retention + # store or use `sum_over_time(...[Nd])`. Used in the My Listings + # "today · X earned" header text and the Browse catalog usage badge. + - record: x402:revenue:total_by_offer_current expr: | sum by (offer_namespace, offer_name) ( obol_x402_verifier_charged_requests_total diff --git a/internal/embed/infrastructure/values/monitoring.yaml.gotmpl b/internal/embed/infrastructure/values/monitoring.yaml.gotmpl index 18e6ba01..e440bd0d 100644 --- a/internal/embed/infrastructure/values/monitoring.yaml.gotmpl +++ b/internal/embed/infrastructure/values/monitoring.yaml.gotmpl @@ -11,7 +11,7 @@ prometheus: matchLabels: release: monitoring podMonitorNamespaceSelector: {} - retention: 6h + retention: 8d resources: requests: cpu: 100m From fdb86b380265d9f091f4df6a683cecba41634932 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 23 May 2026 22:34:48 +0400 Subject: [PATCH 08/31] chore(images): digest-pin verifier, controller, litellm, cloudflared MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends the @sha256 digest discipline that x402-buyer and the frontend already carry to the remaining four images that ship as part of the embedded infrastructure. Tag-only refs (e.g. ghcr.io/obolnetwork/ x402-verifier:b13254e) are vulnerable to mutable-tag rewrites — the class of bug CLAUDE.md pitfall #12 documented as a real production fire. Pins: - x402-verifier:b13254e @ sha256:a8a7aa0ca4c35b0ddf6983fa6e3e5f8a3f64e44d8e506ebfd55e39de2bc0342d - serviceoffer-controller:b13254e @ sha256:f83bd7e55bdc5d87edb49c04e7fd9257097364e2d43e769c19dfd7c8b47d07af - litellm:sha-c16b156 @ sha256:9f112b51ac5a57d73cdd54103fb98d24eabaddd8689a9a285884dca6456dc86e - cloudflared:2026.3.0 @ sha256:6b599ca3e974349ead3286d178da61d291961182ec3fe9c505e1dd02c8ac31b0 Adds a regression test asserting every embedded manifest carries @sha256: on its image refs so a future dependency bump can't silently revert to tag-only. Dev-rewrite invariant (defaults.go:124 + setup.go:74 alternation regex) verified intact via go test ./internal/defaults/... ./internal/x402/... --- internal/embed/embed_image_pin_test.go | 132 ++++++++++++++++++ .../infrastructure/base/templates/llm.yaml | 2 +- .../infrastructure/base/templates/x402.yaml | 4 +- .../infrastructure/cloudflared/values.yaml | 2 +- 4 files changed, 136 insertions(+), 4 deletions(-) diff --git a/internal/embed/embed_image_pin_test.go b/internal/embed/embed_image_pin_test.go index 36dd1676..1517bf76 100644 --- a/internal/embed/embed_image_pin_test.go +++ b/internal/embed/embed_image_pin_test.go @@ -136,3 +136,135 @@ func TestEmbeddedImages_NoNewLatestTags(t *testing.T) { strings.Join(stale, "\n ")) } } + +// TestEmbeddedImages_NamedImagesAreDigestPinned guards the @sha256: discipline +// for the cluster-side container images that ship as part of the embedded +// infrastructure. Tag-only refs (e.g. `:b13254e`) are vulnerable to mutable-tag +// rewrites — the class of supply-chain bug CLAUDE.md pitfall #12 documented +// after a real local-cluster incident. +// +// Adding a new image to this list MUST be accompanied by an `@sha256:` +// suffix on the `image:` line (or, for Helm value files, on the `tag:` field +// such that the rendered manifest produces `:@sha256:`). +// +// To regenerate a digest: +// +// docker buildx imagetools inspect : --format '{{ .Manifest.Digest }}' +func TestEmbeddedImages_NamedImagesAreDigestPinned(t *testing.T) { + cases := []struct { + file string + // repo is the substring used to locate the relevant line. The match + // is line-scoped — the line must also contain @sha256: to pass. + repo string + }{ + // internal/embed/infrastructure/base/templates/x402.yaml + {file: "base/templates/x402.yaml", repo: "ghcr.io/obolnetwork/x402-verifier"}, + {file: "base/templates/x402.yaml", repo: "ghcr.io/obolnetwork/serviceoffer-controller"}, + // internal/embed/infrastructure/base/templates/llm.yaml + {file: "base/templates/llm.yaml", repo: "ghcr.io/obolnetwork/litellm"}, + {file: "base/templates/llm.yaml", repo: "ghcr.io/obolnetwork/x402-buyer"}, + } + + for _, tc := range cases { + t.Run(tc.repo, func(t *testing.T) { + data, err := ReadInfrastructureFile(tc.file) + if err != nil { + t.Fatalf("read %s: %v", tc.file, err) + } + + var ( + found bool + offenders []string + ) + + scanner := bufio.NewScanner(bytes.NewReader(data)) + scanner.Buffer(make([]byte, 0, 1024*1024), 1024*1024) + + lineNum := 0 + for scanner.Scan() { + lineNum++ + line := scanner.Text() + + trimmed := strings.TrimSpace(line) + if strings.HasPrefix(trimmed, "#") { + continue + } + // Must look like a Kubernetes container `image:` field, not a + // random doc-comment or env var. + if !strings.Contains(trimmed, "image:") { + continue + } + if !strings.Contains(line, tc.repo) { + continue + } + + found = true + if !strings.Contains(line, "@sha256:") { + offenders = append(offenders, + fmt.Sprintf("%s:%d → %q lacks @sha256: digest pin", tc.file, lineNum, strings.TrimSpace(line))) + } + } + + if err := scanner.Err(); err != nil { + t.Fatalf("scan %s: %v", tc.file, err) + } + + if !found { + t.Fatalf("no image: line containing %q found in %s — has the image been renamed or moved? "+ + "Update this test alongside the manifest change.", tc.repo, tc.file) + } + + if len(offenders) > 0 { + t.Fatalf("digest-pin discipline broken in %s:\n %s\n\n"+ + "Pin the image as `:@sha256:`. Resolve with:\n"+ + " docker buildx imagetools inspect %s: --format '{{ .Manifest.Digest }}'", + tc.file, strings.Join(offenders, "\n "), tc.repo) + } + }) + } +} + +// TestEmbeddedImages_CloudflaredHelmTagIsDigestPinned covers the cloudflared +// chart, which uses the Helm idiom `image.repository` + `image.tag` rather +// than a literal `image:` line. The chart template renders +// `:`; embedding `@sha256:` inside `.tag` produces +// a valid digest-pinned ref at render time and preserves the same +// mutable-tag protection. +func TestEmbeddedImages_CloudflaredHelmTagIsDigestPinned(t *testing.T) { + data, err := ReadInfrastructureFile("cloudflared/values.yaml") + if err != nil { + t.Fatalf("read cloudflared/values.yaml: %v", err) + } + + var tagLine string + + scanner := bufio.NewScanner(bytes.NewReader(data)) + scanner.Buffer(make([]byte, 0, 1024*1024), 1024*1024) + + for scanner.Scan() { + line := scanner.Text() + trimmed := strings.TrimSpace(line) + if strings.HasPrefix(trimmed, "#") { + continue + } + if strings.HasPrefix(trimmed, "tag:") { + tagLine = line + break + } + } + + if err := scanner.Err(); err != nil { + t.Fatalf("scan cloudflared/values.yaml: %v", err) + } + + if tagLine == "" { + t.Fatal("no `tag:` field found in cloudflared/values.yaml — chart layout changed; update this test.") + } + + if !strings.Contains(tagLine, "@sha256:") { + t.Fatalf("cloudflared image tag is not digest-pinned: %q\n\n"+ + "Pin it as `tag: \"@sha256:\"`. Resolve with:\n"+ + " docker buildx imagetools inspect cloudflare/cloudflared: --format '{{ .Manifest.Digest }}'", + strings.TrimSpace(tagLine)) + } +} diff --git a/internal/embed/infrastructure/base/templates/llm.yaml b/internal/embed/infrastructure/base/templates/llm.yaml index cf34841f..2d35138f 100644 --- a/internal/embed/infrastructure/base/templates/llm.yaml +++ b/internal/embed/infrastructure/base/templates/llm.yaml @@ -148,7 +148,7 @@ spec: # No Postgres required — /model/new and /model/delete work via # in-memory router + config.yaml persistence. # Source: https://github.com/ObolNetwork/litellm - image: ghcr.io/obolnetwork/litellm:sha-c16b156 + image: ghcr.io/obolnetwork/litellm:sha-c16b156@sha256:9f112b51ac5a57d73cdd54103fb98d24eabaddd8689a9a285884dca6456dc86e imagePullPolicy: IfNotPresent args: - --config diff --git a/internal/embed/infrastructure/base/templates/x402.yaml b/internal/embed/infrastructure/base/templates/x402.yaml index 9dcc933e..b272215e 100644 --- a/internal/embed/infrastructure/base/templates/x402.yaml +++ b/internal/embed/infrastructure/base/templates/x402.yaml @@ -212,7 +212,7 @@ spec: serviceAccountName: x402-verifier containers: - name: verifier - image: ghcr.io/obolnetwork/x402-verifier:b13254e + image: ghcr.io/obolnetwork/x402-verifier:b13254e@sha256:a8a7aa0ca4c35b0ddf6983fa6e3e5f8a3f64e44d8e506ebfd55e39de2bc0342d imagePullPolicy: IfNotPresent ports: - name: http @@ -283,7 +283,7 @@ spec: serviceAccountName: serviceoffer-controller containers: - name: controller - image: ghcr.io/obolnetwork/serviceoffer-controller:b13254e + image: ghcr.io/obolnetwork/serviceoffer-controller:b13254e@sha256:f83bd7e55bdc5d87edb49c04e7fd9257097364e2d43e769c19dfd7c8b47d07af imagePullPolicy: IfNotPresent env: - name: POD_NAMESPACE diff --git a/internal/embed/infrastructure/cloudflared/values.yaml b/internal/embed/infrastructure/cloudflared/values.yaml index a41a4715..8ff1670c 100644 --- a/internal/embed/infrastructure/cloudflared/values.yaml +++ b/internal/embed/infrastructure/cloudflared/values.yaml @@ -5,7 +5,7 @@ transport: image: repository: cloudflare/cloudflared - tag: "2026.3.0" + tag: "2026.3.0@sha256:6b599ca3e974349ead3286d178da61d291961182ec3fe9c505e1dd02c8ac31b0" metrics: address: "0.0.0.0:2000" From 7896384b1ae5b17496ccd989b36d6a4e14a405e7 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 23 May 2026 22:45:16 +0400 Subject: [PATCH 09/31] feat(controller): wire client-go leader-election so HA scaling is safe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Today the serviceoffer-controller is pinned at replicas: 1 with a "Do not scale" comment in x402.yaml. The RBAC for leases is already granted (x402.yaml:176-178) — pre-positioned and unused. An accidental `kubectl scale --replicas=2` or HPA misconfiguration produces split-brain finalizers and double on-chain ERC-8004 registration (real gas spend + duplicate registry entries). This wires client-go tools/leaderelection so multi-replica deployment is safe-by-correctness, not safe-by-comment. - cmd/serviceoffer-controller/main.go: - Read POD_NAME / POD_NAMESPACE from downward API env. - Acquire Lease "serviceoffer-controller" in POD_NAMESPACE before running the reconcile loop. - On lost leadership, os.Exit(1) — kubelet restarts the pod which re-elects from scratch. - --leader-elect flag (default true) so local dev can bypass. - x402.yaml: - Add downward-API POD_NAME env to the controller Deployment (POD_NAMESPACE was already wired). - Update the "Do not scale" comment to "Single replica by default; bumping to 2+ is now safe — leader election prevents split-brain on the reconcile loop." - Lease parameters chosen for fast failover on k3d (lease=30s, renew=20s, retry=5s). Tunable via flag if a multi-zone deployment ever needs longer. Uses client-go directly rather than controller-runtime Manager to minimize churn — controller is currently raw client-go workqueues, not controller-runtime. Migration to controller-runtime is a separate much larger workstream and not necessary just for leader election. --- cmd/serviceoffer-controller/main.go | 78 +++++++++++++++++- cmd/serviceoffer-controller/main_test.go | 81 +++++++++++++++++++ .../infrastructure/base/templates/x402.yaml | 9 ++- 3 files changed, 165 insertions(+), 3 deletions(-) create mode 100644 cmd/serviceoffer-controller/main_test.go diff --git a/cmd/serviceoffer-controller/main.go b/cmd/serviceoffer-controller/main.go index 8fc01a42..6e81cd65 100644 --- a/cmd/serviceoffer-controller/main.go +++ b/cmd/serviceoffer-controller/main.go @@ -7,15 +7,27 @@ import ( "os" "os/signal" "syscall" + "time" "github.com/ObolNetwork/obol-stack/internal/serviceoffercontroller" "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" + "k8s.io/client-go/tools/leaderelection" + "k8s.io/client-go/tools/leaderelection/resourcelock" +) + +const ( + defaultLockNamespace = "x402" + leaseName = "serviceoffer-controller" + leaseDuration = 30 * time.Second + renewDeadline = 20 * time.Second + retryPeriod = 5 * time.Second ) func main() { kubeconfig := flag.String("kubeconfig", "", "Path to kubeconfig for out-of-cluster runs") workers := flag.Int("workers", 1, "Number of reconcile workers") + leaderElect := flag.Bool("leader-elect", true, "Acquire a Lease before running the reconcile loop (disable for local dev)") flag.Parse() cfg, err := loadConfig(*kubeconfig) @@ -31,9 +43,71 @@ func main() { ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) defer cancel() - if err := controller.Run(ctx, *workers); err != nil { - log.Fatalf("run controller: %v", err) + if !*leaderElect { + if err := controller.Run(ctx, *workers); err != nil { + log.Fatalf("run controller: %v", err) + } + return + } + + runWithLeaderElection(ctx, cfg, controller, *workers) +} + +func runWithLeaderElection(ctx context.Context, cfg *rest.Config, controller *serviceoffercontroller.Controller, workers int) { + podName := os.Getenv("POD_NAME") + if podName == "" { + // Fall back so local dev (go run ./cmd/serviceoffer-controller --leader-elect=false) + // still works if someone forgets the flag. Identity must be unique across + // candidates — in real deployments the downward API supplies the pod name. + podName = "serviceoffer-controller-local" } + + lockNamespace := os.Getenv("POD_NAMESPACE") + if lockNamespace == "" { + lockNamespace = defaultLockNamespace + } + + lock, err := resourcelock.NewFromKubeconfig( + resourcelock.LeasesResourceLock, + lockNamespace, + leaseName, + resourcelock.ResourceLockConfig{ + Identity: podName, + }, + cfg, + renewDeadline, + ) + if err != nil { + log.Fatalf("create lease lock: %v", err) + } + + leaderelection.RunOrDie(ctx, leaderelection.LeaderElectionConfig{ + Lock: lock, + ReleaseOnCancel: true, + LeaseDuration: leaseDuration, + RenewDeadline: renewDeadline, + RetryPeriod: retryPeriod, + Callbacks: leaderelection.LeaderCallbacks{ + OnStartedLeading: func(ctx context.Context) { + log.Printf("serviceoffer-controller: became leader %s", podName) + if err := controller.Run(ctx, workers); err != nil { + log.Printf("controller run: %v", err) + } + }, + OnStoppedLeading: func() { + // On lost leadership exit non-zero so the kubelet restarts the + // pod and the next election starts from a clean state. Trying + // to keep running without the lease would race the new leader. + log.Printf("serviceoffer-controller: lost leadership %s", podName) + os.Exit(1) + }, + OnNewLeader: func(identity string) { + if identity != podName { + log.Printf("serviceoffer-controller: new leader is %s", identity) + } + }, + }, + }) } func loadConfig(kubeconfig string) (*rest.Config, error) { diff --git a/cmd/serviceoffer-controller/main_test.go b/cmd/serviceoffer-controller/main_test.go new file mode 100644 index 00000000..addb8856 --- /dev/null +++ b/cmd/serviceoffer-controller/main_test.go @@ -0,0 +1,81 @@ +package main + +import ( + "os" + "path/filepath" + "testing" +) + +// TestLoadConfig_FromKubeconfigFile asserts loadConfig parses an explicit +// kubeconfig path. This is the local-dev codepath used when --leader-elect=false. +func TestLoadConfig_FromKubeconfigFile(t *testing.T) { + dir := t.TempDir() + kc := filepath.Join(dir, "kubeconfig") + if err := os.WriteFile(kc, []byte(minimalKubeconfig), 0o600); err != nil { + t.Fatalf("write kubeconfig: %v", err) + } + + cfg, err := loadConfig(kc) + if err != nil { + t.Fatalf("loadConfig: %v", err) + } + if cfg.Host != "https://example.invalid:6443" { + t.Fatalf("unexpected host: %q", cfg.Host) + } +} + +// TestLoadConfig_FromKubeconfigEnv mirrors the path used when KUBECONFIG is set +// (e.g. obol kubectl/helm passthrough during local dev). +func TestLoadConfig_FromKubeconfigEnv(t *testing.T) { + dir := t.TempDir() + kc := filepath.Join(dir, "kubeconfig") + if err := os.WriteFile(kc, []byte(minimalKubeconfig), 0o600); err != nil { + t.Fatalf("write kubeconfig: %v", err) + } + + t.Setenv("KUBECONFIG", kc) + cfg, err := loadConfig("") + if err != nil { + t.Fatalf("loadConfig: %v", err) + } + if cfg.Host != "https://example.invalid:6443" { + t.Fatalf("unexpected host: %q", cfg.Host) + } +} + +// TestLeaderElectionDefaults locks in the lease parameters chosen for fast +// failover on single-node k3d. If you tune these for a multi-zone deployment, +// update this test and the PR-description rationale. +func TestLeaderElectionDefaults(t *testing.T) { + if leaseDuration <= renewDeadline { + t.Fatalf("leaseDuration (%s) must exceed renewDeadline (%s)", leaseDuration, renewDeadline) + } + if renewDeadline <= retryPeriod { + t.Fatalf("renewDeadline (%s) must exceed retryPeriod (%s)", renewDeadline, retryPeriod) + } + if leaseName != "serviceoffer-controller" { + t.Fatalf("leaseName drifted from RBAC + Deployment expectation: %q", leaseName) + } + if defaultLockNamespace != "x402" { + t.Fatalf("defaultLockNamespace drifted from infrastructure manifest: %q", defaultLockNamespace) + } +} + +const minimalKubeconfig = `apiVersion: v1 +kind: Config +clusters: +- name: test + cluster: + server: https://example.invalid:6443 + insecure-skip-tls-verify: true +contexts: +- name: test + context: + cluster: test + user: test +current-context: test +users: +- name: test + user: + token: test-token +` diff --git a/internal/embed/infrastructure/base/templates/x402.yaml b/internal/embed/infrastructure/base/templates/x402.yaml index 9dcc933e..56d75737 100644 --- a/internal/embed/infrastructure/base/templates/x402.yaml +++ b/internal/embed/infrastructure/base/templates/x402.yaml @@ -271,7 +271,10 @@ metadata: labels: app: serviceoffer-controller spec: - replicas: 1 # Do not scale — multiple replicas race on ERC-8004 on-chain registration + # Single replica by default; bumping to 2+ is now safe — leader election + # (client-go Lease in this namespace) prevents split-brain on the reconcile + # loop and the resulting double on-chain ERC-8004 registration. + replicas: 1 selector: matchLabels: app: serviceoffer-controller @@ -286,6 +289,10 @@ spec: image: ghcr.io/obolnetwork/serviceoffer-controller:b13254e imagePullPolicy: IfNotPresent env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name - name: POD_NAMESPACE valueFrom: fieldRef: From d8912ebc75041e6ec7d363915e7dedaea0cbff6c Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 23 May 2026 22:45:28 +0400 Subject: [PATCH 10/31] fix(x402): gate verifier /readyz on informer cache sync MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the root cause of CLAUDE.md pitfall #14 ("first-request flake on freshly-deployed verifier"). Previously /readyz returned 200 the moment config.Load() became non-nil, but routes from the ServiceOffer informer load later — between those two events the pod is Ready from kubelet's view, receives Service traffic, and matchPaidRoute returns "no rule -> 200" for paid routes. The release-smoke flows hide this behind 12x5s retry loops; the actual fix is to not be Ready until routes are loaded. - Adds routesLoaded atomic.Bool to Verifier. - HandleReadyz returns 503 until BOTH config and routes loaded, with a body that distinguishes the two cases for kubectl describe debuggability. - WatchServiceOffers takes an optional onFirstApply callback, invoked after the post-WaitForCacheSync refresh succeeds. - main.go wires v.MarkRoutesLoaded as the callback for kube source, or invokes it directly after NewVerifier for file source (the file source has no informer; routes are loaded synchronously). Pairs with PR #515 (replicas: 1) — at single replica the rollout window for this race shrinks from "some scrapes" to "first ~5-10s", but it's still a bug; this PR closes it. --- cmd/x402-verifier/main.go | 9 ++++- internal/x402/serviceoffer_source.go | 18 +++++++--- internal/x402/verifier.go | 23 ++++++++++-- internal/x402/verifier_test.go | 52 ++++++++++++++++++++++++++++ 4 files changed, 94 insertions(+), 8 deletions(-) diff --git a/cmd/x402-verifier/main.go b/cmd/x402-verifier/main.go index d9538c22..e1d52f63 100644 --- a/cmd/x402-verifier/main.go +++ b/cmd/x402-verifier/main.go @@ -60,6 +60,13 @@ func main() { ctx, cancel := context.WithCancel(context.Background()) defer cancel() + // File-sourced routes are populated synchronously by LoadConfig above, + // so they are "loaded" as soon as NewVerifier returns. The kube branch + // below flips this flag only after the first informer apply succeeds. + if *routeSource == "file" { + v.MarkRoutesLoaded() + } + if *watch { switch *routeSource { case "file": @@ -76,7 +83,7 @@ func main() { log.Fatalf("load kube route source config: %v", err) } go func() { - if err := x402verifier.WatchServiceOffers(ctx, kubeCfg, accumulator.SetRoutes); err != nil { + if err := x402verifier.WatchServiceOffers(ctx, kubeCfg, accumulator.SetRoutes, v.MarkRoutesLoaded); err != nil { log.Printf("x402-serviceoffer-source: stopped: %v", err) } }() diff --git a/internal/x402/serviceoffer_source.go b/internal/x402/serviceoffer_source.go index f0b1999a..1442bd72 100644 --- a/internal/x402/serviceoffer_source.go +++ b/internal/x402/serviceoffer_source.go @@ -20,7 +20,12 @@ import ( "k8s.io/client-go/tools/cache" ) -func WatchServiceOffers(ctx context.Context, cfg *rest.Config, apply func([]RouteRule) error) error { +// WatchServiceOffers runs the ServiceOffer + litellm-secrets informers and +// pushes rendered RouteRules to apply on every change. The optional +// onFirstApply callback is invoked exactly once after the post-cache-sync +// refresh succeeds; it is the signal that the route source has produced its +// first usable snapshot. Pass nil to skip. +func WatchServiceOffers(ctx context.Context, cfg *rest.Config, apply func([]RouteRule) error, onFirstApply func()) error { client, err := dynamic.NewForConfig(cfg) if err != nil { return fmt.Errorf("create dynamic client: %w", err) @@ -33,17 +38,18 @@ func WatchServiceOffers(ctx context.Context, cfg *rest.Config, apply func([]Rout offers := offerFactory.ForResource(monetizeapi.ServiceOfferGVR).Informer() secrets := secretFactory.ForResource(monetizeapi.SecretGVR).Informer() - refresh := func() { + refresh := func() (ok bool) { routes, err := routesFromStore(offers.GetStore().List(), secrets.GetStore().List()) if err != nil { log.Printf("x402-serviceoffer-source: render routes: %v", err) - return + return false } if err := apply(routes); err != nil { log.Printf("x402-serviceoffer-source: apply routes: %v", err) - return + return false } log.Printf("x402-serviceoffer-source: routes reloaded (%d routes)", len(routes)) + return true } handler := cache.ResourceEventHandlerFuncs{ @@ -60,7 +66,9 @@ func WatchServiceOffers(ctx context.Context, cfg *rest.Config, apply func([]Rout return fmt.Errorf("wait for serviceoffer informer sync") } - refresh() + if refresh() && onFirstApply != nil { + onFirstApply() + } <-ctx.Done() return nil } diff --git a/internal/x402/verifier.go b/internal/x402/verifier.go index 60e2fa80..2b27f29a 100644 --- a/internal/x402/verifier.go +++ b/internal/x402/verifier.go @@ -21,8 +21,19 @@ type Verifier struct { chain atomic.Pointer[ChainInfo] chains atomic.Pointer[map[string]ChainInfo] // pre-resolved: chain name → config metrics *verifierMetrics + + // routesLoaded is set true after the first route source apply completes. + // Until then HandleReadyz returns 503 so kubelet keeps the pod out of + // the Service Endpoints, preventing the "no rule -> 200 free pass" + // window during informer warmup (CLAUDE.md pitfall #14). + routesLoaded atomic.Bool } +// MarkRoutesLoaded signals that the route source has produced its first +// non-error apply. Idempotent. After this, HandleReadyz returns 200 +// once config is also loaded. +func (v *Verifier) MarkRoutesLoaded() { v.routesLoaded.Store(true) } + // NewVerifier creates a Verifier with the given initial configuration. func NewVerifier(cfg *PricingConfig) (*Verifier, error) { v := &Verifier{metrics: newVerifierMetrics()} @@ -224,10 +235,18 @@ func (v *Verifier) HandleHealthz(w http.ResponseWriter, r *http.Request) { fmt.Fprintln(w, `{"status":"ok"}`) } -// HandleReadyz returns 200 OK if pricing config is loaded, 503 otherwise. +// HandleReadyz returns 200 OK once BOTH pricing config and the first route +// source apply have completed. Until then it returns 503 with a cause-specific +// body so kubelet keeps the pod out of Service Endpoints, preventing the +// "no rule -> 200 free pass" window during informer warmup +// (CLAUDE.md pitfall #14). func (v *Verifier) HandleReadyz(w http.ResponseWriter, r *http.Request) { if v.config.Load() == nil { - http.Error(w, "not ready", http.StatusServiceUnavailable) + http.Error(w, "not ready: config not loaded", http.StatusServiceUnavailable) + return + } + if !v.routesLoaded.Load() { + http.Error(w, "not ready: routes not loaded", http.StatusServiceUnavailable) return } diff --git a/internal/x402/verifier_test.go b/internal/x402/verifier_test.go index 3b62c815..89239542 100644 --- a/internal/x402/verifier_test.go +++ b/internal/x402/verifier_test.go @@ -100,6 +100,8 @@ func testPaymentHeaderFor(t *testing.T, payTo, amount string) string { } // newTestVerifier creates a Verifier backed by the given facilitator URL. +// It also marks routes as loaded so /readyz returns 200 immediately, which +// matches what the production wire-up does once the route source warms up. func newTestVerifier(t *testing.T, facilitatorURL string, routes []RouteRule) *Verifier { t.Helper() v, err := NewVerifier(&PricingConfig{ @@ -112,6 +114,7 @@ func newTestVerifier(t *testing.T, facilitatorURL string, routes []RouteRule) *V if err != nil { t.Fatalf("NewVerifier: %v", err) } + v.MarkRoutesLoaded() return v } @@ -488,6 +491,55 @@ func TestVerifier_ReadyzNotReady(t *testing.T) { if w.Code != http.StatusServiceUnavailable { t.Errorf("expected 503 when config is nil, got %d", w.Code) } + if got := w.Body.String(); !strings.Contains(got, "config not loaded") { + t.Errorf("expected body to mention %q, got %q", "config not loaded", got) + } +} + +// TestVerifier_Readyz_BlocksUntilRoutesLoaded asserts the fix for +// CLAUDE.md pitfall #14: /readyz must return 503 between "config loaded" +// and "first route source apply completed" so kubelet keeps the pod out +// of the Service Endpoints during informer warm-up. +func TestVerifier_Readyz_BlocksUntilRoutesLoaded(t *testing.T) { + v, err := NewVerifier(&PricingConfig{ + Wallet: "0xdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef", + Chain: "base-sepolia", + FacilitatorURL: "http://example.invalid", + }) + if err != nil { + t.Fatalf("NewVerifier: %v", err) + } + + // Config is loaded by NewVerifier, but routes have NOT been marked + // loaded yet — /readyz must still 503 with a routes-specific message + // so kubectl describe pod surfaces the actual cause. + req := httptest.NewRequest(http.MethodGet, "/readyz", nil) + w := httptest.NewRecorder() + v.HandleReadyz(w, req) + + if w.Code != http.StatusServiceUnavailable { + t.Fatalf("expected 503 before routes loaded, got %d", w.Code) + } + if got := w.Body.String(); !strings.Contains(got, "routes not loaded") { + t.Errorf("expected body to mention %q, got %q", "routes not loaded", got) + } + + // After the route source signals first apply, /readyz flips to 200. + v.MarkRoutesLoaded() + + w = httptest.NewRecorder() + v.HandleReadyz(w, req) + if w.Code != http.StatusOK { + t.Fatalf("expected 200 after MarkRoutesLoaded, got %d (body=%q)", w.Code, w.Body.String()) + } + + // MarkRoutesLoaded is idempotent — calling it again must not regress. + v.MarkRoutesLoaded() + w = httptest.NewRecorder() + v.HandleReadyz(w, req) + if w.Code != http.StatusOK { + t.Fatalf("expected 200 after second MarkRoutesLoaded, got %d", w.Code) + } } // ── Per-route PayTo / Network override tests ───────────────────────────────── From 08b4808612a86b1775f6fde89d30bc500fcce11e Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 23 May 2026 22:48:47 +0400 Subject: [PATCH 11/31] refactor(x402): drive verifier deployment from helmfile, not Go-side kubectl apply MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kills CLAUDE.md pitfall #9 forever. The previous code path had two problems that compounded: 1. EnsureVerifier did kubectl apply of embed.FS x402.yaml directly, overwriting whatever helmfile had installed. Under OBOL_DEVELOPMENT=true, this stripped local-build image pins back to registry-pinned digests — silently bypassing every dev edit to the verifier. 2. To work around (1), setup.go carried a DUPLICATE copy of the image-pin rewrite regex from internal/defaults/defaults.go (with a code comment confessing "duplicated here to avoid an import cycle"). Every fix to the regex (e.g. pitfall #12's alternation- order fix) had to be applied in two places — which is exactly the kind of footgun that produces silent bypasses. Now EnsureVerifier shells out to helmfile --selector name=base sync against the helmfile state already used by obol stack up. Since helmfile reads the manifests from \$OBOL_CONFIG_DIR/defaults/ — which is populated by defaults.CopyInfrastructure with the canonical regex already applied — the dev-rewrite happens exactly once, in exactly one place. - Deletes the duplicate devLocallyBuiltImageBases + regex from internal/x402/setup.go. - EnsureVerifier now: RefreshInfrastructureIfChanged(); helmfile sync --selector name=base. - Deletes internal/x402/manifest_devmode_test.go — the canonical regression test is internal/defaults/defaults_test.go:: TestCopyInfrastructure_DevModeRewritesDigestPins which still guards the rewrite at its single source. - Adds a structural test (setup_structure_test.go) asserting setup.go does not import the regexp package, making re-introduction of the duplicate fail at test time. The duplicate-regex footgun is now structurally impossible to re-introduce. --- internal/x402/manifest_devmode_test.go | 43 -------- internal/x402/setup.go | 146 +++++++++++++++---------- internal/x402/setup_structure_test.go | 82 ++++++++++++++ 3 files changed, 169 insertions(+), 102 deletions(-) delete mode 100644 internal/x402/manifest_devmode_test.go create mode 100644 internal/x402/setup_structure_test.go diff --git a/internal/x402/manifest_devmode_test.go b/internal/x402/manifest_devmode_test.go deleted file mode 100644 index 7bcc3e52..00000000 --- a/internal/x402/manifest_devmode_test.go +++ /dev/null @@ -1,43 +0,0 @@ -package x402 - -import ( - "strings" - "testing" -) - -func TestX402Manifest_DevModeRewritesPins(t *testing.T) { - t.Setenv("OBOL_DEVELOPMENT", "true") - out := string(x402ManifestForApply()) - - for _, want := range []string{ - "ghcr.io/obolnetwork/x402-verifier:latest", - "ghcr.io/obolnetwork/serviceoffer-controller:latest", - } { - if !strings.Contains(out, want) { - t.Errorf("dev mode did not rewrite to %q", want) - } - } - for _, bad := range []string{ - "ghcr.io/obolnetwork/x402-verifier:b13254e", - "ghcr.io/obolnetwork/serviceoffer-controller:b13254e", - } { - if strings.Contains(out, bad) && !strings.Contains(out, ":latest@sha256:") { - // b13254e in a *comment* would be acceptable, but the regex doesn't - // match comments preceded by '#' — flag any unrewritten image: line. - for _, line := range strings.Split(out, "\n") { - trim := strings.TrimSpace(line) - if strings.HasPrefix(trim, "image:") && strings.Contains(trim, bad) { - t.Errorf("dev mode left immutable pin on image line: %q", line) - } - } - } - } -} - -func TestX402Manifest_ProductionPreservesPins(t *testing.T) { - t.Setenv("OBOL_DEVELOPMENT", "") - out := string(x402ManifestForApply()) - if !strings.Contains(out, "ghcr.io/obolnetwork/x402-verifier:b13254e") { - t.Error("production manifest should preserve x402-verifier:b13254e pin") - } -} diff --git a/internal/x402/setup.go b/internal/x402/setup.go index 562da993..4811ceb6 100644 --- a/internal/x402/setup.go +++ b/internal/x402/setup.go @@ -4,15 +4,32 @@ import ( "encoding/json" "fmt" "os" - "regexp" + "os/exec" + "path/filepath" "strings" "github.com/ObolNetwork/obol-stack/internal/config" + stackdefaults "github.com/ObolNetwork/obol-stack/internal/defaults" "github.com/ObolNetwork/obol-stack/internal/embed" + "github.com/ObolNetwork/obol-stack/internal/helmcmd" "github.com/ObolNetwork/obol-stack/internal/kubectl" "gopkg.in/yaml.v3" ) +// x402Manifest is the raw embedded x402.yaml. It is no longer applied +// directly via kubectl — helmfile renders the same file via the `base` +// release (see EnsureVerifier). Retained as a package-level value so +// shape/content tests can assert invariants about the embedded source. +var x402Manifest = mustReadX402Manifest() + +func mustReadX402Manifest() []byte { + data, err := embed.ReadInfrastructureFile("base/templates/x402.yaml") + if err != nil { + panic(fmt.Sprintf("read embedded x402 manifest: %v", err)) + } + return data +} + const ( x402Namespace = "x402" pricingConfigMap = "x402-pricing" @@ -37,77 +54,88 @@ const ( // Used only as a hint in error messages; the actual chain is taken // from the seller's 402 response by buy.py. DefaultBuySellerChain = "base-sepolia" -) -var x402Manifest = mustReadX402Manifest() + // baseReleaseName matches the helmfile release in + // internal/embed/infrastructure/helmfile.yaml whose `chart: ./base` + // renders the x402 manifests. EnsureVerifier targets this release + // via --selector so the verifier deployment is reconciled the same + // way `obol stack up` deploys it — single source of truth. + baseReleaseName = "base" +) -func mustReadX402Manifest() []byte { - data, err := embed.ReadInfrastructureFile("base/templates/x402.yaml") - if err != nil { - panic(fmt.Sprintf("read embedded x402 manifest: %v", err)) +// EnsureVerifier deploys the x402 verifier subsystem if it doesn't exist. +// Idempotent — helmfile sync is safe to run multiple times. +// +// Historical note: this used to read embed.FS x402.yaml directly and +// `kubectl apply` it, which fought helmfile's field manager and forced +// us to duplicate the dev-mode image-pin rewrite (formerly in this file, +// now lives canonically in internal/defaults/defaults.go). Driving the +// deployment through helmfile against the already-populated +// $OBOL_CONFIG_DIR/defaults/ tree picks up the canonical dev rewrite +// for free and removes the entire footgun. See CLAUDE.md pitfall #9. +func EnsureVerifier(cfg *config.Config) error { + if err := kubectl.EnsureCluster(cfg); err != nil { + return err } - return data -} -// devLocallyBuiltImageBases mirrors internal/defaults.devLocallyBuiltImageBases -// — duplicated here to avoid a defaults → x402 → defaults import cycle. -// Must stay in lockstep with the canonical list there. -var devLocallyBuiltImageBases = []string{ - "ghcr.io/obolnetwork/x402-verifier", - "ghcr.io/obolnetwork/serviceoffer-controller", - "ghcr.io/obolnetwork/x402-buyer", - "ghcr.io/obolnetwork/demo-server", - "ghcr.io/obolnetwork/obol-stack-public-storefront", -} + // Refresh the defaults tree so the helmfile sync below reads the + // most recent embedded manifests. Under OBOL_DEVELOPMENT=true this + // also applies the canonical digest-pin -> :latest rewrite via + // defaults.rewriteDevDigestPins so freshly built local images are + // honored. No-op when the stamp is up to date. + backendName := stackdefaults.DetectedBackendName(cfg) + stackID := stackdefaults.StackID(cfg) + if stackID == "" { + return fmt.Errorf("stack ID not found, run 'obol stack init' first") + } + if _, err := stackdefaults.RefreshInfrastructureIfChanged(cfg, backendName, stackID); err != nil { + return fmt.Errorf("refresh infrastructure defaults: %w", err) + } -// rewriteDevImagePinsInManifest applies the same `:tag@sha256:digest` / -// `@sha256:digest` / `:tag` → `:latest` rewrite the defaults pipeline uses, -// so kubectl-applied manifests inside EnsureVerifier honor the local-build -// path under OBOL_DEVELOPMENT=true. Without this rewrite, the embedded -// x402.yaml carrying `:b13254e` pins beats the helmfile-rendered :latest -// deployment, and the cluster runs the stale registry image regardless of -// OBOL_FORCE_REBUILD_LOCAL_DEV_IMAGES (root cause of the missing -// HandleProxy debug-log saga during flow-11 step 43 chase, May 2026). -// -// Pattern parity with internal/defaults.rewriteDevDigestPins is enforced -// by the regression test in TestX402Manifest_DevModeRewritesPins. -func rewriteDevImagePinsInManifest(data []byte) []byte { - out := data - for _, base := range devLocallyBuiltImageBases { - re := regexp.MustCompile(regexp.QuoteMeta(base) + - `(:[a-f0-9]{7,40}@sha256:[a-f0-9]{64}|@sha256:[a-f0-9]{64}|:[a-f0-9]{7,40})`) - out = re.ReplaceAll(out, []byte(base+":latest")) + if err := helmfileSyncBaseRelease(cfg); err != nil { + return fmt.Errorf("helmfile sync %s: %w", baseReleaseName, err) } - return out + + // Populate the CA bundle after deploying the verifier so TLS verification + // of the facilitator works immediately. Idempotent — safe to call multiple times. + bin, kc := kubectl.Paths(cfg) + populateCABundle(bin, kc) + return nil } -// x402ManifestForApply returns the kubectl-apply-ready bytes, rewriting -// immutable image pins to `:latest` when OBOL_DEVELOPMENT=true so the -// in-cluster verifier/controller uses the freshly-built local image. -// In production (OBOL_DEVELOPMENT unset/false) returns the embedded -// manifest verbatim — the pins are intentional and immutable. -func x402ManifestForApply() []byte { - if os.Getenv("OBOL_DEVELOPMENT") != "true" { - return x402Manifest +// helmfileSyncBaseRelease runs `helmfile --selector name=base sync` +// against the defaults helmfile rendered into $OBOL_CONFIG_DIR/defaults. +// This is the same invocation pattern used by `internal/stack.syncDefaults` +// and `internal/update.ApplyUpgrades`, scoped to the single release that +// owns the x402 manifests. +func helmfileSyncBaseRelease(cfg *config.Config) error { + kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") + helmfilePath := filepath.Join(cfg.ConfigDir, "defaults", "helmfile.yaml") + + if _, err := os.Stat(helmfilePath); err != nil { + return fmt.Errorf("defaults helmfile not found at %s (run 'obol stack init' first): %w", helmfilePath, err) } - return rewriteDevImagePinsInManifest(x402Manifest) -} -// EnsureVerifier deploys the x402 verifier subsystem if it doesn't exist. -// Idempotent — kubectl apply is safe to run multiple times. -func EnsureVerifier(cfg *config.Config) error { - if err := kubectl.EnsureCluster(cfg); err != nil { - return err + helmfileBin := filepath.Join(cfg.BinDir, "helmfile") + helmBin := filepath.Join(cfg.BinDir, "helm") + + args := []string{ + "--file", helmfilePath, + "--kubeconfig", kubeconfigPath, + "--selector", "name=" + baseReleaseName, + "sync", } - bin, kc := kubectl.Paths(cfg) + args = append(args, helmcmd.SyncFlagsForVersion(helmBin)...) - fmt.Println("Applying x402 payment components...") - if err := kubectl.Apply(bin, kc, x402ManifestForApply()); err != nil { - return err + cmd := exec.Command(helmfileBin, args...) + cmd.Env = append(os.Environ(), + "KUBECONFIG="+kubeconfigPath, + "STACK_DATA_DIR="+cfg.DataDir, + ) + out, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("%w: %s", err, strings.TrimSpace(string(out))) } - // Populate the CA bundle after deploying the verifier so TLS verification - // of the facilitator works immediately. Idempotent — safe to call multiple times. - populateCABundle(bin, kc) return nil } diff --git a/internal/x402/setup_structure_test.go b/internal/x402/setup_structure_test.go new file mode 100644 index 00000000..4353a0fd --- /dev/null +++ b/internal/x402/setup_structure_test.go @@ -0,0 +1,82 @@ +package x402 + +import ( + "go/parser" + "go/token" + "os" + "path/filepath" + "strings" + "testing" +) + +// TestEnsureVerifier_NoInlineRegex enforces CLAUDE.md pitfall #9 at the +// structural level: setup.go must not carry its own image-pin rewrite +// regex. The canonical rewrite lives in internal/defaults/defaults.go, +// applied to the helmfile-rendered tree under $OBOL_CONFIG_DIR/defaults. +// Driving the verifier deployment through helmfile (not kubectl apply +// of embed.FS) means any duplicated regex is dead code at best and a +// silent-bypass footgun at worst. +// +// If this test fires, either: +// - delete the duplicate regex from internal/x402/setup.go, or +// - if the duplicate is genuinely needed (it almost never is), move +// it behind a shared helper in internal/defaults and call that. +func TestEnsureVerifier_NoInlineRegex(t *testing.T) { + setupPath := mustResolveFile(t, "setup.go") + + data, err := os.ReadFile(setupPath) + if err != nil { + t.Fatalf("read setup.go: %v", err) + } + src := string(data) + + // Cheap textual guard first — surfaces a clear error message even when + // the AST parse below would also catch it. + if strings.Contains(src, `"regexp"`) { + t.Fatalf("internal/x402/setup.go must not import the regexp package; " + + "the image-pin rewrite belongs in internal/defaults (see CLAUDE.md pitfall #9)") + } + if strings.Contains(src, "regexp.MustCompile") || strings.Contains(src, "regexp.Compile") { + t.Fatalf("internal/x402/setup.go must not compile regexes inline; " + + "the duplicated rewrite was deleted in favor of helmfile-driven deploy") + } + + // AST-level guard: catches aliased imports (e.g. `re "regexp"`) and is + // resilient to comments that happen to contain the word "regexp". + fset := token.NewFileSet() + file, err := parser.ParseFile(fset, setupPath, data, parser.ImportsOnly) + if err != nil { + t.Fatalf("parse setup.go: %v", err) + } + for _, imp := range file.Imports { + path := strings.Trim(imp.Path.Value, `"`) + if path == "regexp" { + t.Fatalf("internal/x402/setup.go imports %q; remove the duplicated rewrite", path) + } + } +} + +// mustResolveFile locates a source file relative to this test file. Works +// whether `go test` is run from the package directory or from the repo root. +func mustResolveFile(t *testing.T, name string) string { + t.Helper() + // First try working directory (default for `go test ./...`). + if _, err := os.Stat(name); err == nil { + abs, err := filepath.Abs(name) + if err != nil { + t.Fatalf("abs %q: %v", name, err) + } + return abs + } + t.Fatalf("could not locate %q from %q", name, mustGetwd(t)) + return "" +} + +func mustGetwd(t *testing.T) string { + t.Helper() + wd, err := os.Getwd() + if err != nil { + t.Fatalf("getwd: %v", err) + } + return wd +} From 04b9a6e1b43b0dbb8e70a82fda8c39c9c88de3cb Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 23 May 2026 23:03:23 +0400 Subject: [PATCH 12/31] feat(security): Restricted Pod Security Standard across embedded workloads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Brings every embedded Deployment shipped by obol-stack up to PSS Restricted: - runAsNonRoot: true with fixed non-zero UID/GID (65532) - allowPrivilegeEscalation: false - capabilities.drop: [ALL] - seccompProfile: RuntimeDefault - readOnlyRootFilesystem: true (with named emptyDir mounts where Python needs writeable /tmp and HOME/.cache) PSS labels (enforce=restricted, audit/warn=restricted) added to the x402 and llm namespaces so future Deployment edits that omit per-pod securityContext are rejected at admission. Also switches the serviceoffer-controller Dockerfile from gcr.io/distroless/static-debian12 (UID 0) to ...:nonroot (UID 65532). Container escape via a Go runtime CVE on a UID-0 / no-seccomp / no-cap-drop / RW-rootfs container was the easiest path to host pivot on k3s single-node; this closes it. Files touched: - Dockerfile.serviceoffer-controller (:nonroot base) - internal/embed/infrastructure/base/templates/x402.yaml (verifier + controller securityContext blocks, x402 ns PSS label) - internal/embed/infrastructure/base/templates/llm.yaml (litellm + x402-buyer securityContext, litellm-tmp + litellm-home emptyDir mounts with HOME/XDG_CACHE_HOME/HF_HOME redirection, llm ns PSS label) Scope notes: - local-path-provisioner lives in kube-system (k3d-managed); not relabeled per PSS guidance to skip system namespaces. - hermes-obol-agent runtime is generated dynamically by serviceoffer-controller (internal/serviceoffercontroller/agent_render.go and internal/hermes/hermes.go), not from the embedded templates; its init-hermes-perms initContainer legitimately runs as UID 0 for /data chown and is intentionally left out of this PR's scope. - cloudflared chart (internal/embed/infrastructure/cloudflared/...) is a separate Helm chart and not in this PR's file list. What may break: - LiteLLM with readOnlyRootFilesystem may fail if it writes outside /tmp or $HOME — watch the next release-smoke for permission-denied errors and add named emptyDir mounts for any new write paths. --- Dockerfile.serviceoffer-controller | 2 +- .../infrastructure/base/templates/llm.yaml | 57 +++++++++++++++++++ .../infrastructure/base/templates/x402.yaml | 42 ++++++++++++++ 3 files changed, 100 insertions(+), 1 deletion(-) diff --git a/Dockerfile.serviceoffer-controller b/Dockerfile.serviceoffer-controller index 5214a93a..09f6935b 100644 --- a/Dockerfile.serviceoffer-controller +++ b/Dockerfile.serviceoffer-controller @@ -5,6 +5,6 @@ RUN go mod download COPY . . RUN CGO_ENABLED=0 go build -o /serviceoffer-controller ./cmd/serviceoffer-controller -FROM gcr.io/distroless/static-debian12 +FROM gcr.io/distroless/static-debian12:nonroot COPY --from=builder /serviceoffer-controller /serviceoffer-controller ENTRYPOINT ["/serviceoffer-controller"] diff --git a/internal/embed/infrastructure/base/templates/llm.yaml b/internal/embed/infrastructure/base/templates/llm.yaml index cf34841f..956f59d4 100644 --- a/internal/embed/infrastructure/base/templates/llm.yaml +++ b/internal/embed/infrastructure/base/templates/llm.yaml @@ -21,6 +21,15 @@ apiVersion: v1 kind: Namespace metadata: name: llm + labels: + # Pod Security Standards: Restricted profile enforced at admission. + # The litellm pod (litellm + x402-buyer sidecar) runs as non-root with + # all caps dropped, seccomp=RuntimeDefault, and readOnlyRootFilesystem; + # write paths are routed to named emptyDir mounts. + pod-security.kubernetes.io/enforce: restricted + pod-security.kubernetes.io/enforce-version: latest + pod-security.kubernetes.io/audit: restricted + pod-security.kubernetes.io/warn: restricted --- # ClusterIP Service + Endpoints: routes ollama.llm.svc.cluster.local → host Ollama. @@ -142,6 +151,17 @@ spec: secret.reloader.stakater.com/reload: "litellm-secrets" spec: terminationGracePeriodSeconds: 60 + # PSS Restricted: pod-level identity. UID/GID 65532 is the nonroot + # distroless convention; the Obol LiteLLM fork's working dirs are + # routed onto emptyDir mounts below so readOnlyRootFilesystem can + # stay on without breaking Python's tempfile / cache writes. + securityContext: + runAsNonRoot: true + runAsUser: 65532 + runAsGroup: 65532 + fsGroup: 65532 + seccompProfile: + type: RuntimeDefault containers: - name: litellm # Obol fork of LiteLLM with config-only model management API. @@ -150,6 +170,13 @@ spec: # Source: https://github.com/ObolNetwork/litellm image: ghcr.io/obolnetwork/litellm:sha-c16b156 imagePullPolicy: IfNotPresent + # PSS Restricted: drop all caps, no privilege escalation, RO rootfs. + # Python writes are funneled to the emptyDir mounts below. + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"] args: - --config - /etc/litellm/config.yaml @@ -167,10 +194,22 @@ spec: value: "false" - name: DISABLE_SCHEMA_UPDATE value: "true" + # Redirect Python / HF / pip cache lookups onto the writeable + # emptyDir at /home/litellm so readOnlyRootFilesystem=true holds. + - name: HOME + value: /home/litellm + - name: XDG_CACHE_HOME + value: /home/litellm/.cache + - name: HF_HOME + value: /home/litellm/.cache/huggingface volumeMounts: - name: litellm-config mountPath: /etc/litellm/config.yaml subPath: config.yaml + - name: litellm-tmp + mountPath: /tmp + - name: litellm-home + mountPath: /home/litellm startupProbe: httpGet: path: /health/readiness @@ -214,6 +253,14 @@ spec: # across flow-08/11/14/13. See internal/embed/embed_image_pin_test.go. image: ghcr.io/obolnetwork/x402-buyer:b13254e@sha256:446d730fefbe1860e8b3245289aa8979d765ae977b7f0eaa053543e2468313cb imagePullPolicy: IfNotPresent + # PSS Restricted: Go distroless:nonroot image already runs as + # UID 65532; only the state dir under /state needs to be writeable + # and it's already an emptyDir mount. + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"] args: - --config-dir=/config/buyer-config - --auths-dir=/config/buyer-auths @@ -258,6 +305,16 @@ spec: items: - key: config.yaml path: config.yaml + # Writable /tmp for Python tempfile / multipart uploads. Sized + # modestly — LiteLLM streams responses rather than buffering them. + - name: litellm-tmp + emptyDir: + sizeLimit: 128Mi + # Writable HOME for LiteLLM's pip/HF/XDG cache lookups so the + # container can run with readOnlyRootFilesystem=true. + - name: litellm-home + emptyDir: + sizeLimit: 256Mi - name: buyer-config configMap: name: x402-buyer-config diff --git a/internal/embed/infrastructure/base/templates/x402.yaml b/internal/embed/infrastructure/base/templates/x402.yaml index 9dcc933e..11fdfba3 100644 --- a/internal/embed/infrastructure/base/templates/x402.yaml +++ b/internal/embed/infrastructure/base/templates/x402.yaml @@ -6,6 +6,16 @@ apiVersion: v1 kind: Namespace metadata: name: x402 + labels: + # Pod Security Standards: Restricted profile enforced at admission. + # Future Deployment edits that omit the per-pod securityContext will be + # rejected by the apiserver. Both x402-verifier and serviceoffer-controller + # run as non-root with all caps dropped, seccomp=RuntimeDefault, and + # readOnlyRootFilesystem. + pod-security.kubernetes.io/enforce: restricted + pod-security.kubernetes.io/enforce-version: latest + pod-security.kubernetes.io/audit: restricted + pod-security.kubernetes.io/warn: restricted --- # Static gateway settings plus optional manual routes. In cluster mode the @@ -210,10 +220,25 @@ spec: app: x402-verifier spec: serviceAccountName: x402-verifier + # PSS Restricted: pod-level identity. + securityContext: + runAsNonRoot: true + runAsUser: 65532 + runAsGroup: 65532 + fsGroup: 65532 + seccompProfile: + type: RuntimeDefault containers: - name: verifier image: ghcr.io/obolnetwork/x402-verifier:b13254e imagePullPolicy: IfNotPresent + # PSS Restricted: per-container hardening. Verifier is a Go binary + # reading two RO ConfigMaps; no writeable rootfs paths required. + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"] ports: - name: http containerPort: 8080 @@ -281,10 +306,27 @@ spec: app: serviceoffer-controller spec: serviceAccountName: serviceoffer-controller + # PSS Restricted: pod-level identity. Paired with Dockerfile + # FROM gcr.io/distroless/static-debian12:nonroot which default-runs + # as UID/GID 65532. Container escape via a Go-runtime CVE on a + # UID-0 / no-seccomp / no-cap-drop / RW-rootfs container was the + # easiest path to host pivot on k3s single-node; this closes it. + securityContext: + runAsNonRoot: true + runAsUser: 65532 + runAsGroup: 65532 + fsGroup: 65532 + seccompProfile: + type: RuntimeDefault containers: - name: controller image: ghcr.io/obolnetwork/serviceoffer-controller:b13254e imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"] env: - name: POD_NAMESPACE valueFrom: From 5c9a879e8bd5ca7d0a3651923336099a99f20ac3 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 23 May 2026 23:05:16 +0400 Subject: [PATCH 13/31] fix(x402-buyer): persist consumed-nonce state to PVC instead of emptyDir MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Today the x402-buyer sidecar's /state directory is an emptyDir. When the litellm pod restarts (rollout, OOM, node drain), consumed.json is gone. The pre-signed auth pool reloads from the ConfigMap the controller manages, and the buyer treats every auth as unconsumed — attempting to spend nonces that the facilitator already marked used. Cascade: facilitator returns 400 "nonce already used" -> buyer 402 back to LiteLLM -> caller retry -> same 400 -> eventually buyer pool exhausted -> 503 until manual `buy.py process --all` reseeds. Fix: convert /state to a PVC backed by local-path-provisioner (the storage class already deployed via base/templates/local-path.yaml). 50Mi request; consumed.json is tiny but room left for log growth. Deployment strategy switched to Recreate because a RWO PVC can't be co-mounted during a RollingUpdate surge. Litellm is replicas: 1 so this just means rollouts have a ~5s gap instead of an overlap — acceptable. What this does NOT solve: - Multi-replica litellm. RWO PVC works only for replicas: 1; would need RWX (which local-path doesn't support — needs NFS/Longhorn) or per-replica state via StatefulSet. Out of scope; litellm has no current scaling need. - Hard node loss. local-path PVCs are node-local; if the k3d node is destroyed, state is gone (along with the rest of the cluster). For local-only operator that's the expected blast radius. PSS compatibility note: the PVC mount works under PSS Restricted as long as the buyer container runs with appropriate fsGroup. PR #12 (Restricted PSS sweep) handles that separately and will verify mount permissions when it lands. --- internal/embed/embed_buyer_state_test.go | 92 +++++++++++++++++++ .../infrastructure/base/templates/llm.yaml | 36 +++++++- 2 files changed, 123 insertions(+), 5 deletions(-) create mode 100644 internal/embed/embed_buyer_state_test.go diff --git a/internal/embed/embed_buyer_state_test.go b/internal/embed/embed_buyer_state_test.go new file mode 100644 index 00000000..e7ed4ae2 --- /dev/null +++ b/internal/embed/embed_buyer_state_test.go @@ -0,0 +1,92 @@ +package embed + +import ( + "testing" +) + +// TestBuyerStatePVC asserts that x402-buyer's /state is backed by a PVC +// (not an emptyDir), and that the litellm Deployment uses the Recreate +// strategy so the RWO PVC can be remounted without overlap. +// +// Regression: emptyDir lost consumed.json on every pod restart, causing +// the buyer to re-spend already-consumed auths from the ConfigMap pool +// and cascading into facilitator 400s ("nonce already used") until a +// manual `buy.py process --all` reseeded. +func TestBuyerStatePVC(t *testing.T) { + data, err := ReadInfrastructureFile("base/templates/llm.yaml") + if err != nil { + t.Fatalf("ReadInfrastructureFile: %v", err) + } + + docs := multiDoc(data) + + // PVC must exist in the llm namespace with RWO + local-path storage class. + pvc := findDocByName(docs, "PersistentVolumeClaim", "x402-buyer-state") + if pvc == nil { + t.Fatal("PersistentVolumeClaim 'x402-buyer-state' missing from llm.yaml") + } + + if ns := nested(pvc, "metadata", "namespace"); ns != "llm" { + t.Errorf("PVC namespace = %v, want llm", ns) + } + + modes, ok := nested(pvc, "spec", "accessModes").([]any) + if !ok || len(modes) != 1 || modes[0] != "ReadWriteOnce" { + t.Errorf("PVC accessModes = %v, want [ReadWriteOnce]", modes) + } + + if sc := nested(pvc, "spec", "storageClassName"); sc != "local-path" { + t.Errorf("PVC storageClassName = %v, want local-path", sc) + } + + if storage := nested(pvc, "spec", "resources", "requests", "storage"); storage == nil { + t.Error("PVC missing spec.resources.requests.storage") + } + + // litellm Deployment volume entry must reference the PVC, not emptyDir. + dep := findDocByName(docs, "Deployment", "litellm") + if dep == nil { + t.Fatal("litellm Deployment missing from llm.yaml") + } + + volumes, ok := nested(dep, "spec", "template", "spec", "volumes").([]any) + if !ok { + t.Fatal("litellm Deployment has no volumes") + } + + var stateVolume map[string]any + for _, v := range volumes { + vm, ok := v.(map[string]any) + if !ok { + continue + } + if vm["name"] == "x402-buyer-state" { + stateVolume = vm + break + } + } + + if stateVolume == nil { + t.Fatal("litellm Deployment missing 'x402-buyer-state' volume") + } + + if _, isEmptyDir := stateVolume["emptyDir"]; isEmptyDir { + t.Error("x402-buyer-state is still emptyDir — must be persistentVolumeClaim to survive pod restarts") + } + + pvcRef, ok := stateVolume["persistentVolumeClaim"].(map[string]any) + if !ok { + t.Fatal("x402-buyer-state volume is not backed by persistentVolumeClaim") + } + + if claim := pvcRef["claimName"]; claim != "x402-buyer-state" { + t.Errorf("persistentVolumeClaim.claimName = %v, want x402-buyer-state", claim) + } + + // Strategy must be Recreate so the new pod waits for the old pod to + // release the RWO PVC before mounting. RollingUpdate with maxSurge>0 + // would block indefinitely. + if strat := nested(dep, "spec", "strategy", "type"); strat != "Recreate" { + t.Errorf("litellm Deployment strategy.type = %v, want Recreate (RWO PVC cannot be co-mounted during surge)", strat) + } +} diff --git a/internal/embed/infrastructure/base/templates/llm.yaml b/internal/embed/infrastructure/base/templates/llm.yaml index cf34841f..b9a304e4 100644 --- a/internal/embed/infrastructure/base/templates/llm.yaml +++ b/internal/embed/infrastructure/base/templates/llm.yaml @@ -113,6 +113,30 @@ type: Opaque stringData: LITELLM_MASTER_KEY: "sk-obol-{{CLUSTER_ID}}" +--- +# x402-buyer maintains consumed-nonce state in /state/consumed.json. +# Previously this was emptyDir, which lost state on every pod restart +# — the buyer would then attempt to re-spend already-consumed auths +# from the ConfigMap-loaded pool, cascading into 400s from the +# facilitator's nonce protection until a manual buy.py process --all. +# PVC backed by local-path (single-node k3d default storage class) +# gives crash-safety without conversion to StatefulSet. +# +# Deployment strategy: Recreate — RWO PVC can't be mounted by two +# pods, so RollingUpdate's surge would block. Recreate accepts a +# brief gap during rollout (litellm is replicas:1 anyway). +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: x402-buyer-state + namespace: llm +spec: + accessModes: [ReadWriteOnce] + storageClassName: local-path + resources: + requests: + storage: 50Mi + --- apiVersion: apps/v1 kind: Deployment @@ -126,11 +150,12 @@ spec: # is local to the sidecar pod. Scale this back out only after consumed auth # state is shared or auth pools are sharded per replica. replicas: 1 + # Recreate (not RollingUpdate) because the x402-buyer-state PVC is RWO and + # cannot be co-mounted by an overlapping new pod during surge. Litellm is + # replicas: 1 so this just trades the (currently maxSurge:1) overlap for a + # short gap during rollout — acceptable, and unavoidable with RWO storage. strategy: - type: RollingUpdate - rollingUpdate: - maxUnavailable: 0 - maxSurge: 1 + type: Recreate selector: matchLabels: app: litellm @@ -267,7 +292,8 @@ spec: name: x402-buyer-auths optional: true - name: x402-buyer-state - emptyDir: {} + persistentVolumeClaim: + claimName: x402-buyer-state --- apiVersion: policy/v1 From fb594eab3ea87c19d376853fa574d16ba76faa72 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 23 May 2026 23:09:06 +0400 Subject: [PATCH 14/31] refactor: relocate remaining bedag/raw helmfile releases into base chart MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The infrastructure helmfile shipped 6 `bedag/raw` releases — a wrapper chart whose only job is to apply inline YAML through helmfile. With the `base` release already rendering every other YAML in `base/templates/`, the inline approach has zero remaining justification. This PR finishes the job by relocating all 6: - llm-buyer-podmonitor → base/templates/llm.yaml (appended) - erpc-httproute → base/templates/erpc.yaml (new file) - erpc-x402-middleware → base/templates/erpc.yaml - erpc-metadata → base/templates/erpc.yaml - obol-frontend-httproute → base/templates/obol-frontend.yaml (new file) - obol-frontend-rbac → base/templates/obol-frontend.yaml Net change to the rendering: zero. Same YAML, just sourced from the chart's templates directory instead of inlined in helmfile.yaml through the bedag/raw wrapper chart. Each relocated YAML carries a provenance comment. DAG: `base` now `needs: [traefik/traefik, monitoring/monitoring]` so the Traefik (Middleware) / Gateway API (HTTPRoute) / Prometheus operator (PodMonitor) CRDs are guaranteed present before the relocated templates apply. New Namespace docs for `erpc` and `obol-frontend` make the `base` release self-contained — the upstream chart releases that originally created those namespaces still set `createNamespace: true`, which is a no-op against an existing namespace. The `bedag` repository entry is removed (no infrastructure release uses it anymore). Network helmfiles + hermes still use bedag/raw — out of scope for this PR. `migrateDefaultsHTTPRouteHostnames` in internal/stack/stack.go targets the old in-helmfile HTTPRoute indentation pattern; it is a no-op against the relocated templates and against the new helmfile, preserved unchanged for users upgrading from older stacks. The `hostnames: ["obol.stack"]` restriction is preserved on every relocated HTTPRoute per CLAUDE.md guidance — removing it would expose the frontend / eRPC to the public cloudflared tunnel. `TestHelmfile_IncludesBuyerPodMonitor` rewired to read `base/templates/llm.yaml`. All embed CRD tests, stack tests, and go build are green. --- .../infrastructure/base/templates/erpc.yaml | 100 ++++++++ .../infrastructure/base/templates/llm.yaml | 24 ++ .../base/templates/obol-frontend.yaml | 95 ++++++++ internal/embed/infrastructure/helmfile.yaml | 213 +++--------------- .../values/erpc-metadata.yaml.gotmpl | 21 -- internal/stack/stack_test.go | 17 +- 6 files changed, 265 insertions(+), 205 deletions(-) create mode 100644 internal/embed/infrastructure/base/templates/erpc.yaml create mode 100644 internal/embed/infrastructure/base/templates/obol-frontend.yaml delete mode 100644 internal/embed/infrastructure/values/erpc-metadata.yaml.gotmpl diff --git a/internal/embed/infrastructure/base/templates/erpc.yaml b/internal/embed/infrastructure/base/templates/erpc.yaml new file mode 100644 index 00000000..635665d3 --- /dev/null +++ b/internal/embed/infrastructure/base/templates/erpc.yaml @@ -0,0 +1,100 @@ +# Relocated from helmfile.yaml `erpc-httproute`, `erpc-x402-middleware`, +# and `erpc-metadata` bedag/raw releases. These resources live alongside +# their workload (eRPC in the `erpc` namespace) instead of inlined in +# helmfile so the chart layout is the single source of truth for what +# ships in the erpc namespace. +# +# CRD prerequisites: +# - HTTPRoute -> gateway.networking.k8s.io/v1 (shipped by the Traefik +# v38+ chart's bundled CRDs) +# - Middleware -> traefik.io/v1alpha1 (shipped by the Traefik chart) +# `base` now declares `needs: [traefik/traefik]` in helmfile.yaml to +# guarantee CRDs are present before these templates apply. +# +# The eRPC Deployment + Service themselves still come from the upstream +# `ethereum/erpc` Helm chart (separate release in helmfile.yaml); only +# the routing + discovery metadata is owned here. + +--- +# eRPC namespace. Pre-created here so resources in this file (HTTPRoute, +# Middleware, ConfigMap) can apply during the `base` release without +# waiting for the `erpc` upstream chart release to create it. The `erpc` +# release still sets `createNamespace: true` — kubectl apply on an +# existing namespace is a no-op. +apiVersion: v1 +kind: Namespace +metadata: + name: erpc + +--- +# eRPC HTTPRoute — gates /rpc through the x402-payment Middleware and +# restricts the route to the obol.stack hostname so it cannot be reached +# via the public cloudflared tunnel (see CLAUDE.md "Security: Tunnel +# Exposure"). Removing the hostnames restriction is a critical security +# regression. +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: erpc + namespace: erpc +spec: + hostnames: + - "obol.stack" + parentRefs: + - name: traefik-gateway + namespace: traefik + sectionName: web + rules: + - matches: + - path: + type: PathPrefix + value: /rpc + filters: + - type: ExtensionRef + extensionRef: + group: traefik.io + kind: Middleware + name: x402-payment + backendRefs: + - name: erpc + port: 80 + +--- +# x402 Middleware for the eRPC namespace (ForwardAuth -> central +# verifier). Always deployed; the verifier returns 200 for routes with +# no pricing rules. +apiVersion: traefik.io/v1alpha1 +kind: Middleware +metadata: + name: x402-payment + namespace: erpc +spec: + forwardAuth: + address: http://x402-verifier.x402.svc.cluster.local:8080/verify + authResponseHeaders: + - X-Payment-Response + +--- +# eRPC metadata ConfigMap for frontend discovery. `.Values.network` +# resolves against the `network` value passed to the `base` release +# (default "mainnet", overridable via helmfile state values). +apiVersion: v1 +kind: ConfigMap +metadata: + name: erpc-metadata + namespace: erpc + labels: + app.kubernetes.io/part-of: obol.stack + obol.stack/id: default + obol.stack/app: erpc +data: + metadata.json: | + { + "network": "{{ .Values.network }}", + "endpoints": { + "rpc": { + "external": "http://obol.stack/rpc/{{ .Values.network }}", + "internal": "http://erpc.erpc.svc.cluster.local/rpc/{{ .Values.network }}" + } + } + } diff --git a/internal/embed/infrastructure/base/templates/llm.yaml b/internal/embed/infrastructure/base/templates/llm.yaml index cf34841f..d2bc7b90 100644 --- a/internal/embed/infrastructure/base/templates/llm.yaml +++ b/internal/embed/infrastructure/base/templates/llm.yaml @@ -298,3 +298,27 @@ spec: port: 4000 targetPort: http protocol: TCP + +--- +# Relocated from helmfile.yaml `llm-buyer-podmonitor` bedag/raw release. +# Lives alongside its workload (litellm + x402-buyer sidecar) instead of +# inlined in helmfile so the chart layout is the single source of truth +# for what ships in the llm namespace. The PodMonitor CRD comes from the +# monitoring release (kube-prometheus-stack), so `base` now declares a +# `needs: [monitoring/monitoring]` in helmfile.yaml to guarantee CRD +# presence before this template applies. +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: litellm-x402-buyer + namespace: llm + labels: + release: monitoring +spec: + selector: + matchLabels: + app: litellm + podMetricsEndpoints: + - port: buyer-http + path: /metrics + interval: 30s diff --git a/internal/embed/infrastructure/base/templates/obol-frontend.yaml b/internal/embed/infrastructure/base/templates/obol-frontend.yaml new file mode 100644 index 00000000..397a192e --- /dev/null +++ b/internal/embed/infrastructure/base/templates/obol-frontend.yaml @@ -0,0 +1,95 @@ +# Relocated from helmfile.yaml `obol-frontend-httproute` and +# `obol-frontend-rbac` bedag/raw releases. These resources live +# alongside their workload (the obol-frontend Helm release in the +# `obol-frontend` namespace) instead of inlined in helmfile so the +# chart layout is the single source of truth for what ships in the +# obol-frontend namespace. +# +# The obol-frontend Deployment + Service themselves still come from +# the `obol/obol-app` upstream chart (separate release in +# helmfile.yaml); only the HTTPRoute and discovery RBAC are owned +# here. +# +# CRD prerequisite: HTTPRoute -> gateway.networking.k8s.io/v1 +# (shipped by the Traefik v38+ chart's bundled CRDs). `base` now +# declares `needs: [traefik/traefik]` in helmfile.yaml to guarantee +# the CRDs are present before this template applies. + +--- +# obol-frontend namespace. Pre-created here so the HTTPRoute and +# ClusterRoleBinding subject reference can resolve during the `base` +# release without waiting for the `obol-frontend` upstream chart +# release to create it. The chart release still sets +# `createNamespace: true` — kubectl apply on an existing namespace is +# a no-op. +apiVersion: v1 +kind: Namespace +metadata: + name: obol-frontend + +--- +# obol-frontend HTTPRoute. The `hostnames: ["obol.stack"]` restriction +# keeps the frontend UI off the public cloudflared tunnel — removing +# it is a critical security regression (see CLAUDE.md "Security: +# Tunnel Exposure"). +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: obol-frontend + namespace: obol-frontend +spec: + hostnames: + - "obol.stack" + parentRefs: + - name: traefik-gateway + namespace: traefik + sectionName: web + rules: + - matches: + - path: + type: PathPrefix + value: / + backendRefs: + - name: obol-frontend-obol-app + port: 3000 + +--- +# obol-frontend RBAC for OpenClaw instance discovery and ServiceOffer +# CRUD from the frontend sell modal. The ClusterRoleBinding subject +# references the `obol-frontend` ServiceAccount that the upstream +# `obol/obol-app` chart creates — the binding applies fine even if +# the SA does not exist yet, and starts granting permissions the +# moment the SA appears. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: obol-frontend-openclaw-discovery + labels: + app.kubernetes.io/name: obol-frontend +rules: + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["get", "list"] + - apiGroups: [""] + resources: ["pods", "configmaps", "secrets"] + verbs: ["get", "list"] + # ServiceOffer CRD — frontend sell modal creates offers + - apiGroups: ["obol.org"] + resources: ["serviceoffers", "serviceoffers/status"] + verbs: ["get", "list", "create", "update", "patch", "delete"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: obol-frontend-openclaw-discovery + labels: + app.kubernetes.io/name: obol-frontend +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: obol-frontend-openclaw-discovery +subjects: + - kind: ServiceAccount + name: obol-frontend + namespace: obol-frontend diff --git a/internal/embed/infrastructure/helmfile.yaml b/internal/embed/infrastructure/helmfile.yaml index aa7fc052..84ddc538 100644 --- a/internal/embed/infrastructure/helmfile.yaml +++ b/internal/embed/infrastructure/helmfile.yaml @@ -11,8 +11,6 @@ repositories: url: https://obolnetwork.github.io/helm-charts/ - name: ethereum url: https://ethpandaops.github.io/ethereum-helm-charts - - name: bedag - url: https://bedag.github.io/helm-charts/ - name: stakater url: https://stakater.github.io/stakater-charts @@ -27,15 +25,9 @@ values: enabled: true releases: - # Local storage provisioner (raw manifests wrapped as chart) - - name: base - namespace: kube-system - chart: ./base - values: - - dataDir: /data - - network: "{{ .Values.network }}" - - # Monitoring stack (Prometheus operator + Prometheus) + # Monitoring stack (Prometheus operator + Prometheus). Must run before + # `base` so the PodMonitor CRD exists when base/templates/llm.yaml + # applies the litellm-x402-buyer PodMonitor. - name: monitoring namespace: monitoring createNamespace: true @@ -44,34 +36,12 @@ releases: values: - ./values/monitoring.yaml.gotmpl - - name: llm-buyer-podmonitor - namespace: llm - createNamespace: true - chart: bedag/raw - version: 2.0.2 - needs: - - monitoring/monitoring - - kube-system/base - values: - - resources: - - apiVersion: monitoring.coreos.com/v1 - kind: PodMonitor - metadata: - name: litellm-x402-buyer - namespace: llm - labels: - release: monitoring - spec: - selector: - matchLabels: - app: litellm - podMetricsEndpoints: - - port: buyer-http - path: /metrics - interval: 30s - # Traefik ingress controller with Gateway API support - # Traefik v38+ bundles Gateway API CRDs in its crds/ directory + # Traefik v38+ bundles Gateway API CRDs in its crds/ directory. + # Declared before `base` so the Traefik CRDs (Middleware, + # IngressRoute, …) and Gateway API CRDs are available when base + # templates that depend on them (erpc.yaml, obol-frontend.yaml) + # apply. - name: traefik namespace: traefik createNamespace: true @@ -135,6 +105,24 @@ releases: dashboard: enabled: false + # Local storage provisioner + co-located cluster-wide manifests: + # CRDs, agent RBAC, x402 controller + verifier, LiteLLM + buyer + # PodMonitor, eRPC HTTPRoute + Middleware + metadata ConfigMap, and + # the obol-frontend HTTPRoute + discovery RBAC. The `needs` on + # traefik + monitoring guarantee the Traefik / Gateway API and + # monitoring CRDs are present before the relocated routing / + # PodMonitor templates (previously shipped as separate bedag/raw + # helmfile releases) apply. + - name: base + namespace: kube-system + chart: ./base + needs: + - traefik/traefik + - monitoring/monitoring + values: + - dataDir: /data + - network: "{{ .Values.network }}" + # Cloudflare Tunnel (dormant until configured via obol tunnel login/provision). # `condition: cloudflared.enabled` lets `obol stack up` flip this off when an # active quick tunnel is already serving — re-syncing the chart kills the @@ -168,75 +156,16 @@ releases: - ./values/erpc.yaml.gotmpl # The chart exposes port 4000 (container) via Service port 4000. # In-cluster callers use erpc.erpc.svc.cluster.local:4000. + # + # The eRPC HTTPRoute, x402-payment Middleware, and erpc-metadata + # ConfigMap previously shipped as separate bedag/raw helmfile + # releases now live in base/templates/erpc.yaml. - # eRPC HTTPRoute - - name: erpc-httproute - namespace: erpc - chart: bedag/raw - version: 2.0.2 - needs: - - traefik/traefik - - erpc/erpc - values: - - resources: - - apiVersion: gateway.networking.k8s.io/v1 - kind: HTTPRoute - metadata: - name: erpc - namespace: erpc - spec: - hostnames: - - "obol.stack" - parentRefs: - - name: traefik-gateway - namespace: traefik - sectionName: web - rules: - - matches: - - path: - type: PathPrefix - value: /rpc - filters: - - type: ExtensionRef - extensionRef: - group: traefik.io - kind: Middleware - name: x402-payment - backendRefs: - - name: erpc - port: 80 - - # x402 Middleware for eRPC namespace (ForwardAuth -> central verifier). - # Always deployed; the verifier returns 200 for routes with no pricing rules. - - name: erpc-x402-middleware - namespace: erpc - chart: bedag/raw - version: 2.0.2 - needs: - - traefik/traefik - values: - - resources: - - apiVersion: traefik.io/v1alpha1 - kind: Middleware - metadata: - name: x402-payment - namespace: erpc - spec: - forwardAuth: - address: http://x402-verifier.x402.svc.cluster.local:8080/verify - authResponseHeaders: - - X-Payment-Response - - # eRPC metadata ConfigMap for frontend discovery - - name: erpc-metadata - namespace: erpc - chart: bedag/raw - needs: - - erpc/erpc - values: - - ./values/erpc-metadata.yaml.gotmpl - - # Obol Stack frontend + # Obol Stack frontend. + # + # The frontend HTTPRoute and discovery RBAC (ClusterRole + + # ClusterRoleBinding) previously shipped as separate bedag/raw + # helmfile releases now live in base/templates/obol-frontend.yaml. - name: obol-frontend namespace: obol-frontend createNamespace: true @@ -247,75 +176,3 @@ releases: - erpc/erpc values: - ./values/obol-frontend.yaml.gotmpl - - # Obol Frontend HTTPRoute - - name: obol-frontend-httproute - namespace: obol-frontend - chart: bedag/raw - version: 2.0.2 - needs: - - traefik/traefik - - obol-frontend/obol-frontend - values: - - resources: - - apiVersion: gateway.networking.k8s.io/v1 - kind: HTTPRoute - metadata: - name: obol-frontend - namespace: obol-frontend - spec: - hostnames: - - "obol.stack" - parentRefs: - - name: traefik-gateway - namespace: traefik - sectionName: web - rules: - - matches: - - path: - type: PathPrefix - value: / - backendRefs: - - name: obol-frontend-obol-app - port: 3000 - - # Obol Frontend RBAC (OpenClaw instance discovery via Kubernetes API) - - name: obol-frontend-rbac - namespace: obol-frontend - chart: bedag/raw - version: 2.0.2 - needs: - - obol-frontend/obol-frontend - values: - - resources: - - apiVersion: rbac.authorization.k8s.io/v1 - kind: ClusterRole - metadata: - name: obol-frontend-openclaw-discovery - labels: - app.kubernetes.io/name: obol-frontend - rules: - - apiGroups: [""] - resources: ["namespaces"] - verbs: ["get", "list"] - - apiGroups: [""] - resources: ["pods", "configmaps", "secrets"] - verbs: ["get", "list"] - # ServiceOffer CRD — frontend sell modal creates offers - - apiGroups: ["obol.org"] - resources: ["serviceoffers", "serviceoffers/status"] - verbs: ["get", "list", "create", "update", "patch", "delete"] - - apiVersion: rbac.authorization.k8s.io/v1 - kind: ClusterRoleBinding - metadata: - name: obol-frontend-openclaw-discovery - labels: - app.kubernetes.io/name: obol-frontend - roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: obol-frontend-openclaw-discovery - subjects: - - kind: ServiceAccount - name: obol-frontend - namespace: obol-frontend diff --git a/internal/embed/infrastructure/values/erpc-metadata.yaml.gotmpl b/internal/embed/infrastructure/values/erpc-metadata.yaml.gotmpl deleted file mode 100644 index fe94d8ef..00000000 --- a/internal/embed/infrastructure/values/erpc-metadata.yaml.gotmpl +++ /dev/null @@ -1,21 +0,0 @@ -resources: - - apiVersion: v1 - kind: ConfigMap - metadata: - name: erpc-metadata - namespace: erpc - labels: - app.kubernetes.io/part-of: obol.stack - obol.stack/id: default - obol.stack/app: erpc - data: - metadata.json: | - { - "network": "{{ .Values.network }}", - "endpoints": { - "rpc": { - "external": "http://obol.stack/rpc/{{ .Values.network }}", - "internal": "http://erpc.erpc.svc.cluster.local/rpc/{{ .Values.network }}" - } - } - } diff --git a/internal/stack/stack_test.go b/internal/stack/stack_test.go index 0d7cca31..a4c032ea 100644 --- a/internal/stack/stack_test.go +++ b/internal/stack/stack_test.go @@ -434,33 +434,38 @@ func TestDockerBridgeGatewayIP(t *testing.T) { t.Logf("docker0 gateway IP: %s", ip) } +// TestHelmfile_IncludesBuyerPodMonitor asserts the litellm-x402-buyer +// PodMonitor is shipped with the stack. The PodMonitor previously lived +// as an inline `bedag/raw` release in helmfile.yaml; it now lives next +// to its workload in base/templates/llm.yaml. The chart layout (the +// `base` Helm release) renders it during `obol stack up`. func TestHelmfile_IncludesBuyerPodMonitor(t *testing.T) { projectRoot := findProjectRoot() if projectRoot == "" { t.Fatal("project root not found") } - data, err := os.ReadFile(filepath.Join(projectRoot, "internal/embed/infrastructure/helmfile.yaml")) + data, err := os.ReadFile(filepath.Join(projectRoot, "internal/embed/infrastructure/base/templates/llm.yaml")) if err != nil { - t.Fatalf("read helmfile: %v", err) + t.Fatalf("read llm template: %v", err) } out := string(data) if !strings.Contains(out, "kind: PodMonitor") { - t.Fatalf("helmfile missing PodMonitor:\n%s", out) + t.Fatalf("llm template missing PodMonitor:\n%s", out) } if !strings.Contains(out, "name: litellm-x402-buyer") { - t.Fatalf("helmfile missing buyer PodMonitor name:\n%s", out) + t.Fatalf("llm template missing buyer PodMonitor name:\n%s", out) } if !strings.Contains(out, "release: monitoring") { - t.Fatalf("helmfile missing monitoring label:\n%s", out) + t.Fatalf("llm template missing monitoring label:\n%s", out) } if !strings.Contains(out, "port: buyer-http") || !strings.Contains(out, "path: /metrics") { - t.Fatalf("helmfile missing buyer metrics endpoint:\n%s", out) + t.Fatalf("llm template missing buyer metrics endpoint:\n%s", out) } } From 6bec6519abff3333a6f0519ef003fa95585eb7fa Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 23 May 2026 23:09:29 +0400 Subject: [PATCH 15/31] fix(x402): fail-closed when URI is under a paid prefix but no rule matches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Today HandleVerify returns 200 whenever matchPaidRoute returns nil. Combined with Traefik ForwardAuth's "200 = allow" semantics, this means a misconfigured Middleware on a paid route OR a code bug where the route was supposed to match but didn't silently makes the route FREE — revenue loss with no signal. - Adds paidPrefixes atomic.Pointer[[]string] to Verifier. - Verifier.load() derives prefixes from cfg.Routes patterns: "/services/foo/*" -> "/services/foo/" (trailing slash kept so HasPrefix doesn't false-match /services/foobar/). - HandleVerify: when matchRoute returns nil, check if URI is under any tracked prefix. If yes -> 403. If no -> 200 (legitimately free). Complementary to PR #519 (gate /readyz on informer sync): - PR #519 ensures the pod isn't Ready until routes are loaded (closes the bootstrap-window leak). - This PR ensures that after Ready, any prefix the verifier KNOWS about that doesn't have a matching rule is fail-closed (closes the steady-state-bug leak). Together they cover the "rule should match but doesn't" gap. --- internal/x402/verifier.go | 68 +++++++++++++++++++++++++++- internal/x402/verifier_test.go | 82 ++++++++++++++++++++++++++++++++++ 2 files changed, 149 insertions(+), 1 deletion(-) diff --git a/internal/x402/verifier.go b/internal/x402/verifier.go index 60e2fa80..370ba1ef 100644 --- a/internal/x402/verifier.go +++ b/internal/x402/verifier.go @@ -6,6 +6,7 @@ import ( "net/http" "net/http/httputil" "net/url" + "sort" "strings" "sync/atomic" @@ -21,6 +22,16 @@ type Verifier struct { chain atomic.Pointer[ChainInfo] chains atomic.Pointer[map[string]ChainInfo] // pre-resolved: chain name → config metrics *verifierMetrics + + // paidPrefixes is the list of URI prefixes the verifier KNOWS are + // paid routes (derived from cfg.Routes patterns on each load). Used + // by HandleVerify to fail-closed when a URI is under a paid prefix + // but no rule matches — the alternative (200 → ForwardAuth allow) + // would silently make the route free. + // + // Sorted by length descending so longer-prefix matches win first + // (defensive — fixes nothing today but cheap insurance). + paidPrefixes atomic.Pointer[[]string] } // NewVerifier creates a Verifier with the given initial configuration. @@ -64,6 +75,19 @@ func (v *Verifier) load(cfg *PricingConfig) error { v.chains.Store(&chains) v.config.Store(cfg) + // Derive paid-prefix tracker from the route patterns. HandleVerify + // uses this to fail-closed when a URI is under a tracked prefix but + // no rule matches (see isUnderPaidPrefix for the rationale). + prefixes := make([]string, 0, len(cfg.Routes)) + for _, r := range cfg.Routes { + prefix := patternToPrefix(r.Pattern) + if prefix != "" { + prefixes = append(prefixes, prefix) + } + } + sort.Slice(prefixes, func(i, j int) bool { return len(prefixes[i]) > len(prefixes[j]) }) + v.paidPrefixes.Store(&prefixes) + // Drop metric series for offers that are no longer in the route set. // Without this, deleting an offer leaves its counters + last-success // gauge in the registry forever, polluting dashboards and silently @@ -104,7 +128,17 @@ func (v *Verifier) HandleVerify(w http.ResponseWriter, r *http.Request) { rule, requirement, extensions, _, chain, asset, ok := v.matchPaidRouteFull(cfg, uri) if !ok { - // No pricing rule matches — route is free. + // Check if this URI is under a tracked paid prefix. If yes, + // the route was supposed to match but didn't — fail closed + // rather than silently make it free (Traefik ForwardAuth 200 + // means "allow"). + if v.isUnderPaidPrefix(uri) { + log.Printf("x402-verifier: URI %q is under a paid prefix but no rule matches — fail closed", uri) + http.Error(w, "no rule matches; route appears to be a paid prefix with stale or missing rule", http.StatusForbidden) + + return + } + // Not under any paid prefix — legitimately free route. w.WriteHeader(http.StatusOK) return } @@ -277,6 +311,38 @@ func (v *Verifier) matchPaidRouteFull(cfg *PricingConfig, uri string) (*RouteRul return rule, requirement, extensions, prometheusLabels(rule), chain, asset, true } +// isUnderPaidPrefix reports whether uri starts with any of the URI +// prefixes the verifier knows are paid routes. Used by HandleVerify +// to fail-closed when matchRoute returns nil but the URI is still +// under a tracked prefix — i.e. the route was supposed to match but +// didn't (stale route table, code bug, etc.). +func (v *Verifier) isUnderPaidPrefix(uri string) bool { + prefixes := v.paidPrefixes.Load() + if prefixes == nil { + return false + } + for _, p := range *prefixes { + if strings.HasPrefix(uri, p) { + return true + } + } + return false +} + +// patternToPrefix converts a route Pattern like "/services/foo/*" +// into a directory-style prefix "/services/foo/" suitable for +// strings.HasPrefix matching. Returns "" for patterns without a +// trailing glob — exact-match patterns aren't paid prefixes, so +// fail-closed only applies to the broader "any URI under this path" +// semantic. The trailing slash is preserved so HasPrefix +// distinguishes /services/foo/ from /services/foobar/. +func patternToPrefix(pattern string) string { + if !strings.HasSuffix(pattern, "/*") { + return "" + } + return strings.TrimSuffix(pattern, "*") +} + // mergeAgentExtras adds the agent fields from a RouteRule to the // requirement's Extra map so buyers probing a 402 see which model and // skills are powering the offer. No-op for non-agent rules. diff --git a/internal/x402/verifier_test.go b/internal/x402/verifier_test.go index 3b62c815..98996cdb 100644 --- a/internal/x402/verifier_test.go +++ b/internal/x402/verifier_test.go @@ -998,6 +998,88 @@ func TestVerifier_Reload_PrunesDeletedOfferSeries(t *testing.T) { } } +// TestVerifier_HandleVerify_FailClosed_ManualPrefixInjection sanity checks +// that an arbitrary prefix in paidPrefixes triggers fail-closed (403) when +// no rule matches. The manual prefix injection simulates the case where the +// verifier KNOWS about a paid prefix (because a route was previously loaded) +// but the matcher rejects the URI — config drift, code bug, etc. +func TestVerifier_HandleVerify_FailClosed_ManualPrefixInjection(t *testing.T) { + fac := newMockFacilitator(t, mockFacilitatorOpts{}) + v := newTestVerifier(t, fac.URL, []RouteRule{ + // No rules; matchRoute will return nil for everything. + }) + + // Manually inject a paid prefix (simulating a stale prefix state). + prefixes := []string{"/services/gated/"} + v.paidPrefixes.Store(&prefixes) + + req := httptest.NewRequest(http.MethodPost, "/verify", nil) + req.Header.Set("X-Forwarded-Uri", "/services/gated/foo") + rec := httptest.NewRecorder() + v.HandleVerify(rec, req) + + if rec.Code != http.StatusForbidden { + t.Errorf("expected 403 (fail-closed) for URI under tracked paid prefix, got %d", rec.Code) + } +} + +// TestVerifier_HandleVerify_FreeRoute_OutsidePrefixes asserts that URIs +// outside all tracked paid prefixes still return 200 (legitimate free pass). +// The verifier is mounted on routes that may or may not be paid; only URIs +// under a known paid prefix should fail closed. +func TestVerifier_HandleVerify_FreeRoute_OutsidePrefixes(t *testing.T) { + fac := newMockFacilitator(t, mockFacilitatorOpts{}) + v := newTestVerifier(t, fac.URL, []RouteRule{ + {Pattern: "/services/known/*", Price: "0.0001"}, + }) + + req := httptest.NewRequest(http.MethodPost, "/verify", nil) + req.Header.Set("X-Forwarded-Uri", "/health") // Not under any paid prefix. + rec := httptest.NewRecorder() + v.HandleVerify(rec, req) + + if rec.Code != http.StatusOK { + t.Errorf("expected 200 for free route outside paid prefixes, got %d", rec.Code) + } +} + +// TestVerifier_HandleVerify_PrefixBoundary_NoFalseMatch verifies that the +// trailing slash on paid prefixes prevents false matches between siblings +// like /services/foo/ and /services/foobar/. Without the trailing slash, +// a request to /services/foobar/x would falsely match /services/foo/*. +func TestVerifier_HandleVerify_PrefixBoundary_NoFalseMatch(t *testing.T) { + fac := newMockFacilitator(t, mockFacilitatorOpts{}) + v := newTestVerifier(t, fac.URL, []RouteRule{ + {Pattern: "/services/foo/*", Price: "0.0001"}, + }) + + // /services/foobar/x is NOT under /services/foo/ — must return 200. + req := httptest.NewRequest(http.MethodPost, "/verify", nil) + req.Header.Set("X-Forwarded-Uri", "/services/foobar/x") + rec := httptest.NewRecorder() + v.HandleVerify(rec, req) + + if rec.Code != http.StatusOK { + t.Errorf("expected 200 for sibling path not under prefix, got %d", rec.Code) + } +} + +func TestPatternToPrefix(t *testing.T) { + cases := []struct{ pattern, want string }{ + {"/services/foo/*", "/services/foo/"}, + {"/rpc/*", "/rpc/"}, + {"/health", ""}, // No glob, returns empty. + {"/*", "/"}, + {"", ""}, + {"/exact/match", ""}, // Exact pattern, not a prefix. + } + for _, c := range cases { + if got := patternToPrefix(c.pattern); got != c.want { + t.Errorf("patternToPrefix(%q) = %q, want %q", c.pattern, got, c.want) + } + } +} + // findVerifierMetricValue returns the value of the series in `family` whose // labels match `wantLabels` exactly, failing the test if no such series exists. func findVerifierMetricValue(t *testing.T, family *dto.MetricFamily, wantLabels map[string]string) float64 { From 43539483dac6c6bd353b2daf5d791bfb96a1a22e Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 23 May 2026 23:17:44 +0400 Subject: [PATCH 16/31] feat(monetizeapi): controller-gen as canonical CRD schema source MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the entire class of "CRD YAML and Go struct drifted" bugs. PurchaseAutoRefill.MaxTotal was the most recent instance — it existed in purchaserequest-crd.yaml for months while internal/monetizeapi/ types.go didn't have the corresponding field. Without this commit, that pattern recurs by design: two sources of truth, one hand- maintained, no enforcement of agreement. Now Go is the single source of truth: - kubebuilder markers on every CRD-backed struct in types.go (validation, required, enum, pattern, printer columns, subresources) - `just generate` regenerates *-crd.yaml from those markers + zz_generated.deepcopy.go from object:generate=true - CI fails if `git status` is non-empty after `just generate` runs This commit also fixes the documented MaxTotal / MaxSpendPerDay drift by adding both fields to PurchaseAutoRefill — the generated CRD now matches the prior hand-written one and the controller can read them. Pinned controller-tools at v0.16.5 in tools/tools.go (compatible with client-go v0.34.x; a newer release would force prometheus/common through a panicking validation-scheme change). Generation is deterministic; running locally produces no diff after a clean checkout. For future CRD edits: 1. Edit types.go (add/change a field, update markers) 2. `just generate` 3. Commit both the Go and YAML diffs 4. CI verifies the YAML was committed PreSignedAuth.Payment is map[string]interface{} (opaque x402 payload), which controller-gen cannot deep-copy automatically; a hand-written DeepCopy lives in deepcopy_manual.go and the type is flagged object:generate=false. The hack/boilerplate.go.txt file is force-added past *.txt gitignore; it's an empty marker for now — add a copyright header later if the repo settles on one. --- .github/workflows/lint-test.yaml | 28 + go.mod | 41 +- go.sum | 78 +- hack/boilerplate.go.txt | 1 + .../base/templates/agent-crd.yaml | 245 +++--- .../base/templates/agentidentity-crd.yaml | 117 +-- .../base/templates/purchaserequest-crd.yaml | 354 ++++---- .../templates/registrationrequest-crd.yaml | 162 ++-- .../base/templates/serviceoffer-crd.yaml | 671 ++++++++------- internal/monetizeapi/deepcopy_manual.go | 58 ++ internal/monetizeapi/doc.go | 16 + internal/monetizeapi/types.go | 510 +++++++++--- internal/monetizeapi/zz_generated.deepcopy.go | 775 ++++++++++++++++++ justfile | 35 + tools/tools.go | 14 + 15 files changed, 2251 insertions(+), 854 deletions(-) create mode 100644 hack/boilerplate.go.txt create mode 100644 internal/monetizeapi/deepcopy_manual.go create mode 100644 internal/monetizeapi/doc.go create mode 100644 internal/monetizeapi/zz_generated.deepcopy.go create mode 100644 tools/tools.go diff --git a/.github/workflows/lint-test.yaml b/.github/workflows/lint-test.yaml index 8a251e97..34895b32 100644 --- a/.github/workflows/lint-test.yaml +++ b/.github/workflows/lint-test.yaml @@ -43,3 +43,31 @@ jobs: - name: Run chart-testing (install) run: ct install --target-branch ${{ github.event.repository.default_branch }} + + generate-check: + name: CRD generation up-to-date + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + + - name: Set up Go + uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0 + with: + go-version-file: 'go.mod' + + - name: Set up just + uses: extractions/setup-just@dd310ad5a97d8e7b41793f8ef055398d51ad4de6 # v2.0.2 + + - name: Regenerate CRDs + DeepCopy + run: just generate + + - name: Fail if regeneration changed any tracked files + run: | + if [ -n "$(git status --porcelain)" ]; then + echo "::error::CRD manifests or DeepCopy methods are out of date." + echo "::error::Run 'just generate' locally and commit the result." + git status + git --no-pager diff + exit 1 + fi diff --git a/go.mod b/go.mod index 98f67795..28171102 100644 --- a/go.mod +++ b/go.mod @@ -15,19 +15,22 @@ require ( github.com/hf/nitrite v0.0.0-20241225144000-c2d5d3c4f303 github.com/hf/nsm v0.0.0-20220930140112-cd181bd646b9 github.com/mattn/go-isatty v0.0.20 - github.com/prometheus/client_golang v1.15.0 - github.com/prometheus/client_model v0.3.0 - github.com/prometheus/common v0.42.0 + github.com/prometheus/client_golang v1.19.1 + github.com/prometheus/client_model v0.6.1 + github.com/prometheus/common v0.55.0 github.com/santhosh-tekuri/jsonschema/v6 v6.0.2 github.com/shopspring/decimal v1.3.1 github.com/urfave/cli/v2 v2.27.5 github.com/urfave/cli/v3 v3.6.2 golang.org/x/crypto v0.46.0 + golang.org/x/net v0.48.0 golang.org/x/sys v0.39.0 golang.org/x/term v0.38.0 gopkg.in/yaml.v3 v3.0.1 + k8s.io/api v0.34.1 k8s.io/apimachinery v0.34.1 k8s.io/client-go v0.34.1 + sigs.k8s.io/controller-tools v0.16.5 ) require ( @@ -48,21 +51,22 @@ require ( github.com/crate-crypto/go-ipa v0.0.0-20240724233137-53bbb0ceb27a // indirect github.com/cucumber/gherkin/go/v26 v26.2.0 // indirect github.com/cucumber/messages/go/v21 v21.0.1 // indirect - github.com/davecgh/go-spew v1.1.1 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/deckarep/golang-set/v2 v2.8.0 // indirect github.com/emicklei/go-restful/v3 v3.12.2 // indirect github.com/ethereum/c-kzg-4844/v2 v2.1.5 // indirect github.com/ethereum/go-verkle v0.2.2 // indirect + github.com/fatih/color v1.18.0 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/fxamacker/cbor/v2 v2.9.0 // indirect - github.com/go-logr/logr v1.4.2 // indirect + github.com/go-logr/logr v1.4.3 // indirect github.com/go-ole/go-ole v1.3.0 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect github.com/go-openapi/jsonreference v0.20.2 // indirect github.com/go-openapi/swag v0.23.0 // indirect + github.com/gobuffalo/flect v1.0.3 // indirect github.com/gofrs/uuid v4.3.1+incompatible // indirect github.com/gogo/protobuf v1.3.2 // indirect - github.com/golang/protobuf v1.5.4 // indirect github.com/google/gnostic-models v0.7.0 // indirect github.com/google/go-cmp v0.7.0 // indirect github.com/google/go-configfs-tsm v0.2.2 // indirect @@ -72,24 +76,25 @@ require ( github.com/hashicorp/go-memdb v1.3.4 // indirect github.com/hashicorp/golang-lru v0.5.4 // indirect github.com/holiman/uint256 v1.3.2 // indirect + github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/klauspost/compress v1.18.1 // indirect github.com/lucasb-eyer/go-colorful v1.2.0 // indirect github.com/mailru/easyjson v0.7.7 // indirect + github.com/mattn/go-colorable v0.1.14 // indirect github.com/mattn/go-runewidth v0.0.16 // indirect - github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/muesli/termenv v0.16.0 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/pkg/errors v0.9.1 // indirect - github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/prometheus/procfs v0.9.0 // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect + github.com/prometheus/procfs v0.16.1 // indirect github.com/rivo/uniseg v0.4.7 // indirect github.com/rogpeppe/go-internal v1.14.1 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/shirou/gopsutil v3.21.4-0.20210419000835-c7a38de76ee5+incompatible // indirect + github.com/spf13/cobra v1.9.1 // indirect github.com/spf13/pflag v1.0.10 // indirect github.com/supranational/blst v0.3.16 // indirect github.com/tklauser/go-sysconf v0.3.12 // indirect @@ -98,22 +103,24 @@ require ( github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect go.uber.org/multierr v1.11.0 // indirect - go.yaml.in/yaml/v2 v2.4.2 // indirect + go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546 // indirect - golang.org/x/net v0.48.0 // indirect + golang.org/x/mod v0.30.0 // indirect golang.org/x/oauth2 v0.32.0 // indirect golang.org/x/sync v0.19.0 // indirect golang.org/x/text v0.32.0 // indirect golang.org/x/time v0.14.0 // indirect + golang.org/x/tools v0.39.0 // indirect google.golang.org/protobuf v1.36.11 // indirect - gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect + gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect - k8s.io/api v0.34.1 // indirect + gopkg.in/yaml.v2 v2.4.0 // indirect + k8s.io/apiextensions-apiserver v0.31.2 // indirect k8s.io/klog/v2 v2.130.1 // indirect - k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b // indirect - k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 // indirect - sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect + k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect + k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect + sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect sigs.k8s.io/randfill v1.0.0 // indirect sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect sigs.k8s.io/yaml v1.6.0 // indirect diff --git a/go.sum b/go.sum index c451ad0f..fd84f4a3 100644 --- a/go.sum +++ b/go.sum @@ -45,6 +45,7 @@ github.com/coinbase/x402/go v0.0.0-20260331075907-bff876de232a/go.mod h1:8xt63HO github.com/consensys/gnark-crypto v0.19.2 h1:qrEAIXq3T4egxqiliFFoNrepkIWVEeIYwt3UL0fvS80= github.com/consensys/gnark-crypto v0.19.2/go.mod h1:rT23F0XSZqE0mUA0+pRtnL56IbPxs6gp4CeRsBk4XS0= github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo= github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/crate-crypto/go-eth-kzg v1.4.0 h1:WzDGjHk4gFg6YzV0rJOAsTK4z3Qkz5jd4RE3DAvPFkg= @@ -60,8 +61,9 @@ github.com/cucumber/messages/go/v21 v21.0.1 h1:wzA0LxwjlWQYZd32VTlAVDTkW6inOFmSM github.com/cucumber/messages/go/v21 v21.0.1/go.mod h1:zheH/2HS9JLVFukdrsPWoPdmUtmYQAQPLk7w5vWsk5s= github.com/cucumber/messages/go/v22 v22.0.0/go.mod h1:aZipXTKc0JnjCsXrJnuZpWhtay93k7Rn3Dee7iyPJjs= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dchest/siphash v1.2.3 h1:QXwFc8cFOR2dSa/gE6o/HokBMWtLUaNDVd+22aKHeEA= github.com/dchest/siphash v1.2.3/go.mod h1:0NvQU092bT0ipiFN++/rXm69QG9tVxLAlQHIXMPAkHc= github.com/deckarep/golang-set/v2 v2.8.0 h1:swm0rlPCmdWn9mESxKOjWk8hXSqoxOp+ZlfuyaAdFlQ= @@ -88,6 +90,8 @@ github.com/ethereum/go-ethereum v1.16.7 h1:qeM4TvbrWK0UC0tgkZ7NiRsmBGwsjqc64BHo2 github.com/ethereum/go-ethereum v1.16.7/go.mod h1:Fs6QebQbavneQTYcA39PEKv2+zIjX7rPUZ14DER46wk= github.com/ethereum/go-verkle v0.2.2 h1:I2W0WjnrFUIzzVPwm8ykY+7pL2d4VhlsePn4j7cnFk8= github.com/ethereum/go-verkle v0.2.2/go.mod h1:M3b90YRnzqKyyzBEWJGqj8Qff4IDeXnzFw0P9bFw3uk= +github.com/fatih/color v1.18.0 h1:S8gINlzdQ840/4pfAwic/ZE0djQEH3wM94VfqLTZcOM= +github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/6HfU= github.com/ferranbt/fastssz v0.1.4 h1:OCDB+dYDEQDvAgtAGnTSidK1Pe2tW3nFV40XyMkTeDY= github.com/ferranbt/fastssz v0.1.4/go.mod h1:Ea3+oeoRGGLGm5shYAeDgu6PGUlcvQhE2fILyD9+tGg= github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= @@ -99,8 +103,8 @@ github.com/gballet/go-libpcsclite v0.0.0-20190607065134-2772fd86a8ff h1:tY80oXqG github.com/gballet/go-libpcsclite v0.0.0-20190607065134-2772fd86a8ff/go.mod h1:x7DCsMOv1taUwEWCzT4cmDeAkigA5/QCwUodaVOe8Ww= github.com/getsentry/sentry-go v0.27.0 h1:Pv98CIbtB3LkMWmXi4Joa5OOcwbmnX88sF5qbK3r3Ps= github.com/getsentry/sentry-go v0.27.0/go.mod h1:lc76E2QywIyW8WuBnwl8Lc4bkmQH4+w1gwTf25trprY= -github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= -github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-ole/go-ole v1.2.5/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= github.com/go-ole/go-ole v1.3.0 h1:Dt6ye7+vXGIKZ7Xtk4s6/xVdGDQynvom7xCFEdWr6uE= github.com/go-ole/go-ole v1.3.0/go.mod h1:5LS6F96DhAwUc7C+1HLexzMXY1xGRSryjyPPKW6zv78= @@ -114,6 +118,8 @@ github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+Gr github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= +github.com/gobuffalo/flect v1.0.3 h1:xeWBM2nui+qnVvNM4S3foBhCAL2XgPU+a7FdpelbTq4= +github.com/gobuffalo/flect v1.0.3/go.mod h1:A5msMlrHtLqh9umBSnvabjsMrCcCpAyzglnDvkbYKHs= github.com/gofrs/flock v0.12.1 h1:MTLVXXHf8ekldpJk3AKicLij9MdwOWkZ+a/jHHZby9E= github.com/gofrs/flock v0.12.1/go.mod h1:9zxTsyu5xtJ9DK+1tFZyibEV7y3uwDxPPfbxeeHCoD0= github.com/gofrs/uuid v4.2.0+incompatible/go.mod h1:b2aQJv3Z4Fp6yNu3cdSllBxTCLRxnplIgP/c0N/04lM= @@ -123,10 +129,6 @@ github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang-jwt/jwt/v4 v4.5.2 h1:YtQM7lnr8iZ+j5q71MGKkNw9Mn7AjHM68uc9g5fXeUI= github.com/golang-jwt/jwt/v4 v4.5.2/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0= -github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.3.5/go.mod h1:6O5/vntMXwX2lRkT1hjjk0nAC1IDOTvTlVgjlRvqsdk= -github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= -github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/golang/snappy v1.0.0 h1:Oy607GVXHs7RtbggtPBnr2RmDArIsAefDwvrdWvRhGs= github.com/golang/snappy v1.0.0/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo= @@ -177,6 +179,7 @@ github.com/holiman/uint256 v1.3.2 h1:a9EgMPSC1AAaj1SZL5zIQD3WbwTuHrMGOerLjGmM/TA github.com/holiman/uint256 v1.3.2/go.mod h1:EOMSn4q6Nyt9P6efbI3bueV4e1b3dGlUCXeiRV4ng7E= github.com/huin/goupnp v1.3.0 h1:UvLUlWDNpoUdYzb2TCn+MuTWtcjXKSza2n6CBdQ0xXc= github.com/huin/goupnp v1.3.0/go.mod h1:gnGPsThkYa7bFi/KWmEysQRf48l2dvR5bxr2OFckNX8= +github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/influxdata/influxdb-client-go/v2 v2.4.0 h1:HGBfZYStlx3Kqvsv1h2pJixbCl/jhnFtxpKFAv9Tu5k= github.com/influxdata/influxdb-client-go/v2 v2.4.0/go.mod h1:vLNHdxTJkIf2mSLvGrpj8TCcISApPoXkaxP8g9uRlW8= @@ -217,8 +220,6 @@ github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWE github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc= github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= -github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo= -github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= github.com/minio/sha256-simd v1.0.0 h1:v1ta+49hkWZyvaKwrQB8elexRqm6Y0aMLjCNsrYxo6g= github.com/minio/sha256-simd v1.0.0/go.mod h1:OuYzVNI5vcoYIAmbIvHPl3N3jUzVedXbKy5RFepssQM= github.com/mitchellh/mapstructure v1.4.1 h1:CpVNEelQCZBooIPDn+AR3NpivK/TIKU8bDxdASFVQag= @@ -235,8 +236,12 @@ github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= +github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec= github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY= +github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE= +github.com/onsi/ginkgo v1.16.5/go.mod h1:+E8gABHa3K6zRBolWtd+ROzc/U5bkGt0FwiG042wbpU= github.com/onsi/ginkgo/v2 v2.21.0 h1:7rg/4f3rB88pb5obDgNZrNHrQ4e6WpjonchcpuBRnZM= github.com/onsi/ginkgo/v2 v2.21.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo= github.com/onsi/gomega v1.35.1 h1:Cwbd75ZBPxFSuZ6T+rN/WCb/gOc6YgFBXLlZLhC7Ds4= @@ -257,16 +262,17 @@ github.com/pion/transport/v3 v3.0.1 h1:gDTlPJwROfSfz6QfSi0ZmeCSkFcnWWiiR9ES0ouAN github.com/pion/transport/v3 v3.0.1/go.mod h1:UY7kiITrlMv7/IKgd5eTUcaahZx5oUN3l9SzK5f5xE0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prometheus/client_golang v1.15.0 h1:5fCgGYogn0hFdhyhLbw7hEsWxufKtY9klyvdNfFlFhM= -github.com/prometheus/client_golang v1.15.0/go.mod h1:e9yaBhRPU2pPNsZwE+JdQl0KEt1N9XgF6zxWmaC0xOk= -github.com/prometheus/client_model v0.3.0 h1:UBgGFHqYdG/TPFD1B1ogZywDqEkwp3fBMvqdiQ7Xew4= -github.com/prometheus/client_model v0.3.0/go.mod h1:LDGWKZIo7rky3hgvBe+caln+Dr3dPggB5dvjtD7w9+w= -github.com/prometheus/common v0.42.0 h1:EKsfXEYo4JpWMHH5cg+KOUWeuJSov1Id8zGR8eeI1YM= -github.com/prometheus/common v0.42.0/go.mod h1:xBwqVerjNdUDjgODMpudtOMwlOwf2SaTr1yjz4b7Zbc= -github.com/prometheus/procfs v0.9.0 h1:wzCHvIvM5SxWqYvwgVL7yJY8Lz3PKn49KQtpgMYJfhI= -github.com/prometheus/procfs v0.9.0/go.mod h1:+pB4zwohETzFnmlpe6yd2lSc+0/46IYZRB/chUwxUZY= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.19.1 h1:wZWJDwK+NameRJuPGDhlnFgx8e8HN3XHQeLaYJFJBOE= +github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho= +github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= +github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= +github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc= +github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8= +github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= +github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= @@ -283,7 +289,10 @@ github.com/shirou/gopsutil v3.21.4-0.20210419000835-c7a38de76ee5+incompatible/go github.com/shopspring/decimal v1.3.1 h1:2Usl1nmF/WZucqkFZhnfFYxxxu8LG21F6nPQBE5gKV8= github.com/shopspring/decimal v1.3.1/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o= github.com/spf13/cobra v1.7.0/go.mod h1:uLxZILRyS/50WlhOIKD7W6V5bgeIt+4sICxh6uRMrb0= +github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo= +github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spf13/pflag v1.0.7/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk= github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= @@ -323,8 +332,8 @@ go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= -go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= -go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= +go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= +go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= @@ -338,6 +347,8 @@ golang.org/x/lint v0.0.0-20201208152925-83fdc39ff7b5/go.mod h1:3xt1FjdF8hUf6vQPI golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.30.0 h1:fDEXFVZ/fmCKProc/yAXXUijritrDzahmwwefnjoPFk= +golang.org/x/mod v0.30.0/go.mod h1:lAsf5O2EvJeSFMiBxXDki7sCgAxEUcZHXoXMKT4GJKc= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= @@ -346,7 +357,6 @@ golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU= golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY= golang.org/x/oauth2 v0.32.0 h1:jsCblLleRMDrxMN29H3z/k1KliIvpLgCkE6R8FXXNgY= golang.org/x/oauth2 v0.32.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= -golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -379,6 +389,10 @@ golang.org/x/tools v0.0.0-20210105210202-9ed45478a130/go.mod h1:emZCQorbCU4vsT4f golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.39.0 h1:ik4ho21kwuQln40uelmciQPp9SipgNDdrafrYA4TmQQ= golang.org/x/tools v0.39.0/go.mod h1:JnefbkDPyD8UU2kI5fuf8ZX4/yUeh9W877ZeBONxUqQ= +golang.org/x/tools/go/expect v0.1.1-deprecated h1:jpBZDwmgPhXsKZC6WhL20P4b/wmnpsEAGHaNy0n/rJM= +golang.org/x/tools/go/expect v0.1.1-deprecated/go.mod h1:eihoPOH+FgIqa3FpoTwguz/bVUSGBlGQU67vpBeOrBY= +golang.org/x/tools/go/packages/packagestest v0.1.1-deprecated h1:1h2MnaIAIXISqTFKdENegdpAgUXz6NrPEsbIeWaBRvM= +golang.org/x/tools/go/packages/packagestest v0.1.1-deprecated/go.mod h1:RVAQXBGNv1ib0J382/DPCRS/BPnsGebyM1Gj5VSDpG8= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -388,12 +402,14 @@ google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= -gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4= -gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= +gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo= +gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/natefinch/lumberjack.v2 v2.2.1 h1:bBRl1b0OH9s/DuPhuXpNl+VtCaJXFZ5/uEFST95x9zc= gopkg.in/natefinch/lumberjack.v2 v2.2.1/go.mod h1:YD8tP3GAjkrDg1eZH7EGmyESg/lsYskCTPBJVb9jqSc= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= @@ -401,18 +417,22 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= k8s.io/api v0.34.1 h1:jC+153630BMdlFukegoEL8E/yT7aLyQkIVuwhmwDgJM= k8s.io/api v0.34.1/go.mod h1:SB80FxFtXn5/gwzCoN6QCtPD7Vbu5w2n1S0J5gFfTYk= +k8s.io/apiextensions-apiserver v0.31.2 h1:W8EwUb8+WXBLu56ser5IudT2cOho0gAKeTOnywBLxd0= +k8s.io/apiextensions-apiserver v0.31.2/go.mod h1:i+Geh+nGCJEGiCGR3MlBDkS7koHIIKWVfWeRFiOsUcM= k8s.io/apimachinery v0.34.1 h1:dTlxFls/eikpJxmAC7MVE8oOeP1zryV7iRyIjB0gky4= k8s.io/apimachinery v0.34.1/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw= k8s.io/client-go v0.34.1 h1:ZUPJKgXsnKwVwmKKdPfw4tB58+7/Ik3CrjOEhsiZ7mY= k8s.io/client-go v0.34.1/go.mod h1:kA8v0FP+tk6sZA0yKLRG67LWjqufAoSHA2xVGKw9Of8= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b h1:MloQ9/bdJyIu9lb1PzujOPolHyvO06MXG5TUIj2mNAA= -k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts= -k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 h1:hwvWFiBzdWw1FhfY1FooPn3kzWuJ8tmbZBHi4zVsl1Y= -k8s.io/utils v0.0.0-20250604170112-4c0f3b243397/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= -sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE= -sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= +k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZcmKS3g6CthxToOb37KgwE= +k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ= +k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck= +k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +sigs.k8s.io/controller-tools v0.16.5 h1:5k9FNRqziBPwqr17AMEPPV/En39ZBplLAdOwwQHruP4= +sigs.k8s.io/controller-tools v0.16.5/go.mod h1:8vztuRVzs8IuuJqKqbXCSlXcw+lkAv/M2sTpg55qjMY= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= sigs.k8s.io/structured-merge-diff/v6 v6.3.0 h1:jTijUJbW353oVOd9oTlifJqOGEkUw2jB/fXCbTiQEco= diff --git a/hack/boilerplate.go.txt b/hack/boilerplate.go.txt new file mode 100644 index 00000000..8c47ec39 --- /dev/null +++ b/hack/boilerplate.go.txt @@ -0,0 +1 @@ +// Code generated by controller-gen. DO NOT EDIT. diff --git a/internal/embed/infrastructure/base/templates/agent-crd.yaml b/internal/embed/infrastructure/base/templates/agent-crd.yaml index 510d4d0c..8338c0d6 100644 --- a/internal/embed/infrastructure/base/templates/agent-crd.yaml +++ b/internal/embed/infrastructure/base/templates/agent-crd.yaml @@ -1,12 +1,9 @@ --- -# Agent CRD -# Declarative spec for an Obol Stack agent (Hermes today, OpenClaw later). -# Decouples agent lifecycle from selling — `obol sell agent ` references -# an existing Agent rather than provisioning one inline. Internal manager -# agents with RBAC can also create Agent resources to spawn sub-agents. apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.5 name: agents.obol.org spec: group: obol.org @@ -14,103 +11,151 @@ spec: kind: Agent listKind: AgentList plural: agents - singular: agent shortNames: - - ag + - ag + singular: agent scope: Namespaced versions: - - name: v1alpha1 - served: true - storage: true - subresources: - status: {} - additionalPrinterColumns: - - name: Runtime - type: string - jsonPath: .spec.runtime - - name: Model - type: string - jsonPath: .status.pinnedModel - - name: Wallet - type: string - jsonPath: .status.walletAddress - - name: Phase - type: string - jsonPath: .status.phase - - name: Ready - type: string - jsonPath: .status.conditions[?(@.type=="Ready")].status - - name: Age - type: date - jsonPath: .metadata.creationTimestamp - schema: - openAPIV3Schema: - type: object - properties: - spec: - type: object - properties: - runtime: - type: string - enum: - - hermes - default: hermes - description: "Agent runtime (only hermes today; openclaw planned)" - model: + - additionalPrinterColumns: + - jsonPath: .spec.runtime + name: Runtime + type: string + - jsonPath: .status.pinnedModel + name: Model + type: string + - jsonPath: .status.walletAddress + name: Wallet + type: string + - jsonPath: .status.phase + name: Phase + type: string + - jsonPath: .status.conditions[?(@.type=="Ready")].status + name: Ready + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + Agent is the declarative spec for an Obol Stack agent (Hermes today, + OpenClaw later). Decouples agent lifecycle from selling: `obol sell + agent ` references an existing Agent rather than provisioning + one inline. Internal manager agents with RBAC can also create Agent + resources to spawn sub-agents. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + properties: + model: + description: |- + LiteLLM model name to pin. Empty = controller picks cluster + top-of-rank on first deploy and writes status.pinnedModel. + maxLength: 256 + type: string + objective: + description: |- + Operator-supplied objective text. Substituted into the SOUL.md + template by the seeder on first write. Agent owns SOUL.md after that. + maxLength: 4096 + type: string + runtime: + default: hermes + description: Agent runtime (only hermes today; openclaw planned). + enum: + - hermes + type: string + skills: + description: |- + Allow-listed skills written to the per-agent skills dir on first + reconcile. Agent can edit afterwards; this is a seed, not a sandbox. + items: + maxLength: 64 + pattern: ^[a-z0-9][a-z0-9-]*$ type: string - maxLength: 256 - description: "LiteLLM model name to pin. Empty = controller picks cluster top-of-rank on first deploy and writes status.pinnedModel." - skills: - type: array - maxItems: 64 - items: - type: string - pattern: "^[a-z0-9][a-z0-9-]*$" - maxLength: 64 - description: "Allow-listed skills written to the per-agent skills dir on first reconcile. Agent can edit afterwards; this is a seed, not a sandbox." - objective: - type: string - maxLength: 4096 - description: "Operator-supplied objective text. Substituted into the SOUL.md template by the seeder on first write. Agent owns SOUL.md after that." - wallet: - type: object + maxItems: 64 + type: array + wallet: + properties: + create: + default: false + description: |- + Provision a per-namespace remote-signer keystore. Address is + published in status.walletAddress. + type: boolean + type: object + type: object + status: + properties: + conditions: + items: properties: - create: - type: boolean - default: false - description: "Provision a per-namespace remote-signer keystore. Address is published in status.walletAddress." - status: - type: object - properties: - observedGeneration: - type: integer - format: int64 - phase: - type: string - description: "Pending | Provisioning | Ready | Failed" - pinnedModel: - type: string - description: "Actual model the agent is using (= spec.model when set, otherwise the auto-picked top-of-rank)." - walletAddress: - type: string - pattern: "^(0x[0-9a-fA-F]{40})?$" - description: "Agent's signing address when wallet.create=true. Empty otherwise." - endpoint: - type: string - description: "Cluster-internal URL for the agent runtime (e.g. http://hermes.agent-quant.svc.cluster.local:8642)." - conditions: - type: array - items: - type: object - properties: - type: - type: string - status: - type: string - reason: - type: string - message: - type: string - lastTransitionTime: - type: string - format: date-time + lastTransitionTime: + description: Last time the condition transitioned. + format: date-time + type: string + message: + description: Human-readable message with details. + type: string + reason: + description: Machine-readable reason for the condition. + type: string + status: + description: Status of the condition. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: Condition type. + type: string + required: + - status + - type + type: object + type: array + endpoint: + description: |- + Cluster-internal URL for the agent runtime (e.g. + http://hermes.agent-quant.svc.cluster.local:8642). + type: string + observedGeneration: + format: int64 + type: integer + phase: + description: Pending | Provisioning | Ready | Failed + type: string + pinnedModel: + description: |- + Actual model the agent is using (= spec.model when set, otherwise + the auto-picked top-of-rank). + type: string + walletAddress: + description: Agent's signing address when wallet.create=true. Empty + otherwise. + pattern: ^(0x[0-9a-fA-F]{40})?$ + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/internal/embed/infrastructure/base/templates/agentidentity-crd.yaml b/internal/embed/infrastructure/base/templates/agentidentity-crd.yaml index 29ad8c03..fa9fc2bc 100644 --- a/internal/embed/infrastructure/base/templates/agentidentity-crd.yaml +++ b/internal/embed/infrastructure/base/templates/agentidentity-crd.yaml @@ -1,13 +1,9 @@ --- -# AgentIdentity CRD -# Durable ERC-8004 agent identity document. Outlives ServiceOffers: when the -# last offer is deleted, the controller renders a tombstone (active:false, -# x402Support:false) instead of removing the registration document. The -# canonical operator identity lives at x402/default and status.registrations -# records the on-chain agentId for each registered chain. apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.5 name: agentidentities.obol.org spec: group: obol.org @@ -15,48 +11,73 @@ spec: kind: AgentIdentity listKind: AgentIdentityList plural: agentidentities - singular: agentidentity shortNames: - - aid + - aid + singular: agentidentity scope: Namespaced versions: - - name: v1alpha1 - served: true - storage: true - subresources: - status: {} - additionalPrinterColumns: - - name: Chains - type: string - jsonPath: .status.registrations[*].chain - - name: AgentIDs - type: string - jsonPath: .status.registrations[*].agentId - - name: Age - type: date - jsonPath: .metadata.creationTimestamp - schema: - openAPIV3Schema: - type: object - properties: - spec: - type: object - status: - type: object - properties: - registrations: - type: array - description: "Per-chain ERC-8004 registrations for this identity document." - items: - type: object - required: - - chain - - agentId - properties: - chain: - type: string - maxLength: 64 - description: "ERC-8004 registration chain alias." - agentId: - type: string - description: "On-chain ERC-721 tokenId on the given chain." + - additionalPrinterColumns: + - jsonPath: .status.registrations[*].chain + name: Chains + type: string + - jsonPath: .status.registrations[*].agentId + name: AgentIDs + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + AgentIdentity is the durable, on-chain identity an operator controls in + the ERC-8004 Identity Registry. A single AgentIdentity outlives + ServiceOffers: deleting the last ServiceOffer that references it does + not delete the NFT, the published registration document, or the + recorded agentId; instead the renderer publishes a tombstone + (active:false, x402Support:false) so external observers still see the + historical record. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + type: object + status: + properties: + registrations: + description: Per-chain ERC-8004 registrations for this identity document. + items: + properties: + agentId: + description: On-chain ERC-721 tokenId on the given chain. + type: string + chain: + description: ERC-8004 registration chain alias. + maxLength: 64 + type: string + required: + - agentId + - chain + type: object + type: array + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/internal/embed/infrastructure/base/templates/purchaserequest-crd.yaml b/internal/embed/infrastructure/base/templates/purchaserequest-crd.yaml index 49bb7359..af32f441 100644 --- a/internal/embed/infrastructure/base/templates/purchaserequest-crd.yaml +++ b/internal/embed/infrastructure/base/templates/purchaserequest-crd.yaml @@ -1,6 +1,9 @@ +--- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.5 name: purchaserequests.obol.org spec: group: obol.org @@ -8,162 +11,221 @@ spec: kind: PurchaseRequest listKind: PurchaseRequestList plural: purchaserequests - singular: purchaserequest shortNames: - - pr + - pr + singular: purchaserequest scope: Namespaced versions: - - name: v1alpha1 - served: true - storage: true - subresources: - status: {} - additionalPrinterColumns: - - name: Endpoint - type: string - jsonPath: .spec.endpoint - - name: Model - type: string - jsonPath: .spec.model - - name: Price - type: string - jsonPath: .spec.payment.price - - name: Remaining - type: integer - jsonPath: .status.remaining - - name: Spent - type: integer - jsonPath: .status.spent - - name: Ready - type: string - jsonPath: .status.conditions[?(@.type=="Ready")].status - - name: Age - type: date - jsonPath: .metadata.creationTimestamp - schema: - openAPIV3Schema: - type: object - properties: - spec: - type: object - required: [endpoint, model, count, payment] - properties: - endpoint: - type: string - description: "Full URL to the x402-gated inference endpoint" - model: - type: string - description: "Remote model ID (used as paid/ in LiteLLM)" - count: - type: integer - minimum: 1 - maximum: 2500 - description: "Number of pre-signed auths to create" - preSignedAuths: - type: array - description: "Pre-signed x402 payments (legacy ERC-3009 auths still supported)" - items: - type: object - properties: - id: { type: string } - payment: - type: object - x-kubernetes-preserve-unknown-fields: true - signature: { type: string } - from: { type: string } - to: { type: string } - value: { type: string } - validAfter: { type: string } - validBefore: { type: string } - nonce: { type: string } - autoRefill: - type: object + - additionalPrinterColumns: + - jsonPath: .spec.endpoint + name: Endpoint + type: string + - jsonPath: .spec.model + name: Model + type: string + - jsonPath: .spec.payment.price + name: Price + type: string + - jsonPath: .status.remaining + name: Remaining + type: integer + - jsonPath: .status.spent + name: Spent + type: integer + - jsonPath: .status.conditions[?(@.type=="Ready")].status + name: Ready + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + PurchaseRequest is the buyer-side request for pre-signed x402 auths + against a remote inference endpoint. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + properties: + autoRefill: + description: |- + PurchaseAutoRefill drives the agent-managed auto-refill policy for a + PurchaseRequest. The reconciler reads MaxTotal + MaxSpendPerDay as + budget caps before signing more auths; without these fields populated + the agent will not auto-refill beyond the initial Count. + properties: + count: + description: Number of auths to sign on refill. + minimum: 1 + type: integer + enabled: + default: false + type: boolean + maxSpendPerDay: + description: Max micro-USDC spend per day. + type: string + maxTotal: + description: Cap total auths ever signed. + type: integer + threshold: + description: Refill when remaining < threshold. + minimum: 0 + type: integer + type: object + count: + description: Number of pre-signed auths to create. + maximum: 2500 + minimum: 1 + type: integer + endpoint: + description: Full URL to the x402-gated inference endpoint. + type: string + model: + description: Remote model ID (used as paid/ in LiteLLM). + type: string + payment: + properties: + asset: + description: ERC-20 contract address. + type: string + assetDecimals: + description: Token decimals in atomic units. + format: int64 + type: integer + assetSymbol: + description: Human-friendly token symbol (e.g. USDC, OBOL). + type: string + assetTransferMethod: + description: x402 transfer method used for this asset. + type: string + eip712Name: + description: EIP-712 domain name used for signing. + type: string + eip712Version: + description: EIP-712 domain version used for signing. + type: string + network: + type: string + payTo: + type: string + price: + description: Atomic token units per request. + type: string + required: + - asset + - network + - payTo + - price + type: object + preSignedAuths: + description: Pre-signed x402 payments (legacy ERC-3009 auths still + supported). + items: + description: |- + PreSignedAuth carries a pre-signed x402 payment authorization. The + Payment map is opaque (forwarded verbatim to the buyer sidecar / x402 + facilitator) which can't be deep-copied by controller-gen; DeepCopy + methods for this type are hand-written in deepcopy_manual.go. properties: - enabled: - type: boolean - default: false - threshold: - type: integer - minimum: 0 - description: "Refill when remaining < threshold" - count: - type: integer - minimum: 1 - description: "Number of auths to sign on refill" - maxTotal: - type: integer - description: "Cap total auths ever signed" - maxSpendPerDay: + from: type: string - description: "Max micro-USDC spend per day" - payment: - type: object - required: [network, payTo, price, asset] - properties: - network: + id: + type: string + nonce: type: string - payTo: + payment: + x-kubernetes-preserve-unknown-fields: true + signature: type: string - price: + to: type: string - description: "Atomic token units per request" - asset: + validAfter: type: string - description: "ERC-20 contract address" - assetSymbol: + validBefore: type: string - description: "Human-friendly token symbol (e.g. USDC, OBOL)" - assetDecimals: - type: integer - description: "Token decimals in atomic units" - assetTransferMethod: + value: type: string - description: "x402 transfer method used for this asset" - eip712Name: + type: object + type: array + required: + - count + - endpoint + - model + - payment + type: object + status: + properties: + conditions: + items: + properties: + lastTransitionTime: + description: Last time the condition transitioned. + format: date-time + type: string + message: + description: Human-readable message with details. type: string - description: "EIP-712 domain name used for signing" - eip712Version: + reason: + description: Machine-readable reason for the condition. type: string - description: "EIP-712 domain version used for signing" - status: - type: object - properties: - observedGeneration: - type: integer - format: int64 - conditions: - type: array - items: - type: object - properties: - type: - type: string - status: - type: string - reason: - type: string - message: - type: string - lastTransitionTime: - type: string - format: date-time - publicModel: - type: string - description: "LiteLLM model name (paid/)" - remaining: - type: integer - spent: - type: integer - totalSigned: - type: integer - totalSpent: - type: string - probedAt: - type: string - format: date-time - probedPrice: - type: string - walletBalance: - type: string - signerAddress: - type: string + status: + description: Status of the condition. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: Condition type. + type: string + required: + - status + - type + type: object + type: array + observedGeneration: + format: int64 + type: integer + probedAt: + format: date-time + type: string + probedPrice: + type: string + publicModel: + description: LiteLLM model name (paid/). + type: string + remaining: + type: integer + signerAddress: + type: string + spent: + type: integer + totalSigned: + type: integer + totalSpent: + type: string + walletBalance: + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/internal/embed/infrastructure/base/templates/registrationrequest-crd.yaml b/internal/embed/infrastructure/base/templates/registrationrequest-crd.yaml index b6266db2..8ac1a00c 100644 --- a/internal/embed/infrastructure/base/templates/registrationrequest-crd.yaml +++ b/internal/embed/infrastructure/base/templates/registrationrequest-crd.yaml @@ -1,10 +1,9 @@ --- -# RegistrationRequest CRD -# Isolates ERC-8004 publication and on-chain side effects from the main -# ServiceOffer reconciliation loop. ServiceOffer remains the source of truth. apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.5 name: registrationrequests.obol.org spec: group: obol.org @@ -12,74 +11,95 @@ spec: kind: RegistrationRequest listKind: RegistrationRequestList plural: registrationrequests - singular: registrationrequest shortNames: - - rr + - rr + singular: registrationrequest scope: Namespaced versions: - - name: v1alpha1 - served: true - storage: true - subresources: - status: {} - additionalPrinterColumns: - - name: Offer - type: string - jsonPath: .spec.serviceOfferName - - name: State - type: string - jsonPath: .spec.desiredState - - name: Phase - type: string - jsonPath: .status.phase - - name: AgentID - type: string - jsonPath: .status.agentId - - name: Age - type: date - jsonPath: .metadata.creationTimestamp - schema: - openAPIV3Schema: - type: object - properties: - spec: - type: object - required: - - serviceOfferName - - serviceOfferNamespace - - desiredState - properties: - serviceOfferName: - type: string - serviceOfferNamespace: - type: string - desiredState: - type: string - enum: - - Active - - Tombstoned - chain: - type: string - description: "ERC-8004 registration chain alias for this request." - status: - type: object - properties: - phase: - type: string - message: - type: string - publishedUrl: - type: string - agentId: - type: string - registrationTxHash: - type: string - registrationOwner: - type: string - registrationUri: - type: string - registrationSearchFromBlock: - type: integer - format: int64 - metadataSynced: - type: boolean + - additionalPrinterColumns: + - jsonPath: .spec.serviceOfferName + name: Offer + type: string + - jsonPath: .spec.desiredState + name: State + type: string + - jsonPath: .status.phase + name: Phase + type: string + - jsonPath: .status.agentId + name: AgentID + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + RegistrationRequest isolates ERC-8004 publication and on-chain side + effects from the main ServiceOffer reconciliation loop. ServiceOffer + remains the source of truth. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + properties: + chain: + description: ERC-8004 registration chain alias for this request. + type: string + desiredState: + enum: + - Active + - Tombstoned + type: string + serviceOfferName: + type: string + serviceOfferNamespace: + type: string + required: + - desiredState + - serviceOfferName + - serviceOfferNamespace + type: object + status: + properties: + agentId: + type: string + message: + type: string + metadataSynced: + type: boolean + phase: + type: string + publishedUrl: + type: string + registrationOwner: + type: string + registrationSearchFromBlock: + format: int64 + type: integer + registrationTxHash: + type: string + registrationUri: + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/internal/embed/infrastructure/base/templates/serviceoffer-crd.yaml b/internal/embed/infrastructure/base/templates/serviceoffer-crd.yaml index 5b37ba23..9bac5643 100644 --- a/internal/embed/infrastructure/base/templates/serviceoffer-crd.yaml +++ b/internal/embed/infrastructure/base/templates/serviceoffer-crd.yaml @@ -1,15 +1,9 @@ --- -# ServiceOffer CRD -# Defines a compute service the agent can expose, gate with x402, and register on-chain. -# Condition lifecycle: ModelReady -> UpstreamHealthy -> PaymentGateReady -> RoutePublished -> Registered -> Ready -# -# Field naming conventions: -# - payment.* fields align with x402 PaymentRequirements (V2): payTo, network, scheme, maxTimeoutSeconds -# - registration.* fields align with ERC-8004 AgentRegistration: name, description, services, supportedTrust -# - Human-friendly values (e.g., "base-sepolia") are used; the reconciler translates to wire format apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.5 name: serviceoffers.obol.org spec: group: obol.org @@ -17,354 +11,355 @@ spec: kind: ServiceOffer listKind: ServiceOfferList plural: serviceoffers - singular: serviceoffer shortNames: - - so + - so + singular: serviceoffer scope: Namespaced versions: - - name: v1alpha1 - served: true - storage: true - subresources: - status: {} - additionalPrinterColumns: - - name: Type - type: string - jsonPath: .spec.type - - name: Model - type: string - jsonPath: .spec.model.name - - name: Price - type: string - jsonPath: .spec.payment.price.perRequest - - name: Network - type: string - jsonPath: .spec.payment.network - - name: Ready - type: string - jsonPath: .status.conditions[?(@.type=="Ready")].status - - name: Age - type: date - jsonPath: .metadata.creationTimestamp - schema: - openAPIV3Schema: - type: object - description: >- - ServiceOffer declares a compute service that can be exposed publicly, - gated with x402 payments, and optionally registered on an ERC-8004 - service registry. Field names align with x402 and ERC-8004 standards. - properties: - spec: - type: object - required: - - payment - # upstream is required for type=http|inference|fine-tuning but - # synthesized by the controller from Agent.status.endpoint when - # type=agent. Validation of "upstream OR agent.ref" lives in - # the controller's runtime check. - properties: - type: - type: string - description: >- - Service type. 'inference' enables model management; 'http' for any HTTP service; - 'agent' references an Agent CR via spec.agent.ref and the controller derives - upstream + model + skills from the agent's status. - default: "http" - enum: - - inference - - fine-tuning - - http - - agent - agent: - type: object - description: >- - Required when type='agent'. The controller resolves spec.agent.ref to the - referenced Agent CR, derives upstream from Agent.status.endpoint, and surfaces - the agent's pinned model + skills in the 402 response's extra block. - properties: - ref: - type: object - required: - - name - - namespace - properties: - name: - type: string - namespace: - type: string - model: - type: object - description: "LLM model metadata. Required when the upstream serves an LLM." - required: + - additionalPrinterColumns: + - jsonPath: .spec.type + name: Type + type: string + - jsonPath: .spec.model.name + name: Model + type: string + - jsonPath: .spec.payment.price.perRequest + name: Price + type: string + - jsonPath: .spec.payment.network + name: Network + type: string + - jsonPath: .status.conditions[?(@.type=="Ready")].status + name: Ready + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + ServiceOffer declares a compute service that can be exposed publicly, + gated with x402 payments, and optionally registered on an ERC-8004 + service registry. Field names align with x402 and ERC-8004 standards. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + properties: + agent: + description: |- + Required when type='agent'. The controller resolves spec.agent.ref to + the referenced Agent CR, derives upstream from Agent.status.endpoint, + and surfaces the agent's pinned model + skills in the 402 response. + properties: + ref: + properties: + name: + type: string + namespace: + type: string + required: - name - - runtime - properties: - name: - type: string - description: "Model identifier (e.g. qwen3.5:35b)." - runtime: - type: string - description: "Runtime serving the model." - enum: - - ollama - - vllm - - tgi - upstream: - type: object - description: "In-cluster service that handles the actual workload." - required: - - service - namespace - - port - properties: - service: - type: string - description: "Kubernetes Service name." - namespace: - type: string - description: "Namespace of the upstream Service." - port: - type: integer - description: "Port on the upstream Service." - default: 11434 - minimum: 1 - maximum: 65535 - healthPath: - type: string - description: "HTTP path used for health probes against the upstream." - default: "/health" - payment: - type: object - description: >- - x402 payment terms. Field names align with x402 PaymentRequirements (V2): - payTo, network, scheme, maxTimeoutSeconds. - required: - - network - - payTo - - price - properties: - scheme: - type: string - description: "x402 payment scheme." - default: "exact" - enum: - - exact - network: + type: object + type: object + model: + description: LLM model metadata. Required when the upstream serves + an LLM. + properties: + name: + description: Model identifier (e.g. qwen3.5:35b). + type: string + runtime: + description: Runtime serving the model. + enum: + - ollama + - vllm + - tgi + type: string + required: + - name + - runtime + type: object + path: + description: URL path prefix for the HTTPRoute, defaults to /services/. + pattern: ^/[a-zA-Z0-9/_.-]*$ + type: string + payment: + properties: + asset: + description: |- + Optional token metadata override for x402 settlement. When omitted, + the verifier uses the chain default asset. + properties: + address: + description: ERC-20 contract address. + pattern: ^0x[0-9a-fA-F]{40}$ + type: string + decimals: + description: Token decimals in atomic units. + format: int64 + maximum: 255 + minimum: 0 + type: integer + eip712Name: + description: EIP-712 domain name used by the token. + type: string + eip712Version: + description: EIP-712 domain version used by the token. + type: string + symbol: + description: Human-friendly token symbol (e.g. USDC, OBOL). + type: string + transferMethod: + description: x402 transfer method for the asset. + enum: + - eip3009 + - permit2 + type: string + type: object + maxTimeoutSeconds: + default: 300 + description: 'Payment validity window in seconds (x402: maxTimeoutSeconds).' + format: int64 + type: integer + network: + description: |- + Chain identifier for payments (human-friendly). Reconciler resolves + to CAIP-2 format (e.g., "base-sepolia" → "eip155:84532"). + type: string + payTo: + description: 'USDC recipient wallet address (x402: payTo).' + pattern: ^0x[0-9a-fA-F]{40}$ + type: string + price: + description: |- + Pricing table with per-unit prices in USDC (human-readable decimals). + Which fields are applicable depends on the workload type. + properties: + perEpoch: + description: Per-training-epoch price in USDC. Fine-tuning + only. + type: string + perHour: + description: Per-compute-hour price in USDC. Fine-tuning only. + type: string + perMTok: + description: Per-million-tokens price in USDC. Inference only. + type: string + perRequest: + description: Flat per-request price in USDC. Applicable to + all types. + type: string + type: object + scheme: + default: exact + description: x402 payment scheme. + enum: + - exact + type: string + required: + - network + - payTo + - price + type: object + provenance: + additionalProperties: + type: string + description: |- + Optional provenance metadata for the service. Tracks how the model or + service was produced (e.g. autoresearch experiment data). Included in + the ERC-8004 registration document when present. + type: object + registration: + description: |- + ERC-8004 registration metadata. Field names align with the + AgentRegistration document schema (ERC-8004 spec). + properties: + description: + description: 'Agent description (ERC-8004: AgentRegistration.description).' + type: string + domains: + description: |- + OASF domains for discovery (e.g. technology/artificial_intelligence). + Mapped to an OASF service entry in the registration JSON. + items: type: string - description: >- - Chain identifier for payments (human-friendly). - Reconciler resolves to CAIP-2 format (e.g., "base-sepolia" → "eip155:84532"). - payTo: + type: array + enabled: + default: false + description: If true, register on ERC-8004 after routing is live. + type: boolean + image: + description: 'Agent icon URL (ERC-8004: AgentRegistration.image).' + type: string + metadata: + additionalProperties: type: string - description: "USDC recipient wallet address (x402: payTo)." - pattern: "^0x[0-9a-fA-F]{40}$" - maxTimeoutSeconds: - type: integer - description: "Payment validity window in seconds (x402: maxTimeoutSeconds)." - default: 300 - asset: - type: object - description: >- - Optional token metadata override for x402 settlement. - When omitted, the verifier uses the chain default asset. + description: |- + Additional registration metadata published into the generated + agent-registration.json for discovery and ranking. + type: object + name: + description: 'Agent name (ERC-8004: AgentRegistration.name).' + type: string + services: + description: 'Service endpoints (ERC-8004: AgentRegistration.services[]).' + items: properties: - address: - type: string - description: "ERC-20 contract address." - pattern: "^0x[0-9a-fA-F]{40}$" - symbol: + endpoint: + description: Service URL. Auto-filled from tunnel URL if + empty. type: string - description: "Human-friendly token symbol (e.g. USDC, OBOL)." - decimals: - type: integer - description: "Token decimals in atomic units." - minimum: 0 - maximum: 255 - transferMethod: - type: string - description: "x402 transfer method for the asset." - enum: - - eip3009 - - permit2 - eip712Name: + name: + description: 'Service type: web, A2A, MCP, OASF, ENS, DID, + email.' type: string - description: "EIP-712 domain name used by the token." - eip712Version: + version: + description: Protocol version (SHOULD per ERC-8004 spec). type: string - description: "EIP-712 domain version used by the token." - price: + required: + - endpoint + - name type: object - description: >- - Pricing table with per-unit prices in USDC (human-readable decimals). - Which fields are applicable depends on the workload type. - properties: - perRequest: - type: string - description: "Flat per-request price in USDC. Applicable to all types." - perMTok: - type: string - description: "Per-million-tokens price in USDC. Inference only." - perHour: - type: string - description: "Per-compute-hour price in USDC. Fine-tuning only." - perEpoch: - type: string - description: "Per-training-epoch price in USDC. Fine-tuning only." - provenance: - type: object - description: >- - Optional provenance metadata for the service. Tracks how the - model or service was produced (e.g. autoresearch experiment data). - Included in the ERC-8004 registration document when present. - properties: - framework: - type: string - description: "Optimization framework (e.g. autoresearch)." - metricName: - type: string - description: "Name of the primary quality metric (e.g. val_bpb)." - metricValue: + type: array + skills: + description: |- + OASF skills for discovery (e.g. + natural_language_processing/text_generation). Mapped to an OASF + service entry in the registration JSON. + items: type: string - description: "Primary quality metric value (e.g. 0.9973)." - experimentId: + type: array + supportedTrust: + description: |- + Trust verification methods (ERC-8004: AgentRegistration.supportedTrust[]). + Valid values: reputation, crypto-economic, tee-attestation. + items: type: string - description: "Experiment or commit identifier." - trainHash: + type: array + type: object + type: + default: http + description: |- + Service type. 'inference' enables model management; 'http' for any HTTP + service; 'agent' references an Agent CR via spec.agent.ref and the + controller derives upstream + model + skills from the agent's status. + enum: + - inference + - fine-tuning + - http + - agent + type: string + upstream: + description: In-cluster service that handles the actual workload. + properties: + healthPath: + default: /health + description: HTTP path used for health probes against the upstream. + type: string + namespace: + description: Namespace of the upstream Service. + type: string + port: + default: 11434 + description: Port on the upstream Service. + format: int64 + maximum: 65535 + minimum: 1 + type: integer + service: + description: Kubernetes Service name. + type: string + required: + - namespace + - port + - service + type: object + required: + - payment + type: object + status: + properties: + agentId: + description: ERC-8004 agent NFT token ID after on-chain registration. + type: string + agentResolution: + description: |- + Controller's resolved view of an agent-type offer's referenced Agent. + Populated only when type=agent and the Agent is Ready. + properties: + endpoint: + type: string + model: + type: string + runtime: + type: string + skills: + items: type: string - description: "SHA-256 hash of the training code that produced this model." - paramCount: - type: string - description: "Model parameter count (e.g. 50M, 1.3B)." - path: - type: string - description: "URL path prefix for the HTTPRoute, defaults to /services/." - pattern: "^/[a-zA-Z0-9/_.-]*$" - registration: - type: object - description: >- - ERC-8004 registration metadata. Field names align with the - AgentRegistration document schema (ERC-8004 spec). + type: array + type: object + conditions: + description: |- + Condition types: ModelReady, UpstreamHealthy, PaymentGateReady, + RoutePublished, Registered, Ready. + items: properties: - enabled: - type: boolean - description: "If true, register on ERC-8004 after routing is live." - default: false - name: + lastTransitionTime: + description: Last time the condition transitioned. + format: date-time type: string - description: "Agent name (ERC-8004: AgentRegistration.name)." - description: + message: + description: Human-readable message with details. type: string - description: "Agent description (ERC-8004: AgentRegistration.description)." - image: + reason: + description: Machine-readable reason for the condition. type: string - description: "Agent icon URL (ERC-8004: AgentRegistration.image)." - services: - type: array - description: "Service endpoints (ERC-8004: AgentRegistration.services[])." - items: - type: object - required: - - name - - endpoint - properties: - name: - type: string - description: "Service type: web, A2A, MCP, OASF, ENS, DID, email." - endpoint: - type: string - description: "Service URL. Auto-filled from tunnel URL if empty." - version: - type: string - description: "Protocol version (SHOULD per ERC-8004 spec)." - skills: - type: array - description: >- - OASF skills for discovery (e.g. natural_language_processing/text_generation). - Mapped to an OASF service entry in the registration JSON. - items: - type: string - domains: - type: array - description: >- - OASF domains for discovery (e.g. technology/artificial_intelligence). - Mapped to an OASF service entry in the registration JSON. - items: - type: string - supportedTrust: - type: array - description: >- - Trust verification methods (ERC-8004: AgentRegistration.supportedTrust[]). - Valid values: reputation, crypto-economic, tee-attestation. - items: - type: string - metadata: - type: object - description: >- - Additional registration metadata published into the generated - agent-registration.json for discovery and ranking (for example: - gpu, framework, best_val_bpb, total_experiments). - additionalProperties: - type: string - status: - type: object - properties: - conditions: - type: array - description: >- - Condition types: ModelReady, UpstreamHealthy, PaymentGateReady, - RoutePublished, Registered, Ready. - items: - type: object - required: - - type - - status - properties: - type: - type: string - description: "Condition type." - status: - type: string - description: "Status of the condition." - enum: - - "True" - - "False" - - "Unknown" - reason: - type: string - description: "Machine-readable reason for the condition." - message: - type: string - description: "Human-readable message with details." - lastTransitionTime: - type: string - format: date-time - description: "Last time the condition transitioned." - endpoint: - type: string - description: "The public endpoint URL once the route is published." - agentId: - type: string - description: "ERC-8004 agent NFT token ID after on-chain registration." - registrationTxHash: - type: string - description: "Transaction hash of the ERC-8004 registration." - observedGeneration: - type: integer - format: int64 - description: "The generation observed by the controller." - agentResolution: - type: object - description: >- - Controller's resolved view of an agent-type offer's referenced - Agent. Populated only when type=agent and the Agent is Ready. - properties: - model: - type: string - skills: - type: array - items: - type: string - runtime: + status: + description: Status of the condition. + enum: + - "True" + - "False" + - Unknown type: string - endpoint: + type: + description: Condition type. type: string + required: + - status + - type + type: object + type: array + endpoint: + description: The public endpoint URL once the route is published. + type: string + observedGeneration: + description: The generation observed by the controller. + format: int64 + type: integer + registrationTxHash: + description: Transaction hash of the ERC-8004 registration. + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/internal/monetizeapi/deepcopy_manual.go b/internal/monetizeapi/deepcopy_manual.go new file mode 100644 index 00000000..89636588 --- /dev/null +++ b/internal/monetizeapi/deepcopy_manual.go @@ -0,0 +1,58 @@ +package monetizeapi + +// PreSignedAuth deep-copy is hand-written because its Payment field is +// an opaque map[string]interface{} (controller-gen can't deep-copy +// untyped JSON). The type is excluded from generation via the +// object-generate=false marker in types.go. + +// DeepCopyInto copies the receiver into out. The Payment map is +// shallow-copied; values inside the map are JSON-serializable scalars / +// maps / slices passed through to the buyer sidecar, so a shallow copy +// is sufficient for the controller's deep-copy contract (no internal +// pointer aliasing into caller-owned mutable structures). +func (in *PreSignedAuth) DeepCopyInto(out *PreSignedAuth) { + *out = *in + if in.Payment != nil { + out.Payment = deepCopyJSONMap(in.Payment) + } +} + +// DeepCopy returns a deep copy of the receiver. +func (in *PreSignedAuth) DeepCopy() *PreSignedAuth { + if in == nil { + return nil + } + out := new(PreSignedAuth) + in.DeepCopyInto(out) + return out +} + +// deepCopyJSONMap walks an opaque JSON-decoded map[string]interface{} +// tree and returns a structurally identical copy. Handles the nested +// shapes the x402 PaymentPayload uses (object, array, scalar). +func deepCopyJSONMap(in map[string]interface{}) map[string]interface{} { + if in == nil { + return nil + } + out := make(map[string]interface{}, len(in)) + for k, v := range in { + out[k] = deepCopyJSONValue(v) + } + return out +} + +func deepCopyJSONValue(v interface{}) interface{} { + switch t := v.(type) { + case map[string]interface{}: + return deepCopyJSONMap(t) + case []interface{}: + out := make([]interface{}, len(t)) + for i, item := range t { + out[i] = deepCopyJSONValue(item) + } + return out + default: + // Strings, numbers, bools, nil — value types, safe to share. + return v + } +} diff --git a/internal/monetizeapi/doc.go b/internal/monetizeapi/doc.go new file mode 100644 index 00000000..63ab340c --- /dev/null +++ b/internal/monetizeapi/doc.go @@ -0,0 +1,16 @@ +// Package monetizeapi defines the Custom Resource Definitions for the +// Obol Stack monetize subsystem. +// +// The Go types in this package are the single source of truth for the +// CRD OpenAPI schemas embedded under +// internal/embed/infrastructure/base/templates/*-crd.yaml. +// +// Edit a field or marker here, then run `just generate` to regenerate +// the CRD YAML manifests + zz_generated_deepcopy.go from kubebuilder +// markers. CI fails if the working tree is dirty after that command +// runs (see .github/workflows/lint-test.yaml::generate-check). +// +// +kubebuilder:object:generate=true +// +groupName=obol.org +// +versionName=v1alpha1 +package monetizeapi diff --git a/internal/monetizeapi/types.go b/internal/monetizeapi/types.go index 6e905eee..ac104b7c 100644 --- a/internal/monetizeapi/types.go +++ b/internal/monetizeapi/types.go @@ -61,6 +61,21 @@ var ( PVCGVR = schema.GroupVersionResource{Group: "", Version: "v1", Resource: "persistentvolumeclaims"} ) +// ── ServiceOffer ──────────────────────────────────────────────────────────── + +// +kubebuilder:object:root=true +// +kubebuilder:resource:scope=Namespaced,shortName=so +// +kubebuilder:subresource:status +// +kubebuilder:printcolumn:name="Type",type=string,JSONPath=`.spec.type` +// +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.spec.model.name` +// +kubebuilder:printcolumn:name="Price",type=string,JSONPath=`.spec.payment.price.perRequest` +// +kubebuilder:printcolumn:name="Network",type=string,JSONPath=`.spec.payment.network` +// +kubebuilder:printcolumn:name="Ready",type=string,JSONPath=`.status.conditions[?(@.type=="Ready")].status` +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp` + +// ServiceOffer declares a compute service that can be exposed publicly, +// gated with x402 payments, and optionally registered on an ERC-8004 +// service registry. Field names align with x402 and ERC-8004 standards. type ServiceOffer struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` @@ -68,14 +83,48 @@ type ServiceOffer struct { Status ServiceOfferStatus `json:"status,omitempty"` } +// +kubebuilder:object:root=true + +// ServiceOfferList is the list form for kubectl/list operations. +type ServiceOfferList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []ServiceOffer `json:"items"` +} + type ServiceOfferSpec struct { - Type string `json:"type,omitempty"` - Agent ServiceOfferAgent `json:"agent,omitempty"` - Model ServiceOfferModel `json:"model,omitempty"` - Upstream ServiceOfferUpstream `json:"upstream,omitempty"` - Payment ServiceOfferPayment `json:"payment,omitempty"` - Path string `json:"path,omitempty"` - Provenance map[string]string `json:"provenance,omitempty"` + // Service type. 'inference' enables model management; 'http' for any HTTP + // service; 'agent' references an Agent CR via spec.agent.ref and the + // controller derives upstream + model + skills from the agent's status. + // +kubebuilder:default="http" + // +kubebuilder:validation:Enum=inference;fine-tuning;http;agent + Type string `json:"type,omitempty"` + + // Required when type='agent'. The controller resolves spec.agent.ref to + // the referenced Agent CR, derives upstream from Agent.status.endpoint, + // and surfaces the agent's pinned model + skills in the 402 response. + Agent ServiceOfferAgent `json:"agent,omitempty"` + + // LLM model metadata. Required when the upstream serves an LLM. + Model ServiceOfferModel `json:"model,omitempty"` + + // In-cluster service that handles the actual workload. + Upstream ServiceOfferUpstream `json:"upstream,omitempty"` + + // +kubebuilder:validation:Required + Payment ServiceOfferPayment `json:"payment"` + + // URL path prefix for the HTTPRoute, defaults to /services/. + // +kubebuilder:validation:Pattern=`^/[a-zA-Z0-9/_.-]*$` + Path string `json:"path,omitempty"` + + // Optional provenance metadata for the service. Tracks how the model or + // service was produced (e.g. autoresearch experiment data). Included in + // the ERC-8004 registration document when present. + Provenance map[string]string `json:"provenance,omitempty"` + + // ERC-8004 registration metadata. Field names align with the + // AgentRegistration document schema (ERC-8004 spec). Registration ServiceOfferRegistration `json:"registration,omitempty"` } @@ -88,72 +137,148 @@ type ServiceOfferAgent struct { } type ServiceOfferAgentRef struct { - Name string `json:"name,omitempty"` - Namespace string `json:"namespace,omitempty"` + // +kubebuilder:validation:Required + Name string `json:"name"` + // +kubebuilder:validation:Required + Namespace string `json:"namespace"` } type ServiceOfferModel struct { - Name string `json:"name,omitempty"` - Runtime string `json:"runtime,omitempty"` + // Model identifier (e.g. qwen3.5:35b). + // +kubebuilder:validation:Required + Name string `json:"name"` + // Runtime serving the model. + // +kubebuilder:validation:Required + // +kubebuilder:validation:Enum=ollama;vllm;tgi + Runtime string `json:"runtime"` } type ServiceOfferUpstream struct { - Service string `json:"service,omitempty"` - Namespace string `json:"namespace,omitempty"` - Port int64 `json:"port,omitempty"` + // Kubernetes Service name. + // +kubebuilder:validation:Required + Service string `json:"service"` + // Namespace of the upstream Service. + // +kubebuilder:validation:Required + Namespace string `json:"namespace"` + // Port on the upstream Service. + // +kubebuilder:validation:Required + // +kubebuilder:default=11434 + // +kubebuilder:validation:Minimum=1 + // +kubebuilder:validation:Maximum=65535 + Port int64 `json:"port"` + // HTTP path used for health probes against the upstream. + // +kubebuilder:default="/health" HealthPath string `json:"healthPath,omitempty"` } type ServiceOfferPayment struct { - Scheme string `json:"scheme,omitempty"` - Network string `json:"network,omitempty"` - PayTo string `json:"payTo,omitempty"` - MaxTimeoutSeconds int64 `json:"maxTimeoutSeconds,omitempty"` - Asset ServiceOfferAsset `json:"asset,omitempty"` - Price ServiceOfferPriceTable `json:"price,omitempty"` + // x402 payment scheme. + // +kubebuilder:default="exact" + // +kubebuilder:validation:Enum=exact + Scheme string `json:"scheme,omitempty"` + // Chain identifier for payments (human-friendly). Reconciler resolves + // to CAIP-2 format (e.g., "base-sepolia" → "eip155:84532"). + // +kubebuilder:validation:Required + Network string `json:"network"` + // USDC recipient wallet address (x402: payTo). + // +kubebuilder:validation:Required + // +kubebuilder:validation:Pattern=`^0x[0-9a-fA-F]{40}$` + PayTo string `json:"payTo"` + // Payment validity window in seconds (x402: maxTimeoutSeconds). + // +kubebuilder:default=300 + MaxTimeoutSeconds int64 `json:"maxTimeoutSeconds,omitempty"` + // Optional token metadata override for x402 settlement. When omitted, + // the verifier uses the chain default asset. + Asset ServiceOfferAsset `json:"asset,omitempty"` + // Pricing table with per-unit prices in USDC (human-readable decimals). + // Which fields are applicable depends on the workload type. + // +kubebuilder:validation:Required + Price ServiceOfferPriceTable `json:"price"` } type ServiceOfferAsset struct { - Address string `json:"address,omitempty"` - Symbol string `json:"symbol,omitempty"` - Decimals int64 `json:"decimals,omitempty"` + // ERC-20 contract address. + // +kubebuilder:validation:Pattern=`^0x[0-9a-fA-F]{40}$` + Address string `json:"address,omitempty"` + // Human-friendly token symbol (e.g. USDC, OBOL). + Symbol string `json:"symbol,omitempty"` + // Token decimals in atomic units. + // +kubebuilder:validation:Minimum=0 + // +kubebuilder:validation:Maximum=255 + Decimals int64 `json:"decimals,omitempty"` + // x402 transfer method for the asset. + // +kubebuilder:validation:Enum=eip3009;permit2 TransferMethod string `json:"transferMethod,omitempty"` - EIP712Name string `json:"eip712Name,omitempty"` - EIP712Version string `json:"eip712Version,omitempty"` + // EIP-712 domain name used by the token. + EIP712Name string `json:"eip712Name,omitempty"` + // EIP-712 domain version used by the token. + EIP712Version string `json:"eip712Version,omitempty"` } type ServiceOfferPriceTable struct { + // Flat per-request price in USDC. Applicable to all types. PerRequest string `json:"perRequest,omitempty"` - PerMTok string `json:"perMTok,omitempty"` - PerHour string `json:"perHour,omitempty"` - PerEpoch string `json:"perEpoch,omitempty"` + // Per-million-tokens price in USDC. Inference only. + PerMTok string `json:"perMTok,omitempty"` + // Per-compute-hour price in USDC. Fine-tuning only. + PerHour string `json:"perHour,omitempty"` + // Per-training-epoch price in USDC. Fine-tuning only. + PerEpoch string `json:"perEpoch,omitempty"` } type ServiceOfferRegistration struct { - Enabled bool `json:"enabled,omitempty"` - Name string `json:"name,omitempty"` - Description string `json:"description,omitempty"` - Image string `json:"image,omitempty"` - Services []ServiceOfferService `json:"services,omitempty"` - SupportedTrust []string `json:"supportedTrust,omitempty"` - Skills []string `json:"skills,omitempty"` - Domains []string `json:"domains,omitempty"` - Metadata map[string]string `json:"metadata,omitempty"` + // If true, register on ERC-8004 after routing is live. + // +kubebuilder:default=false + Enabled bool `json:"enabled,omitempty"` + // Agent name (ERC-8004: AgentRegistration.name). + Name string `json:"name,omitempty"` + // Agent description (ERC-8004: AgentRegistration.description). + Description string `json:"description,omitempty"` + // Agent icon URL (ERC-8004: AgentRegistration.image). + Image string `json:"image,omitempty"` + // Service endpoints (ERC-8004: AgentRegistration.services[]). + Services []ServiceOfferService `json:"services,omitempty"` + // Trust verification methods (ERC-8004: AgentRegistration.supportedTrust[]). + // Valid values: reputation, crypto-economic, tee-attestation. + SupportedTrust []string `json:"supportedTrust,omitempty"` + // OASF skills for discovery (e.g. + // natural_language_processing/text_generation). Mapped to an OASF + // service entry in the registration JSON. + Skills []string `json:"skills,omitempty"` + // OASF domains for discovery (e.g. technology/artificial_intelligence). + // Mapped to an OASF service entry in the registration JSON. + Domains []string `json:"domains,omitempty"` + // Additional registration metadata published into the generated + // agent-registration.json for discovery and ranking. + Metadata map[string]string `json:"metadata,omitempty"` } type ServiceOfferService struct { - Name string `json:"name,omitempty"` - Endpoint string `json:"endpoint,omitempty"` - Version string `json:"version,omitempty"` + // Service type: web, A2A, MCP, OASF, ENS, DID, email. + // +kubebuilder:validation:Required + Name string `json:"name"` + // Service URL. Auto-filled from tunnel URL if empty. + // +kubebuilder:validation:Required + Endpoint string `json:"endpoint"` + // Protocol version (SHOULD per ERC-8004 spec). + Version string `json:"version,omitempty"` } type ServiceOfferStatus struct { - Conditions []Condition `json:"conditions,omitempty"` - Endpoint string `json:"endpoint,omitempty"` - AgentID string `json:"agentId,omitempty"` - RegistrationTxHash string `json:"registrationTxHash,omitempty"` - ObservedGeneration int64 `json:"observedGeneration,omitempty"` - AgentResolution *ServiceOfferAgentResolution `json:"agentResolution,omitempty"` + // Condition types: ModelReady, UpstreamHealthy, PaymentGateReady, + // RoutePublished, Registered, Ready. + Conditions []Condition `json:"conditions,omitempty"` + // The public endpoint URL once the route is published. + Endpoint string `json:"endpoint,omitempty"` + // ERC-8004 agent NFT token ID after on-chain registration. + AgentID string `json:"agentId,omitempty"` + // Transaction hash of the ERC-8004 registration. + RegistrationTxHash string `json:"registrationTxHash,omitempty"` + // The generation observed by the controller. + ObservedGeneration int64 `json:"observedGeneration,omitempty"` + // Controller's resolved view of an agent-type offer's referenced Agent. + // Populated only when type=agent and the Agent is Ready. + AgentResolution *ServiceOfferAgentResolution `json:"agentResolution,omitempty"` } // ServiceOfferAgentResolution is the controller's resolved view of an @@ -169,13 +294,35 @@ type ServiceOfferAgentResolution struct { } type Condition struct { - Type string `json:"type"` - Status string `json:"status"` - Reason string `json:"reason,omitempty"` - Message string `json:"message,omitempty"` + // Condition type. + // +kubebuilder:validation:Required + Type string `json:"type"` + // Status of the condition. + // +kubebuilder:validation:Required + // +kubebuilder:validation:Enum=True;False;Unknown + Status string `json:"status"` + // Machine-readable reason for the condition. + Reason string `json:"reason,omitempty"` + // Human-readable message with details. + Message string `json:"message,omitempty"` + // Last time the condition transitioned. LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"` } +// ── RegistrationRequest ───────────────────────────────────────────────────── + +// +kubebuilder:object:root=true +// +kubebuilder:resource:scope=Namespaced,shortName=rr +// +kubebuilder:subresource:status +// +kubebuilder:printcolumn:name="Offer",type=string,JSONPath=`.spec.serviceOfferName` +// +kubebuilder:printcolumn:name="State",type=string,JSONPath=`.spec.desiredState` +// +kubebuilder:printcolumn:name="Phase",type=string,JSONPath=`.status.phase` +// +kubebuilder:printcolumn:name="AgentID",type=string,JSONPath=`.status.agentId` +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp` + +// RegistrationRequest isolates ERC-8004 publication and on-chain side +// effects from the main ServiceOffer reconciliation loop. ServiceOffer +// remains the source of truth. type RegistrationRequest struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` @@ -183,11 +330,25 @@ type RegistrationRequest struct { Status RegistrationRequestStatus `json:"status,omitempty"` } +// +kubebuilder:object:root=true + +// RegistrationRequestList is the list form for kubectl/list operations. +type RegistrationRequestList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []RegistrationRequest `json:"items"` +} + type RegistrationRequestSpec struct { - ServiceOfferName string `json:"serviceOfferName,omitempty"` - ServiceOfferNamespace string `json:"serviceOfferNamespace,omitempty"` - DesiredState string `json:"desiredState,omitempty"` - Chain string `json:"chain,omitempty"` + // +kubebuilder:validation:Required + ServiceOfferName string `json:"serviceOfferName"` + // +kubebuilder:validation:Required + ServiceOfferNamespace string `json:"serviceOfferNamespace"` + // +kubebuilder:validation:Required + // +kubebuilder:validation:Enum=Active;Tombstoned + DesiredState string `json:"desiredState"` + // ERC-8004 registration chain alias for this request. + Chain string `json:"chain,omitempty"` } type RegistrationRequestStatus struct { @@ -247,6 +408,19 @@ func (o *ServiceOffer) IsPaused() bool { // ── PurchaseRequest ───────────────────────────────────────────────────────── +// +kubebuilder:object:root=true +// +kubebuilder:resource:scope=Namespaced,shortName=pr +// +kubebuilder:subresource:status +// +kubebuilder:printcolumn:name="Endpoint",type=string,JSONPath=`.spec.endpoint` +// +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.spec.model` +// +kubebuilder:printcolumn:name="Price",type=string,JSONPath=`.spec.payment.price` +// +kubebuilder:printcolumn:name="Remaining",type=integer,JSONPath=`.status.remaining` +// +kubebuilder:printcolumn:name="Spent",type=integer,JSONPath=`.status.spent` +// +kubebuilder:printcolumn:name="Ready",type=string,JSONPath=`.status.conditions[?(@.type=="Ready")].status` +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp` + +// PurchaseRequest is the buyer-side request for pre-signed x402 auths +// against a remote inference endpoint. type PurchaseRequest struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` @@ -254,57 +428,110 @@ type PurchaseRequest struct { Status PurchaseRequestStatus `json:"status,omitempty"` } +// +kubebuilder:object:root=true + +// PurchaseRequestList is the list form for kubectl/list operations. +type PurchaseRequestList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []PurchaseRequest `json:"items"` +} + type PurchaseRequestSpec struct { - Endpoint string `json:"endpoint"` - Model string `json:"model"` - Count int `json:"count"` + // Full URL to the x402-gated inference endpoint. + // +kubebuilder:validation:Required + Endpoint string `json:"endpoint"` + // Remote model ID (used as paid/ in LiteLLM). + // +kubebuilder:validation:Required + Model string `json:"model"` + // Number of pre-signed auths to create. + // +kubebuilder:validation:Required + // +kubebuilder:validation:Minimum=1 + // +kubebuilder:validation:Maximum=2500 + Count int `json:"count"` + // Pre-signed x402 payments (legacy ERC-3009 auths still supported). PreSignedAuths []PreSignedAuth `json:"preSignedAuths,omitempty"` AutoRefill PurchaseAutoRefill `json:"autoRefill,omitempty"` - Payment PurchasePayment `json:"payment"` + // +kubebuilder:validation:Required + Payment PurchasePayment `json:"payment"` } +// +kubebuilder:object:generate=false + +// PreSignedAuth carries a pre-signed x402 payment authorization. The +// Payment map is opaque (forwarded verbatim to the buyer sidecar / x402 +// facilitator) which can't be deep-copied by controller-gen; DeepCopy +// methods for this type are hand-written in deepcopy_manual.go. type PreSignedAuth struct { - ID string `json:"id,omitempty"` + ID string `json:"id,omitempty"` + // +kubebuilder:pruning:PreserveUnknownFields + // +kubebuilder:validation:Schemaless Payment map[string]interface{} `json:"payment,omitempty"` - Signature string `json:"signature"` - From string `json:"from"` - To string `json:"to"` - Value string `json:"value"` - ValidAfter string `json:"validAfter"` - ValidBefore string `json:"validBefore"` - Nonce string `json:"nonce"` -} - + Signature string `json:"signature,omitempty"` + From string `json:"from,omitempty"` + To string `json:"to,omitempty"` + Value string `json:"value,omitempty"` + ValidAfter string `json:"validAfter,omitempty"` + ValidBefore string `json:"validBefore,omitempty"` + Nonce string `json:"nonce,omitempty"` +} + +// PurchaseAutoRefill drives the agent-managed auto-refill policy for a +// PurchaseRequest. The reconciler reads MaxTotal + MaxSpendPerDay as +// budget caps before signing more auths; without these fields populated +// the agent will not auto-refill beyond the initial Count. type PurchaseAutoRefill struct { - Enabled bool `json:"enabled,omitempty"` - Threshold int `json:"threshold,omitempty"` - Count int `json:"count,omitempty"` + // +kubebuilder:default=false + Enabled bool `json:"enabled,omitempty"` + // Refill when remaining < threshold. + // +kubebuilder:validation:Minimum=0 + Threshold int `json:"threshold,omitempty"` + // Number of auths to sign on refill. + // +kubebuilder:validation:Minimum=1 + Count int `json:"count,omitempty"` + // Cap total auths ever signed. + MaxTotal int `json:"maxTotal,omitempty"` + // Max micro-USDC spend per day. + MaxSpendPerDay string `json:"maxSpendPerDay,omitempty"` } type PurchasePayment struct { - Network string `json:"network"` - PayTo string `json:"payTo"` - Price string `json:"price"` - Asset string `json:"asset"` - AssetSymbol string `json:"assetSymbol,omitempty"` - AssetDecimals int64 `json:"assetDecimals,omitempty"` + // +kubebuilder:validation:Required + Network string `json:"network"` + // +kubebuilder:validation:Required + PayTo string `json:"payTo"` + // Atomic token units per request. + // +kubebuilder:validation:Required + Price string `json:"price"` + // ERC-20 contract address. + // +kubebuilder:validation:Required + Asset string `json:"asset"` + // Human-friendly token symbol (e.g. USDC, OBOL). + AssetSymbol string `json:"assetSymbol,omitempty"` + // Token decimals in atomic units. + AssetDecimals int64 `json:"assetDecimals,omitempty"` + // x402 transfer method used for this asset. AssetTransferMethod string `json:"assetTransferMethod,omitempty"` - EIP712Name string `json:"eip712Name,omitempty"` - EIP712Version string `json:"eip712Version,omitempty"` + // EIP-712 domain name used for signing. + EIP712Name string `json:"eip712Name,omitempty"` + // EIP-712 domain version used for signing. + EIP712Version string `json:"eip712Version,omitempty"` } type PurchaseRequestStatus struct { ObservedGeneration int64 `json:"observedGeneration,omitempty"` Conditions []Condition `json:"conditions,omitempty"` - PublicModel string `json:"publicModel,omitempty"` - Remaining int `json:"remaining,omitempty"` - Spent int `json:"spent,omitempty"` - TotalSigned int `json:"totalSigned,omitempty"` - TotalSpent string `json:"totalSpent,omitempty"` - ProbedAt string `json:"probedAt,omitempty"` - ProbedPrice string `json:"probedPrice,omitempty"` - WalletBalance string `json:"walletBalance,omitempty"` - SignerAddress string `json:"signerAddress,omitempty"` + // LiteLLM model name (paid/). + PublicModel string `json:"publicModel,omitempty"` + Remaining int `json:"remaining,omitempty"` + Spent int `json:"spent,omitempty"` + TotalSigned int `json:"totalSigned,omitempty"` + TotalSpent string `json:"totalSpent,omitempty"` + // +kubebuilder:validation:Format=date-time + ProbedAt string `json:"probedAt,omitempty"` + ProbedPrice string `json:"probedPrice,omitempty"` + WalletBalance string `json:"walletBalance,omitempty"` + SignerAddress string `json:"signerAddress,omitempty"` } func (pr *PurchaseRequest) EffectiveBuyerNamespace() string { @@ -313,6 +540,21 @@ func (pr *PurchaseRequest) EffectiveBuyerNamespace() string { // ── Agent ─────────────────────────────────────────────────────────────────── +// +kubebuilder:object:root=true +// +kubebuilder:resource:scope=Namespaced,shortName=ag +// +kubebuilder:subresource:status +// +kubebuilder:printcolumn:name="Runtime",type=string,JSONPath=`.spec.runtime` +// +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.status.pinnedModel` +// +kubebuilder:printcolumn:name="Wallet",type=string,JSONPath=`.status.walletAddress` +// +kubebuilder:printcolumn:name="Phase",type=string,JSONPath=`.status.phase` +// +kubebuilder:printcolumn:name="Ready",type=string,JSONPath=`.status.conditions[?(@.type=="Ready")].status` +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp` + +// Agent is the declarative spec for an Obol Stack agent (Hermes today, +// OpenClaw later). Decouples agent lifecycle from selling: `obol sell +// agent ` references an existing Agent rather than provisioning +// one inline. Internal manager agents with RBAC can also create Agent +// resources to spawn sub-agents. type Agent struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` @@ -320,25 +562,58 @@ type Agent struct { Status AgentStatus `json:"status,omitempty"` } +// +kubebuilder:object:root=true + +// AgentList is the list form for kubectl/list operations. +type AgentList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []Agent `json:"items"` +} + type AgentSpec struct { - Runtime string `json:"runtime,omitempty"` - Model string `json:"model,omitempty"` - Skills []string `json:"skills,omitempty"` + // Agent runtime (only hermes today; openclaw planned). + // +kubebuilder:default=hermes + // +kubebuilder:validation:Enum=hermes + Runtime string `json:"runtime,omitempty"` + // LiteLLM model name to pin. Empty = controller picks cluster + // top-of-rank on first deploy and writes status.pinnedModel. + // +kubebuilder:validation:MaxLength=256 + Model string `json:"model,omitempty"` + // Allow-listed skills written to the per-agent skills dir on first + // reconcile. Agent can edit afterwards; this is a seed, not a sandbox. + // +kubebuilder:validation:MaxItems=64 + // +kubebuilder:validation:items:Pattern=`^[a-z0-9][a-z0-9-]*$` + // +kubebuilder:validation:items:MaxLength=64 + Skills []string `json:"skills,omitempty"` + // Operator-supplied objective text. Substituted into the SOUL.md + // template by the seeder on first write. Agent owns SOUL.md after that. + // +kubebuilder:validation:MaxLength=4096 Objective string `json:"objective,omitempty"` Wallet AgentWallet `json:"wallet,omitempty"` } type AgentWallet struct { + // Provision a per-namespace remote-signer keystore. Address is + // published in status.walletAddress. + // +kubebuilder:default=false Create bool `json:"create,omitempty"` } type AgentStatus struct { - ObservedGeneration int64 `json:"observedGeneration,omitempty"` - Phase string `json:"phase,omitempty"` - PinnedModel string `json:"pinnedModel,omitempty"` - WalletAddress string `json:"walletAddress,omitempty"` - Endpoint string `json:"endpoint,omitempty"` - Conditions []Condition `json:"conditions,omitempty"` + ObservedGeneration int64 `json:"observedGeneration,omitempty"` + // Pending | Provisioning | Ready | Failed + Phase string `json:"phase,omitempty"` + // Actual model the agent is using (= spec.model when set, otherwise + // the auto-picked top-of-rank). + PinnedModel string `json:"pinnedModel,omitempty"` + // Agent's signing address when wallet.create=true. Empty otherwise. + // +kubebuilder:validation:Pattern=`^(0x[0-9a-fA-F]{40})?$` + WalletAddress string `json:"walletAddress,omitempty"` + // Cluster-internal URL for the agent runtime (e.g. + // http://hermes.agent-quant.svc.cluster.local:8642). + Endpoint string `json:"endpoint,omitempty"` + Conditions []Condition `json:"conditions,omitempty"` } func (a *Agent) EffectiveRuntime() string { @@ -363,12 +638,22 @@ func (a *Agent) IsReady() bool { return a.Status.Phase == AgentPhaseReady } -// AgentIdentity is the durable, on-chain identity an operator controls in the -// ERC-8004 Identity Registry. A single AgentIdentity outlives ServiceOffers: -// deleting the last ServiceOffer that references it does not delete the NFT, -// the published registration document, or the recorded agentId; instead the -// renderer publishes a tombstone (active:false, x402Support:false) so external -// observers still see the historical record. +// ── AgentIdentity ─────────────────────────────────────────────────────────── + +// +kubebuilder:object:root=true +// +kubebuilder:resource:scope=Namespaced,shortName=aid +// +kubebuilder:subresource:status +// +kubebuilder:printcolumn:name="Chains",type=string,JSONPath=`.status.registrations[*].chain` +// +kubebuilder:printcolumn:name="AgentIDs",type=string,JSONPath=`.status.registrations[*].agentId` +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp` + +// AgentIdentity is the durable, on-chain identity an operator controls in +// the ERC-8004 Identity Registry. A single AgentIdentity outlives +// ServiceOffers: deleting the last ServiceOffer that references it does +// not delete the NFT, the published registration document, or the +// recorded agentId; instead the renderer publishes a tombstone +// (active:false, x402Support:false) so external observers still see the +// historical record. type AgentIdentity struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` @@ -376,16 +661,31 @@ type AgentIdentity struct { Status AgentIdentityStatus `json:"status,omitempty"` } +// +kubebuilder:object:root=true + +// AgentIdentityList is the list form for kubectl/list operations. +type AgentIdentityList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []AgentIdentity `json:"items"` +} + type AgentIdentitySpec struct { } type AgentIdentityStatus struct { + // Per-chain ERC-8004 registrations for this identity document. Registrations []AgentIdentityRegistration `json:"registrations,omitempty"` } type AgentIdentityRegistration struct { - Chain string `json:"chain,omitempty"` - AgentID string `json:"agentId,omitempty"` + // ERC-8004 registration chain alias. + // +kubebuilder:validation:Required + // +kubebuilder:validation:MaxLength=64 + Chain string `json:"chain"` + // On-chain ERC-721 tokenId on the given chain. + // +kubebuilder:validation:Required + AgentID string `json:"agentId"` } func AgentIdentityAgentIDForChain(status AgentIdentityStatus, chain string) string { diff --git a/internal/monetizeapi/zz_generated.deepcopy.go b/internal/monetizeapi/zz_generated.deepcopy.go new file mode 100644 index 00000000..ca9fdb32 --- /dev/null +++ b/internal/monetizeapi/zz_generated.deepcopy.go @@ -0,0 +1,775 @@ +//go:build !ignore_autogenerated + +// Code generated by controller-gen. DO NOT EDIT. + +// Code generated by controller-gen. DO NOT EDIT. + +package monetizeapi + +import ( + runtime "k8s.io/apimachinery/pkg/runtime" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Agent) DeepCopyInto(out *Agent) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Agent. +func (in *Agent) DeepCopy() *Agent { + if in == nil { + return nil + } + out := new(Agent) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *Agent) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentIdentity) DeepCopyInto(out *AgentIdentity) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + out.Spec = in.Spec + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentIdentity. +func (in *AgentIdentity) DeepCopy() *AgentIdentity { + if in == nil { + return nil + } + out := new(AgentIdentity) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *AgentIdentity) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentIdentityList) DeepCopyInto(out *AgentIdentityList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]AgentIdentity, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentIdentityList. +func (in *AgentIdentityList) DeepCopy() *AgentIdentityList { + if in == nil { + return nil + } + out := new(AgentIdentityList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *AgentIdentityList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentIdentityRegistration) DeepCopyInto(out *AgentIdentityRegistration) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentIdentityRegistration. +func (in *AgentIdentityRegistration) DeepCopy() *AgentIdentityRegistration { + if in == nil { + return nil + } + out := new(AgentIdentityRegistration) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentIdentitySpec) DeepCopyInto(out *AgentIdentitySpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentIdentitySpec. +func (in *AgentIdentitySpec) DeepCopy() *AgentIdentitySpec { + if in == nil { + return nil + } + out := new(AgentIdentitySpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentIdentityStatus) DeepCopyInto(out *AgentIdentityStatus) { + *out = *in + if in.Registrations != nil { + in, out := &in.Registrations, &out.Registrations + *out = make([]AgentIdentityRegistration, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentIdentityStatus. +func (in *AgentIdentityStatus) DeepCopy() *AgentIdentityStatus { + if in == nil { + return nil + } + out := new(AgentIdentityStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentList) DeepCopyInto(out *AgentList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]Agent, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentList. +func (in *AgentList) DeepCopy() *AgentList { + if in == nil { + return nil + } + out := new(AgentList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *AgentList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentSpec) DeepCopyInto(out *AgentSpec) { + *out = *in + if in.Skills != nil { + in, out := &in.Skills, &out.Skills + *out = make([]string, len(*in)) + copy(*out, *in) + } + out.Wallet = in.Wallet +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentSpec. +func (in *AgentSpec) DeepCopy() *AgentSpec { + if in == nil { + return nil + } + out := new(AgentSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentStatus) DeepCopyInto(out *AgentStatus) { + *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentStatus. +func (in *AgentStatus) DeepCopy() *AgentStatus { + if in == nil { + return nil + } + out := new(AgentStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentWallet) DeepCopyInto(out *AgentWallet) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentWallet. +func (in *AgentWallet) DeepCopy() *AgentWallet { + if in == nil { + return nil + } + out := new(AgentWallet) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Condition) DeepCopyInto(out *Condition) { + *out = *in + in.LastTransitionTime.DeepCopyInto(&out.LastTransitionTime) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Condition. +func (in *Condition) DeepCopy() *Condition { + if in == nil { + return nil + } + out := new(Condition) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PurchaseAutoRefill) DeepCopyInto(out *PurchaseAutoRefill) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PurchaseAutoRefill. +func (in *PurchaseAutoRefill) DeepCopy() *PurchaseAutoRefill { + if in == nil { + return nil + } + out := new(PurchaseAutoRefill) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PurchasePayment) DeepCopyInto(out *PurchasePayment) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PurchasePayment. +func (in *PurchasePayment) DeepCopy() *PurchasePayment { + if in == nil { + return nil + } + out := new(PurchasePayment) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PurchaseRequest) DeepCopyInto(out *PurchaseRequest) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PurchaseRequest. +func (in *PurchaseRequest) DeepCopy() *PurchaseRequest { + if in == nil { + return nil + } + out := new(PurchaseRequest) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *PurchaseRequest) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PurchaseRequestList) DeepCopyInto(out *PurchaseRequestList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]PurchaseRequest, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PurchaseRequestList. +func (in *PurchaseRequestList) DeepCopy() *PurchaseRequestList { + if in == nil { + return nil + } + out := new(PurchaseRequestList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *PurchaseRequestList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PurchaseRequestSpec) DeepCopyInto(out *PurchaseRequestSpec) { + *out = *in + if in.PreSignedAuths != nil { + in, out := &in.PreSignedAuths, &out.PreSignedAuths + *out = make([]PreSignedAuth, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + out.AutoRefill = in.AutoRefill + out.Payment = in.Payment +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PurchaseRequestSpec. +func (in *PurchaseRequestSpec) DeepCopy() *PurchaseRequestSpec { + if in == nil { + return nil + } + out := new(PurchaseRequestSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PurchaseRequestStatus) DeepCopyInto(out *PurchaseRequestStatus) { + *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PurchaseRequestStatus. +func (in *PurchaseRequestStatus) DeepCopy() *PurchaseRequestStatus { + if in == nil { + return nil + } + out := new(PurchaseRequestStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RegistrationRequest) DeepCopyInto(out *RegistrationRequest) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + out.Spec = in.Spec + out.Status = in.Status +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RegistrationRequest. +func (in *RegistrationRequest) DeepCopy() *RegistrationRequest { + if in == nil { + return nil + } + out := new(RegistrationRequest) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *RegistrationRequest) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RegistrationRequestList) DeepCopyInto(out *RegistrationRequestList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]RegistrationRequest, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RegistrationRequestList. +func (in *RegistrationRequestList) DeepCopy() *RegistrationRequestList { + if in == nil { + return nil + } + out := new(RegistrationRequestList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *RegistrationRequestList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RegistrationRequestSpec) DeepCopyInto(out *RegistrationRequestSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RegistrationRequestSpec. +func (in *RegistrationRequestSpec) DeepCopy() *RegistrationRequestSpec { + if in == nil { + return nil + } + out := new(RegistrationRequestSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RegistrationRequestStatus) DeepCopyInto(out *RegistrationRequestStatus) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RegistrationRequestStatus. +func (in *RegistrationRequestStatus) DeepCopy() *RegistrationRequestStatus { + if in == nil { + return nil + } + out := new(RegistrationRequestStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOffer) DeepCopyInto(out *ServiceOffer) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOffer. +func (in *ServiceOffer) DeepCopy() *ServiceOffer { + if in == nil { + return nil + } + out := new(ServiceOffer) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ServiceOffer) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOfferAgent) DeepCopyInto(out *ServiceOfferAgent) { + *out = *in + out.Ref = in.Ref +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOfferAgent. +func (in *ServiceOfferAgent) DeepCopy() *ServiceOfferAgent { + if in == nil { + return nil + } + out := new(ServiceOfferAgent) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOfferAgentRef) DeepCopyInto(out *ServiceOfferAgentRef) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOfferAgentRef. +func (in *ServiceOfferAgentRef) DeepCopy() *ServiceOfferAgentRef { + if in == nil { + return nil + } + out := new(ServiceOfferAgentRef) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOfferAgentResolution) DeepCopyInto(out *ServiceOfferAgentResolution) { + *out = *in + if in.Skills != nil { + in, out := &in.Skills, &out.Skills + *out = make([]string, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOfferAgentResolution. +func (in *ServiceOfferAgentResolution) DeepCopy() *ServiceOfferAgentResolution { + if in == nil { + return nil + } + out := new(ServiceOfferAgentResolution) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOfferAsset) DeepCopyInto(out *ServiceOfferAsset) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOfferAsset. +func (in *ServiceOfferAsset) DeepCopy() *ServiceOfferAsset { + if in == nil { + return nil + } + out := new(ServiceOfferAsset) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOfferList) DeepCopyInto(out *ServiceOfferList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]ServiceOffer, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOfferList. +func (in *ServiceOfferList) DeepCopy() *ServiceOfferList { + if in == nil { + return nil + } + out := new(ServiceOfferList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ServiceOfferList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOfferModel) DeepCopyInto(out *ServiceOfferModel) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOfferModel. +func (in *ServiceOfferModel) DeepCopy() *ServiceOfferModel { + if in == nil { + return nil + } + out := new(ServiceOfferModel) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOfferPayment) DeepCopyInto(out *ServiceOfferPayment) { + *out = *in + out.Asset = in.Asset + out.Price = in.Price +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOfferPayment. +func (in *ServiceOfferPayment) DeepCopy() *ServiceOfferPayment { + if in == nil { + return nil + } + out := new(ServiceOfferPayment) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOfferPriceTable) DeepCopyInto(out *ServiceOfferPriceTable) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOfferPriceTable. +func (in *ServiceOfferPriceTable) DeepCopy() *ServiceOfferPriceTable { + if in == nil { + return nil + } + out := new(ServiceOfferPriceTable) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOfferRegistration) DeepCopyInto(out *ServiceOfferRegistration) { + *out = *in + if in.Services != nil { + in, out := &in.Services, &out.Services + *out = make([]ServiceOfferService, len(*in)) + copy(*out, *in) + } + if in.SupportedTrust != nil { + in, out := &in.SupportedTrust, &out.SupportedTrust + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.Skills != nil { + in, out := &in.Skills, &out.Skills + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.Domains != nil { + in, out := &in.Domains, &out.Domains + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.Metadata != nil { + in, out := &in.Metadata, &out.Metadata + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOfferRegistration. +func (in *ServiceOfferRegistration) DeepCopy() *ServiceOfferRegistration { + if in == nil { + return nil + } + out := new(ServiceOfferRegistration) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOfferService) DeepCopyInto(out *ServiceOfferService) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOfferService. +func (in *ServiceOfferService) DeepCopy() *ServiceOfferService { + if in == nil { + return nil + } + out := new(ServiceOfferService) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOfferSpec) DeepCopyInto(out *ServiceOfferSpec) { + *out = *in + out.Agent = in.Agent + out.Model = in.Model + out.Upstream = in.Upstream + out.Payment = in.Payment + if in.Provenance != nil { + in, out := &in.Provenance, &out.Provenance + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + in.Registration.DeepCopyInto(&out.Registration) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOfferSpec. +func (in *ServiceOfferSpec) DeepCopy() *ServiceOfferSpec { + if in == nil { + return nil + } + out := new(ServiceOfferSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOfferStatus) DeepCopyInto(out *ServiceOfferStatus) { + *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.AgentResolution != nil { + in, out := &in.AgentResolution, &out.AgentResolution + *out = new(ServiceOfferAgentResolution) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOfferStatus. +func (in *ServiceOfferStatus) DeepCopy() *ServiceOfferStatus { + if in == nil { + return nil + } + out := new(ServiceOfferStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServiceOfferUpstream) DeepCopyInto(out *ServiceOfferUpstream) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceOfferUpstream. +func (in *ServiceOfferUpstream) DeepCopy() *ServiceOfferUpstream { + if in == nil { + return nil + } + out := new(ServiceOfferUpstream) + in.DeepCopyInto(out) + return out +} diff --git a/justfile b/justfile index c3cc2996..40d7115c 100644 --- a/justfile +++ b/justfile @@ -81,6 +81,41 @@ dev-frontend-reset: obol kubectl rollout status deployment/obol-frontend-obol-app -n obol-frontend --timeout=120s echo "✓ Frontend reset to released image" +# Regenerate CRD manifests + DeepCopy methods from kubebuilder markers +# in internal/monetizeapi/. The Go types are the single source of truth; +# CI (.github/workflows/lint-test.yaml::generate-check) fails if the +# working tree is dirty after this command runs. See CLAUDE.md for the +# edit-types -> just generate -> commit-both workflow. +generate: + #!/usr/bin/env bash + set -euo pipefail + # DeepCopy methods (zz_generated_deepcopy.go) next to the Go types. + go run sigs.k8s.io/controller-tools/cmd/controller-gen \ + object:headerFile=hack/boilerplate.go.txt \ + paths=./internal/monetizeapi/... + # CRD manifests into the embed dir. controller-gen names files + # obol.org_.yaml; rename to existing -crd.yaml + # naming so embed.FS readers don't need to change. + out=internal/embed/infrastructure/base/templates + go run sigs.k8s.io/controller-tools/cmd/controller-gen \ + crd \ + paths=./internal/monetizeapi/... \ + output:crd:dir="$out" + for f in "$out"/obol.org_*.yaml; do + [ -e "$f" ] || continue + plural=$(basename "$f" .yaml | sed 's/^obol\.org_//') + case "$plural" in + agentidentities) target="agentidentity-crd.yaml" ;; + agents) target="agent-crd.yaml" ;; + purchaserequests) target="purchaserequest-crd.yaml" ;; + registrationrequests) target="registrationrequest-crd.yaml" ;; + serviceoffers) target="serviceoffer-crd.yaml" ;; + *) target="${plural%s}-crd.yaml" ;; + esac + mv "$f" "$out/$target" + done + echo "✓ Regenerated CRDs and DeepCopy methods" + # Install pre-commit hooks (run once after cloning) setup: #!/usr/bin/env bash diff --git a/tools/tools.go b/tools/tools.go new file mode 100644 index 00000000..b177a3e4 --- /dev/null +++ b/tools/tools.go @@ -0,0 +1,14 @@ +//go:build tools + +// Package tools tracks build-time dependencies that are not imported by +// production code. controller-gen is the canonical source-of-truth tool +// for generating CRD manifests and DeepCopy methods from kubebuilder +// markers on the Go types in internal/monetizeapi. +// +// See `just generate`. CI fails if generated artifacts drift from the +// markers (see .github/workflows/lint-test.yaml::generate-check). +package tools + +import ( + _ "sigs.k8s.io/controller-tools/cmd/controller-gen" +) From 9481e4ea5d66679a46bbb97b3ea1bbdebce48fb6 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sun, 24 May 2026 09:08:43 +0400 Subject: [PATCH 17/31] fix(prometheus-rules): escape PromQL $labels for Helm rendering PrometheusRule annotations use {{ $labels.X }} which Prometheus evaluates at alert-firing time. When this file is rendered through Helm (via chart: ./base in helmfile.yaml), Helm's Go-template engine tries to evaluate $labels at chart-render time and fails with: Error: UPGRADE FAILED: parse error at (base-infra/templates/x402-prometheus-rules.yaml:N): undefined variable "$labels" Wrap each templated brace pair as {{ "{{" }}...{{ "}}" }} so Helm emits literal Prometheus template syntax verbatim into the YAML output, where Prometheus picks it up at alert-eval time. Bug surfaced by integration-branch full stack-up; not caught by `go test ./...` (unit tests don't render Helm) nor by the agent worktree validation (which only checked Go-side compilation). Recommend adding a CI smoke that pipes embedded *.yaml templates through `helm template ./base` to catch this class going forward. Stacks on PR #513 (which introduced the file in commit 27e1ac5). --- .../base/templates/x402-prometheus-rules.yaml | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml b/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml index 73b10f94..4dbbbea9 100644 --- a/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml +++ b/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml @@ -47,10 +47,11 @@ spec: increase(obol_x402_verifier_charged_requests_total[7d]) ) - # Lifetime charged-request count per offer (sum across replicas - # + chains). Used in the My Listings "today · X earned" header - # text and the Browse catalog usage badge. - - record: x402:revenue:lifetime_by_offer + # Sum of currently-running verifier replicas' counters — resets + # on rollout; for true lifetime, query against a long-retention + # store or use `sum_over_time(...[Nd])`. Used in the My Listings + # "today · X earned" header text and the Browse catalog usage badge. + - record: x402:revenue:total_by_offer_current expr: | sum by (offer_namespace, offer_name) ( obol_x402_verifier_charged_requests_total @@ -101,11 +102,11 @@ spec: labels: severity: warning annotations: - summary: "x402 payment failures > 10% on {{ $labels.offer_namespace }}/{{ $labels.offer_name }} ({{ $labels.chain }})" + summary: "x402 payment failures > 10% on {{ "{{" }} $labels.offer_namespace {{ "}}" }}/{{ "{{" }} $labels.offer_name {{ "}}" }} ({{ "{{" }} $labels.chain {{ "}}" }})" description: | More than 10% of paid requests to - {{ $labels.offer_namespace }}/{{ $labels.offer_name }} on - {{ $labels.chain }} have failed verification over the last + {{ "{{" }} $labels.offer_namespace {{ "}}" }}/{{ "{{" }} $labels.offer_name {{ "}}" }} on + {{ "{{" }} $labels.chain {{ "}}" }} have failed verification over the last hour. Check the verifier logs for x509/facilitator errors and the seller's `ca-certificates` ConfigMap. @@ -130,10 +131,10 @@ spec: labels: severity: warning annotations: - summary: "{{ $labels.offer_namespace }}/{{ $labels.offer_name }} returns 402 but never settles" + summary: "{{ "{{" }} $labels.offer_namespace {{ "}}" }}/{{ "{{" }} $labels.offer_name {{ "}}" }} returns 402 but never settles" description: | The x402 verifier issued 402 responses for - {{ $labels.offer_namespace }}/{{ $labels.offer_name }} in the + {{ "{{" }} $labels.offer_namespace {{ "}}" }}/{{ "{{" }} $labels.offer_name {{ "}}" }} in the last hour but observed no settled requests. Check the buyer sidecar's auth pool (/status) and the facilitator's settlement endpoint. From 7919a36bc40d49de0c5f9670e511afed76c2ae20 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sun, 24 May 2026 09:11:58 +0400 Subject: [PATCH 18/31] =?UTF-8?q?docs(migration):=20bedag/raw=20=E2=86=92?= =?UTF-8?q?=20base=20release=20ownership=20transfer=20script?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #523 moved 6 bedag/raw helmfile releases into the base chart so there's one source of truth for what ships in each namespace. Fresh installs work. EXISTING clusters being upgraded from pre-#523 obol-stack fail at `helm upgrade base` with: Error: UPGRADE FAILED: exists and cannot be imported into the current release: invalid ownership metadata; annotation validation error: key "meta.helm.sh/release-name" must equal "base" This blocks `obol stack up` until the operator manually re-annotates ~10 resources (Namespaces, HTTPRoutes, Middlewares, ConfigMaps, PrometheusRule, PodMonitor, ClusterRole/Binding). Adds hack/migrate-bedag-raw-to-base.sh which finds all such orphans and re-annotates them in bulk. Idempotent — safe to re-run. Surfaced by the 14-PR integration test campaign; see plans/integration-test-results-final-20260524.md Bug #2. --- .github/release-template.md | 7 +++ docs/upgrade-from-pre-pr-523.md | 82 +++++++++++++++++++++++++++++++ hack/migrate-bedag-raw-to-base.sh | 82 +++++++++++++++++++++++++++++++ 3 files changed, 171 insertions(+) create mode 100644 docs/upgrade-from-pre-pr-523.md create mode 100755 hack/migrate-bedag-raw-to-base.sh diff --git a/.github/release-template.md b/.github/release-template.md index c67dc086..fd72746e 100644 --- a/.github/release-template.md +++ b/.github/release-template.md @@ -96,6 +96,13 @@ repositories or docs.] ## Breaking changes / Migration notes - [Delete this section if there are no breaking changes.] +- **Upgrading from a pre-PR #523 cluster**: PR #523 relocated six `bedag/raw` + helmfile releases into the `base` chart. Existing clusters must run + `bash hack/migrate-bedag-raw-to-base.sh` once before `obol stack up` to + transfer Helm ownership annotations; otherwise `helm upgrade base` fails + with `invalid ownership metadata`. See + [`docs/upgrade-from-pre-pr-523.md`](../docs/upgrade-from-pre-pr-523.md). + Fresh installs are unaffected. ## Known issues diff --git a/docs/upgrade-from-pre-pr-523.md b/docs/upgrade-from-pre-pr-523.md new file mode 100644 index 00000000..1b9b5bf7 --- /dev/null +++ b/docs/upgrade-from-pre-pr-523.md @@ -0,0 +1,82 @@ +# Upgrading clusters created before PR #523 + +PR [#523](https://github.com/ObolNetwork/obol-stack/pull/523) relocates six +`bedag/raw` helmfile releases into the `base` chart so the stack has one +source of truth for everything it ships in the `erpc`, `obol-frontend`, and +`llm` namespaces. + +**Fresh installs are unaffected.** This page only applies if you are +upgrading a cluster that was created **before** PR #523 was merged. + +## Symptom + +Running `obol stack up` on a pre-#523 cluster fails during `helm upgrade base` +with errors of the form: + +``` +Error: UPGRADE FAILED: exists and cannot be imported into the +current release: invalid ownership metadata; annotation validation error: +key "meta.helm.sh/release-name" must equal "base"; current value is +"" +``` + +Helm refuses to "adopt" resources owned by another release. About ten +resources are affected (Namespaces, HTTPRoutes, Middlewares, ConfigMaps, +PrometheusRule, PodMonitor, ClusterRole/Binding) — enough that hand-fixing +them is error prone. + +## When to run the migration script + +- **Run once**, **before** `obol stack up`, against any cluster created + before PR #523 merged. +- The script is **idempotent** — safe to re-run if `obol stack up` is + interrupted or if you migrate one cluster at a time. +- Fresh clusters (`obol stack init && obol stack up` on an empty machine) + do **not** need it. + +```bash +# Optional: point at a non-default kubeconfig +export KUBECONFIG="$HOME/.config/obol/kubeconfig.yaml" + +bash hack/migrate-bedag-raw-to-base.sh +obol stack up +``` + +## What the script does + +It re-annotates the affected resources so Helm treats them as members of +the `base` release: + +``` +meta.helm.sh/release-name=base +meta.helm.sh/release-namespace=kube-system +app.kubernetes.io/managed-by=Helm +``` + +It covers the legacy `bedag/raw` releases removed by PR #523: + +| Legacy release | Namespace | +|---|---| +| `obol-frontend-rbac` | `obol-frontend` | +| `obol-frontend-httproute` | `obol-frontend` | +| `erpc-httproute` | `erpc` | +| `erpc-x402-middleware` | `erpc` | +| `erpc-metadata` | `erpc` | +| `llm-buyer-podmonitor` | `llm` | +| `x402-verifier-podmonitor` | `x402` (partial-upgrade clusters from before PR #513 hardening) | + +It also adopts a small set of resources that may exist with no Helm +ownership at all (`namespace/erpc`, `namespace/obol-frontend`, +`prometheusrule/x402-verifier` in `x402`) so the next `helm upgrade base` +can manage them cleanly. + +## Verifying the migration + +After running the script, `obol stack up` should succeed without the +`invalid ownership metadata` errors. To spot-check a single resource: + +```bash +kubectl get httproute -n obol-frontend obol-frontend \ + -o jsonpath='{.metadata.annotations.meta\.helm\.sh/release-name}{"\n"}' +# → base +``` diff --git a/hack/migrate-bedag-raw-to-base.sh b/hack/migrate-bedag-raw-to-base.sh new file mode 100755 index 00000000..d1c7d278 --- /dev/null +++ b/hack/migrate-bedag-raw-to-base.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash +# Migrate resources from the legacy bedag/raw helmfile releases to the +# base chart that now owns them after obol-stack PR #523. +# +# Symptom this fixes: +# Error: UPGRADE FAILED: exists and cannot be imported +# into the current release: invalid ownership metadata +# +# Run once before `obol stack up` against any cluster deployed before +# PR #523 merged. +# +# Idempotent — safe to re-run. + +set -euo pipefail + +: "${KUBECONFIG:=$HOME/.config/obol/kubeconfig.yaml}" + +ORPHAN_RELEASES=( + obol-frontend-rbac + obol-frontend-httproute + erpc-httproute + erpc-x402-middleware + erpc-metadata + llm-buyer-podmonitor + x402-verifier-podmonitor # killed by PR #513's hardening; keep in case partial-upgrade clusters still have it +) + +migrate_one() { + local target="$1" + local current + current=$(kubectl get "$target" -o jsonpath='{.metadata.annotations.meta\.helm\.sh/release-name}' 2>/dev/null || true) + if [[ "$current" == "base" ]]; then + echo " $target: already on base, skipping" + return 0 + fi + if [[ -z "$current" ]]; then + echo " $target: no Helm metadata, adopting into base" + else + echo " $target: was on '$current', migrating to base" + fi + kubectl annotate "$target" \ + meta.helm.sh/release-name=base \ + meta.helm.sh/release-namespace=kube-system --overwrite >/dev/null + kubectl label "$target" app.kubernetes.io/managed-by=Helm --overwrite >/dev/null +} + +echo "==> Scanning for resources owned by legacy bedag/raw releases..." +for release in "${ORPHAN_RELEASES[@]}"; do + echo "release: $release" + kubectl get all,clusterrole,clusterrolebinding,role,rolebinding,configmap,httproute,middleware,podmonitor,servicemonitor,prometheusrule,referencegrant,namespace \ + -A -o json 2>/dev/null \ + | jq -r --arg rel "$release" '.items[] + | select(.metadata.annotations["meta.helm.sh/release-name"] == $rel) + | "\(.kind)/\(.metadata.name)\(if .metadata.namespace then " -n " + .metadata.namespace else "" end)"' \ + | while read -r target; do + [[ -z "$target" ]] && continue + migrate_one "$target" + done +done + +# Some resources were never Helm-owned (e.g. PrometheusRule x402-verifier may have +# been created via kubectl apply somewhere). Adopt them into base too if they exist +# in the namespaces base now owns. +echo "==> Adopting unowned resources base will now claim..." +declare -a UNOWNED_TARGETS=( + "namespace/erpc" + "namespace/obol-frontend" + "prometheusrule/x402-verifier -n x402" +) +for target in "${UNOWNED_TARGETS[@]}"; do + if kubectl get $target >/dev/null 2>&1; then + owner=$(kubectl get $target -o jsonpath='{.metadata.annotations.meta\.helm\.sh/release-name}' 2>/dev/null || true) + if [[ -z "$owner" || "$owner" == "base" ]]; then + echo " $target: $([ -z "$owner" ] && echo "adopting" || echo "already base")" + kubectl annotate $target meta.helm.sh/release-name=base meta.helm.sh/release-namespace=kube-system --overwrite >/dev/null + kubectl label $target app.kubernetes.io/managed-by=Helm --overwrite >/dev/null + fi + fi +done + +echo "" +echo "✓ Migration complete. You may now run 'obol stack up'." From 938b380a2ea114af080de1f27af7b6bb67cddf87 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sun, 24 May 2026 09:13:39 +0400 Subject: [PATCH 19/31] fix(controller/render): Restricted PSS securityContext on httpd workloads PR #521 enforces Restricted Pod Security Standard on x402 + llm namespaces. The controller renders two httpd-based Deployments (obol-skill-md publisher + agentidentity-default-registration well- known/agent-registration.json publisher) without securityContext, so PSS admission rejects them and they never start. Result: marketplace API returns STACK_UNREACHABLE because skill-md isn't reachable. Adds Restricted-compliant securityContext to both renderers: pod: runAsNonRoot, runAsUser=1000, RunAsGroup=1000, seccompProfile=RuntimeDefault, fsGroup=1000 container: allowPrivilegeEscalation=false, drop ALL capabilities Both Deployments already bind httpd to 8080, which is non-root safe, so no port change is required. Surfaced by the 14-PR integration test campaign. The integration test workaround patched the running Deployments manually: plans/integration-test-results-final-20260524.md Bug #3. --- internal/serviceoffercontroller/render.go | 49 ++++++++-- .../render_builders_test.go | 93 +++++++++++++++++++ 2 files changed, 136 insertions(+), 6 deletions(-) diff --git a/internal/serviceoffercontroller/render.go b/internal/serviceoffercontroller/render.go index a733213b..ff9da256 100644 --- a/internal/serviceoffercontroller/render.go +++ b/internal/serviceoffercontroller/render.go @@ -26,6 +26,39 @@ const ( servicesJSONRouteName = "obol-services-json-route" ) +// restrictedPodSecurityContext returns a Pod-level securityContext that +// satisfies the Restricted Pod Security Standard (PSS). PR #521 enforces +// Restricted PSS on the x402 namespace, so the controller-rendered httpd +// workloads (obol-skill-md and agentidentity-*-registration) must ship a +// compliant securityContext or they fail admission and never start. +// +// UID/GID 1000 is the canonical non-root user available in the busybox +// image used by both Deployments. fsGroup keeps the projected ConfigMap +// volumes readable by the httpd process. +func restrictedPodSecurityContext() map[string]any { + return map[string]any{ + "runAsNonRoot": true, + "runAsUser": int64(1000), + "runAsGroup": int64(1000), + "fsGroup": int64(1000), + "seccompProfile": map[string]any{ + "type": "RuntimeDefault", + }, + } +} + +// restrictedContainerSecurityContext returns a container-level +// securityContext compliant with the Restricted PSS profile: privilege +// escalation disabled and all Linux capabilities dropped. +func restrictedContainerSecurityContext() map[string]any { + return map[string]any{ + "allowPrivilegeEscalation": false, + "capabilities": map[string]any{ + "drop": []any{"ALL"}, + }, + } +} + func buildRegistrationRequest(offer *monetizeapi.ServiceOffer, desiredState string) *unstructured.Unstructured { return &unstructured.Unstructured{ Object: map[string]any{ @@ -92,11 +125,13 @@ func buildAgentIdentityRegistrationDeployment(identity *monetizeapi.AgentIdentit }, }, "spec": map[string]any{ + "securityContext": restrictedPodSecurityContext(), "containers": []any{ map[string]any{ - "name": "httpd", - "image": "busybox:1.36", - "command": []any{"httpd", "-f", "-p", "8080", "-h", "/www"}, + "name": "httpd", + "image": "busybox:1.36", + "command": []any{"httpd", "-f", "-p", "8080", "-h", "/www"}, + "securityContext": restrictedContainerSecurityContext(), "ports": []any{ map[string]any{"containerPort": int64(8080), "protocol": "TCP"}, }, @@ -259,11 +294,13 @@ func buildSkillCatalogDeployment(contentHash string) *unstructured.Unstructured }, }, "spec": map[string]any{ + "securityContext": restrictedPodSecurityContext(), "containers": []any{ map[string]any{ - "name": "httpd", - "image": "busybox:1.36", - "command": []any{"httpd", "-f", "-p", "8080", "-h", "/www"}, + "name": "httpd", + "image": "busybox:1.36", + "command": []any{"httpd", "-f", "-p", "8080", "-h", "/www"}, + "securityContext": restrictedContainerSecurityContext(), "ports": []any{ map[string]any{"containerPort": int64(8080), "protocol": "TCP"}, }, diff --git a/internal/serviceoffercontroller/render_builders_test.go b/internal/serviceoffercontroller/render_builders_test.go index 22efa4b5..573d72af 100644 --- a/internal/serviceoffercontroller/render_builders_test.go +++ b/internal/serviceoffercontroller/render_builders_test.go @@ -5,8 +5,101 @@ import ( "testing" "github.com/ObolNetwork/obol-stack/internal/monetizeapi" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +// assertRestrictedPSS checks that a controller-rendered Deployment satisfies +// the Restricted Pod Security Standard. PR #521 enforces Restricted PSS on +// the x402 namespace, so any httpd workload missing these fields gets +// rejected at admission and never starts (Bug #3 from the 14-PR integration +// test campaign). +func assertRestrictedPSS(t *testing.T, deploymentName string, spec map[string]any) { + t.Helper() + template, _ := spec["template"].(map[string]any) + podSpec, _ := template["spec"].(map[string]any) + + psc, ok := podSpec["securityContext"].(map[string]any) + if !ok { + t.Fatalf("%s: pod spec missing securityContext", deploymentName) + } + if v, _ := psc["runAsNonRoot"].(bool); !v { + t.Errorf("%s: pod securityContext.runAsNonRoot = %v, want true", deploymentName, psc["runAsNonRoot"]) + } + if v, _ := psc["runAsUser"].(int64); v == 0 { + t.Errorf("%s: pod securityContext.runAsUser must be set to a non-zero UID", deploymentName) + } + if v, _ := psc["runAsGroup"].(int64); v == 0 { + t.Errorf("%s: pod securityContext.runAsGroup must be set to a non-zero GID", deploymentName) + } + sp, ok := psc["seccompProfile"].(map[string]any) + if !ok { + t.Errorf("%s: pod securityContext missing seccompProfile", deploymentName) + } else if t2, _ := sp["type"].(string); t2 != "RuntimeDefault" && t2 != "Localhost" { + t.Errorf("%s: pod seccompProfile.type = %q, want RuntimeDefault or Localhost", deploymentName, t2) + } + + containers, _ := podSpec["containers"].([]any) + if len(containers) == 0 { + t.Fatalf("%s: no containers in pod spec", deploymentName) + } + for _, c := range containers { + cm, _ := c.(map[string]any) + name, _ := cm["name"].(string) + csc, ok := cm["securityContext"].(map[string]any) + if !ok { + t.Errorf("%s/%s: container missing securityContext", deploymentName, name) + continue + } + if v, _ := csc["allowPrivilegeEscalation"].(bool); v { + t.Errorf("%s/%s: container allowPrivilegeEscalation = true, want false", deploymentName, name) + } + if _, present := csc["allowPrivilegeEscalation"]; !present { + t.Errorf("%s/%s: container missing allowPrivilegeEscalation (must be false)", deploymentName, name) + } + caps, ok := csc["capabilities"].(map[string]any) + if !ok { + t.Errorf("%s/%s: container securityContext missing capabilities", deploymentName, name) + continue + } + drop, _ := caps["drop"].([]any) + var droppedAll bool + for _, d := range drop { + if s, _ := d.(string); s == "ALL" { + droppedAll = true + } + } + if !droppedAll { + t.Errorf("%s/%s: container capabilities.drop must include \"ALL\", got %v", deploymentName, name, drop) + } + } +} + +// TestBuildSkillCatalogDeployment_RestrictedPSS verifies the skill-md +// httpd Deployment ships a Restricted-PSS-compliant securityContext. +// Regression test for the cross-PR interaction with #521 surfaced by +// the 14-PR integration test (Bug #3). +func TestBuildSkillCatalogDeployment_RestrictedPSS(t *testing.T) { + d := buildSkillCatalogDeployment("hash-x") + spec, _ := d.Object["spec"].(map[string]any) + assertRestrictedPSS(t, skillCatalogConfigMapName, spec) +} + +// TestBuildAgentIdentityRegistrationDeployment_RestrictedPSS verifies the +// agentidentity well-known/agent-registration.json publisher httpd +// Deployment ships a Restricted-PSS-compliant securityContext. +func TestBuildAgentIdentityRegistrationDeployment_RestrictedPSS(t *testing.T) { + identity := &monetizeapi.AgentIdentity{ + ObjectMeta: metav1.ObjectMeta{ + Name: monetizeapi.AgentIdentityDefaultName, + Namespace: "x402", + UID: "test-uid", + }, + } + d := buildAgentIdentityRegistrationDeployment(identity, "hash-y") + spec, _ := d.Object["spec"].(map[string]any) + assertRestrictedPSS(t, agentIdentityRegistrationName(identity), spec) +} + // TestBuildSkillCatalogConfigMap: exposes skill.md + services.json + httpd conf. func TestBuildSkillCatalogConfigMap(t *testing.T) { cm := buildSkillCatalogConfigMap("# Catalog", `[{"name":"a"}]`) From f9f1ff5c71829e2d0b83096c01f3c889d5d5c174 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sun, 24 May 2026 10:18:44 +0400 Subject: [PATCH 20/31] fix(prometheus-rules): use increase() for the per-offer revenue rule MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recording rule was sum(counter), which is wrong for any metric where the counter resets across pod restarts — Prometheus counters are per-process by design. The TSDB is the canonical persistence layer; rate() and increase() perform reset detection at query time across the samples the TSDB holds. - Renames the rule to x402:revenue:7d_by_offer (name matches what it returns; the old "lifetime" / "total_by_offer_current" names were aspirational against a finite retention window). - Expression: sum by (offer_namespace, offer_name) ( increase(obol_x402_verifier_charged_requests_total[7d]) ) - 7d inside 8d retention gives 1-day headroom so reset detection has both-side samples at the window's left edge. Per Robust Perception's "avoiding the counter-reset undercount" canonical guidance. Zero new components — uses only native Prometheus + recording-rule primitives. Found by the 14-PR integration test (plans/integration-test-results- final-20260524.md). The OBOL parity smoke surfaced it more visibly when a verifier restart produced a "0 req·24h" UI display on a row with real on-chain traffic. Stacks on PR #527 (Helm-escape fix for the same file). --- .../base/templates/x402-prometheus-rules.yaml | 31 +++++++++++++++---- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml b/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml index 4dbbbea9..f2d20717 100644 --- a/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml +++ b/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml @@ -47,14 +47,33 @@ spec: increase(obol_x402_verifier_charged_requests_total[7d]) ) - # Sum of currently-running verifier replicas' counters — resets - # on rollout; for true lifetime, query against a long-retention - # store or use `sum_over_time(...[Nd])`. Used in the My Listings - # "today · X earned" header text and the Browse catalog usage badge. - - record: x402:revenue:total_by_offer_current + # 7d charged-request count per offer (chain-agnostic). Used in the + # My Listings "7d · X earned" header text and the Browse catalog + # usage badge. + # + # Why `increase()` and not `sum(counter)`: + # Prometheus counters are per-process by design — they reset to + # zero on every pod restart (rollout, OOM, eviction, node + # reschedule). A naive `sum by (...) (counter)` query therefore + # drops to zero whenever the verifier restarts, producing a + # misleading "0 requests" reading on offers with real on-chain + # traffic. `increase()` performs reset detection at query time + # across the samples the TSDB holds, accounting for the wraps. + # + # Why `[7d]` and not `[8d]` (matching retention): + # The TSDB is the canonical persistence layer. `increase()` + # needs samples on both sides of the window edge to do reset + # detection at the left edge; a 7d window inside 8d retention + # gives a 1-day headroom so the rule keeps working at exactly + # the moment data ages out, instead of silently producing + # NaN/undercounts at the boundary. + # + # Canonical reference: Robust Perception, "avoiding the counter- + # reset undercount". + - record: x402:revenue:7d_by_offer expr: | sum by (offer_namespace, offer_name) ( - obol_x402_verifier_charged_requests_total + increase(obol_x402_verifier_charged_requests_total[7d]) ) # Settlement rate (verified / attempted) over the last hour, per From b700f343424b05a7c23759a37c4b95431a918325 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sun, 24 May 2026 10:20:19 +0400 Subject: [PATCH 21/31] feat(x402-metrics): add asset_symbol label for per-token queries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the verifier emits (offer_namespace, offer_name, chain). Answering "what's my OBOL revenue?" requires joining metrics with the ServiceOffer CR's spec.payment.asset.symbol at the frontend. With asset_symbol on the label set, the answer is a direct PromQL aggregation. Cardinality cost: zero. Each offer pins exactly one asset (A=1 per offer), so the new dimension is functionally constant within the existing (ns, name) group — no series multiplication. The "don't label what you can derive" guidance exists to prevent *multiplicative* blowups (chain x pod x pod_owner style); the single-asset-per-offer invariant means there's no multiplication to prevent. The argument for adding asset_symbol is identical to the argument that already justifies `chain` on these vecs: both are CR-derived, both are query-meaningful, both have bounded values. Changes: - 6 metric vecs: label slice gains "asset_symbol" - pruneSeriesNotIn key now (ns, name, chain, asset_symbol) so asset-repin doesn't leak the old series - verifier.load() live-set built with the same 4-tuple - prometheusLabels() emits rule.AssetSymbol (or "unknown" if empty as defensive fallback) - New _asset_symbol-suffixed recording rules added side-by-side with existing rules; existing rules unchanged (non-breaking) - Tests: emission asserts asset_symbol; prune test asserts asset-repin doesn't leak Frontend can simplify the existing metric x CR join in a future PR once it migrates to the _asset_symbol-suffixed rule. Findings from: plans/integration-test-L7-paid-flow-20260524.md (OBOL parity smoke surfaced this as a real gap when validating the WalletStrip / EarningsStrip per-token columns). --- .../base/templates/x402-prometheus-rules.yaml | 27 ++++ internal/x402/metrics.go | 52 +++--- internal/x402/verifier.go | 18 ++- internal/x402/verifier_test.go | 148 +++++++++++++++++- 4 files changed, 218 insertions(+), 27 deletions(-) diff --git a/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml b/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml index f2d20717..77b6429e 100644 --- a/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml +++ b/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml @@ -33,12 +33,28 @@ spec: # 24h charged-request count per (offer, chain). Replaces the # frontend's `increase(charged_requests_total[24h])` query — same # math, pre-computed every 30s. + # + # Kept unchanged for backwards compatibility. The + # _by_offer_chain_asset_symbol sibling below is the migration + # target for the frontend's per-token EarningsStrip columns. - record: x402:revenue:24h_by_offer_chain expr: | sum by (offer_namespace, offer_name, chain) ( increase(obol_x402_verifier_charged_requests_total[24h]) ) + # 24h charged-request count per (offer, chain, asset_symbol). + # Same math as :24h_by_offer_chain but keeps the asset dimension + # so the frontend can answer "what's my OBOL revenue?" with a + # single PromQL query instead of joining metrics with the + # ServiceOffer CR. Adding asset_symbol is non-multiplicative + # because each offer pins exactly one asset (A=1 per offer). + - record: x402:revenue:24h_by_offer_chain_asset_symbol + expr: | + sum by (offer_namespace, offer_name, chain, asset_symbol) ( + increase(obol_x402_verifier_charged_requests_total[24h]) + ) + # 7d charged-request count per (offer, chain). Powers the # EarningsStrip per-chain × CRD price multiplication. - record: x402:revenue:7d_by_offer_chain @@ -47,6 +63,17 @@ spec: increase(obol_x402_verifier_charged_requests_total[7d]) ) + # 7d charged-request count per (offer, chain, asset_symbol). + # Sibling of :7d_by_offer_chain — once the frontend migrates to + # the per-asset rule, the EarningsStrip can drop its + # CR-join-at-query-time for per-token columns. Cardinality is + # non-multiplicative because each offer pins exactly one asset. + - record: x402:revenue:7d_by_offer_chain_asset_symbol + expr: | + sum by (offer_namespace, offer_name, chain, asset_symbol) ( + increase(obol_x402_verifier_charged_requests_total[7d]) + ) + # 7d charged-request count per offer (chain-agnostic). Used in the # My Listings "7d · X earned" header text and the Browse catalog # usage badge. diff --git a/internal/x402/metrics.go b/internal/x402/metrics.go index b445d4c3..2779d148 100644 --- a/internal/x402/metrics.go +++ b/internal/x402/metrics.go @@ -10,12 +10,12 @@ import ( type verifierMetrics struct { registry *prometheus.Registry - requestsTotal *prometheus.CounterVec - paymentRequired *prometheus.CounterVec - paymentVerified *prometheus.CounterVec - paymentFailed *prometheus.CounterVec - chargedRequests *prometheus.CounterVec - lastPaymentSuccess *prometheus.GaugeVec + requestsTotal *prometheus.CounterVec + paymentRequired *prometheus.CounterVec + paymentVerified *prometheus.CounterVec + paymentFailed *prometheus.CounterVec + chargedRequests *prometheus.CounterVec + lastPaymentSuccess *prometheus.GaugeVec } func newVerifierMetrics() *verifierMetrics { @@ -26,42 +26,42 @@ func newVerifierMetrics() *verifierMetrics { Name: "obol_x402_verifier_requests_total", Help: "Requests evaluated by the x402 verifier for matched paid routes.", }, - []string{"offer_namespace", "offer_name", "chain"}, + []string{"offer_namespace", "offer_name", "chain", "asset_symbol"}, ), paymentRequired: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_payment_required_total", Help: "Requests rejected with 402 because payment was required.", }, - []string{"offer_namespace", "offer_name", "chain"}, + []string{"offer_namespace", "offer_name", "chain", "asset_symbol"}, ), paymentVerified: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_payment_verified_total", Help: "Requests approved after successful x402 payment verification.", }, - []string{"offer_namespace", "offer_name", "chain"}, + []string{"offer_namespace", "offer_name", "chain", "asset_symbol"}, ), paymentFailed: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_payment_failed_total", Help: "Requests rejected after a provided x402 payment failed verification.", }, - []string{"offer_namespace", "offer_name", "chain"}, + []string{"offer_namespace", "offer_name", "chain", "asset_symbol"}, ), chargedRequests: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_charged_requests_total", Help: "Requests that incurred a paid x402 charge.", }, - []string{"offer_namespace", "offer_name", "chain"}, + []string{"offer_namespace", "offer_name", "chain", "asset_symbol"}, ), lastPaymentSuccess: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "obol_x402_verifier_last_payment_success_seconds", Help: "Unix timestamp (seconds) of the most recent successful paid x402 charge for a route.", }, - []string{"offer_namespace", "offer_name", "chain"}, + []string{"offer_namespace", "offer_name", "chain", "asset_symbol"}, ), } @@ -81,16 +81,19 @@ func (m *verifierMetrics) handler() http.Handler { return promhttp.HandlerFor(m.registry, promhttp.HandlerOpts{}) } -// pruneSeriesNotIn drops every (offer_namespace, offer_name, chain) series -// from the verifier's counter/gauge vecs that is not present in `keep`. -// Called from Verifier.load whenever the route set changes so deleted offers -// (e.g. `obol sell delete`) stop emitting stale series — most importantly the -// last_payment_success_seconds gauge, which would otherwise hold the deleted -// offer's last-success timestamp forever and falsely satisfy "recent activity" -// alerts and dashboards. +// pruneSeriesNotIn drops every (offer_namespace, offer_name, chain, +// asset_symbol) series from the verifier's counter/gauge vecs that is not +// present in `keep`. Called from Verifier.load whenever the route set changes +// so deleted offers (e.g. `obol sell delete`) stop emitting stale series — +// most importantly the last_payment_success_seconds gauge, which would +// otherwise hold the deleted offer's last-success timestamp forever and +// falsely satisfy "recent activity" alerts and dashboards. // -// Key shape: "ns\x00name\x00chain" — \x00 is forbidden in Kubernetes object -// names and CAIP-2 chain ids, so the byte-join can't collide. +// Key shape: "ns\x00name\x00chain\x00asset" — \x00 is forbidden in +// Kubernetes object names, CAIP-2 chain ids, and ERC-20 symbols, so the +// byte-join can't collide. Including asset_symbol in the key means an +// asset-repin (USDC → OBOL on the same offer) prunes the old series rather +// than leaking a stale per-asset timestamp. func (m *verifierMetrics) pruneSeriesNotIn(keep map[string]struct{}) { vecs := []interface { DeletePartialMatch(prometheus.Labels) int @@ -110,7 +113,7 @@ func (m *verifierMetrics) pruneSeriesNotIn(keep map[string]struct{}) { for _, family := range gathered { for _, metric := range family.GetMetric() { labels := metric.GetLabel() - ns, name, chain := "", "", "" + ns, name, chain, asset := "", "", "", "" for _, l := range labels { switch l.GetName() { case "offer_namespace": @@ -119,18 +122,21 @@ func (m *verifierMetrics) pruneSeriesNotIn(keep map[string]struct{}) { name = l.GetValue() case "chain": chain = l.GetValue() + case "asset_symbol": + asset = l.GetValue() } } if ns == "" && name == "" { continue } - if _, ok := keep[ns+"\x00"+name+"\x00"+chain]; ok { + if _, ok := keep[ns+"\x00"+name+"\x00"+chain+"\x00"+asset]; ok { continue } match := prometheus.Labels{ "offer_namespace": ns, "offer_name": name, "chain": chain, + "asset_symbol": asset, } for _, vec := range vecs { vec.DeletePartialMatch(match) diff --git a/internal/x402/verifier.go b/internal/x402/verifier.go index 60e2fa80..77e42db7 100644 --- a/internal/x402/verifier.go +++ b/internal/x402/verifier.go @@ -74,7 +74,7 @@ func (v *Verifier) load(cfg *PricingConfig) error { if r.OfferNamespace == "" && r.OfferName == "" { continue } - live[r.OfferNamespace+"\x00"+r.OfferName+"\x00"+r.Network] = struct{}{} + live[r.OfferNamespace+"\x00"+r.OfferName+"\x00"+r.Network+"\x00"+r.AssetSymbol] = struct{}{} } v.metrics.pruneSeriesNotIn(live) @@ -466,9 +466,25 @@ func prometheusLabels(rule *RouteRule) prometheus.Labels { // offer_name) which already uniquely identifies a paid route — the // pattern was redundant and unbounded by path fragments, which would // have ballooned series count for sellers running many granular routes. + // + // asset_symbol is included for direct per-token aggregation in PromQL + // (e.g. "what's my OBOL revenue?") without having to join the metric + // against the ServiceOffer CR at query time. Cardinality cost is zero + // because each offer pins exactly one asset — the new dimension is + // functionally constant within the existing (ns, name) group. + asset := rule.AssetSymbol + if asset == "" { + // Defensive: a missing symbol is operationally ugly in PromQL. + // Empty-string labels are legal in Prometheus but render as a + // bare "asset_symbol=" in selectors, which makes dashboard + // filters harder to write. "unknown" is unambiguous and matches + // the convention we use elsewhere for under-populated metadata. + asset = "unknown" + } return prometheus.Labels{ "offer_namespace": rule.OfferNamespace, "offer_name": rule.OfferName, "chain": rule.Network, + "asset_symbol": asset, } } diff --git a/internal/x402/verifier_test.go b/internal/x402/verifier_test.go index 3b62c815..b41a274a 100644 --- a/internal/x402/verifier_test.go +++ b/internal/x402/verifier_test.go @@ -756,6 +756,7 @@ func TestVerifier_MetricsPaymentRequired(t *testing.T) { "offer_namespace": "llm", "offer_name": "paid-rpc", "chain": "", + "asset_symbol": "unknown", } assertVerifierMetricValue(t, metrics["obol_x402_verifier_requests_total"], labels, 1) assertVerifierMetricValue(t, metrics["obol_x402_verifier_payment_required_total"], labels, 1) @@ -769,6 +770,7 @@ func TestVerifier_MetricsVerifiedAndRejectedPayments(t *testing.T) { "offer_namespace": "llm", "offer_name": "paid-rpc", "chain": "", + "asset_symbol": "unknown", } okFac := newMockFacilitator(t, mockFacilitatorOpts{}) @@ -828,12 +830,15 @@ func TestVerifier_MetricsVerifiedAndRejectedPayments(t *testing.T) { // when an unpaid request is rejected with 402. // // The gauge is labeled identically to the verifier counters; for this rule -// `chain` is the empty string because the test RouteRule has no Network set. +// `chain` is the empty string because the test RouteRule has no Network set, +// and `asset_symbol` is "unknown" because AssetSymbol is unset (the defensive +// fallback emitted by prometheusLabels). func TestVerifier_LastPaymentSuccessGauge(t *testing.T) { labels := map[string]string{ "offer_namespace": "llm", "offer_name": "paid-rpc", "chain": "", + "asset_symbol": "unknown", } tests := []struct { @@ -950,8 +955,8 @@ func TestVerifier_Reload_PrunesDeletedOfferSeries(t *testing.T) { } } - keptLabels := map[string]string{"offer_namespace": "llm", "offer_name": "keep", "chain": ""} - goneLabels := map[string]string{"offer_namespace": "llm", "offer_name": "gone", "chain": ""} + keptLabels := map[string]string{"offer_namespace": "llm", "offer_name": "keep", "chain": "", "asset_symbol": "unknown"} + goneLabels := map[string]string{"offer_namespace": "llm", "offer_name": "gone", "chain": "", "asset_symbol": "unknown"} families := scrapeVerifierMetrics(t, v) for _, name := range []string{ @@ -1086,3 +1091,140 @@ func verifierMetricValue(metric *dto.Metric) float64 { return 0 } } + +// TestVerifier_PrometheusLabels_IncludesAssetSymbol asserts that the +// asset_symbol label is emitted with the value from RouteRule.AssetSymbol +// (which the serviceoffer_source populates from +// offer.Spec.Payment.Asset.Symbol). This is what makes "what's my OBOL +// revenue?" a single PromQL aggregation instead of a metric × CR join. +func TestVerifier_PrometheusLabels_IncludesAssetSymbol(t *testing.T) { + rule := &RouteRule{ + OfferNamespace: "llm", + OfferName: "demo-hello", + Network: "eip155:84532", + AssetSymbol: "USDC", + } + labels := prometheusLabels(rule) + if got := labels["asset_symbol"]; got != "USDC" { + t.Errorf("asset_symbol = %q, want %q (full labels: %v)", got, "USDC", labels) + } + if got := labels["chain"]; got != "eip155:84532" { + t.Errorf("chain = %q, want %q", got, "eip155:84532") + } +} + +// TestVerifier_PrometheusLabels_DefaultsToUnknownIfEmpty asserts the +// defensive fallback: when AssetSymbol is empty (legacy offers, parsing +// hiccup, etc.) the label value is "unknown" rather than "" — empty-string +// labels are legal in Prometheus but render as bare selectors that are +// awkward to filter in dashboards. +func TestVerifier_PrometheusLabels_DefaultsToUnknownIfEmpty(t *testing.T) { + rule := &RouteRule{ + OfferNamespace: "llm", + OfferName: "no-asset", + Network: "eip155:84532", + AssetSymbol: "", + } + labels := prometheusLabels(rule) + if got := labels["asset_symbol"]; got != "unknown" { + t.Errorf("asset_symbol = %q, want %q (full labels: %v)", got, "unknown", labels) + } +} + +// TestVerifier_PruneSeriesNotIn_DistinguishesAssetSymbol asserts that +// pruning treats asset_symbol as part of the series key, so an asset-repin +// scenario (USDC route gets dropped, OBOL route for the same offer is +// retained) prunes the dead USDC series without taking the live OBOL one +// with it. Without asset_symbol in the key, both series would map to the +// same (ns, name, chain) tuple and pruning would either drop both or +// neither — leaking a stale per-asset series. +func TestVerifier_PruneSeriesNotIn_DistinguishesAssetSymbol(t *testing.T) { + fac := newMockFacilitator(t, mockFacilitatorOpts{}) + usdcRoute := RouteRule{ + Pattern: "/svc/*", + Price: "0.0001", + OfferNamespace: "llm", + OfferName: "demo", + Network: "base-sepolia", + AssetSymbol: "USDC", + } + obolRoute := RouteRule{ + Pattern: "/svc-obol/*", + Price: "0.0001", + OfferNamespace: "llm", + OfferName: "demo", + Network: "base-sepolia", + AssetSymbol: "OBOL", + } + v := newTestVerifier(t, fac.URL, []RouteRule{usdcRoute, obolRoute}) + + // Stamp a successful paid request through each asset variant so both + // series exist in the registry before pruning. + for _, path := range []string{"/svc/x", "/svc-obol/x"} { + req := httptest.NewRequest(http.MethodPost, "/verify", nil) + req.Header.Set("X-Forwarded-Uri", path) + req.Header.Set("X-Forwarded-Host", "obol.stack") + req.Header.Set("X-PAYMENT", testPaymentHeader(t)) + rec := httptest.NewRecorder() + v.HandleVerify(rec, req) + if rec.Code != http.StatusOK { + t.Fatalf("setup paid request to %s: status=%d", path, rec.Code) + } + } + + usdcLabels := map[string]string{ + "offer_namespace": "llm", + "offer_name": "demo", + "chain": "base-sepolia", + "asset_symbol": "USDC", + } + obolLabels := map[string]string{ + "offer_namespace": "llm", + "offer_name": "demo", + "chain": "base-sepolia", + "asset_symbol": "OBOL", + } + + families := scrapeVerifierMetrics(t, v) + for _, name := range []string{ + "obol_x402_verifier_charged_requests_total", + "obol_x402_verifier_last_payment_success_seconds", + } { + family := families[name] + if family == nil { + t.Fatalf("baseline: missing %s before reload", name) + } + findVerifierMetricValue(t, family, usdcLabels) + findVerifierMetricValue(t, family, obolLabels) + } + + // Drop the USDC route, keep OBOL. If pruneSeriesNotIn ignored + // asset_symbol, both series would key to (llm, demo, base-sepolia) + // and the OBOL series would survive (because the OBOL route is in + // the keep set) — masking the bug. Conversely, if the key didn't + // distinguish at all, both could be wiped. Including asset_symbol + // in the key keeps USDC prunable and OBOL alive. + if err := v.Reload(&PricingConfig{ + Wallet: "0xdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef", + Chain: "base-sepolia", + FacilitatorURL: fac.URL, + Routes: []RouteRule{obolRoute}, + }); err != nil { + t.Fatalf("Reload: %v", err) + } + + families = scrapeVerifierMetrics(t, v) + for _, name := range []string{ + "obol_x402_verifier_requests_total", + "obol_x402_verifier_charged_requests_total", + "obol_x402_verifier_last_payment_success_seconds", + } { + assertVerifierMetricMissing(t, families[name], usdcLabels) + } + + if charged := families["obol_x402_verifier_charged_requests_total"]; charged != nil { + findVerifierMetricValue(t, charged, obolLabels) + } else { + t.Errorf("OBOL charged series was pruned along with USDC — asset_symbol was ignored in prune key") + } +} From 9022f37e46cf0c31a1adb09177d83df4b8dab4f7 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sun, 24 May 2026 11:40:57 +0400 Subject: [PATCH 22/31] fix(prometheus-rules): use epsilon floor not 1.0 to avoid under-reporting low-traffic alerts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X402PaymentFailureRateHigh and the settlement_rate recording rule used clamp_min(denominator, 1) as a div-by-zero guard. For paid endpoints under light load (sub-1 req/s), the floor is 1.0 instead of the true denominator, so the ratio numerator/denominator returns near-zero even when 50%+ of requests are failing — the alert never fires. Switch the floor to 1e-9. Epsilon prevents division-by-zero while keeping the actual ratio accurate at any non-zero traffic level. Surfaced by Expert #2 review of the PromQL design (plans/integration-test-L7-paid-flow-20260524.md follow-ups). Stacks on PR #531 (asset_symbol label) which is the tip of the rules-file chain. Will rebase onto main as the chain merges. --- .../base/templates/x402-prometheus-rules.yaml | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml b/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml index 77b6429e..5115a7d4 100644 --- a/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml +++ b/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml @@ -105,6 +105,15 @@ spec: # Settlement rate (verified / attempted) over the last hour, per # (offer, chain). Useful for the dashboard + the alert below. + # + # The `clamp_min(..., 1e-9)` is a division-by-zero guard, not a + # traffic floor. An earlier revision used `clamp_min(..., 1)`, + # which floored the denominator at 1 req/s and silently + # distorted the ratio on low-traffic offers (e.g. verified= + # 0.001/s ÷ floored_denominator=1 ≈ 0 instead of the real + # 0.001/0.002 = 0.5). Epsilon keeps the answer accurate at any + # non-zero traffic level while still avoiding a NaN when no + # samples exist in the window. - record: x402:settlement_rate:1h_by_offer_chain expr: | sum by (offer_namespace, offer_name, chain) ( @@ -119,7 +128,7 @@ spec: + rate(obol_x402_verifier_payment_failed_total[1h]) ), - 1 + 1e-9 ) - name: x402.alerting @@ -128,6 +137,14 @@ spec: # route that's actually receiving traffic. Typical cause: # facilitator unreachable, chain pruning, or seller's CA bundle # missing (CLAUDE.md pitfall #8). + # + # The `clamp_min(..., 1e-9)` here is a div-by-zero guard only. + # A prior `clamp_min(..., 1)` floored the denominator at 1 req/s, + # which under-reports the failure ratio on light-traffic + # endpoints (failed=0.001/s ÷ floored_denominator=1 = 0.001 + # instead of the true 0.001/0.002 = 0.5) and prevented the + # alert from ever firing at sub-1 req/s. Epsilon avoids NaN + # without distorting the ratio. - alert: X402PaymentFailureRateHigh expr: | ( @@ -141,7 +158,7 @@ spec: + rate(obol_x402_verifier_payment_verified_total[1h]) ), - 1 + 1e-9 ) ) > 0.10 for: 10m From 7c66408cd329bd889beec237e6d2f8eb6b613f93 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sun, 24 May 2026 11:42:02 +0400 Subject: [PATCH 23/31] ci: add helm-template-smoke job to catch chart-render parse errors PR #527 fixed an unescaped {{ $labels }} in a PrometheusRule annotation that broke `helm upgrade base` on every `obol stack up`. The bug shipped to integration testing because go test ./... doesn't exercise Helm rendering. This job pipes the embedded base chart through `helm template` on every PR; parse errors fail the build before merge. - Runs against ./internal/embed/infrastructure/base - Uses helm v3.20.1 (matches obolup.sh pinned version) - Also runs `helm lint` for chart-structure issues - Substitutes {{OLLAMA_HOST_IP}}/{{CLUSTER_ID}} stubs in a temp copy of the chart (mirroring what `obol stack init` does via internal/defaults/defaults.go::InfrastructureReplacements) - Future: pair with a helmfile-lint job for state-value tests If we ever land a chart-template change that this doesn't catch, expand the helm-template invocation with --set values mimicking what `obol stack up` provides. --- .github/workflows/helm-template-smoke.yml | 83 +++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 .github/workflows/helm-template-smoke.yml diff --git a/.github/workflows/helm-template-smoke.yml b/.github/workflows/helm-template-smoke.yml new file mode 100644 index 00000000..27a9ed1f --- /dev/null +++ b/.github/workflows/helm-template-smoke.yml @@ -0,0 +1,83 @@ +name: Helm Template Smoke + +on: + pull_request: + branches: [ main ] + paths: + - 'internal/embed/infrastructure/**' + - '.github/workflows/helm-template-smoke.yml' + push: + branches: [ main ] + paths: + - 'internal/embed/infrastructure/**' + - '.github/workflows/helm-template-smoke.yml' + +jobs: + helm-template-smoke: + name: helm template embedded chart + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + + - name: Set up Helm + uses: azure/setup-helm@1a275c3b69536ee54be43f2070a358922e12c8d4 # v4.3.1 + with: + version: v3.20.1 # match obolup.sh pinned version + + - name: helm template ./base + run: | + # Render the embedded `base` chart and fail on Go-template parse + # errors. Catches bugs like the unescaped `{{ $labels }}` in + # PrometheusRule annotations that broke `helm upgrade base` on + # every `obol stack up` (see PR #527). `go test ./...` does not + # exercise Helm rendering, so this is the only pre-merge gate + # for chart parse errors. + # + # The base chart contains `{{PLACEHOLDER}}` strings (e.g. + # `{{OLLAMA_HOST_IP}}`, `{{CLUSTER_ID}}`) that are substituted + # by `internal/defaults/defaults.go::InfrastructureReplacements` + # before helmfile runs. Helm's Go-template parser would treat + # them as actions and fail, so we substitute stub values into + # a working copy first — mirroring what `obol stack init` does. + set -euo pipefail + workdir="$(mktemp -d)" + cp -R internal/embed/infrastructure/base "$workdir/base" + # Mirror internal/defaults InfrastructureReplacements with CI stubs. + find "$workdir/base" -type f -name '*.yaml' -print0 \ + | xargs -0 sed -i \ + -e 's/{{OLLAMA_HOST_IP}}/127.0.0.1/g' \ + -e 's/{{OLLAMA_HOST}}/localhost/g' \ + -e 's/{{CLUSTER_ID}}/ci-helm-smoke/g' + # Match values passed by helmfile.yaml `releases[base]`. + helm template base "$workdir/base" \ + --set dataDir=/data \ + --set network=mainnet \ + > /dev/null + + - name: helm template ./cloudflared + run: | + # The cloudflared chart has no placeholder substitution and uses + # default values from values.yaml. + set -euo pipefail + helm template cloudflared internal/embed/infrastructure/cloudflared \ + > /dev/null + + - name: helm lint ./base + run: | + set -euo pipefail + workdir="$(mktemp -d)" + cp -R internal/embed/infrastructure/base "$workdir/base" + find "$workdir/base" -type f -name '*.yaml' -print0 \ + | xargs -0 sed -i \ + -e 's/{{OLLAMA_HOST_IP}}/127.0.0.1/g' \ + -e 's/{{OLLAMA_HOST}}/localhost/g' \ + -e 's/{{CLUSTER_ID}}/ci-helm-smoke/g' + helm lint "$workdir/base" \ + --set dataDir=/data \ + --set network=mainnet + + - name: helm lint ./cloudflared + run: | + set -euo pipefail + helm lint internal/embed/infrastructure/cloudflared From e2d4add056259317248e45aa6bb8b941d32e82ca Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sun, 24 May 2026 11:42:54 +0400 Subject: [PATCH 24/31] docs(observability): record the thin-layer architecture decisions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the OBOL parity smoke + Prometheus expert review, we made explicit design choices worth recording so they don't get re-litigated: 1. Counters are intentionally per-process — Prometheus design. Pod restarts reset them; rate()/increase() handle this at query time via the TSDB's reset detection. Don't add persistence to the counter itself. 2. Prometheus = recent operational telemetry (bounded by retention). On-chain settlement TXs = canonical lifetime financial record. 3. Recording rules use the convention ::; name the window (7d_by_offer, not lifetime_by_offer). 4. Add labels you'd query by directly (chain, asset_symbol — both CR-derived, both query-meaningful, both bounded). 5. div-by-zero guards use epsilon (1e-9), not 1.0. 6. CRD versioning stance: stay on v1alpha1 during active dev; the alpha promise IS "no compat". Graduate only when an external operator commits to depending on the schema. The PVC-backed counter persistence option was considered and rejected for our single-operator local-k3d use case. The doc walks through why, what would change that decision, and where the canonical "lifetime" answer comes from. Adds CLAUDE.md pointer so future contributors land here first. --- CLAUDE.md | 2 + docs/observability.md | 369 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 371 insertions(+) create mode 100644 docs/observability.md diff --git a/CLAUDE.md b/CLAUDE.md index 04a71396..b29a9d46 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -342,6 +342,8 @@ A registry digest pin instead of `:latest` on the verifier means your dev rewrit For a fuller debug catalog with symptom→fix mapping, see `.agents/skills/obol-stack-dev/references/release-smoke-debugging.md`. +For observability architecture decisions (Prometheus retention vs. on-chain canonical record, counter-reset semantics, recording-rule naming, label conventions, CRD versioning stance, `clamp_min` epsilon), see `docs/observability.md` — read this before adding a new metric, recording rule, or proposing counter persistence. + ### Security: Tunnel Exposure The Cloudflare tunnel exposes the cluster to the public internet. Only x402-gated endpoints and discovery metadata should be reachable via the tunnel hostname. Internal services (frontend, eRPC, LiteLLM, monitoring) MUST have `hostnames: ["obol.stack"]` on their HTTPRoutes to restrict them to local access. diff --git a/docs/observability.md b/docs/observability.md new file mode 100644 index 00000000..e4daa5f2 --- /dev/null +++ b/docs/observability.md @@ -0,0 +1,369 @@ +# Observability architecture + +Operator-facing reference for how Obol Stack records, queries, and reasons about +payment-flow telemetry. Read this before adding a new metric, a new recording +rule, or proposing "let's persist the counter to a PVC." + +## TL;DR + +- **Prometheus is for recent operational telemetry**, bounded by TSDB retention + (currently 8d in our cluster values). +- **On-chain settlement TXs are the canonical record for lifetime financial + state.** Every settled x402 payment leaves an immutable on-chain trace via + `X-PAYMENT-RESPONSE` (settle tx hash, asset, amount, payer, payee). +- **Counters reset on every pod restart. That is intentional.** Prometheus + counters are per-process by design. Use `increase()` / `rate()` at query + time — they detect resets in the TSDB and stitch ranges back together. +- Recording rules use `::` (Prometheus convention) + and **name the window** in the rule (`7d_by_offer`, not `lifetime_by_offer`). +- Div-by-zero guards use a small epsilon (`1e-9`), **never `1.0`**. +- CRDs stay on `v1alpha1` during active dev — the alpha promise IS "no compat", + and we have no external operators yet. + +If you find yourself asking "how do we compute lifetime revenue for offer X +since the project started," the answer is **not** a recording rule — it is a +chain indexer over settle TXs. + +--- + +## Why counters reset (and why that's fine) + +Prometheus counters are stored per-process. When a verifier pod restarts (rollout, +node drain, OOM, image bump), the in-memory counter goes back to zero. This is +**not** a bug to engineer around at write time. The Prometheus query engine +already knows about it: + +- `rate(counter[5m])` and `increase(counter[5m])` perform **reset detection**: + if the last sample is less than the previous sample inside the window, the + engine assumes a reset and stitches the two ranges together rather than + emitting a negative delta. +- This is the well-documented "counter reset semantics." See Robust Perception: + *Avoiding the counter-reset undercount* — the canonical writeup of why you + must always range-query counters rather than `sum()`'ing them raw. + +The corollary is the rule that bit us in PR #530: + +> Never write a recording rule of the form `sum(my_counter_total) by (...)`. +> Always write `sum(increase(my_counter_total[])) by (...)`. + +`sum(counter)` collapses to "whatever value the live samples currently hold," +which means **every pod restart silently zeros the recorded series**. The +expert review caught a recording rule shipped in that exact broken form; +PR #530 swapped it to `increase()` over an explicit window. + +--- + +## The thin-layer architecture + +``` + +------------------------------+ + | x402-verifier (stateless) | + | - in-memory counters | + | - labels: route, | + | offer_namespace, | + | offer_name, chain, | + | asset_symbol | + +---------------+--------------+ + | + | /metrics scrape (Prometheus) + v + +------------------------------+ + | Prometheus TSDB (retention) | + | - 8d rolling window | + | - reset detection built in | + +---------------+--------------+ + | + | recording rules with increase() + v + +------------------------------+ + | Pre-aggregated series | + | offer:x402_revenue:7d_by_offer + | offer:x402_paid_requests:7d_by_offer + +---------------+--------------+ + | + | PromQL queries + v + +------------------------------+ + | Frontend / dashboards | + | - reads pre-aggregated | + | - cheap, scoped to window | + +------------------------------+ + + + Parallel canonical path (for lifetime financial truth): + + +------------------------------+ + | x402-buyer / facilitator | + +---------------+--------------+ + | + | settle tx (on-chain) + v + +------------------------------+ + | Base / Base Sepolia | + | ERC-20 Transfer events | + | X-PAYMENT-RESPONSE header | + | carries settle tx hash | + +------------------------------+ + | + | chain indexer / explorer + v + +------------------------------+ + | Lifetime per-offer revenue | + | "since first deploy" answer | + +------------------------------+ +``` + +The two paths answer **different questions**: + +- Prometheus answers "what is the system doing in the last N hours/days?" with + cheap, second-resolution queries and label-faceting. +- On-chain answers "what was every payment that ever settled for offer X?" with + immutability and full historical depth, at the cost of being slower and + requiring an indexer. + +Mixing them is a category error. Don't try to make Prometheus answer the +lifetime question, and don't try to make the chain answer "what is the current +402-rate this minute?" + +--- + +## When NOT to add persistence to the counter itself + +Three options come up repeatedly in design discussions. We rejected all three +for the current use case: + +### PVC-backed verifier state +**Why it's tempting**: counters survive restart, no `increase()` gymnastics. + +**Why we rejected it**: it bolts a stateful primitive onto a stateless +component. `x402-verifier` is currently safe to scale, rollout, evict, and +re-image freely. A PVC turns every restart into a sequence-recovery problem +(double-counting on a torn write, undercounting on a crash before flush). +Prometheus already solves reset detection correctly; we'd be reimplementing +it badly and introducing a new failure mode. + +### Pushgateway +**Why it's tempting**: "decouple short-lived job state from scrape." + +**Why we rejected it**: Pushgateway is for batch-job final values, not for +long-running services. Using it for a live verifier inverts the ownership +model (Pushgateway becomes the source of truth, verifier becomes a writer), +loses per-pod identity, and adds a single-point-of-failure that, if it +restarts, **also** zeros the counter — without `rate()` knowing about it. + +### OTel collector with `cumulativetodelta` +**Why it's tempting**: collector-side reset stitching, hand off deltas to a +downstream store. + +**Why we rejected it**: it solves a problem we don't have (we're not sending +deltas to a backend that needs them), at the cost of a new infrastructure +component to operate. For a single-operator local-k3d stack, this is over- +engineering. If we ever export to an OTel-native backend, revisit. + +--- + +## When you WOULD want persistence + +The only legitimate driver is an explicit **billing or compliance requirement +to report "totals since first deploy" that exceeds Prometheus retention.** + +We do not have this requirement today. If we ever do: + +1. **Derive it from on-chain TXs**, not from metrics. Every paid request leaves + an `X-PAYMENT-RESPONSE` with a settle tx hash; an indexer over those is the + canonical answer. +2. Only fall back to a persisted counter if for some reason the chain trace is + unavailable for the offer in question — and even then, treat the indexed + chain data as the source of truth and the counter as a soft mirror. + +The architecture review's framing was right: if you find yourself wanting +Prometheus to answer a lifetime question, you've picked the wrong tool. + +--- + +## Recording rule conventions + +Naming follows the standard Prometheus pattern: + +``` +:: +``` + +Examples we ship: + +- `offer:x402_revenue:7d_by_offer` — revenue aggregated to the `offer` level, + base metric is `x402_revenue`, operation is `increase` over `7d` grouped + `by_offer`. +- `offer:x402_paid_requests:7d_by_offer` — same shape for paid request count. + +Rules: + +1. **Name the window in the rule.** `7d_by_offer` is honest; `lifetime_by_offer` + is a lie (Prometheus has no "lifetime"). The window in the name must match + the window in the expression. +2. **Use `increase()` over an explicit range, not `sum()` of the raw counter.** + See PR #530 — the original rule did `sum(by offer) (x402_revenue_total)` and + silently zeroed every time the verifier pod restarted. The fixed rule is + `sum by (offer_namespace, offer_name) (increase(x402_revenue_total[7d]))`. +3. **Keep the window aligned with retention.** Recording a `30d` rule with 8d + retention is a footgun: the rule sees nulls and silently produces nothing. + +--- + +## Label conventions + +Labels are the query interface. The rule of thumb: + +- **Add a label if it's an attribute you'd want to facet by directly and the + cardinality is bounded.** "Bounded" means you can write down all possible + values: chains, asset symbols, offer names. Not user addresses, not request + IDs, not arbitrary route paths beyond what the offer CR enumerates. +- **Don't add a label that multiplies cardinality.** Every unique combination + of label values is a separate time series in TSDB. A label that adds 100 + values multiplies storage by 100×. + +Concrete examples: + +| Label | Source | Why include it | +|-------------------|------------------|-------------------------------------------| +| `route` | offer CR pattern | Direct query facet, bounded by # offers | +| `offer_namespace` | offer CR meta | Tenancy facet | +| `offer_name` | offer CR meta | Per-offer breakdown | +| `chain` | offer CR payment | "Revenue by chain" is a real question | +| `asset_symbol` | offer CR payment | Added in PR #531 — per-token facet | + +`chain` and `asset_symbol` are both CR-derived (operator-set, bounded) and +query-meaningful ("how much USDC vs OBOL did we earn on Base last week?"). They +both belong. PR #531 added `asset_symbol` for exactly this reason — the prior +schema collapsed all asset types into one bucket. + +Anti-pattern: labeling by `payer_address` or `tx_hash`. Those are unbounded and +belong on the chain trace, not on the metric. + +--- + +## CRD versioning: stay on `v1alpha1` during active dev + +For the current single-operator local-stack development, the alpha-stays-alpha +approach matches the design intent. Concretely: + +- **While in active dev with no external operators, stay on `v1alpha1` and edit + the schema in place.** The alpha promise IS "no compat" — that's the whole + point of the version channel. Renaming a field, dropping a field, tightening + validation: all fair game at `v1alpha1`. +- **Bump to `v1alpha2` only when** you need both versions to coexist briefly to + validate a conversion path (which requires standing up a conversion webhook), + or to checkpoint a major redesign you want to land alongside the old shape. +- **Graduate to `v1beta1` only when** all three are true: + 1. The schema has been stable for ~2 releases (no breaking edits). + 2. An external operator has committed to depending on it. + 3. You're committing to backwards-compat for at least one release, with + deprecation warnings for any field you eventually want to remove. + +The architecture review surfaced "should we graduate to `v1beta1`?" as a flag. +That was a "what if we ship externally" hypothetical, not an action item — and +graduating prematurely locks us into compat overhead before the schema has +earned it. The current ServiceOffer / RegistrationRequest / PurchaseRequest +CRDs all stay on `v1alpha1` until the three conditions above hold. + +--- + +## `clamp_min(..., 1)` is an anti-pattern + +Div-by-zero guards in PromQL exist because dividing by an empty counter +produces a `NaN`. The naive fix is: + +```promql +# WRONG +my_success_rate + / +ignoring(...) clamp_min(my_request_total, 1) +``` + +That `1` is poison under low traffic. Suppose the real request rate over 5m is +3 successful out of 4 total (75%). With `clamp_min(..., 1)` and a window in +which the counter shows `0` total requests (e.g. between scrapes), the formula +returns `3/1 = 3.0` — a 300% success rate that breaks any alert downstream of +it. More commonly: the **denominator is clamped to 1 when it should be e.g. +0.5**, and your "success rate" reports half its real value, **causing +low-traffic alerts to under-report and stay silent during exactly the windows +when traffic is degraded**. + +The fix is to use an epsilon that's small enough never to dominate the real +denominator: + +```promql +# RIGHT +my_success_rate + / +ignoring(...) clamp_min(my_request_total, 1e-9) +``` + +`1e-9` keeps the division finite without distorting the result. Pick `1e-9` (or +smaller) as the project-wide epsilon and use it consistently. **Never `1.0`, +never `0.001`, never "a reasonable small number" — pick the smallest value that +avoids NaN and stick with it.** + +This was fixed in the same review pass that produced this doc. Future +contributors: if you write a guarded division, the epsilon is `1e-9`. + +--- + +## Cross-references + +### Code + +- `internal/x402/metrics.go` — verifier metric definitions + (`obol_x402_verifier_requests_total`, `_payment_required_total`, + `_payment_verified_total`, `_payment_failed_total`, `_charged_requests_total`). +- `internal/x402/verifier.go` — `prometheusLabels()` controls the verifier + label set; this is the canonical place to add a new bounded label. +- `internal/x402/buyer/metrics.go` — buyer-side counters + (`payment_attempts`, `payment_success_total`, `payment_failure_total`, + `confirm_spend_failure_total`, `payment_unsettled_confirmations`) plus + gauges (`auth_remaining`, `auth_spent`, `active_model_mappings`). +- `internal/x402/buyer/proxy.go` — `prometheusLabels()` for the buyer side. + +### Infrastructure + +- `internal/embed/infrastructure/values/monitoring.yaml.gotmpl` — Prometheus + values, including retention and recording rule wiring. +- `internal/embed/infrastructure/base/templates/x402.yaml` — verifier + Deployment, ServiceMonitor / PodMonitor. + +### Pull requests that shaped this + +- **PR #527** — `fix(prometheus-rules): escape PromQL $labels for Helm + rendering`. Helm was interpreting `$labels` as a Helm template variable and + blanking it; the fix is to escape so the literal `$labels` reaches the + Prometheus rule engine. +- **PR #530** — `fix(prometheus-rules): use increase() for the per-offer + revenue rule`. The original rule did `sum(counter)`, which silently zeroed + on verifier restart. Now uses `sum(increase(counter[7d]))` per the rules + above. +- **PR #531** — `feat(x402-metrics): add asset_symbol label for per-token + queries`. Unlocks "USDC vs OBOL revenue by chain" without needing a + downstream join. + +### Reports + +- The OBOL parity integration test report — see `plans/` for the most recent + `release-smoke-hardening-*.md` and `post-490-integration-*.md` entries that + reference the metric audits behind PRs #527 / #530 / #531. + +--- + +## Quick checklist for the next change + +Before opening a PR that touches metrics: + +- [ ] New label is bounded and CR-derived (or otherwise enumerable). +- [ ] No label that could grow unbounded (payer address, tx hash, free-form + path beyond CR enumeration). +- [ ] New recording rule uses `increase()` over an explicit window. +- [ ] Window in the rule name matches window in the expression + (no `lifetime_*`). +- [ ] Window is within Prometheus retention. +- [ ] Any guarded division uses `1e-9` as the clamp floor. +- [ ] If the new metric tries to answer a "lifetime" question, you've stopped + and reconsidered using on-chain data instead. From 5de3d4ac6e9a14c57541f0e12a6ba7f9c67dfe51 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sun, 24 May 2026 12:49:06 +0400 Subject: [PATCH 25/31] feat(monetize): replace pause annotation with ERC-8004-friendly drain The legacy obol.org/paused annotation tore down HTTPRoutes immediately, which is indistinguishable from a crash to remote x402 buyers and ERC-8004 reputation scorers. obol sell stop was also broken: it patched status.conditions which the controller immediately overwrote. This replaces both with a real drain: - New ServiceOffer spec.drainAt (date-time) + spec.drainGracePeriod (duration; default 1h) mark an offer as winding down. - While draining, /skill.md and /.well-known/agent-registration.json advertise the offer with available=false and drainEndsAt set, so external discovery can react before traffic disappears. - The HTTPRoute + payment gate stay up until DrainEndsAt, letting in-flight buyers complete payments. - After the grace period, the controller tears down the route, sets Draining=False reason=Drained, and leaves the CR (delete is the canonical removal command). obol sell stop sets spec.drainAt, supports --grace and --force/--now (zero grace = abrupt teardown for behavior parity with the old annotation). --- cmd/obol/sell.go | 73 ++++++++--- cmd/obol/sell_test.go | 13 +- docs/guides/monetize-inference.md | 29 ++++- .../base/templates/serviceoffer-crd.yaml | 22 +++- .../sell/references/serviceoffer-spec.md | 8 +- internal/monetizeapi/drain_test.go | 113 ++++++++++++++++++ internal/monetizeapi/types.go | 61 +++++++++- internal/schemas/service-catalog.schema.json | 15 ++- internal/schemas/service_catalog.go | 15 +++ internal/serviceoffercontroller/controller.go | 51 ++++++-- .../identity_controller.go | 5 +- internal/serviceoffercontroller/render.go | 57 ++++++++- .../serviceoffercontroller/render_test.go | 102 +++++++++++++++- internal/x402/serviceoffer_source.go | 8 +- internal/x402/serviceoffer_source_test.go | 55 +++++++-- 15 files changed, 566 insertions(+), 61 deletions(-) create mode 100644 internal/monetizeapi/drain_test.go diff --git a/cmd/obol/sell.go b/cmd/obol/sell.go index 379fb2d9..b06508f7 100644 --- a/cmd/obol/sell.go +++ b/cmd/obol/sell.go @@ -2331,8 +2331,25 @@ Examples: func sellStopCommand(cfg *config.Config) *cli.Command { return &cli.Command{ Name: "stop", - Usage: "Pause a ServiceOffer without deleting it", + Usage: "Drain a ServiceOffer gracefully (advertises wind-down via discovery, then tears down the route)", ArgsUsage: "", + Description: `Marks a ServiceOffer as draining. While draining: + - The offer stays in /skill.md and /.well-known/agent-registration.json + with available=false and a drainEndsAt timestamp, so external + discovery (and ERC-8004 reputation scorers) can see the wind-down. + - The HTTPRoute and x402 payment gate STAY UP for the grace period + so buyers can complete in-flight payments. + - When the grace period elapses, the controller tears down the route + and marks PaymentGateReady/RoutePublished False with reason=Drained. + +The ServiceOffer CR itself is preserved — use 'obol sell delete' to +remove it entirely (which also tombstones the ERC-8004 record). + +Flags: + --grace 30m Override the grace period (default 1h). + --force Skip the drain window (equivalent to --grace 0). Use + this when the abrupt-teardown behavior of the old + pause annotation is required for behavior parity.`, Flags: []cli.Flag{ &cli.StringFlag{ Name: "namespace", @@ -2340,6 +2357,16 @@ func sellStopCommand(cfg *config.Config) *cli.Command { Usage: "Namespace of the ServiceOffer", Required: true, }, + &cli.DurationFlag{ + Name: "grace", + Usage: "Drain grace period (e.g. 30m, 2h). Defaults to 1h.", + Value: monetizeapi.DefaultDrainGracePeriod, + }, + &cli.BoolFlag{ + Name: "force", + Aliases: []string{"now"}, + Usage: "Skip the drain window and tear the route down on the next reconcile (alias: --now)", + }, }, Action: func(ctx context.Context, cmd *cli.Command) error { u := getUI(cmd) @@ -2352,19 +2379,37 @@ func sellStopCommand(cfg *config.Config) *cli.Command { return err } ns := cmd.String("namespace") + grace := cmd.Duration("grace") + if cmd.Bool("force") { + grace = 0 + } + if grace < 0 { + return errors.New("--grace must be >= 0") + } - u.Infof("Stopping the service offering %s/%s...", ns, name) - - removePricingRoute(cfg, u, name) - - patchJSON := `{"status":{"conditions":[{"type":"Ready","status":"False","reason":"Stopped","message":"Offer stopped by user"}]}}` - err := kubectlRun(cfg, "patch", "serviceoffers.obol.org", name, "-n", ns, - "--type=merge", "-p", patchJSON) - if err != nil { - return fmt.Errorf("failed to pause serviceoffer: %w", err) + now := time.Now().UTC() + drainEndsAt := now.Add(grace) + + // metav1.Duration JSON-marshals as the string form (e.g. + // "1h0m0s"), and metav1.Time marshals as RFC3339. We can + // emit a tiny strategic-merge patch directly without + // importing the meta types into the CLI. + patchJSON := fmt.Sprintf( + `{"spec":{"drainAt":%q,"drainGracePeriod":%q}}`, + now.Format(time.RFC3339), + grace.String(), + ) + if err := kubectlRun(cfg, "patch", "serviceoffers.obol.org", name, "-n", ns, + "--type=merge", "-p", patchJSON); err != nil { + return fmt.Errorf("failed to drain serviceoffer: %w", err) } - u.Successf("Service offering %s/%s stopped.", ns, name) + if grace == 0 { + u.Successf("ServiceOffer %s/%s draining; route will be removed on the next reconcile (--force).", ns, name) + } else { + u.Successf("ServiceOffer %s/%s draining; route will be removed at %s.", ns, name, drainEndsAt.Format(time.RFC3339)) + } + u.Infof("In-flight buyers can complete payments until then. Run `obol sell delete %s -n %s` to fully remove.", name, ns) return nil }, } @@ -2518,8 +2563,6 @@ func sellDeleteCommand(cfg *config.Config) *cli.Command { } } - removePricingRoute(cfg, u, name) - // Identity-level registration ownership lives in the AgentIdentity // CR and is managed by the controller. The CLI no longer patches // the registration ConfigMap here; deleting the ServiceOffer is @@ -4126,7 +4169,3 @@ func manifestNSName(manifest map[string]any) (string, string) { return ns, name } -// removePricingRoute is a no-op retained for compatibility. -// The serviceoffer-controller now manages pricing routes via the ServiceOffer -// informer; static ConfigMap routes are no longer used. -func removePricingRoute(_ *config.Config, _ *ui.UI, _ string) {} diff --git a/cmd/obol/sell_test.go b/cmd/obol/sell_test.go index 9f3991ea..4055fa2c 100644 --- a/cmd/obol/sell_test.go +++ b/cmd/obol/sell_test.go @@ -626,9 +626,20 @@ func TestSellStop_Structure(t *testing.T) { stop := findSubcommand(t, cmd, "stop") flags := flagMap(stop) - requireFlags(t, flags, "namespace") + requireFlags(t, flags, "namespace", "grace", "force") assertFlagRequired(t, flags, "namespace") assertFlagHasAlias(t, flags, "namespace", "n") + // --now is the documented alias for --force; if it disappears, + // scripted operators that rely on it break silently. + assertFlagHasAlias(t, flags, "force", "now") + + graceFlag, ok := flags["grace"].(*cli.DurationFlag) + if !ok { + t.Fatalf("--grace should be *cli.DurationFlag, got %T", flags["grace"]) + } + if graceFlag.Value != monetizeapi.DefaultDrainGracePeriod { + t.Errorf("--grace default = %v, want %v", graceFlag.Value, monetizeapi.DefaultDrainGracePeriod) + } } func TestSellDelete_Structure(t *testing.T) { diff --git a/docs/guides/monetize-inference.md b/docs/guides/monetize-inference.md index 2fdf3680..775ba812 100644 --- a/docs/guides/monetize-inference.md +++ b/docs/guides/monetize-inference.md @@ -572,15 +572,34 @@ obol sell status my-qwen --namespace llm obol sell status ``` -### Pausing +### Draining -Pause an offer without deleting it: +Stop an offer gracefully so buyers can wind down before the route disappears: ```bash -obol sell stop my-qwen --namespace llm +obol sell stop my-qwen --namespace llm # default: 1h grace +obol sell stop my-qwen --namespace llm --grace 30m # custom grace +obol sell stop my-qwen --namespace llm --force # tear down immediately ``` -The CR and any ERC-8004 registration remain intact. Re-create the offer with the same name to restart. +`obol sell stop` sets `spec.drainAt` on the ServiceOffer. While the offer is +draining: + +- `/skill.md` and `/.well-known/agent-registration.json` advertise the offer + with `available: false` and `drainEndsAt: `, so external discovery + (and ERC-8004 reputation scorers) can react before traffic disappears. +- The HTTPRoute and x402 payment gate stay up so in-flight buyers can complete + payments. +- When the grace period elapses, the controller tears down the route and marks + `Draining=False` reason=Drained. + +The ServiceOffer CR and any ERC-8004 registration remain intact. Use +`obol sell delete` to remove the offer entirely. + +`--force` (alias: `--now`) skips the drain window — useful when you want the +abrupt-teardown behavior of the legacy `obol.org/paused` annotation, for +example to reclaim the path immediately. Note that abrupt teardown is a worse +reputation signal for on-chain buyers than a graceful drain. ### Cleanup @@ -815,7 +834,7 @@ manifest. Do not paper over smoke-test failures with an ad hoc patch. | `obol sell http --wallet ... --chain ... --per-request ... --upstream ... --port ...` | Create a ServiceOffer and register by default | | `obol sell list` | List all ServiceOffers | | `obol sell status -n ` | Show conditions for an offer | -| `obol sell stop -n ` | Pause an offer without deleting it | +| `obol sell stop -n [--grace 1h] [--force]` | Drain an offer (advertise wind-down via discovery, then tear down the route after the grace period). `--force`/`--now` skips the grace window. | | `obol sell delete -n ` | Delete an offer and cleanup | | `obol sell status` | Show cluster pricing and registration | | `obol sell register --private-key-file ...` | Advanced/manual registration or repair path | diff --git a/internal/embed/infrastructure/base/templates/serviceoffer-crd.yaml b/internal/embed/infrastructure/base/templates/serviceoffer-crd.yaml index 5b37ba23..86f2a446 100644 --- a/internal/embed/infrastructure/base/templates/serviceoffer-crd.yaml +++ b/internal/embed/infrastructure/base/templates/serviceoffer-crd.yaml @@ -239,6 +239,24 @@ spec: type: string description: "URL path prefix for the HTTPRoute, defaults to /services/." pattern: "^/[a-zA-Z0-9/_.-]*$" + drainAt: + type: string + format: date-time + description: >- + When set, marks the offer as draining. Discovery surfaces + (/skill.md and /.well-known/agent-registration.json) advertise + the offer with available=false and drainEndsAt set, so external + observers can react before the route is removed. The HTTPRoute + and payment gate stay up until drainAt + drainGracePeriod so + in-flight buyers can settle. Set by `obol sell stop`. + drainGracePeriod: + type: string + description: >- + How long after drainAt the HTTPRoute remains up. Go duration + format (e.g. "1h", "30m", "0s"). Defaults to "1h" when unset. + A zero duration tears the route down on the next reconcile + (the `obol sell stop --force` path). + pattern: "^([0-9]+(ns|us|µs|ms|s|m|h))+$" registration: type: object description: >- @@ -312,7 +330,9 @@ spec: type: array description: >- Condition types: ModelReady, UpstreamHealthy, PaymentGateReady, - RoutePublished, Registered, Ready. + RoutePublished, Registered, Ready, Draining. Draining is True + while spec.drainAt is set and the grace window has not elapsed; + transitions to False reason=Drained once the route is torn down. items: type: object required: diff --git a/internal/embed/skills/sell/references/serviceoffer-spec.md b/internal/embed/skills/sell/references/serviceoffer-spec.md index a2b6183e..5ead12d7 100644 --- a/internal/embed/skills/sell/references/serviceoffer-spec.md +++ b/internal/embed/skills/sell/references/serviceoffer-spec.md @@ -180,7 +180,13 @@ Each condition contains: ## Lifecycle Notes -- Pausing is represented via the `obol.org/paused: "true"` annotation. +- Graceful stop is represented via `spec.drainAt` (RFC3339 timestamp) and + the optional `spec.drainGracePeriod` (Go duration, e.g. `"30m"`, defaults + to `1h`). While draining, discovery surfaces advertise the offer with + `available: false` + `drainEndsAt`, and the HTTPRoute/payment gate stay + up until the grace period expires so in-flight buyers can settle. + `obol sell stop --force` is the equivalent of `drainGracePeriod: 0s` — + abrupt teardown with no advertised wind-down. - Deleting a `ServiceOffer` cascades owned `Middleware` and `HTTPRoute` resources via `ownerReferences`. - Registration side effects are isolated in a child `RegistrationRequest` diff --git a/internal/monetizeapi/drain_test.go b/internal/monetizeapi/drain_test.go new file mode 100644 index 00000000..0241a49a --- /dev/null +++ b/internal/monetizeapi/drain_test.go @@ -0,0 +1,113 @@ +package monetizeapi + +import ( + "testing" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestServiceOffer_IsDraining(t *testing.T) { + t.Run("nil drainAt", func(t *testing.T) { + o := &ServiceOffer{} + if o.IsDraining() { + t.Errorf("IsDraining() = true, want false for nil drainAt") + } + }) + t.Run("set drainAt", func(t *testing.T) { + now := metav1.Now() + o := &ServiceOffer{Spec: ServiceOfferSpec{DrainAt: &now}} + if !o.IsDraining() { + t.Errorf("IsDraining() = false, want true for non-nil drainAt") + } + }) +} + +func TestServiceOffer_DrainEndsAt(t *testing.T) { + base := time.Date(2026, time.May, 1, 12, 0, 0, 0, time.UTC) + baseMeta := metav1.NewTime(base) + + cases := []struct { + name string + drain *metav1.Time + grace *metav1.Duration + want time.Time + }{ + { + name: "nil drainAt returns zero", + drain: nil, + grace: nil, + want: time.Time{}, + }, + { + name: "nil grace applies default 1h", + drain: &baseMeta, + grace: nil, + want: base.Add(time.Hour), + }, + { + name: "explicit zero grace honored", + drain: &baseMeta, + grace: &metav1.Duration{Duration: 0}, + want: base, + }, + { + name: "custom grace honored", + drain: &baseMeta, + grace: &metav1.Duration{Duration: 30 * time.Minute}, + want: base.Add(30 * time.Minute), + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + o := &ServiceOffer{Spec: ServiceOfferSpec{DrainAt: tc.drain, DrainGracePeriod: tc.grace}} + if got := o.DrainEndsAt(); !got.Equal(tc.want) { + t.Errorf("DrainEndsAt() = %v, want %v", got, tc.want) + } + }) + } +} + +func TestServiceOffer_DrainExpired(t *testing.T) { + now := time.Date(2026, time.May, 1, 12, 0, 0, 0, time.UTC) + + t.Run("not draining returns false", func(t *testing.T) { + o := &ServiceOffer{} + if o.DrainExpired(now) { + t.Errorf("DrainExpired() = true, want false for non-draining offer") + } + }) + + t.Run("mid-drain returns false", func(t *testing.T) { + drainAt := metav1.NewTime(now.Add(-10 * time.Minute)) + o := &ServiceOffer{Spec: ServiceOfferSpec{ + DrainAt: &drainAt, + DrainGracePeriod: &metav1.Duration{Duration: time.Hour}, + }} + if o.DrainExpired(now) { + t.Errorf("DrainExpired() = true, want false for mid-drain offer") + } + }) + + t.Run("expired returns true", func(t *testing.T) { + drainAt := metav1.NewTime(now.Add(-2 * time.Hour)) + o := &ServiceOffer{Spec: ServiceOfferSpec{ + DrainAt: &drainAt, + DrainGracePeriod: &metav1.Duration{Duration: time.Hour}, + }} + if !o.DrainExpired(now) { + t.Errorf("DrainExpired() = false, want true for expired drain") + } + }) + + t.Run("force path zero grace tears down on next reconcile", func(t *testing.T) { + drainAt := metav1.NewTime(now) + o := &ServiceOffer{Spec: ServiceOfferSpec{ + DrainAt: &drainAt, + DrainGracePeriod: &metav1.Duration{Duration: 0}, + }} + if !o.DrainExpired(now) { + t.Errorf("DrainExpired() = false at now == drainAt with zero grace, want true") + } + }) +} diff --git a/internal/monetizeapi/types.go b/internal/monetizeapi/types.go index 6e905eee..30f0ca0d 100644 --- a/internal/monetizeapi/types.go +++ b/internal/monetizeapi/types.go @@ -3,11 +3,18 @@ package monetizeapi import ( "fmt" "strings" + "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" ) +// DefaultDrainGracePeriod is the grace period applied to a draining +// ServiceOffer when spec.drainGracePeriod is unset. Buyers using the +// offer can complete in-flight payments and migrate to alternative +// providers within this window before the HTTPRoute is torn down. +const DefaultDrainGracePeriod = time.Hour + const ( Group = "obol.org" Version = "v1alpha1" @@ -29,8 +36,6 @@ const ( AgentIdentityDefaultNamespace = "x402" AgentIdentityDefaultName = "default" - PausedAnnotation = "obol.org/paused" - AgentRuntimeHermes = "hermes" AgentPhasePending = "Pending" @@ -77,6 +82,21 @@ type ServiceOfferSpec struct { Path string `json:"path,omitempty"` Provenance map[string]string `json:"provenance,omitempty"` Registration ServiceOfferRegistration `json:"registration,omitempty"` + + // DrainAt marks the offer as draining when non-nil. While the offer + // is in the drain window, discovery surfaces (/skill.md and + // /.well-known/agent-registration.json) advertise the offer with + // available=false and drainEndsAt set, so buyers can migrate before + // the route is torn down. The route + payment gate stay up until + // DrainEndsAt() so in-flight payments can complete. Replaces the + // legacy obol.org/paused annotation. + DrainAt *metav1.Time `json:"drainAt,omitempty"` + + // DrainGracePeriod is how long after DrainAt the HTTPRoute remains + // up. Defaults to DefaultDrainGracePeriod when nil. A zero duration + // is honored as "tear down immediately on the next reconcile" (the + // equivalent of `obol sell stop --force`). + DrainGracePeriod *metav1.Duration `json:"drainGracePeriod,omitempty"` } // ServiceOfferAgent is populated when Spec.Type == "agent". The controller @@ -241,8 +261,41 @@ func (o *ServiceOffer) IsAgent() bool { return o.Spec.Type == "agent" } -func (o *ServiceOffer) IsPaused() bool { - return o.Annotations != nil && o.Annotations[PausedAnnotation] == "true" +// IsDraining reports whether spec.drainAt has been set. Drained offers +// transition through three phases: pre-drain (DrainAt nil), draining +// (DrainAt set, now < DrainEndsAt), and drain-expired (DrainAt set, +// now >= DrainEndsAt). The controller keeps the route up during +// "draining" and tears it down once "drain-expired" is reached. +func (o *ServiceOffer) IsDraining() bool { + return o.Spec.DrainAt != nil +} + +// DrainEndsAt returns DrainAt + DrainGracePeriod. When DrainAt is nil +// the zero time is returned (caller should gate on IsDraining first). +// When DrainGracePeriod is nil the default grace period is applied; a +// zero grace period is honored as "drain ends at DrainAt", i.e. tear +// down on the next reconcile (the --force/--now path). +func (o *ServiceOffer) DrainEndsAt() time.Time { + if o.Spec.DrainAt == nil { + return time.Time{} + } + grace := DefaultDrainGracePeriod + if o.Spec.DrainGracePeriod != nil { + grace = o.Spec.DrainGracePeriod.Duration + } + return o.Spec.DrainAt.Time.Add(grace) +} + +// DrainExpired reports whether the drain grace period has elapsed. +// Returns false when the offer is not draining at all. Callers should +// use this rather than IsDraining when deciding whether to tear down +// the HTTPRoute or filter the offer from the live x402 verifier rules. +func (o *ServiceOffer) DrainExpired(now time.Time) bool { + if !o.IsDraining() { + return false + } + end := o.DrainEndsAt() + return !now.Before(end) } // ── PurchaseRequest ───────────────────────────────────────────────────────── diff --git a/internal/schemas/service-catalog.schema.json b/internal/schemas/service-catalog.schema.json index 6f7578ba..58dbc7c4 100644 --- a/internal/schemas/service-catalog.schema.json +++ b/internal/schemas/service-catalog.schema.json @@ -78,7 +78,8 @@ "payTo", "network", "description", - "isDemo" + "isDemo", + "available" ], "properties": { "name": { @@ -149,6 +150,18 @@ }, "isDemo": { "type": "boolean" + }, + "registrationPending": { + "type": "boolean" + }, + "available": { + "type": "boolean", + "description": "False during a drain window. Catalog consumers should treat unset as true for backwards compatibility." + }, + "drainEndsAt": { + "type": "string", + "format": "date-time", + "description": "RFC3339 timestamp at which the offer's HTTPRoute will be torn down. Set only when available=false." } } } diff --git a/internal/schemas/service_catalog.go b/internal/schemas/service_catalog.go index e53085b6..eb8bba78 100644 --- a/internal/schemas/service_catalog.go +++ b/internal/schemas/service_catalog.go @@ -39,6 +39,21 @@ type ServiceCatalogEntry struct { // know the offer is usable for x402 payments today, even though // ERC-8004 discovery via the chain still resolves to the prior state. RegistrationPending bool `json:"registrationPending,omitempty"` + + // Available is false when the offer is in its drain window. Buyers + // can still complete in-flight payments until DrainEndsAt, but + // discovery surfaces should advertise the wind-down so external + // observers can react. When false, DrainEndsAt is set to the RFC3339 + // timestamp at which the HTTPRoute will be torn down. Catalog + // consumers should treat unset Available (the default-true field) as + // "available" for backwards compatibility — the field is only written + // false during drain. + Available bool `json:"available"` + + // DrainEndsAt is the RFC3339 timestamp at which the offer's + // HTTPRoute will be removed. Set only when Available=false. Buyers + // SHOULD migrate to alternative providers before this time. + DrainEndsAt string `json:"drainEndsAt,omitempty"` } // ServiceCatalogAsset describes the settlement token resolved for a catalog diff --git a/internal/serviceoffercontroller/controller.go b/internal/serviceoffercontroller/controller.go index fac586b0..7cbd759b 100644 --- a/internal/serviceoffercontroller/controller.go +++ b/internal/serviceoffercontroller/controller.go @@ -296,7 +296,7 @@ func (c *Controller) enqueueOfferFromRegistration(obj any) { log.Printf("serviceoffer-controller: decode offer for registration fan-out: %v", err) continue } - if offer.DeletionTimestamp != nil || offer.IsPaused() || !offer.Spec.Registration.Enabled { + if offer.DeletionTimestamp != nil || !offer.Spec.Registration.Enabled { continue } c.offerQueue.Add(offer.Namespace + "/" + offer.Name) @@ -455,13 +455,49 @@ func (c *Controller) reconcileOffer(ctx context.Context, key string) error { return err } - if offer.IsPaused() { - if err := c.deleteRouteChildren(ctx, offer); err != nil { - return err + if offer.IsDraining() { + now := time.Now() + drainEndsAt := offer.DrainEndsAt() + if offer.DrainExpired(now) { + // Drain grace period elapsed: tear down the HTTPRoute + + // payment gate. The CR itself stays (delete is the canonical + // removal path) so external observers continue to see the + // offer in the catalog with available=false. + if err := c.deleteRouteChildren(ctx, offer); err != nil { + return err + } + setCondition(&status, "Draining", "False", "Drained", fmt.Sprintf("Drain ended at %s; route torn down", drainEndsAt.UTC().Format(time.RFC3339))) + setCondition(&status, "PaymentGateReady", "False", "Drained", "Offer drained; payment gate removed") + setCondition(&status, "RoutePublished", "False", "Drained", "Offer drained; route removed") + } else { + // Still in the drain window: keep the route + payment gate + // up so in-flight buyers can finish, but mark Draining=True + // so discovery surfaces can advertise available=false. + if upstreamHealthy && isConditionTrue(status, "ModelReady") { + if err := c.reconcilePaymentGate(ctx, &status, offer); err != nil { + return err + } + if isConditionTrue(status, "PaymentGateReady") { + if err := c.reconcileRoute(ctx, &status, offer); err != nil { + return err + } + } + } else { + setCondition(&status, "PaymentGateReady", "False", "WaitingForUpstream", "Waiting for upstream health before publishing payment gate") + setCondition(&status, "RoutePublished", "False", "WaitingForPaymentGate", "Waiting for payment gate before publishing route") + } + setCondition(&status, "Draining", "True", "Draining", fmt.Sprintf("Drain ends at %s", drainEndsAt.UTC().Format(time.RFC3339))) + // Requeue at the drain expiry so the route is torn down on + // time even without any spec change in the interim. Add a + // small slack so the comparison in DrainExpired clears. + if delay := time.Until(drainEndsAt) + time.Second; delay > 0 { + c.offerQueue.AddAfter(offer.Namespace+"/"+offer.Name, delay) + } else { + c.offerQueue.Add(offer.Namespace + "/" + offer.Name) + } } - setCondition(&status, "PaymentGateReady", "False", "Paused", "Offer is paused") - setCondition(&status, "RoutePublished", "False", "Paused", "Offer is paused") } else if upstreamHealthy && isConditionTrue(status, "ModelReady") { + setCondition(&status, "Draining", "False", "Active", "Offer is active") if err := c.reconcilePaymentGate(ctx, &status, offer); err != nil { return err } @@ -471,6 +507,7 @@ func (c *Controller) reconcileOffer(ctx context.Context, key string) error { } } } else { + setCondition(&status, "Draining", "False", "Active", "Offer is active") setCondition(&status, "PaymentGateReady", "False", "WaitingForUpstream", "Waiting for upstream health before publishing payment gate") setCondition(&status, "RoutePublished", "False", "WaitingForPaymentGate", "Waiting for payment gate before publishing route") } @@ -1114,7 +1151,7 @@ func (c *Controller) reconcileSkillCatalog(ctx context.Context, override *moneti } readyOffers := 0 for _, offer := range offers { - if offer != nil && offer.DeletionTimestamp == nil && !offer.IsPaused() && isConditionTrue(offer.Status, "Ready") { + if offer != nil && offer.DeletionTimestamp == nil && isConditionTrue(offer.Status, "Ready") { readyOffers++ } } diff --git a/internal/serviceoffercontroller/identity_controller.go b/internal/serviceoffercontroller/identity_controller.go index 030bf013..92fcd03e 100644 --- a/internal/serviceoffercontroller/identity_controller.go +++ b/internal/serviceoffercontroller/identity_controller.go @@ -386,7 +386,10 @@ func (c *Controller) registrationOffersForIdentity(key agentIdentityKey, exclude if offer.Namespace == excludeNamespace && offer.Name == excludeName { continue } - if offer.DeletionTimestamp != nil || offer.IsPaused() || !offer.Spec.Registration.Enabled { + // Draining offers stay in the registration candidate list so + // the registration document continues to advertise them with + // available=false until the drain grace period expires. + if offer.DeletionTimestamp != nil || !offer.Spec.Registration.Enabled { continue } if !isConditionTrue(offer.Status, "UpstreamHealthy") { diff --git a/internal/serviceoffercontroller/render.go b/internal/serviceoffercontroller/render.go index a733213b..09457aa3 100644 --- a/internal/serviceoffercontroller/render.go +++ b/internal/serviceoffercontroller/render.go @@ -712,8 +712,17 @@ func buildRegistrationServices(owner *monetizeapi.ServiceOffer, offers []*moneti return services } +// offerPublishedForRegistration reports whether an offer should appear +// in the operator's ERC-8004 registration document as a live, gated +// service. Draining offers stay in the document with available=false +// so external observers can see the wind-down — this function filters +// them out only after the drain window has fully expired (i.e. the +// HTTPRoute is gone and there is no payment surface to advertise). func offerPublishedForRegistration(offer *monetizeapi.ServiceOffer) bool { - if offer == nil || offer.DeletionTimestamp != nil || offer.IsPaused() || !offer.Spec.Registration.Enabled { + if offer == nil || offer.DeletionTimestamp != nil || !offer.Spec.Registration.Enabled { + return false + } + if offer.DrainExpired(time.Now()) { return false } return isConditionTrue(offer.Status, "ModelReady") && @@ -731,9 +740,18 @@ func buildSkillCatalogMarkdown(offers []*monetizeapi.ServiceOffer, baseURL strin // both /skill.md and /api/services.json, with the on-chain ERC-8004 // registration treated as informational metadata rather than a gating // signal. See offerOperationallyReady's doc comment for the rationale. + now := time.Now() var ready []*monetizeapi.ServiceOffer for _, offer := range offers { - if offer == nil || offer.DeletionTimestamp != nil || offer.IsPaused() { + if offer == nil || offer.DeletionTimestamp != nil { + continue + } + // Drained offers (post-grace-period) have no live route — drop + // them from the catalog entirely. Draining offers (pre-expiry) + // stay in the catalog with available=false + drainEndsAt set so + // buyers can see the wind-down via discovery before the route + // disappears. + if offer.DrainExpired(now) { continue } if offerOperationallyReady(offer) { @@ -762,20 +780,25 @@ func buildSkillCatalogMarkdown(offers []*monetizeapi.ServiceOffer, baseURL strin } lines = append(lines, "## Services", "") - lines = append(lines, "| Service | Type | Model | Price | Endpoint |") - lines = append(lines, "|---------|------|-------|-------|----------|") + lines = append(lines, "| Service | Type | Model | Price | Available | Endpoint |") + lines = append(lines, "|---------|------|-------|-------|-----------|----------|") for _, offer := range ready { modelName := offer.Spec.Model.Name if modelName == "" { modelName = "—" } + availability := "yes" + if offer.IsDraining() { + availability = fmt.Sprintf("draining (ends %s)", offer.DrainEndsAt().UTC().Format(time.RFC3339)) + } lines = append(lines, fmt.Sprintf( - "| [%s](#%s) | %s | %s | %s | `%s%s` |", + "| [%s](#%s) | %s | %s | %s | %s | `%s%s` |", offer.Name, offer.Name, fallbackOfferType(offer), modelName, describeOfferPrice(offer), + availability, baseURL, offer.EffectivePath(), )) @@ -792,6 +815,12 @@ func buildSkillCatalogMarkdown(offers []*monetizeapi.ServiceOffer, baseURL strin lines = append(lines, fmt.Sprintf("- **Price**: %s", describeOfferPrice(offer))) lines = append(lines, fmt.Sprintf("- **Pay To**: `%s`", firstNonEmpty(offer.Spec.Payment.PayTo, "—"))) lines = append(lines, fmt.Sprintf("- **Network**: %s", firstNonEmpty(offer.Spec.Payment.Network, "—"))) + if offer.IsDraining() { + lines = append(lines, "- **Available**: false (draining)") + lines = append(lines, fmt.Sprintf("- **Drain ends at**: %s", offer.DrainEndsAt().UTC().Format(time.RFC3339))) + } else { + lines = append(lines, "- **Available**: true") + } description := offer.Spec.Registration.Description if description == "" { description = fmt.Sprintf("x402 payment-gated %s service", fallbackOfferType(offer)) @@ -868,9 +897,17 @@ func offerAwaitingRegistration(offer *monetizeapi.ServiceOffer) bool { func buildServiceCatalogJSON(offers []*monetizeapi.ServiceOffer, baseURL string) string { baseURL = strings.TrimRight(baseURL, "/") + now := time.Now() var ready []*monetizeapi.ServiceOffer for _, offer := range offers { - if offer == nil || offer.DeletionTimestamp != nil || offer.IsPaused() { + if offer == nil || offer.DeletionTimestamp != nil { + continue + } + // Drained offers (post-grace-period) have no live route — drop + // them from the catalog entirely. Draining offers (pre-expiry) + // stay in the catalog with available=false + drainEndsAt set so + // buyers can react before the route disappears. + if offer.DrainExpired(now) { continue } if offerOperationallyReady(offer) { @@ -895,6 +932,12 @@ func buildServiceCatalogJSON(offers []*monetizeapi.ServiceOffer, baseURL string) modelName = offer.Status.AgentResolution.Model } + available := !offer.IsDraining() + drainEndsAt := "" + if offer.IsDraining() { + drainEndsAt = offer.DrainEndsAt().UTC().Format(time.RFC3339) + } + svc := schemas.ServiceCatalogEntry{ Name: offer.Name, Namespace: offer.Namespace, @@ -907,6 +950,8 @@ func buildServiceCatalogJSON(offers []*monetizeapi.ServiceOffer, baseURL string) Description: desc, IsDemo: offer.Namespace == "demo", RegistrationPending: offerAwaitingRegistration(offer), + Available: available, + DrainEndsAt: drainEndsAt, } raw, unit := offerPriceRawAndUnit(offer) diff --git a/internal/serviceoffercontroller/render_test.go b/internal/serviceoffercontroller/render_test.go index eb71891c..199e6721 100644 --- a/internal/serviceoffercontroller/render_test.go +++ b/internal/serviceoffercontroller/render_test.go @@ -719,18 +719,27 @@ func TestBuildServiceCatalogJSON_AgentOfferUsesResolvedModel(t *testing.T) { } // TestBuildServiceCatalogJSON_ExcludesNonReady locks in the filter pipeline: -// nil offers, paused offers, and offers with a DeletionTimestamp must never -// leak onto the public storefront, even if they carry Ready=True. +// nil offers, drain-expired offers, and offers with a DeletionTimestamp +// must never leak onto the public storefront, even if they carry +// Ready=True. Mid-drain offers DO stay in the catalog with available=false +// and drainEndsAt set — that's the whole point of the drain replacement. func TestBuildServiceCatalogJSON_ExcludesNonReady(t *testing.T) { readyCond := []monetizeapi.Condition{{Type: "Ready", Status: "True"}} deleting := metav1.Now() + drainedAt := metav1.NewTime(time.Now().Add(-2 * time.Hour)) + zeroGrace := metav1.Duration{Duration: 0} + offers := []*monetizeapi.ServiceOffer{ nil, { - ObjectMeta: metav1.ObjectMeta{ - Name: "paused-svc", Namespace: "llm", - Annotations: map[string]string{monetizeapi.PausedAnnotation: "true"}, + ObjectMeta: metav1.ObjectMeta{Name: "drained-svc", Namespace: "llm"}, + Spec: monetizeapi.ServiceOfferSpec{ + DrainAt: &drainedAt, + DrainGracePeriod: &zeroGrace, + Payment: monetizeapi.ServiceOfferPayment{ + Price: monetizeapi.ServiceOfferPriceTable{PerRequest: "0.001"}, + }, }, Status: monetizeapi.ServiceOfferStatus{Conditions: readyCond}, }, @@ -773,6 +782,89 @@ func TestBuildServiceCatalogJSON_ExcludesNonReady(t *testing.T) { if services[0].Name != "ready-svc" { t.Errorf("got %q, want ready-svc — filter pipeline leaked another offer", services[0].Name) } + if !services[0].Available { + t.Errorf("ready-svc.available = false, want true (offer is not draining)") + } +} + +// TestBuildServiceCatalogJSON_DrainLifecycle covers the three drain +// states explicitly: pre-drain (available=true, no drainEndsAt), mid-drain +// (in catalog, available=false, drainEndsAt populated), and drain-expired +// (filtered out of the catalog because the controller has torn down the +// underlying route). +func TestBuildServiceCatalogJSON_DrainLifecycle(t *testing.T) { + readyCond := []monetizeapi.Condition{{Type: "Ready", Status: "True"}} + mkOffer := func(name string) monetizeapi.ServiceOffer { + return monetizeapi.ServiceOffer{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: "llm"}, + Spec: monetizeapi.ServiceOfferSpec{ + Type: "http", + Payment: monetizeapi.ServiceOfferPayment{ + Network: "base", + PayTo: "0x1111111111111111111111111111111111111111", + Price: monetizeapi.ServiceOfferPriceTable{PerRequest: "0.001"}, + }, + }, + Status: monetizeapi.ServiceOfferStatus{Conditions: readyCond}, + } + } + + // Pre-drain. + pre := mkOffer("pre") + + // Mid-drain: drainAt = now, grace = 1h → ends ~1h from now. + midDrainAt := metav1.NewTime(time.Now()) + midGrace := metav1.Duration{Duration: time.Hour} + mid := mkOffer("mid") + mid.Spec.DrainAt = &midDrainAt + mid.Spec.DrainGracePeriod = &midGrace + + // Drain-expired. + expDrainAt := metav1.NewTime(time.Now().Add(-2 * time.Hour)) + expGrace := metav1.Duration{Duration: time.Hour} + exp := mkOffer("expired") + exp.Spec.DrainAt = &expDrainAt + exp.Spec.DrainGracePeriod = &expGrace + + jsonStr := buildServiceCatalogJSON([]*monetizeapi.ServiceOffer{&pre, &mid, &exp}, "https://example.com") + var services []schemas.ServiceCatalogEntry + if err := json.Unmarshal([]byte(jsonStr), &services); err != nil { + t.Fatalf("invalid JSON: %v\n%s", err, jsonStr) + } + if len(services) != 2 { + t.Fatalf("expected 2 services (pre + mid; expired filtered out), got %d: %+v", len(services), services) + } + + byName := map[string]schemas.ServiceCatalogEntry{} + for _, s := range services { + byName[s.Name] = s + } + if pre, ok := byName["pre"]; !ok { + t.Fatal("pre-drain offer missing from catalog") + } else { + if !pre.Available { + t.Errorf("pre.available = false, want true") + } + if pre.DrainEndsAt != "" { + t.Errorf("pre.drainEndsAt = %q, want empty", pre.DrainEndsAt) + } + } + if mid, ok := byName["mid"]; !ok { + t.Fatal("mid-drain offer missing from catalog") + } else { + if mid.Available { + t.Errorf("mid.available = true, want false (offer is draining)") + } + if mid.DrainEndsAt == "" { + t.Errorf("mid.drainEndsAt is empty, want RFC3339 timestamp") + } + if _, err := time.Parse(time.RFC3339, mid.DrainEndsAt); err != nil { + t.Errorf("mid.drainEndsAt = %q is not RFC3339: %v", mid.DrainEndsAt, err) + } + } + if _, ok := byName["expired"]; ok { + t.Error("drain-expired offer leaked into catalog; should be filtered") + } } // TestBuildServiceCatalogJSON_SortOrder ensures offers render in diff --git a/internal/x402/serviceoffer_source.go b/internal/x402/serviceoffer_source.go index f0b1999a..0c08c491 100644 --- a/internal/x402/serviceoffer_source.go +++ b/internal/x402/serviceoffer_source.go @@ -7,6 +7,7 @@ import ( "log" "sort" "strings" + "time" "github.com/ObolNetwork/obol-stack/internal/monetizeapi" "github.com/ObolNetwork/obol-stack/internal/schemas" @@ -85,7 +86,12 @@ func routesFromStore(offerItems, secretItems []any) ([]RouteRule, error) { if offer.Spec.Upstream.Namespace == "" { offer.Spec.Upstream.Namespace = offer.Namespace } - if offer.IsPaused() || !offerConditionTrue(offer.Status, "RoutePublished") { + // Draining offers keep their route up until the grace period + // expires so in-flight payments can settle. Only skip after the + // drain window has elapsed — at that point the controller has + // also torn down the HTTPRoute, so the verifier rule would + // gate traffic against a non-existent backend. + if offer.DrainExpired(time.Now()) || !offerConditionTrue(offer.Status, "RoutePublished") { continue } diff --git a/internal/x402/serviceoffer_source_test.go b/internal/x402/serviceoffer_source_test.go index 9733095e..6c825cda 100644 --- a/internal/x402/serviceoffer_source_test.go +++ b/internal/x402/serviceoffer_source_test.go @@ -3,6 +3,7 @@ package x402 import ( "encoding/base64" "testing" + "time" "github.com/ObolNetwork/obol-stack/internal/monetizeapi" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -37,11 +38,16 @@ func TestRoutesFromStore(t *testing.T) { Conditions: []monetizeapi.Condition{{Type: "RoutePublished", Status: "True"}}, }, }), + // Drain-expired offer: drainAt + zero grace period in the past + // → route should already be torn down, and the verifier rule + // should be filtered out even though RoutePublished is still + // True in the cached status snapshot. mustOfferObject(t, monetizeapi.ServiceOffer{ - ObjectMeta: metav1.ObjectMeta{Name: "paused", Namespace: "alpha", Annotations: map[string]string{ - monetizeapi.PausedAnnotation: "true", - }}, + ObjectMeta: metav1.ObjectMeta{Name: "drained", Namespace: "alpha"}, Spec: monetizeapi.ServiceOfferSpec{ + Upstream: monetizeapi.ServiceOfferUpstream{Service: "httpbin"}, + DrainAt: &metav1.Time{Time: time.Now().Add(-2 * time.Hour)}, + DrainGracePeriod: &metav1.Duration{Duration: time.Hour}, Payment: monetizeapi.ServiceOfferPayment{ Price: monetizeapi.ServiceOfferPriceTable{PerRequest: "1"}, }, @@ -50,6 +56,23 @@ func TestRoutesFromStore(t *testing.T) { Conditions: []monetizeapi.Condition{{Type: "RoutePublished", Status: "True"}}, }, }), + // Mid-drain offer: drainAt = now, grace = 1h → still within the + // drain window, route stays up so in-flight buyers can settle. + // Should appear in the verifier rules. + mustOfferObject(t, monetizeapi.ServiceOffer{ + ObjectMeta: metav1.ObjectMeta{Name: "c", Namespace: "alpha"}, + Spec: monetizeapi.ServiceOfferSpec{ + Upstream: monetizeapi.ServiceOfferUpstream{Service: "httpbin"}, + DrainAt: &metav1.Time{Time: time.Now()}, + DrainGracePeriod: &metav1.Duration{Duration: time.Hour}, + Payment: monetizeapi.ServiceOfferPayment{ + Price: monetizeapi.ServiceOfferPriceTable{PerRequest: "0.1"}, + }, + }, + Status: monetizeapi.ServiceOfferStatus{ + Conditions: []monetizeapi.Condition{{Type: "RoutePublished", Status: "True"}}, + }, + }), } secrets := []any{ mustSecretObject(t, "alpha", "litellm-secrets", map[string]string{ @@ -62,11 +85,16 @@ func TestRoutesFromStore(t *testing.T) { t.Fatalf("routesFromStore: %v", err) } - if len(routes) != 2 { - t.Fatalf("len(routes) = %d, want 2", len(routes)) + if len(routes) != 3 { + t.Fatalf("len(routes) = %d, want 3", len(routes)) + } + // Expected sort order: alpha/a, alpha/c, beta/b. + // "drained" must be filtered out because its drain window expired. + if routes[0].OfferName != "a" || routes[1].OfferName != "c" || routes[2].OfferName != "b" { + t.Fatalf("routes not sorted by offer identity (drained leaked?): %+v", routes) } - if routes[0].OfferName != "a" || routes[1].OfferName != "b" { - t.Fatalf("routes not sorted by offer identity: %+v", routes) + if routes[0].OfferNamespace != "alpha" || routes[1].OfferNamespace != "alpha" || routes[2].OfferNamespace != "beta" { + t.Fatalf("unexpected route namespaces: %+v", routes) } if routes[0].Pattern != "/services/a/*" { t.Fatalf("routes[0].Pattern = %q, want /services/a/*", routes[0].Pattern) @@ -83,11 +111,16 @@ func TestRoutesFromStore(t *testing.T) { if routes[0].StripPrefix != "/services/a" { t.Fatalf("routes[0].StripPrefix = %q, want /services/a", routes[0].StripPrefix) } - if routes[1].UpstreamAuth != "" { - t.Fatalf("routes[1].UpstreamAuth = %q, want empty", routes[1].UpstreamAuth) + if routes[2].UpstreamAuth != "" { + t.Fatalf("routes[2].UpstreamAuth = %q, want empty", routes[2].UpstreamAuth) + } + if routes[2].UpstreamURL != "http://httpbin.beta.svc.cluster.local:11434" { + t.Fatalf("routes[2].UpstreamURL = %q, want httpbin upstream URL", routes[2].UpstreamURL) } - if routes[1].UpstreamURL != "http://httpbin.beta.svc.cluster.local:11434" { - t.Fatalf("routes[1].UpstreamURL = %q, want httpbin upstream URL", routes[1].UpstreamURL) + // Mid-drain offer "c" stays in the rules but tracks its own + // upstream — verifies the drain window keeps the route alive. + if routes[1].UpstreamURL != "http://httpbin.alpha.svc.cluster.local:11434" { + t.Fatalf("routes[1] (mid-drain) UpstreamURL = %q, want httpbin upstream URL", routes[1].UpstreamURL) } } From 04ed1ab6213a1dc0c4c9f0472874b2ec4725c531 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sun, 24 May 2026 13:12:11 +0400 Subject: [PATCH 26/31] test(stack): allow multi-line emptyDir after PSS sweep sizeLimit addition The TestLLMTemplate_IncludesPaidRouteAndBuyerSidecar assertion expected `emptyDir: {}` but #521's Restricted PSS sweep added sizeLimit values, making the YAML render as `emptyDir:\n sizeLimit: 128Mi`. Loosen the assertion to `emptyDir:` so it still catches removal but accepts the hardened multi-line form. --- internal/stack/stack_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/stack/stack_test.go b/internal/stack/stack_test.go index a4c032ea..a07932fa 100644 --- a/internal/stack/stack_test.go +++ b/internal/stack/stack_test.go @@ -491,7 +491,7 @@ func TestLLMTemplate_IncludesPaidRouteAndBuyerSidecar(t *testing.T) { `name: buyer-http`, `name: x402-buyer-config`, `name: x402-buyer-auths`, - `emptyDir: {}`, + `emptyDir:`, } { if !strings.Contains(out, want) { t.Fatalf("llm template missing %q:\n%s", want, out) From c3ba469e1decabf49071684b2e9baca6b360b9e4 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sun, 24 May 2026 17:30:05 +0400 Subject: [PATCH 27/31] fix: resolve marketplace bundle architecture blockers --- .github/workflows/helm-template-smoke.yml | 32 ++- cmd/serviceoffer-controller/main.go | 8 + cmd/serviceoffer-controller/main_test.go | 10 + docs/observability.md | 22 +- hack/migrate-bedag-raw-to-base.sh | 51 ++-- .../base/templates/obol-frontend-rbac.yaml | 53 ---- .../base/templates/obol-frontend.yaml | 28 +- internal/erc8004/types.go | 12 +- internal/erc8004/types_test.go | 26 ++ internal/serviceoffercontroller/controller.go | 13 +- .../serviceoffercontroller/identity_render.go | 8 +- internal/serviceoffercontroller/render.go | 18 +- .../serviceoffercontroller/render_test.go | 74 +++++ internal/x402/setup.go | 255 ++++++++++++++++++ internal/x402/setup_runtime_config_test.go | 103 +++++++ 15 files changed, 610 insertions(+), 103 deletions(-) delete mode 100644 internal/embed/infrastructure/base/templates/obol-frontend-rbac.yaml create mode 100644 internal/x402/setup_runtime_config_test.go diff --git a/.github/workflows/helm-template-smoke.yml b/.github/workflows/helm-template-smoke.yml index 27a9ed1f..9ee3fa57 100644 --- a/.github/workflows/helm-template-smoke.yml +++ b/.github/workflows/helm-template-smoke.yml @@ -53,7 +53,37 @@ jobs: helm template base "$workdir/base" \ --set dataDir=/data \ --set network=mainnet \ - > /dev/null + > "$workdir/base-rendered.yaml" + + # Kubernetes object identity must be unique within one rendered + # chart. Helm will happily render duplicate apiVersion/kind/name + # tuples and leave the actual outcome to manifest ordering; this + # caught the duplicated obol-frontend ClusterRole/Binding review bug. + awk ' + function flush() { + if (api && kind && name) { + key = api "/" kind "/" ns "/" name + count[key]++ + } + api = kind = name = ns = ""; inmeta = 0 + } + /^---/ { flush(); next } + /^apiVersion:/ { api = $2; next } + /^kind:/ { kind = $2; next } + /^metadata:/ { inmeta = 1; next } + inmeta && /^ name:/ { name = $2; next } + inmeta && /^ namespace:/ { ns = $2; next } + /^[^ ]/ && $0 !~ /^(apiVersion|kind|metadata):/ { inmeta = 0 } + END { + flush() + for (k in count) { + if (count[k] > 1) { + print count[k] " " k + dup = 1 + } + } + exit dup + }' "$workdir/base-rendered.yaml" - name: helm template ./cloudflared run: | diff --git a/cmd/serviceoffer-controller/main.go b/cmd/serviceoffer-controller/main.go index 6e81cd65..28be8287 100644 --- a/cmd/serviceoffer-controller/main.go +++ b/cmd/serviceoffer-controller/main.go @@ -92,6 +92,7 @@ func runWithLeaderElection(ctx context.Context, cfg *rest.Config, controller *se log.Printf("serviceoffer-controller: became leader %s", podName) if err := controller.Run(ctx, workers); err != nil { log.Printf("controller run: %v", err) + os.Exit(controllerRunExitCode(err)) } }, OnStoppedLeading: func() { @@ -110,6 +111,13 @@ func runWithLeaderElection(ctx context.Context, cfg *rest.Config, controller *se }) } +func controllerRunExitCode(err error) int { + if err != nil { + return 1 + } + return 0 +} + func loadConfig(kubeconfig string) (*rest.Config, error) { if kubeconfig != "" { return clientcmd.BuildConfigFromFlags("", kubeconfig) diff --git a/cmd/serviceoffer-controller/main_test.go b/cmd/serviceoffer-controller/main_test.go index addb8856..5a1badb4 100644 --- a/cmd/serviceoffer-controller/main_test.go +++ b/cmd/serviceoffer-controller/main_test.go @@ -1,6 +1,7 @@ package main import ( + "errors" "os" "path/filepath" "testing" @@ -61,6 +62,15 @@ func TestLeaderElectionDefaults(t *testing.T) { } } +func TestControllerRunExitCode(t *testing.T) { + if got := controllerRunExitCode(nil); got != 0 { + t.Fatalf("controllerRunExitCode(nil) = %d, want 0", got) + } + if got := controllerRunExitCode(errors.New("informer died")); got != 1 { + t.Fatalf("controllerRunExitCode(error) = %d, want 1", got) + } +} + const minimalKubeconfig = `apiVersion: v1 kind: Config clusters: diff --git a/docs/observability.md b/docs/observability.md index e4daa5f2..0b98e449 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -59,7 +59,7 @@ PR #530 swapped it to `increase()` over an explicit window. +------------------------------+ | x402-verifier (stateless) | | - in-memory counters | - | - labels: route, | + | - labels: | | offer_namespace, | | offer_name, chain, | | asset_symbol | @@ -77,8 +77,8 @@ PR #530 swapped it to `increase()` over an explicit window. v +------------------------------+ | Pre-aggregated series | - | offer:x402_revenue:7d_by_offer - | offer:x402_paid_requests:7d_by_offer + | x402:revenue:7d_by_offer + | x402:revenue:7d_by_offer_chain +---------------+--------------+ | | PromQL queries @@ -191,10 +191,11 @@ Naming follows the standard Prometheus pattern: Examples we ship: -- `offer:x402_revenue:7d_by_offer` — revenue aggregated to the `offer` level, - base metric is `x402_revenue`, operation is `increase` over `7d` grouped - `by_offer`. -- `offer:x402_paid_requests:7d_by_offer` — same shape for paid request count. +- `x402:revenue:7d_by_offer` — paid request count aggregated to the offer + level over the last 7d. The frontend multiplies this by the ServiceOffer + price table to display revenue. +- `x402:revenue:7d_by_offer_chain_asset_symbol` — same window, retaining + chain and settlement-token facets for per-token and per-chain views. Rules: @@ -202,9 +203,9 @@ Rules: is a lie (Prometheus has no "lifetime"). The window in the name must match the window in the expression. 2. **Use `increase()` over an explicit range, not `sum()` of the raw counter.** - See PR #530 — the original rule did `sum(by offer) (x402_revenue_total)` and - silently zeroed every time the verifier pod restarted. The fixed rule is - `sum by (offer_namespace, offer_name) (increase(x402_revenue_total[7d]))`. + See PR #530 — the original rule did `sum(by offer) (charged_requests_total)` + and silently zeroed every time the verifier pod restarted. The fixed rule is + `sum by (offer_namespace, offer_name) (increase(obol_x402_verifier_charged_requests_total[7d]))`. 3. **Keep the window aligned with retention.** Recording a `30d` rule with 8d retention is a footgun: the rule sees nulls and silently produces nothing. @@ -226,7 +227,6 @@ Concrete examples: | Label | Source | Why include it | |-------------------|------------------|-------------------------------------------| -| `route` | offer CR pattern | Direct query facet, bounded by # offers | | `offer_namespace` | offer CR meta | Tenancy facet | | `offer_name` | offer CR meta | Per-offer breakdown | | `chain` | offer CR payment | "Revenue by chain" is a real question | diff --git a/hack/migrate-bedag-raw-to-base.sh b/hack/migrate-bedag-raw-to-base.sh index d1c7d278..191f2821 100755 --- a/hack/migrate-bedag-raw-to-base.sh +++ b/hack/migrate-bedag-raw-to-base.sh @@ -26,9 +26,20 @@ ORPHAN_RELEASES=( ) migrate_one() { - local target="$1" + local kind="$1" + local name="$2" + local namespace="${3:-}" local current - current=$(kubectl get "$target" -o jsonpath='{.metadata.annotations.meta\.helm\.sh/release-name}' 2>/dev/null || true) + + local resource="${kind}/${name}" + local target="$resource" + local -a ns_args=() + if [[ -n "$namespace" ]]; then + ns_args=(-n "$namespace") + target="$resource -n $namespace" + fi + + current=$(kubectl get "$resource" "${ns_args[@]}" -o jsonpath='{.metadata.annotations.meta\.helm\.sh/release-name}' 2>/dev/null || true) if [[ "$current" == "base" ]]; then echo " $target: already on base, skipping" return 0 @@ -38,10 +49,10 @@ migrate_one() { else echo " $target: was on '$current', migrating to base" fi - kubectl annotate "$target" \ + kubectl annotate "$resource" "${ns_args[@]}" \ meta.helm.sh/release-name=base \ meta.helm.sh/release-namespace=kube-system --overwrite >/dev/null - kubectl label "$target" app.kubernetes.io/managed-by=Helm --overwrite >/dev/null + kubectl label "$resource" "${ns_args[@]}" app.kubernetes.io/managed-by=Helm --overwrite >/dev/null } echo "==> Scanning for resources owned by legacy bedag/raw releases..." @@ -51,10 +62,10 @@ for release in "${ORPHAN_RELEASES[@]}"; do -A -o json 2>/dev/null \ | jq -r --arg rel "$release" '.items[] | select(.metadata.annotations["meta.helm.sh/release-name"] == $rel) - | "\(.kind)/\(.metadata.name)\(if .metadata.namespace then " -n " + .metadata.namespace else "" end)"' \ - | while read -r target; do - [[ -z "$target" ]] && continue - migrate_one "$target" + | [.kind, .metadata.name, (.metadata.namespace // "")] | @tsv' \ + | while IFS=$'\t' read -r kind name namespace; do + [[ -z "$kind" || -z "$name" ]] && continue + migrate_one "$kind" "$name" "$namespace" done done @@ -63,17 +74,25 @@ done # in the namespaces base now owns. echo "==> Adopting unowned resources base will now claim..." declare -a UNOWNED_TARGETS=( - "namespace/erpc" - "namespace/obol-frontend" - "prometheusrule/x402-verifier -n x402" + "namespace erpc " + "namespace obol-frontend " + "prometheusrule x402-verifier x402" ) for target in "${UNOWNED_TARGETS[@]}"; do - if kubectl get $target >/dev/null 2>&1; then - owner=$(kubectl get $target -o jsonpath='{.metadata.annotations.meta\.helm\.sh/release-name}' 2>/dev/null || true) + IFS=$'\t' read -r kind name namespace <<< "$target" + resource="${kind}/${name}" + ns_args=() + display="$resource" + if [[ -n "$namespace" ]]; then + ns_args=(-n "$namespace") + display="$resource -n $namespace" + fi + if kubectl get "$resource" "${ns_args[@]}" >/dev/null 2>&1; then + owner=$(kubectl get "$resource" "${ns_args[@]}" -o jsonpath='{.metadata.annotations.meta\.helm\.sh/release-name}' 2>/dev/null || true) if [[ -z "$owner" || "$owner" == "base" ]]; then - echo " $target: $([ -z "$owner" ] && echo "adopting" || echo "already base")" - kubectl annotate $target meta.helm.sh/release-name=base meta.helm.sh/release-namespace=kube-system --overwrite >/dev/null - kubectl label $target app.kubernetes.io/managed-by=Helm --overwrite >/dev/null + echo " $display: $([ -z "$owner" ] && echo "adopting" || echo "already base")" + kubectl annotate "$resource" "${ns_args[@]}" meta.helm.sh/release-name=base meta.helm.sh/release-namespace=kube-system --overwrite >/dev/null + kubectl label "$resource" "${ns_args[@]}" app.kubernetes.io/managed-by=Helm --overwrite >/dev/null fi fi done diff --git a/internal/embed/infrastructure/base/templates/obol-frontend-rbac.yaml b/internal/embed/infrastructure/base/templates/obol-frontend-rbac.yaml deleted file mode 100644 index 038df594..00000000 --- a/internal/embed/infrastructure/base/templates/obol-frontend-rbac.yaml +++ /dev/null @@ -1,53 +0,0 @@ ---- -# RBAC for the obol-frontend pod's ServiceAccount. -# -# The frontend pod uses this SA's bearer token to: -# - Discover OpenClaw / Hermes instances (namespaces, pods, configmaps) -# - List + mutate ServiceOffer CRs (sell-modal + pause/resume/delete row actions) -# - List PurchaseRequest CRs (My Purchases page; never writes) -# -# The frontend is local-only behind the obol.stack hostname restriction -# (the operator owns the cluster), so this is a single trust boundary. -# Defense-in-depth note: the `secrets` rule is intentionally omitted — -# no code path reads them and the SA token shouldn't have that reach. -# /status subresources are omitted from PurchaseRequest because the -# controller is the only writer. -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: obol-frontend-openclaw-discovery - labels: - app.kubernetes.io/name: obol-frontend -rules: - - apiGroups: [""] - resources: ["namespaces"] - verbs: ["get", "list"] - - apiGroups: [""] - resources: ["pods", "configmaps"] - verbs: ["get", "list"] - # ServiceOffer CRD — frontend sell modal creates offers, row actions - # pause/resume (annotation patch) and delete. - - apiGroups: ["obol.org"] - resources: ["serviceoffers", "serviceoffers/status"] - verbs: ["get", "list", "create", "update", "patch", "delete"] - # PurchaseRequest CRD — frontend My Purchases page lists buyer-side - # records. Read-only; agent buy.py and the controller are the writers. - - apiGroups: ["obol.org"] - resources: ["purchaserequests"] - verbs: ["get", "list", "watch"] - ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: obol-frontend-openclaw-discovery - labels: - app.kubernetes.io/name: obol-frontend -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: obol-frontend-openclaw-discovery -subjects: - - kind: ServiceAccount - name: obol-frontend - namespace: obol-frontend diff --git a/internal/embed/infrastructure/base/templates/obol-frontend.yaml b/internal/embed/infrastructure/base/templates/obol-frontend.yaml index 397a192e..77a4c806 100644 --- a/internal/embed/infrastructure/base/templates/obol-frontend.yaml +++ b/internal/embed/infrastructure/base/templates/obol-frontend.yaml @@ -54,12 +54,16 @@ spec: port: 3000 --- -# obol-frontend RBAC for OpenClaw instance discovery and ServiceOffer -# CRUD from the frontend sell modal. The ClusterRoleBinding subject -# references the `obol-frontend` ServiceAccount that the upstream -# `obol/obol-app` chart creates — the binding applies fine even if -# the SA does not exist yet, and starts granting permissions the -# moment the SA appears. +# obol-frontend RBAC for the pod ServiceAccount. +# +# Keep this as the single frontend RBAC template. A prior bundle carried a +# second obol-frontend-rbac.yaml template with the same ClusterRole and +# ClusterRoleBinding names, which made the rendered chart order-dependent. +# +# The frontend is local-only behind the obol.stack hostname restriction +# (the operator owns the cluster), so this is a single trust boundary. +# Defense-in-depth note: the `secrets` rule is intentionally omitted — no +# frontend code path reads them and the SA token should not have that reach. apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: @@ -71,12 +75,22 @@ rules: resources: ["namespaces"] verbs: ["get", "list"] - apiGroups: [""] - resources: ["pods", "configmaps", "secrets"] + resources: ["pods", "configmaps"] verbs: ["get", "list"] # ServiceOffer CRD — frontend sell modal creates offers - apiGroups: ["obol.org"] resources: ["serviceoffers", "serviceoffers/status"] verbs: ["get", "list", "create", "update", "patch", "delete"] + # PurchaseRequest CRD — My Purchases lists agent buys. Read-only: the + # agent and controller own writes. + - apiGroups: ["obol.org"] + resources: ["purchaserequests", "purchaserequests/status"] + verbs: ["get", "list", "watch"] + # RegistrationRequest CRD — listing rows surface ERC-8004 registration + # state. Read-only: the controller owns writes. + - apiGroups: ["obol.org"] + resources: ["registrationrequests", "registrationrequests/status"] + verbs: ["get", "list", "watch"] --- apiVersion: rbac.authorization.k8s.io/v1 diff --git a/internal/erc8004/types.go b/internal/erc8004/types.go index 3e22d8c5..85463f51 100644 --- a/internal/erc8004/types.go +++ b/internal/erc8004/types.go @@ -33,11 +33,13 @@ const RegistrationType = "https://eips.ethereum.org/EIPS/eip-8004#registration-v // For OASF entries (name="OASF"), Skills and Domains provide machine-readable // taxonomy for agent discovery. See https://schema.oasf.outshift.com/ type ServiceDef struct { - Name string `json:"name"` // e.g., "web", "A2A", "MCP", "OASF" - Endpoint string `json:"endpoint,omitempty"` // full URL (omitempty for OASF entries) - Version string `json:"version,omitempty"` // protocol version (SHOULD per spec) - Skills []string `json:"skills,omitempty"` // OASF skill taxonomy paths - Domains []string `json:"domains,omitempty"` // OASF domain taxonomy paths + Name string `json:"name"` // e.g., "web", "A2A", "MCP", "OASF" + Endpoint string `json:"endpoint,omitempty"` // full URL (omitempty for OASF entries) + Version string `json:"version,omitempty"` // protocol version (SHOULD per spec) + Skills []string `json:"skills,omitempty"` // OASF skill taxonomy paths + Domains []string `json:"domains,omitempty"` // OASF domain taxonomy paths + Available *bool `json:"available,omitempty"` // false only while the service is draining + DrainEndsAt string `json:"drainEndsAt,omitempty"` // RFC3339 timestamp for draining services } // OnChainReg links the registration to its on-chain record. diff --git a/internal/erc8004/types_test.go b/internal/erc8004/types_test.go index 80bd025e..e8fdd9fb 100644 --- a/internal/erc8004/types_test.go +++ b/internal/erc8004/types_test.go @@ -179,6 +179,32 @@ func TestServiceDef_VersionOptional(t *testing.T) { } } +func TestServiceDef_DrainMetadataSerializesFalseAvailability(t *testing.T) { + available := false + svc := ServiceDef{ + Name: "web", + Endpoint: "https://example.com/services/demo", + Available: &available, + DrainEndsAt: "2026-05-24T12:00:00Z", + } + + data, err := json.Marshal(svc) + if err != nil { + t.Fatalf("Marshal: %v", err) + } + + var m map[string]json.RawMessage + if err := json.Unmarshal(data, &m); err != nil { + t.Fatalf("unmarshal to map: %v", err) + } + if string(m["available"]) != "false" { + t.Fatalf("available = %s, want false in %s", m["available"], data) + } + if string(m["drainEndsAt"]) != `"2026-05-24T12:00:00Z"` { + t.Fatalf("drainEndsAt = %s, want timestamp in %s", m["drainEndsAt"], data) + } +} + func TestOnChainReg_AgentIDNumeric(t *testing.T) { reg := OnChainReg{ AgentID: 42, diff --git a/internal/serviceoffercontroller/controller.go b/internal/serviceoffercontroller/controller.go index 7cbd759b..8226f06a 100644 --- a/internal/serviceoffercontroller/controller.go +++ b/internal/serviceoffercontroller/controller.go @@ -439,8 +439,17 @@ func (c *Controller) reconcileOffer(ctx context.Context, key string) error { if !ready { setCondition(&status, "ModelReady", "False", "WaitingForAgent", "Referenced Agent is not yet Ready") setCondition(&status, "UpstreamHealthy", "False", "WaitingForAgent", "Referenced Agent is not yet Ready") - setCondition(&status, "PaymentGateReady", "False", "WaitingForAgent", "Referenced Agent is not yet Ready") - setCondition(&status, "RoutePublished", "False", "WaitingForAgent", "Referenced Agent is not yet Ready") + if offer.DrainExpired(time.Now()) { + if err := c.deleteRouteChildren(ctx, offer); err != nil { + return err + } + setCondition(&status, "Draining", "False", "Drained", fmt.Sprintf("Drain ended at %s; route torn down", offer.DrainEndsAt().UTC().Format(time.RFC3339))) + setCondition(&status, "PaymentGateReady", "False", "Drained", "Offer drained; payment gate removed") + setCondition(&status, "RoutePublished", "False", "Drained", "Offer drained; route removed") + } else { + setCondition(&status, "PaymentGateReady", "False", "WaitingForAgent", "Referenced Agent is not yet Ready") + setCondition(&status, "RoutePublished", "False", "WaitingForAgent", "Referenced Agent is not yet Ready") + } setCondition(&status, "Ready", "False", "WaitingForAgent", "Referenced Agent is not yet Ready") return c.updateOfferStatus(ctx, raw, status) } diff --git a/internal/serviceoffercontroller/identity_render.go b/internal/serviceoffercontroller/identity_render.go index 347d0faf..89623d36 100644 --- a/internal/serviceoffercontroller/identity_render.go +++ b/internal/serviceoffercontroller/identity_render.go @@ -150,10 +150,10 @@ func buildIdentityRegistrationServices(offers []*monetizeapi.ServiceOffer, baseU baseURL = strings.TrimRight(baseURL, "/") services := make([]erc8004.ServiceDef, 0, len(offers)*2) for _, offer := range offers { - services = append(services, erc8004.ServiceDef{ + services = append(services, serviceDefWithDrain(offer, erc8004.ServiceDef{ Name: "web", Endpoint: baseURL + offer.EffectivePath(), - }) + })) if len(offer.Spec.Registration.Skills) > 0 || len(offer.Spec.Registration.Domains) > 0 { services = append(services, erc8004.ServiceDef{ Name: "OASF", @@ -163,11 +163,11 @@ func buildIdentityRegistrationServices(offers []*monetizeapi.ServiceOffer, baseU }) } for _, svc := range offer.Spec.Registration.Services { - services = append(services, erc8004.ServiceDef{ + services = append(services, serviceDefWithDrain(offer, erc8004.ServiceDef{ Name: svc.Name, Endpoint: svc.Endpoint, Version: svc.Version, - }) + })) } } return services diff --git a/internal/serviceoffercontroller/render.go b/internal/serviceoffercontroller/render.go index 24586000..0f65fa89 100644 --- a/internal/serviceoffercontroller/render.go +++ b/internal/serviceoffercontroller/render.go @@ -726,10 +726,10 @@ func buildRegistrationServices(owner *monetizeapi.ServiceOffer, offers []*moneti services := make([]erc8004.ServiceDef, 0, len(ordered)*2) for _, offer := range ordered { - services = append(services, erc8004.ServiceDef{ + services = append(services, serviceDefWithDrain(offer, erc8004.ServiceDef{ Name: "web", Endpoint: baseURL + offer.EffectivePath(), - }) + })) if len(offer.Spec.Registration.Skills) > 0 || len(offer.Spec.Registration.Domains) > 0 { services = append(services, erc8004.ServiceDef{ Name: "OASF", @@ -739,16 +739,26 @@ func buildRegistrationServices(owner *monetizeapi.ServiceOffer, offers []*moneti }) } for _, service := range offer.Spec.Registration.Services { - services = append(services, erc8004.ServiceDef{ + services = append(services, serviceDefWithDrain(offer, erc8004.ServiceDef{ Name: service.Name, Endpoint: service.Endpoint, Version: service.Version, - }) + })) } } return services } +func serviceDefWithDrain(offer *monetizeapi.ServiceOffer, svc erc8004.ServiceDef) erc8004.ServiceDef { + if offer == nil || !offer.IsDraining() || offer.DrainExpired(time.Now()) { + return svc + } + available := false + svc.Available = &available + svc.DrainEndsAt = offer.DrainEndsAt().UTC().Format(time.RFC3339) + return svc +} + // offerPublishedForRegistration reports whether an offer should appear // in the operator's ERC-8004 registration document as a live, gated // service. Draining offers stay in the document with available=false diff --git a/internal/serviceoffercontroller/render_test.go b/internal/serviceoffercontroller/render_test.go index 199e6721..286e1763 100644 --- a/internal/serviceoffercontroller/render_test.go +++ b/internal/serviceoffercontroller/render_test.go @@ -409,6 +409,80 @@ func TestBuildRegistrationServices_IncludesOwnerWhenOwnerNotYetPublished(t *test } } +func TestBuildRegistrationServices_IncludesDrainMetadata(t *testing.T) { + drainAt := metav1.NewTime(time.Now()) + grace := metav1.Duration{Duration: time.Hour} + offer := &monetizeapi.ServiceOffer{ + ObjectMeta: metav1.ObjectMeta{Name: "draining", Namespace: "demo"}, + Spec: monetizeapi.ServiceOfferSpec{ + Path: "/services/draining", + DrainAt: &drainAt, + DrainGracePeriod: &grace, + Registration: monetizeapi.ServiceOfferRegistration{ + Enabled: true, + Services: []monetizeapi.ServiceOfferService{ + {Name: "A2A", Endpoint: "https://example.com/a2a", Version: "0.2.1"}, + }, + }, + }, + Status: monetizeapi.ServiceOfferStatus{ + Conditions: []monetizeapi.Condition{ + {Type: "ModelReady", Status: "True"}, + {Type: "UpstreamHealthy", Status: "True"}, + {Type: "PaymentGateReady", Status: "True"}, + {Type: "RoutePublished", Status: "True"}, + }, + }, + } + + services := buildRegistrationServices(offer, []*monetizeapi.ServiceOffer{offer}, "https://example.com") + if len(services) != 2 { + t.Fatalf("services = %+v, want web + A2A", services) + } + for _, svc := range services { + if svc.Available == nil { + t.Fatalf("%s missing available=false drain marker: %+v", svc.Name, svc) + } + if *svc.Available { + t.Fatalf("%s available = true, want false during drain: %+v", svc.Name, svc) + } + if _, err := time.Parse(time.RFC3339, svc.DrainEndsAt); err != nil { + t.Fatalf("%s drainEndsAt = %q is not RFC3339: %v", svc.Name, svc.DrainEndsAt, err) + } + } +} + +func TestBuildIdentityRegistrationServices_IncludesDrainMetadata(t *testing.T) { + drainAt := metav1.NewTime(time.Now()) + grace := metav1.Duration{Duration: 30 * time.Minute} + offer := &monetizeapi.ServiceOffer{ + ObjectMeta: metav1.ObjectMeta{Name: "identity-drain", Namespace: "demo"}, + Spec: monetizeapi.ServiceOfferSpec{ + Path: "/services/identity-drain", + DrainAt: &drainAt, + DrainGracePeriod: &grace, + Registration: monetizeapi.ServiceOfferRegistration{ + Services: []monetizeapi.ServiceOfferService{ + {Name: "MCP", Endpoint: "https://example.com/mcp", Version: "2025-06-18"}, + }, + }, + }, + } + + services := buildIdentityRegistrationServices([]*monetizeapi.ServiceOffer{offer}, "https://example.com") + if len(services) != 2 { + t.Fatalf("services = %+v, want web + MCP", services) + } + for _, svc := range services { + if svc.Available == nil || *svc.Available { + t.Fatalf("%s missing available=false drain marker: %+v", svc.Name, svc) + } + if _, err := time.Parse(time.RFC3339, svc.DrainEndsAt); err != nil { + t.Fatalf("%s drainEndsAt = %q is not RFC3339: %v", svc.Name, svc.DrainEndsAt, err) + } + } +} + func TestBuildRegistrationConfigMap_PublishesAggregatedAgentRegistration(t *testing.T) { readyConditions := []monetizeapi.Condition{ {Type: "ModelReady", Status: "True"}, diff --git a/internal/x402/setup.go b/internal/x402/setup.go index 4811ceb6..8af3fe8e 100644 --- a/internal/x402/setup.go +++ b/internal/x402/setup.go @@ -92,9 +92,18 @@ func EnsureVerifier(cfg *config.Config) error { return fmt.Errorf("refresh infrastructure defaults: %w", err) } + kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") + snapshots, err := preserveMutableRuntimeConfigMaps(cfg, kubeconfigPath) + if err != nil { + return fmt.Errorf("snapshot mutable runtime configmaps: %w", err) + } + if err := helmfileSyncBaseRelease(cfg); err != nil { return fmt.Errorf("helmfile sync %s: %w", baseReleaseName, err) } + if err := restoreMutableRuntimeConfigMaps(cfg, kubeconfigPath, snapshots); err != nil { + return fmt.Errorf("restore mutable runtime configmaps: %w", err) + } // Populate the CA bundle after deploying the verifier so TLS verification // of the facilitator works immediately. Idempotent — safe to call multiple times. @@ -103,6 +112,252 @@ func EnsureVerifier(cfg *config.Config) error { return nil } +type mutableConfigMapSnapshot struct { + Name string + Namespace string + Data map[string]string +} + +var mutableRuntimeConfigMaps = []mutableConfigMapSnapshot{ + {Name: "litellm-config", Namespace: "llm"}, + {Name: "x402-buyer-config", Namespace: "llm"}, + {Name: "x402-buyer-auths", Namespace: "llm"}, +} + +// preserveMutableRuntimeConfigMaps snapshots ConfigMaps whose data is mutated +// at runtime by `obol model setup`, PurchaseRequest reconciliation, or the +// buyer auth-pool flow. `EnsureVerifier` must sync the base release so the +// verifier uses canonical Helm ownership, but the base chart contains only +// bootstrap defaults for these objects. Without this snapshot/restore pass, +// `obol x402 setup` can erase configured models and buyer auth state. +func preserveMutableRuntimeConfigMaps(cfg *config.Config, kubeconfigPath string) ([]mutableConfigMapSnapshot, error) { + out := make([]mutableConfigMapSnapshot, 0, len(mutableRuntimeConfigMaps)) + for _, item := range mutableRuntimeConfigMaps { + data, found, err := readConfigMapData(cfg, kubeconfigPath, item.Namespace, item.Name) + if err != nil { + return nil, err + } + if !found || len(data) == 0 { + continue + } + out = append(out, mutableConfigMapSnapshot{Name: item.Name, Namespace: item.Namespace, Data: data}) + } + return out, nil +} + +func restoreMutableRuntimeConfigMaps(cfg *config.Config, kubeconfigPath string, snapshots []mutableConfigMapSnapshot) error { + for _, snap := range snapshots { + current, _, err := readConfigMapData(cfg, kubeconfigPath, snap.Namespace, snap.Name) + if err != nil { + return err + } + data, err := mergeRuntimeConfigMapData(snap.Name, current, snap.Data) + if err != nil { + return err + } + if len(data) == 0 { + continue + } + manifest, err := configMapDataManifest(snap.Namespace, snap.Name, data) + if err != nil { + return err + } + if err := kubectl.ApplyServerSideForceConflicts(filepath.Join(cfg.BinDir, "kubectl"), kubeconfigPath, manifest, "helm"); err != nil { + return err + } + } + return nil +} + +func readConfigMapData(cfg *config.Config, kubeconfigPath, namespace, name string) (map[string]string, bool, error) { + raw, err := kubectl.Output(filepath.Join(cfg.BinDir, "kubectl"), kubeconfigPath, + "get", "configmap", name, "-n", namespace, "-o", "json") + if err != nil { + if strings.Contains(err.Error(), "not found") || strings.Contains(err.Error(), "NotFound") { + return nil, false, nil + } + return nil, false, fmt.Errorf("get configmap %s/%s: %w", namespace, name, err) + } + var obj struct { + Data map[string]string `json:"data"` + } + if err := json.Unmarshal([]byte(raw), &obj); err != nil { + return nil, false, fmt.Errorf("parse configmap %s/%s: %w", namespace, name, err) + } + return obj.Data, true, nil +} + +func mergeRuntimeConfigMapData(name string, current, previous map[string]string) (map[string]string, error) { + if name == "litellm-config" { + currentRaw := current["config.yaml"] + previousRaw := previous["config.yaml"] + if strings.TrimSpace(previousRaw) == "" { + return current, nil + } + if strings.TrimSpace(currentRaw) == "" { + return previous, nil + } + merged, err := mergeLiteLLMConfig(currentRaw, previousRaw) + if err != nil { + return nil, err + } + out := copyStringMap(current) + out["config.yaml"] = merged + return out, nil + } + + out := copyStringMap(previous) + for k, v := range current { + out[k] = v + } + return out, nil +} + +func mergeLiteLLMConfig(currentRaw, previousRaw string) (string, error) { + var current map[string]any + if err := yaml.Unmarshal([]byte(currentRaw), ¤t); err != nil { + return "", fmt.Errorf("parse current LiteLLM config: %w", err) + } + if current == nil { + current = map[string]any{} + } + + var previous map[string]any + if err := yaml.Unmarshal([]byte(previousRaw), &previous); err != nil { + return "", fmt.Errorf("parse previous LiteLLM config: %w", err) + } + if previous == nil { + previous = map[string]any{} + } + + merged := copyAnyMap(previous) + for key, value := range current { + merged[key] = value + } + + models, err := mergeLiteLLMModelLists(current["model_list"], previous["model_list"]) + if err != nil { + return "", err + } + if len(models) > 0 { + merged["model_list"] = models + } + + for _, key := range []string{"general_settings", "litellm_settings"} { + if liteLLMValueEmpty(current[key]) && !liteLLMValueEmpty(previous[key]) { + merged[key] = previous[key] + } + } + + mergedRaw, err := yaml.Marshal(merged) + if err != nil { + return "", fmt.Errorf("serialize merged LiteLLM config: %w", err) + } + return string(mergedRaw), nil +} + +func mergeLiteLLMModelLists(currentRaw, previousRaw any) ([]any, error) { + current, err := liteLLMModelList(currentRaw) + if err != nil { + return nil, fmt.Errorf("parse current LiteLLM model_list: %w", err) + } + previous, err := liteLLMModelList(previousRaw) + if err != nil { + return nil, fmt.Errorf("parse previous LiteLLM model_list: %w", err) + } + + merged := append([]any{}, current...) + byName := make(map[string]bool, len(current)) + for _, entry := range current { + if name := liteLLMModelName(entry); name != "" { + byName[name] = true + } + } + for _, entry := range previous { + name := liteLLMModelName(entry) + if name == "" { + continue + } + if byName[name] { + continue + } + byName[name] = true + merged = append(merged, entry) + } + return merged, nil +} + +func liteLLMModelList(value any) ([]any, error) { + if value == nil { + return nil, nil + } + list, ok := value.([]any) + if !ok { + return nil, fmt.Errorf("expected sequence, got %T", value) + } + return list, nil +} + +func liteLLMModelName(entry any) string { + switch typed := entry.(type) { + case map[string]any: + if name, ok := typed["model_name"].(string); ok { + return strings.TrimSpace(name) + } + case map[any]any: + if name, ok := typed["model_name"].(string); ok { + return strings.TrimSpace(name) + } + } + return "" +} + +func liteLLMValueEmpty(value any) bool { + switch typed := value.(type) { + case nil: + return true + case string: + return strings.TrimSpace(typed) == "" + case []any: + return len(typed) == 0 + case map[string]any: + return len(typed) == 0 + case map[any]any: + return len(typed) == 0 + default: + return false + } +} + +func configMapDataManifest(namespace, name string, data map[string]string) ([]byte, error) { + obj := map[string]any{ + "apiVersion": "v1", + "kind": "ConfigMap", + "metadata": map[string]string{ + "name": name, + "namespace": namespace, + }, + "data": data, + } + return yaml.Marshal(obj) +} + +func copyStringMap(in map[string]string) map[string]string { + out := make(map[string]string, len(in)) + for k, v := range in { + out[k] = v + } + return out +} + +func copyAnyMap(in map[string]any) map[string]any { + out := make(map[string]any, len(in)) + for k, v := range in { + out[k] = v + } + return out +} + // helmfileSyncBaseRelease runs `helmfile --selector name=base sync` // against the defaults helmfile rendered into $OBOL_CONFIG_DIR/defaults. // This is the same invocation pattern used by `internal/stack.syncDefaults` diff --git a/internal/x402/setup_runtime_config_test.go b/internal/x402/setup_runtime_config_test.go new file mode 100644 index 00000000..bc0f22e3 --- /dev/null +++ b/internal/x402/setup_runtime_config_test.go @@ -0,0 +1,103 @@ +package x402 + +import ( + "testing" + + "gopkg.in/yaml.v3" +) + +func TestMergeRuntimeConfigMapData_LiteLLMPreservesUserModels(t *testing.T) { + current := map[string]string{"config.yaml": ` +model_list: + - model_name: paid/* + litellm_params: + model: openai/* + api_base: http://127.0.0.1:8402/v1 + api_key: unused +general_settings: + master_key: os.environ/LITELLM_MASTER_KEY +`} + previous := map[string]string{"config.yaml": ` +model_list: + - model_name: paid/qwen36 + litellm_params: + model: openai/qwen36-apex-i-compact + api_base: http://silvermesh.v1337.lan:8081/v1 + api_key: unused +litellm_settings: + drop_params: true +`} + + merged, err := mergeRuntimeConfigMapData("litellm-config", current, previous) + if err != nil { + t.Fatalf("mergeRuntimeConfigMapData: %v", err) + } + + var parsed struct { + ModelList []struct { + ModelName string `yaml:"model_name"` + } `yaml:"model_list"` + GeneralSettings map[string]any `yaml:"general_settings"` + LiteLLMSettings map[string]any `yaml:"litellm_settings"` + } + if err := yaml.Unmarshal([]byte(merged["config.yaml"]), &parsed); err != nil { + t.Fatalf("parse merged yaml: %v\n%s", err, merged["config.yaml"]) + } + + got := map[string]bool{} + for _, entry := range parsed.ModelList { + got[entry.ModelName] = true + } + for _, want := range []string{"paid/*", "paid/qwen36"} { + if !got[want] { + t.Fatalf("merged config missing model %q:\n%s", want, merged["config.yaml"]) + } + } + if parsed.GeneralSettings["master_key"] == nil { + t.Fatalf("current general_settings should be preserved:\n%s", merged["config.yaml"]) + } + if parsed.LiteLLMSettings["drop_params"] == nil { + t.Fatalf("previous litellm_settings should be restored when current is empty:\n%s", merged["config.yaml"]) + } +} + +func TestMergeRuntimeConfigMapData_BuyerConfigPreservesRuntimeKeys(t *testing.T) { + current := map[string]string{"new.json": `{"new":true}`} + previous := map[string]string{ + "alice.json": `{"auths":["a"]}`, + "new.json": `{"old":true}`, + } + + merged, err := mergeRuntimeConfigMapData("x402-buyer-auths", current, previous) + if err != nil { + t.Fatalf("mergeRuntimeConfigMapData: %v", err) + } + if merged["alice.json"] != previous["alice.json"] { + t.Fatalf("runtime key was not preserved: %#v", merged) + } + if merged["new.json"] != current["new.json"] { + t.Fatalf("current key should win on conflicts: %#v", merged) + } +} + +func TestConfigMapDataManifest_RendersConfigMap(t *testing.T) { + manifest, err := configMapDataManifest("llm", "x402-buyer-config", map[string]string{ + "demo.json": `{"endpoint":"http://example"}`, + }) + if err != nil { + t.Fatalf("configMapDataManifest: %v", err) + } + + var parsed struct { + APIVersion string `yaml:"apiVersion"` + Kind string `yaml:"kind"` + Metadata map[string]string `yaml:"metadata"` + Data map[string]string `yaml:"data"` + } + if err := yaml.Unmarshal(manifest, &parsed); err != nil { + t.Fatalf("manifest is not yaml: %v\n%s", err, manifest) + } + if parsed.Kind != "ConfigMap" || parsed.Metadata["namespace"] != "llm" || parsed.Data["demo.json"] == "" { + t.Fatalf("unexpected manifest: %#v\n%s", parsed, manifest) + } +} From 82cbfae8a7f7624771ad9b270fdffe04d129e6e9 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sun, 24 May 2026 17:37:22 +0400 Subject: [PATCH 28/31] chore: remove pre-release migration script --- .github/release-template.md | 7 --- docs/upgrade-from-pre-pr-523.md | 82 ------------------------ hack/migrate-bedag-raw-to-base.sh | 101 ------------------------------ 3 files changed, 190 deletions(-) delete mode 100644 docs/upgrade-from-pre-pr-523.md delete mode 100755 hack/migrate-bedag-raw-to-base.sh diff --git a/.github/release-template.md b/.github/release-template.md index fd72746e..c67dc086 100644 --- a/.github/release-template.md +++ b/.github/release-template.md @@ -96,13 +96,6 @@ repositories or docs.] ## Breaking changes / Migration notes - [Delete this section if there are no breaking changes.] -- **Upgrading from a pre-PR #523 cluster**: PR #523 relocated six `bedag/raw` - helmfile releases into the `base` chart. Existing clusters must run - `bash hack/migrate-bedag-raw-to-base.sh` once before `obol stack up` to - transfer Helm ownership annotations; otherwise `helm upgrade base` fails - with `invalid ownership metadata`. See - [`docs/upgrade-from-pre-pr-523.md`](../docs/upgrade-from-pre-pr-523.md). - Fresh installs are unaffected. ## Known issues diff --git a/docs/upgrade-from-pre-pr-523.md b/docs/upgrade-from-pre-pr-523.md deleted file mode 100644 index 1b9b5bf7..00000000 --- a/docs/upgrade-from-pre-pr-523.md +++ /dev/null @@ -1,82 +0,0 @@ -# Upgrading clusters created before PR #523 - -PR [#523](https://github.com/ObolNetwork/obol-stack/pull/523) relocates six -`bedag/raw` helmfile releases into the `base` chart so the stack has one -source of truth for everything it ships in the `erpc`, `obol-frontend`, and -`llm` namespaces. - -**Fresh installs are unaffected.** This page only applies if you are -upgrading a cluster that was created **before** PR #523 was merged. - -## Symptom - -Running `obol stack up` on a pre-#523 cluster fails during `helm upgrade base` -with errors of the form: - -``` -Error: UPGRADE FAILED: exists and cannot be imported into the -current release: invalid ownership metadata; annotation validation error: -key "meta.helm.sh/release-name" must equal "base"; current value is -"" -``` - -Helm refuses to "adopt" resources owned by another release. About ten -resources are affected (Namespaces, HTTPRoutes, Middlewares, ConfigMaps, -PrometheusRule, PodMonitor, ClusterRole/Binding) — enough that hand-fixing -them is error prone. - -## When to run the migration script - -- **Run once**, **before** `obol stack up`, against any cluster created - before PR #523 merged. -- The script is **idempotent** — safe to re-run if `obol stack up` is - interrupted or if you migrate one cluster at a time. -- Fresh clusters (`obol stack init && obol stack up` on an empty machine) - do **not** need it. - -```bash -# Optional: point at a non-default kubeconfig -export KUBECONFIG="$HOME/.config/obol/kubeconfig.yaml" - -bash hack/migrate-bedag-raw-to-base.sh -obol stack up -``` - -## What the script does - -It re-annotates the affected resources so Helm treats them as members of -the `base` release: - -``` -meta.helm.sh/release-name=base -meta.helm.sh/release-namespace=kube-system -app.kubernetes.io/managed-by=Helm -``` - -It covers the legacy `bedag/raw` releases removed by PR #523: - -| Legacy release | Namespace | -|---|---| -| `obol-frontend-rbac` | `obol-frontend` | -| `obol-frontend-httproute` | `obol-frontend` | -| `erpc-httproute` | `erpc` | -| `erpc-x402-middleware` | `erpc` | -| `erpc-metadata` | `erpc` | -| `llm-buyer-podmonitor` | `llm` | -| `x402-verifier-podmonitor` | `x402` (partial-upgrade clusters from before PR #513 hardening) | - -It also adopts a small set of resources that may exist with no Helm -ownership at all (`namespace/erpc`, `namespace/obol-frontend`, -`prometheusrule/x402-verifier` in `x402`) so the next `helm upgrade base` -can manage them cleanly. - -## Verifying the migration - -After running the script, `obol stack up` should succeed without the -`invalid ownership metadata` errors. To spot-check a single resource: - -```bash -kubectl get httproute -n obol-frontend obol-frontend \ - -o jsonpath='{.metadata.annotations.meta\.helm\.sh/release-name}{"\n"}' -# → base -``` diff --git a/hack/migrate-bedag-raw-to-base.sh b/hack/migrate-bedag-raw-to-base.sh deleted file mode 100755 index 191f2821..00000000 --- a/hack/migrate-bedag-raw-to-base.sh +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env bash -# Migrate resources from the legacy bedag/raw helmfile releases to the -# base chart that now owns them after obol-stack PR #523. -# -# Symptom this fixes: -# Error: UPGRADE FAILED: exists and cannot be imported -# into the current release: invalid ownership metadata -# -# Run once before `obol stack up` against any cluster deployed before -# PR #523 merged. -# -# Idempotent — safe to re-run. - -set -euo pipefail - -: "${KUBECONFIG:=$HOME/.config/obol/kubeconfig.yaml}" - -ORPHAN_RELEASES=( - obol-frontend-rbac - obol-frontend-httproute - erpc-httproute - erpc-x402-middleware - erpc-metadata - llm-buyer-podmonitor - x402-verifier-podmonitor # killed by PR #513's hardening; keep in case partial-upgrade clusters still have it -) - -migrate_one() { - local kind="$1" - local name="$2" - local namespace="${3:-}" - local current - - local resource="${kind}/${name}" - local target="$resource" - local -a ns_args=() - if [[ -n "$namespace" ]]; then - ns_args=(-n "$namespace") - target="$resource -n $namespace" - fi - - current=$(kubectl get "$resource" "${ns_args[@]}" -o jsonpath='{.metadata.annotations.meta\.helm\.sh/release-name}' 2>/dev/null || true) - if [[ "$current" == "base" ]]; then - echo " $target: already on base, skipping" - return 0 - fi - if [[ -z "$current" ]]; then - echo " $target: no Helm metadata, adopting into base" - else - echo " $target: was on '$current', migrating to base" - fi - kubectl annotate "$resource" "${ns_args[@]}" \ - meta.helm.sh/release-name=base \ - meta.helm.sh/release-namespace=kube-system --overwrite >/dev/null - kubectl label "$resource" "${ns_args[@]}" app.kubernetes.io/managed-by=Helm --overwrite >/dev/null -} - -echo "==> Scanning for resources owned by legacy bedag/raw releases..." -for release in "${ORPHAN_RELEASES[@]}"; do - echo "release: $release" - kubectl get all,clusterrole,clusterrolebinding,role,rolebinding,configmap,httproute,middleware,podmonitor,servicemonitor,prometheusrule,referencegrant,namespace \ - -A -o json 2>/dev/null \ - | jq -r --arg rel "$release" '.items[] - | select(.metadata.annotations["meta.helm.sh/release-name"] == $rel) - | [.kind, .metadata.name, (.metadata.namespace // "")] | @tsv' \ - | while IFS=$'\t' read -r kind name namespace; do - [[ -z "$kind" || -z "$name" ]] && continue - migrate_one "$kind" "$name" "$namespace" - done -done - -# Some resources were never Helm-owned (e.g. PrometheusRule x402-verifier may have -# been created via kubectl apply somewhere). Adopt them into base too if they exist -# in the namespaces base now owns. -echo "==> Adopting unowned resources base will now claim..." -declare -a UNOWNED_TARGETS=( - "namespace erpc " - "namespace obol-frontend " - "prometheusrule x402-verifier x402" -) -for target in "${UNOWNED_TARGETS[@]}"; do - IFS=$'\t' read -r kind name namespace <<< "$target" - resource="${kind}/${name}" - ns_args=() - display="$resource" - if [[ -n "$namespace" ]]; then - ns_args=(-n "$namespace") - display="$resource -n $namespace" - fi - if kubectl get "$resource" "${ns_args[@]}" >/dev/null 2>&1; then - owner=$(kubectl get "$resource" "${ns_args[@]}" -o jsonpath='{.metadata.annotations.meta\.helm\.sh/release-name}' 2>/dev/null || true) - if [[ -z "$owner" || "$owner" == "base" ]]; then - echo " $display: $([ -z "$owner" ] && echo "adopting" || echo "already base")" - kubectl annotate "$resource" "${ns_args[@]}" meta.helm.sh/release-name=base meta.helm.sh/release-namespace=kube-system --overwrite >/dev/null - kubectl label "$resource" "${ns_args[@]}" app.kubernetes.io/managed-by=Helm --overwrite >/dev/null - fi - fi -done - -echo "" -echo "✓ Migration complete. You may now run 'obol stack up'." From 94418dbc6bb21bd49cb724702e2b3c7d51078f80 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sun, 24 May 2026 17:39:17 +0400 Subject: [PATCH 29/31] docs: warn pre-release testers about stack reset --- .github/release-template.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/release-template.md b/.github/release-template.md index c67dc086..18643721 100644 --- a/.github/release-template.md +++ b/.github/release-template.md @@ -96,6 +96,18 @@ repositories or docs.] ## Breaking changes / Migration notes - [Delete this section if there are no breaking changes.] +- **Pre-release tester warning**: If you ran an unreleased marketplace or + chart-consolidation branch before this release, `obol stack up` may fail + with Helm `invalid ownership metadata` errors for resources that moved into + the `base` chart. This is not a supported production migration path. Back up + anything you need from the local test stack, then recreate it: + + ```bash + obol stack down + obol stack purge --force + obol stack init + obol stack up + ``` ## Known issues From 46189cd48348533d1a8d60cf0e8add5a4e5b89af Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sun, 24 May 2026 17:46:17 +0400 Subject: [PATCH 30/31] docs: clarify pre-release ownership warning --- .github/release-template.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/release-template.md b/.github/release-template.md index 18643721..7172a71b 100644 --- a/.github/release-template.md +++ b/.github/release-template.md @@ -98,9 +98,10 @@ repositories or docs.] - [Delete this section if there are no breaking changes.] - **Pre-release tester warning**: If you ran an unreleased marketplace or chart-consolidation branch before this release, `obol stack up` may fail - with Helm `invalid ownership metadata` errors for resources that moved into - the `base` chart. This is not a supported production migration path. Back up - anything you need from the local test stack, then recreate it: + with Helm `invalid ownership metadata` errors for resources or namespaces + that moved into the `base` chart. This is not a supported production + migration path. Back up anything you need from the local test stack, then + recreate it: ```bash obol stack down From 7453339d38709f27dd71f4aaeca285220ca0f2a3 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sun, 24 May 2026 18:06:34 +0400 Subject: [PATCH 31/31] ci: restrict workflow token permissions --- .github/workflows/helm-template-smoke.yml | 3 +++ .github/workflows/lint-test.yaml | 3 +++ 2 files changed, 6 insertions(+) diff --git a/.github/workflows/helm-template-smoke.yml b/.github/workflows/helm-template-smoke.yml index 9ee3fa57..9320e05a 100644 --- a/.github/workflows/helm-template-smoke.yml +++ b/.github/workflows/helm-template-smoke.yml @@ -12,6 +12,9 @@ on: - 'internal/embed/infrastructure/**' - '.github/workflows/helm-template-smoke.yml' +permissions: + contents: read + jobs: helm-template-smoke: name: helm template embedded chart diff --git a/.github/workflows/lint-test.yaml b/.github/workflows/lint-test.yaml index 34895b32..3c5ee256 100644 --- a/.github/workflows/lint-test.yaml +++ b/.github/workflows/lint-test.yaml @@ -4,6 +4,9 @@ on: pull_request: branches: [ main ] +permissions: + contents: read + jobs: lint-test: runs-on: ubuntu-latest