From 0b9a9b367ebc425cb25dcf8d90d0c4475d929db0 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 23 May 2026 18:00:04 +0400 Subject: [PATCH 1/5] feat(x402): chain label on buyer + verifier metrics, sync PurchaseAutoRefill Unblocks per-chain earnings/spend aggregation in the frontend's My Listings and My Purchases pages. - obol_x402_buyer_* metrics now carry a `chain` label sourced from UpstreamConfig.Network (already in payload, just wasn't on the labels). - obol_x402_verifier_* metrics now carry a `chain` label sourced from RouteRule.Network. Existing verifier metric tests updated to assert the new label (empty string when no Network is set on the rule). - internal/monetizeapi/types.go PurchaseAutoRefill struct now mirrors the CRD spec (purchaserequest-crd.yaml lines 93-96) by including MaxTotal + MaxSpendPerDay. The CRD already accepts these, the Go types just weren't reading them. Together this means the frontend can soon switch the EarningsStrip / WalletStrip from zeroed placeholders to real PromQL aggregates such as: sum by (chain) (increase(obol_x402_buyer_payment_success_total[7d])) --- internal/monetizeapi/types.go | 8 +++++--- internal/x402/buyer/metrics.go | 18 +++++++++--------- internal/x402/buyer/proxy.go | 9 +++++---- internal/x402/metrics.go | 10 +++++----- internal/x402/verifier.go | 1 + internal/x402/verifier_test.go | 2 ++ 6 files changed, 27 insertions(+), 21 deletions(-) diff --git a/internal/monetizeapi/types.go b/internal/monetizeapi/types.go index 6e905eee..403cffdf 100644 --- a/internal/monetizeapi/types.go +++ b/internal/monetizeapi/types.go @@ -276,9 +276,11 @@ type PreSignedAuth struct { } type PurchaseAutoRefill struct { - Enabled bool `json:"enabled,omitempty"` - Threshold int `json:"threshold,omitempty"` - Count int `json:"count,omitempty"` + Enabled bool `json:"enabled,omitempty"` + Threshold int `json:"threshold,omitempty"` + Count int `json:"count,omitempty"` + MaxTotal int `json:"maxTotal,omitempty"` + MaxSpendPerDay string `json:"maxSpendPerDay,omitempty"` } type PurchasePayment struct { diff --git a/internal/x402/buyer/metrics.go b/internal/x402/buyer/metrics.go index 0df96424..5079f0a0 100644 --- a/internal/x402/buyer/metrics.go +++ b/internal/x402/buyer/metrics.go @@ -29,35 +29,35 @@ func newMetrics() *metrics { Name: "obol_x402_buyer_requests_total", Help: "Total requests routed through the x402 buyer sidecar.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), paymentAttempts: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_buyer_payment_attempts_total", Help: "Total x402 payment attempts made by the buyer sidecar.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), paymentSuccessTotal: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_buyer_payment_success_total", Help: "Total successful x402 payments made by the buyer sidecar.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), paymentFailureTotal: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_buyer_payment_failure_total", Help: "Total failed x402 payments attempted by the buyer sidecar.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), confirmSpendFailureTotal: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_buyer_confirm_spend_failure_total", Help: "Successful upstream responses whose consumed-auth state could not be persisted.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), // paymentUnsettledConfirmations counts the occurrences of an upstream // returning 2xx without X-PAYMENT-RESPONSE. The buyer still marks the @@ -69,28 +69,28 @@ func newMetrics() *metrics { Name: "obol_x402_buyer_payment_unsettled_confirmations_total", Help: "Upstream 2xx responses with no X-PAYMENT-RESPONSE header — auth consumed locally without observed on-chain settlement.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), authRemaining: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "obol_x402_buyer_auth_remaining", Help: "Remaining pre-signed authorizations for an upstream model mapping.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), authSpent: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "obol_x402_buyer_auth_spent", Help: "Consumed pre-signed authorizations for an upstream model mapping.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), activeModelMappings: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "obol_x402_buyer_active_model_mappings", Help: "Active paid model mappings loaded in the buyer sidecar.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), } diff --git a/internal/x402/buyer/proxy.go b/internal/x402/buyer/proxy.go index afcb3d5c..b1b644e0 100644 --- a/internal/x402/buyer/proxy.go +++ b/internal/x402/buyer/proxy.go @@ -203,17 +203,18 @@ func (p *Proxy) syncMetricsLocked() { for name, upstream := range p.upstreams { signer := p.signers[name] - labels := prometheusLabels(name, upstream.remoteModel) + labels := prometheusLabels(name, upstream.remoteModel, upstream.config.Network) p.metrics.activeModelMappings.With(labels).Set(1) p.metrics.authRemaining.With(labels).Set(float64(signer.Remaining())) p.metrics.authSpent.With(labels).Set(float64(signer.Spent())) } } -func prometheusLabels(name, remoteModel string) map[string]string { +func prometheusLabels(name, remoteModel, chain string) map[string]string { return map[string]string{ "upstream": name, "remote_model": remoteModel, + "chain": chain, } } @@ -226,7 +227,7 @@ func (p *Proxy) buildUpstreamHandler(name, remoteModel string, cfg UpstreamConfi return nil, fmt.Errorf("parse upstream URL %q: %w", cfg.URL, err) } - labels := prometheusLabels(name, remoteModel) + labels := prometheusLabels(name, remoteModel, cfg.Network) rp := &httputil.ReverseProxy{ Rewrite: func(pr *httputil.ProxyRequest) { pr.SetURL(target) @@ -298,7 +299,7 @@ func (p *Proxy) handleModelRequest(w http.ResponseWriter, r *http.Request) { return io.NopCloser(bytes.NewReader(rewrittenBody)), nil } - labels := prometheusLabels(entry.name, remoteModel) + labels := prometheusLabels(entry.name, remoteModel, entry.config.Network) p.metrics.requestsTotal.With(labels).Inc() entry.handler.ServeHTTP(w, r) } diff --git a/internal/x402/metrics.go b/internal/x402/metrics.go index 21d87b68..6be9d14a 100644 --- a/internal/x402/metrics.go +++ b/internal/x402/metrics.go @@ -25,35 +25,35 @@ func newVerifierMetrics() *verifierMetrics { Name: "obol_x402_verifier_requests_total", Help: "Requests evaluated by the x402 verifier for matched paid routes.", }, - []string{"route", "offer_namespace", "offer_name"}, + []string{"route", "offer_namespace", "offer_name", "chain"}, ), paymentRequired: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_payment_required_total", Help: "Requests rejected with 402 because payment was required.", }, - []string{"route", "offer_namespace", "offer_name"}, + []string{"route", "offer_namespace", "offer_name", "chain"}, ), paymentVerified: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_payment_verified_total", Help: "Requests approved after successful x402 payment verification.", }, - []string{"route", "offer_namespace", "offer_name"}, + []string{"route", "offer_namespace", "offer_name", "chain"}, ), paymentFailed: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_payment_failed_total", Help: "Requests rejected after a provided x402 payment failed verification.", }, - []string{"route", "offer_namespace", "offer_name"}, + []string{"route", "offer_namespace", "offer_name", "chain"}, ), chargedRequests: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_charged_requests_total", Help: "Requests that incurred a paid x402 charge.", }, - []string{"route", "offer_namespace", "offer_name"}, + []string{"route", "offer_namespace", "offer_name", "chain"}, ), } diff --git a/internal/x402/verifier.go b/internal/x402/verifier.go index 65374ea4..c8d5252e 100644 --- a/internal/x402/verifier.go +++ b/internal/x402/verifier.go @@ -450,5 +450,6 @@ func prometheusLabels(rule *RouteRule) prometheus.Labels { "route": rule.Pattern, "offer_namespace": rule.OfferNamespace, "offer_name": rule.OfferName, + "chain": rule.Network, } } diff --git a/internal/x402/verifier_test.go b/internal/x402/verifier_test.go index 083604a0..d6bbbced 100644 --- a/internal/x402/verifier_test.go +++ b/internal/x402/verifier_test.go @@ -755,6 +755,7 @@ func TestVerifier_MetricsPaymentRequired(t *testing.T) { "route": "/rpc/*", "offer_namespace": "llm", "offer_name": "paid-rpc", + "chain": "", } assertVerifierMetricValue(t, metrics["obol_x402_verifier_requests_total"], labels, 1) assertVerifierMetricValue(t, metrics["obol_x402_verifier_payment_required_total"], labels, 1) @@ -768,6 +769,7 @@ func TestVerifier_MetricsVerifiedAndRejectedPayments(t *testing.T) { "route": "/rpc/*", "offer_namespace": "llm", "offer_name": "paid-rpc", + "chain": "", } okFac := newMockFacilitator(t, mockFacilitatorOpts{}) From 08b303ea0e796e3447bf880be75537e3d458a56d Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 23 May 2026 18:45:16 +0400 Subject: [PATCH 2/5] feat(x402): last-settlement gauge + verifier PodMonitor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the data loop for the frontend My Listings EarningsStrip + the "Last settlement" timestamp the design canvas wants. - New gauge obol_x402_verifier_last_payment_success_seconds, labeled by (route, offer_namespace, offer_name, chain). Stamped via SetToCurrentTime() in both ForwardAuth and proxy-mode paths whenever a paid request reaches the seller successfully. - helmfile.yaml grows an x402-verifier PodMonitor (the namespace was previously scraping only litellm-x402-buyer). Same release: monitoring label so kube-prometheus-stack picks it up. The frontend already has matching consumers (chargedSalesByOfferAndChain, chargedRequests24hByOffer, lastSettlementByOffer in PrometheusClient) — without this scrape the metrics never reach the dashboard. --- internal/embed/infrastructure/helmfile.yaml | 26 +++++++++++++++++++++ internal/x402/metrics.go | 19 +++++++++++---- internal/x402/verifier.go | 2 ++ 3 files changed, 42 insertions(+), 5 deletions(-) diff --git a/internal/embed/infrastructure/helmfile.yaml b/internal/embed/infrastructure/helmfile.yaml index aa7fc052..d4df212d 100644 --- a/internal/embed/infrastructure/helmfile.yaml +++ b/internal/embed/infrastructure/helmfile.yaml @@ -70,6 +70,32 @@ releases: path: /metrics interval: 30s + - name: x402-verifier-podmonitor + namespace: x402 + createNamespace: false + chart: bedag/raw + version: 2.0.2 + needs: + - monitoring/monitoring + - kube-system/base + values: + - resources: + - apiVersion: monitoring.coreos.com/v1 + kind: PodMonitor + metadata: + name: x402-verifier + namespace: x402 + labels: + release: monitoring + spec: + selector: + matchLabels: + app: x402-verifier + podMetricsEndpoints: + - port: http + path: /metrics + interval: 30s + # Traefik ingress controller with Gateway API support # Traefik v38+ bundles Gateway API CRDs in its crds/ directory - name: traefik diff --git a/internal/x402/metrics.go b/internal/x402/metrics.go index 6be9d14a..42266c2b 100644 --- a/internal/x402/metrics.go +++ b/internal/x402/metrics.go @@ -10,11 +10,12 @@ import ( type verifierMetrics struct { registry *prometheus.Registry - requestsTotal *prometheus.CounterVec - paymentRequired *prometheus.CounterVec - paymentVerified *prometheus.CounterVec - paymentFailed *prometheus.CounterVec - chargedRequests *prometheus.CounterVec + requestsTotal *prometheus.CounterVec + paymentRequired *prometheus.CounterVec + paymentVerified *prometheus.CounterVec + paymentFailed *prometheus.CounterVec + chargedRequests *prometheus.CounterVec + lastPaymentSuccess *prometheus.GaugeVec } func newVerifierMetrics() *verifierMetrics { @@ -55,6 +56,13 @@ func newVerifierMetrics() *verifierMetrics { }, []string{"route", "offer_namespace", "offer_name", "chain"}, ), + lastPaymentSuccess: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "obol_x402_verifier_last_payment_success_seconds", + Help: "Unix timestamp (seconds) of the most recent successful paid x402 charge for a route.", + }, + []string{"route", "offer_namespace", "offer_name", "chain"}, + ), } m.registry.MustRegister( @@ -63,6 +71,7 @@ func newVerifierMetrics() *verifierMetrics { m.paymentVerified, m.paymentFailed, m.chargedRequests, + m.lastPaymentSuccess, ) return m diff --git a/internal/x402/verifier.go b/internal/x402/verifier.go index c8d5252e..b4451508 100644 --- a/internal/x402/verifier.go +++ b/internal/x402/verifier.go @@ -144,6 +144,7 @@ func (v *Verifier) HandleVerify(w http.ResponseWriter, r *http.Request) { case tracker.status == http.StatusOK && r.Header.Get("X-Payment") != "": v.metrics.paymentVerified.With(labels).Inc() v.metrics.chargedRequests.With(labels).Inc() + v.metrics.lastPaymentSuccess.With(labels).SetToCurrentTime() case tracker.status == http.StatusPaymentRequired && r.Header.Get("X-Payment") != "": v.metrics.paymentFailed.With(labels).Inc() case tracker.status == http.StatusPaymentRequired: @@ -198,6 +199,7 @@ func (v *Verifier) HandleProxy(w http.ResponseWriter, r *http.Request) { v.metrics.paymentVerified.With(labels).Inc() if tracker.Header().Get("X-PAYMENT-RESPONSE") != "" { v.metrics.chargedRequests.With(labels).Inc() + v.metrics.lastPaymentSuccess.With(labels).SetToCurrentTime() } } } From da721d442b59b3773d2af80c72d57887af621f31 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 23 May 2026 19:44:14 +0400 Subject: [PATCH 3/5] test(x402): cover new chain label + last-payment gauge + PurchaseAutoRefill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the test gap left open by the recent chain-label + last-settlement gauge work. 14 new subtests across three packages plus four pre-existing buyer-proxy assertions updated to carry the new chain label. New tests: - internal/x402/verifier_test.go TestVerifier_LastPaymentSuccessGauge (3 subtests): successful payment stamps gauge within ±5s of time.Now(), unpaid 402 leaves it untouched, rejected payment leaves it untouched. findVerifierMetricValue helper for time-window assertions. - internal/x402/buyer/metrics_test.go TestPrometheusLabels_ChainPropagation (3 subtests): base-sepolia / base mainnet / empty chain. TestMetrics_ChainLabelScrapeRoundtrip (2 subtests): scrape /metrics through the registry, assert every counter + gauge series carries the expected chain label. - internal/monetizeapi/types_test.go TestPurchaseAutoRefill_JSONRoundTrip (5 subtests): full population, only new caps, all-zero omitempty, single fields. TestPurchaseAutoRefill_UnmarshalAcceptsCRDForm: catches json-tag drift between the Go struct and CRD spec. Pre-existing fix: - internal/x402/buyer/proxy_test.go — four TestProxy_* assertions had label maps without `chain`; tests use Network "base-sepolia" so the expected chain is now spelled out alongside upstream + remote_model. RBAC: - helmfile.yaml: obol-frontend ClusterRole grows read access for purchaserequests + purchaserequests/status (frontend My Purchases needs list; agent buy.py + controller remain the only writers). Live-patched into the running cluster too. --- internal/embed/infrastructure/helmfile.yaml | 6 + internal/monetizeapi/types_test.go | 108 +++++++++++ internal/x402/buyer/metrics_test.go | 191 ++++++++++++++++++++ internal/x402/buyer/proxy_test.go | 13 +- internal/x402/verifier_test.go | 109 +++++++++++ 5 files changed, 422 insertions(+), 5 deletions(-) create mode 100644 internal/monetizeapi/types_test.go create mode 100644 internal/x402/buyer/metrics_test.go diff --git a/internal/embed/infrastructure/helmfile.yaml b/internal/embed/infrastructure/helmfile.yaml index d4df212d..fe6eb001 100644 --- a/internal/embed/infrastructure/helmfile.yaml +++ b/internal/embed/infrastructure/helmfile.yaml @@ -331,6 +331,12 @@ releases: - apiGroups: ["obol.org"] resources: ["serviceoffers", "serviceoffers/status"] verbs: ["get", "list", "create", "update", "patch", "delete"] + # PurchaseRequest CRD — frontend My Purchases page lists buyer-side + # purchase records. Read-only; agent buy.py and the controller + # remain the only writers. + - apiGroups: ["obol.org"] + resources: ["purchaserequests", "purchaserequests/status"] + verbs: ["get", "list", "watch"] - apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: diff --git a/internal/monetizeapi/types_test.go b/internal/monetizeapi/types_test.go new file mode 100644 index 00000000..77a15a90 --- /dev/null +++ b/internal/monetizeapi/types_test.go @@ -0,0 +1,108 @@ +package monetizeapi + +import ( + "encoding/json" + "testing" +) + +// TestPurchaseAutoRefill_JSONRoundTrip asserts every field on +// PurchaseAutoRefill marshals to JSON and unmarshals back without loss. The +// MaxTotal + MaxSpendPerDay fields were added to match the CRD spec; this test +// pins the wire format and `omitempty` semantics so silent drift between the +// Go struct and the CRD surfaces as a test failure. +func TestPurchaseAutoRefill_JSONRoundTrip(t *testing.T) { + tests := []struct { + name string + in PurchaseAutoRefill + wantJSON string + }{ + { + name: "all fields populated", + in: PurchaseAutoRefill{ + Enabled: true, + Threshold: 5, + Count: 10, + MaxTotal: 100, + MaxSpendPerDay: "1.50", + }, + wantJSON: `{"enabled":true,"threshold":5,"count":10,"maxTotal":100,"maxSpendPerDay":"1.50"}`, + }, + { + name: "only enabled + new caps", + in: PurchaseAutoRefill{ + Enabled: true, + MaxTotal: 42, + MaxSpendPerDay: "0.05", + }, + wantJSON: `{"enabled":true,"maxTotal":42,"maxSpendPerDay":"0.05"}`, + }, + { + name: "zero values omit every field", + in: PurchaseAutoRefill{}, + wantJSON: `{}`, + }, + { + name: "MaxSpendPerDay alone", + in: PurchaseAutoRefill{ + MaxSpendPerDay: "0.0001", + }, + wantJSON: `{"maxSpendPerDay":"0.0001"}`, + }, + { + name: "MaxTotal alone", + in: PurchaseAutoRefill{ + MaxTotal: 7, + }, + wantJSON: `{"maxTotal":7}`, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gotJSON, err := json.Marshal(tt.in) + if err != nil { + t.Fatalf("marshal: %v", err) + } + if string(gotJSON) != tt.wantJSON { + t.Fatalf("marshal:\n got: %s\nwant: %s", gotJSON, tt.wantJSON) + } + + var roundTripped PurchaseAutoRefill + if err := json.Unmarshal(gotJSON, &roundTripped); err != nil { + t.Fatalf("unmarshal: %v", err) + } + if roundTripped != tt.in { + t.Fatalf("round-trip mismatch:\n got: %+v\nwant: %+v", roundTripped, tt.in) + } + }) + } +} + +// TestPurchaseAutoRefill_UnmarshalAcceptsCRDForm asserts that a JSON document +// shaped like the CRD spec deserialises into every Go field — this is the +// inverse of the marshal direction and catches accidental json-tag drift. +func TestPurchaseAutoRefill_UnmarshalAcceptsCRDForm(t *testing.T) { + const crdJSON = `{ + "enabled": true, + "threshold": 5, + "count": 10, + "maxTotal": 100, + "maxSpendPerDay": "1.50" + }` + + want := PurchaseAutoRefill{ + Enabled: true, + Threshold: 5, + Count: 10, + MaxTotal: 100, + MaxSpendPerDay: "1.50", + } + + var got PurchaseAutoRefill + if err := json.Unmarshal([]byte(crdJSON), &got); err != nil { + t.Fatalf("unmarshal: %v", err) + } + if got != want { + t.Fatalf("unmarshal mismatch:\n got: %+v\nwant: %+v", got, want) + } +} diff --git a/internal/x402/buyer/metrics_test.go b/internal/x402/buyer/metrics_test.go new file mode 100644 index 00000000..9ce4fabc --- /dev/null +++ b/internal/x402/buyer/metrics_test.go @@ -0,0 +1,191 @@ +package buyer + +import ( + "net/http" + "net/http/httptest" + "strings" + "testing" + + dto "github.com/prometheus/client_model/go" + "github.com/prometheus/common/expfmt" +) + +// TestPrometheusLabels_ChainPropagation asserts that prometheusLabels surfaces +// the `chain` label sourced from UpstreamConfig.Network so paid-request metrics +// can be partitioned by chain (base, base-sepolia, etc.). The empty-chain case +// is also exercised so the label is always rendered cleanly even when an +// upstream has no Network set. +func TestPrometheusLabels_ChainPropagation(t *testing.T) { + tests := []struct { + name string + upstream string + remoteModel string + chain string + want map[string]string + }{ + { + name: "base-sepolia chain propagates", + upstream: "upstream-a", + remoteModel: "qwen3.5:9b", + chain: "base-sepolia", + want: map[string]string{ + "upstream": "upstream-a", + "remote_model": "qwen3.5:9b", + "chain": "base-sepolia", + }, + }, + { + name: "base mainnet chain propagates", + upstream: "upstream-b", + remoteModel: "qwen3.5:4b", + chain: "base", + want: map[string]string{ + "upstream": "upstream-b", + "remote_model": "qwen3.5:4b", + "chain": "base", + }, + }, + { + name: "empty chain renders cleanly", + upstream: "upstream-c", + remoteModel: "qwen3.5:1b", + chain: "", + want: map[string]string{ + "upstream": "upstream-c", + "remote_model": "qwen3.5:1b", + "chain": "", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := prometheusLabels(tt.upstream, tt.remoteModel, tt.chain) + if len(got) != len(tt.want) { + t.Fatalf("got %d labels, want %d (%v vs %v)", len(got), len(tt.want), got, tt.want) + } + for k, v := range tt.want { + if got[k] != v { + t.Errorf("label %q = %q, want %q", k, got[k], v) + } + } + }) + } +} + +// TestMetrics_ChainLabelScrapeRoundtrip increments each of the 9 buyer +// counters/gauges using prometheusLabels and then scrapes /metrics through the +// registry's handler, asserting the `chain` label appears (with the expected +// value) on every series. +func TestMetrics_ChainLabelScrapeRoundtrip(t *testing.T) { + tests := []struct { + name string + upstream string + remoteModel string + chain string + }{ + { + name: "base-sepolia label visible on every series", + upstream: "upstream-a", + remoteModel: "qwen3.5:9b", + chain: "base-sepolia", + }, + { + name: "empty chain label is present and empty", + upstream: "upstream-b", + remoteModel: "qwen3.5:4b", + chain: "", + }, + } + + // Every metric registered by newMetrics carries the same {upstream, + // remote_model, chain} label set. + wantFamilies := []string{ + "obol_x402_buyer_requests_total", + "obol_x402_buyer_payment_attempts_total", + "obol_x402_buyer_payment_success_total", + "obol_x402_buyer_payment_failure_total", + "obol_x402_buyer_confirm_spend_failure_total", + "obol_x402_buyer_payment_unsettled_confirmations_total", + "obol_x402_buyer_auth_remaining", + "obol_x402_buyer_auth_spent", + "obol_x402_buyer_active_model_mappings", + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + m := newMetrics() + labels := prometheusLabels(tt.upstream, tt.remoteModel, tt.chain) + + // Counters: incremented once each. + m.requestsTotal.With(labels).Inc() + m.paymentAttempts.With(labels).Inc() + m.paymentSuccessTotal.With(labels).Inc() + m.paymentFailureTotal.With(labels).Inc() + m.confirmSpendFailureTotal.With(labels).Inc() + m.paymentUnsettledConfirmations.With(labels).Inc() + // Gauges: stamped with arbitrary non-zero values. + m.authRemaining.With(labels).Set(7) + m.authSpent.With(labels).Set(3) + m.activeModelMappings.With(labels).Set(1) + + families := scrapeBuyerMetrics(t, m) + + wantLabels := map[string]string{ + "upstream": tt.upstream, + "remote_model": tt.remoteModel, + "chain": tt.chain, + } + for _, name := range wantFamilies { + fam, ok := families[name] + if !ok { + t.Errorf("missing metric family %s", name) + continue + } + if !buyerHasSeriesWithLabels(fam, wantLabels) { + t.Errorf("metric %s missing series with labels %v", name, wantLabels) + } + } + }) + } +} + +// scrapeBuyerMetrics renders the metrics registry through its HTTP handler and +// parses the Prometheus text exposition into a name → MetricFamily map. +func scrapeBuyerMetrics(t *testing.T, m *metrics) map[string]*dto.MetricFamily { + t.Helper() + + rec := httptest.NewRecorder() + m.handler().ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/metrics", nil)) + if rec.Code != http.StatusOK { + t.Fatalf("metrics status = %d, want 200", rec.Code) + } + + var parser expfmt.TextParser + families, err := parser.TextToMetricFamilies(strings.NewReader(rec.Body.String())) + if err != nil { + t.Fatalf("parse metrics: %v", err) + } + return families +} + +// buyerHasSeriesWithLabels returns true iff `family` contains at least one +// series whose label set exactly equals `want`. +func buyerHasSeriesWithLabels(family *dto.MetricFamily, want map[string]string) bool { + for _, metric := range family.GetMetric() { + if len(metric.GetLabel()) != len(want) { + continue + } + match := true + for _, label := range metric.GetLabel() { + if want[label.GetName()] != label.GetValue() { + match = false + break + } + } + if match { + return true + } + } + return false +} diff --git a/internal/x402/buyer/proxy_test.go b/internal/x402/buyer/proxy_test.go index 55281c46..5b135720 100644 --- a/internal/x402/buyer/proxy_test.go +++ b/internal/x402/buyer/proxy_test.go @@ -1045,7 +1045,7 @@ func TestProxy_ModelRoutingAndMetrics(t *testing.T) { } metrics := scrapeMetricFamilies(t, proxy) - labels := map[string]string{"upstream": "seller-qwen", "remote_model": "qwen3:32b"} + labels := map[string]string{"upstream": "seller-qwen", "remote_model": "qwen3:32b", "chain": "base-sepolia"} assertMetricValue(t, metrics["obol_x402_buyer_requests_total"], labels, 1) assertMetricValue(t, metrics["obol_x402_buyer_payment_attempts_total"], labels, 1) assertMetricValue(t, metrics["obol_x402_buyer_payment_success_total"], labels, 1) @@ -1295,10 +1295,10 @@ func TestProxy_ReloadSkipsConsumedAuthsAndReplacesModelMapping(t *testing.T) { t.Fatalf("active model mapping series = %d, want 1", metricFamilyLen(activeMappings)) } - assertMetricValue(t, activeMappings, map[string]string{"upstream": "seller-new", "remote_model": "new-model"}, 1) - assertMetricMissing(t, activeMappings, map[string]string{"upstream": "seller-old", "remote_model": "old-model"}) - assertMetricValue(t, metrics["obol_x402_buyer_auth_remaining"], map[string]string{"upstream": "seller-new", "remote_model": "new-model"}, 1) - assertMetricMissing(t, metrics["obol_x402_buyer_auth_remaining"], map[string]string{"upstream": "seller-old", "remote_model": "old-model"}) + assertMetricValue(t, activeMappings, map[string]string{"upstream": "seller-new", "remote_model": "new-model", "chain": "base-sepolia"}, 1) + assertMetricMissing(t, activeMappings, map[string]string{"upstream": "seller-old", "remote_model": "old-model", "chain": "base-sepolia"}) + assertMetricValue(t, metrics["obol_x402_buyer_auth_remaining"], map[string]string{"upstream": "seller-new", "remote_model": "new-model", "chain": "base-sepolia"}, 1) + assertMetricMissing(t, metrics["obol_x402_buyer_auth_remaining"], map[string]string{"upstream": "seller-old", "remote_model": "old-model", "chain": "base-sepolia"}) } func TestProxy_ReloadSamePurchasePreservesSpentAndAppendsAuthPool(t *testing.T) { @@ -1752,6 +1752,7 @@ func TestProxy_UpstreamSuccessNoSettlementHeader_IncrementsUnsettledMetric(t *te assertMetricValue(t, family, map[string]string{ "upstream": "paid", "remote_model": "paid", + "chain": "base-sepolia", }, 1) } @@ -1826,6 +1827,7 @@ func TestProxy_UpstreamSuccessWithSettlementHeader_DoesNotIncrementUnsettledMetr assertMetricMissing(t, metrics["obol_x402_buyer_payment_unsettled_confirmations_total"], map[string]string{ "upstream": "paid", "remote_model": "paid", + "chain": "base-sepolia", }) } @@ -1901,6 +1903,7 @@ func TestProxy_ConfirmSpendFailure_IncrementsMetric(t *testing.T) { assertMetricValue(t, metrics["obol_x402_buyer_confirm_spend_failure_total"], map[string]string{ "upstream": "paid", "remote_model": "paid", + "chain": "base-sepolia", }, 1) } diff --git a/internal/x402/verifier_test.go b/internal/x402/verifier_test.go index d6bbbced..6b97548d 100644 --- a/internal/x402/verifier_test.go +++ b/internal/x402/verifier_test.go @@ -10,6 +10,7 @@ import ( "strings" "sync/atomic" "testing" + "time" x402types "github.com/coinbase/x402/go/types" dto "github.com/prometheus/client_model/go" @@ -823,6 +824,114 @@ func TestVerifier_MetricsVerifiedAndRejectedPayments(t *testing.T) { assertVerifierMetricMissing(t, rejectMetrics["obol_x402_verifier_charged_requests_total"], labels) } +// TestVerifier_LastPaymentSuccessGauge asserts that the +// obol_x402_verifier_last_payment_success_seconds gauge is stamped to the +// current wall-clock time when a paid request succeeds, and is NOT touched +// when an unpaid request is rejected with 402. +// +// The gauge is labeled identically to the verifier counters; for this rule +// `chain` is the empty string because the test RouteRule has no Network set. +func TestVerifier_LastPaymentSuccessGauge(t *testing.T) { + labels := map[string]string{ + "route": "/rpc/*", + "offer_namespace": "llm", + "offer_name": "paid-rpc", + "chain": "", + } + + tests := []struct { + name string + setPayment bool + rejectPayment bool + wantStatus int + wantGaugeFresh bool // assert gauge ~= now() + }{ + { + name: "successful paid request stamps gauge", + setPayment: true, + rejectPayment: false, + wantStatus: http.StatusOK, + wantGaugeFresh: true, + }, + { + name: "unpaid 402 leaves gauge untouched", + setPayment: false, + rejectPayment: false, + wantStatus: http.StatusPaymentRequired, + wantGaugeFresh: false, + }, + { + name: "rejected payment leaves gauge untouched", + setPayment: true, + rejectPayment: true, + wantStatus: http.StatusPaymentRequired, + wantGaugeFresh: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + fac := newMockFacilitator(t, mockFacilitatorOpts{rejectPayment: tt.rejectPayment}) + v := newTestVerifier(t, fac.URL, []RouteRule{{ + Pattern: "/rpc/*", + Price: "0.0001", + OfferNamespace: "llm", + OfferName: "paid-rpc", + }}) + + req := httptest.NewRequest(http.MethodPost, "/verify", nil) + req.Header.Set("X-Forwarded-Uri", "/rpc/mainnet") + req.Header.Set("X-Forwarded-Host", "obol.stack") + if tt.setPayment { + req.Header.Set("X-PAYMENT", testPaymentHeader(t)) + } + + before := time.Now().Unix() + rec := httptest.NewRecorder() + v.HandleVerify(rec, req) + after := time.Now().Unix() + + if rec.Code != tt.wantStatus { + t.Fatalf("status = %d, want %d", rec.Code, tt.wantStatus) + } + + families := scrapeVerifierMetrics(t, v) + gauge := families["obol_x402_verifier_last_payment_success_seconds"] + + if !tt.wantGaugeFresh { + // Either the family is absent (no series emitted) or no + // series exists for these labels — both are acceptable for + // an untouched gauge. + assertVerifierMetricMissing(t, gauge, labels) + return + } + + if gauge == nil { + t.Fatalf("missing metric family obol_x402_verifier_last_payment_success_seconds") + } + got := findVerifierMetricValue(t, gauge, labels) + // Allow ±5s slack for clock skew / slow CI. + if got < float64(before-5) || got > float64(after+5) { + t.Fatalf("gauge = %v, want within [%d, %d]", got, before-5, after+5) + } + }) + } +} + +// findVerifierMetricValue returns the value of the series in `family` whose +// labels match `wantLabels` exactly, failing the test if no such series exists. +func findVerifierMetricValue(t *testing.T, family *dto.MetricFamily, wantLabels map[string]string) float64 { + t.Helper() + + for _, metric := range family.GetMetric() { + if verifierLabelsMatch(metric, wantLabels) { + return verifierMetricValue(metric) + } + } + t.Fatalf("metric %s missing labels %v", family.GetName(), wantLabels) + return 0 +} + func scrapeVerifierMetrics(t *testing.T, v *Verifier) map[string]*dto.MetricFamily { t.Helper() From 27e1ac594dc53f4b0736bd566efa5fa4219b063f Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 23 May 2026 21:28:14 +0400 Subject: [PATCH 4/5] chore(x402): RBAC trim, ServiceMonitor, relocate monitoring YAML, recording rules Phase 1 + Phase 2 hardening on top of the chain-label/last-settlement work, incorporating findings from the 4-agent K8s architecture review. Skips the auth-on-mutating-endpoints item per operator clarification: the obol-stack frontend is local-only behind the obol.stack hostname restriction, so it's not the primary trust boundary. RBAC trims: - Drop `secrets get/list` from obol-frontend-openclaw-discovery ClusterRole; pre-existing dangling grant, no code reads them. - Drop /status subresource from purchaserequests rule; frontend never writes status (only the controller does). Monitoring + RBAC co-location (kills 3 bedag/raw helmfile releases): - x402-verifier: PodMonitor -> ServiceMonitor in base/templates/x402.yaml. Verifier has a stable Service on port http:8080; ServiceMonitor scrapes the endpoint cleanly across replicas. - litellm-x402-buyer: PodMonitor moved into base/templates/llm.yaml. Stays a PodMonitor because the sidecar's port 8402 is per-pod, not fronted by a Service. - obol-frontend RBAC moved into base/templates/obol-frontend-rbac.yaml next to the workload it grants. Label cardinality: - Drop `route` label from verifier metrics. (offer_namespace, offer_name, chain) already uniquely scopes a paid route; `route` (= rule.Pattern) was redundant and unbounded by path fragments. PrometheusRule (new base/templates/x402-prometheus-rules.yaml): - Recording: x402:revenue:24h_by_offer_chain, x402:revenue:7d_by_offer_chain, x402:revenue:lifetime_by_offer, x402:settlement_rate:1h_by_offer_chain. The frontend's PrometheusClient reads these so renaming raw metrics no longer breaks the UI, and the `increase()` 2-sample minimum no longer leaves cold offers at "0" for the first 30s of traffic. - Alerting: X402PaymentFailureRateHigh (>10% over 1h), X402NoSettlementsAfterChallenge (402s issued, no charges). Deferred (out of scope for this hardening pass): - Frontend-egress NetworkPolicy: on k3s + Flannel the apiserver Service endpoints point at the host process, outside the cluster pod/service CIDRs. A clean allowlist policy can't target the apiserver portably without an install-specific ipBlock; revisit when obol-stack ships a non-k3s deployment surface. - obol-marketplace-api aggregator service: overkill for the local single-operator context. - Three-deployment-paths consolidation (helmfile + bedag/raw + Go `EnsureVerifier`): larger refactor; tracked as separate workstream. Live validation: - 2 paid requests against demo-hello survive both the RBAC trims and the ServiceMonitor swap. `x402:revenue:7d_by_offer_chain` returns 1.0076 for chain=eip155:84532 (matches the underlying obol_x402_verifier_charged_requests_total counter at value 2 over 2 samples). - /api/marketplace/purchases still returns 200 after dropping the /status grant. - /api/agents/wallets returns the agent wallet via the new batched listAllWalletMetadata path (1 ConfigMap list vs N+1 per-instance). --- .../infrastructure/base/templates/llm.yaml | 24 +++ .../base/templates/obol-frontend-rbac.yaml | 53 +++++++ .../base/templates/x402-prometheus-rules.yaml | 139 ++++++++++++++++++ .../infrastructure/base/templates/x402.yaml | 26 ++++ internal/embed/infrastructure/helmfile.yaml | 111 ++------------ internal/x402/metrics.go | 12 +- internal/x402/verifier.go | 5 +- internal/x402/verifier_test.go | 3 - 8 files changed, 266 insertions(+), 107 deletions(-) create mode 100644 internal/embed/infrastructure/base/templates/obol-frontend-rbac.yaml create mode 100644 internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml diff --git a/internal/embed/infrastructure/base/templates/llm.yaml b/internal/embed/infrastructure/base/templates/llm.yaml index cf34841f..18982349 100644 --- a/internal/embed/infrastructure/base/templates/llm.yaml +++ b/internal/embed/infrastructure/base/templates/llm.yaml @@ -298,3 +298,27 @@ spec: port: 4000 targetPort: http protocol: TCP + +--- +# PodMonitor for the x402-buyer sidecar — kept as PodMonitor (not +# ServiceMonitor) because the sidecar listens on a per-pod port (8402) +# that is NOT exposed via the litellm Service. Lives alongside the +# Deployment that hosts it so changing the buyer port here is one edit. +# +# Picked up by kube-prometheus-stack via the `release: monitoring` label. +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: litellm-x402-buyer + namespace: llm + labels: + release: monitoring + app: litellm +spec: + selector: + matchLabels: + app: litellm + podMetricsEndpoints: + - port: buyer-http + path: /metrics + interval: 30s diff --git a/internal/embed/infrastructure/base/templates/obol-frontend-rbac.yaml b/internal/embed/infrastructure/base/templates/obol-frontend-rbac.yaml new file mode 100644 index 00000000..038df594 --- /dev/null +++ b/internal/embed/infrastructure/base/templates/obol-frontend-rbac.yaml @@ -0,0 +1,53 @@ +--- +# RBAC for the obol-frontend pod's ServiceAccount. +# +# The frontend pod uses this SA's bearer token to: +# - Discover OpenClaw / Hermes instances (namespaces, pods, configmaps) +# - List + mutate ServiceOffer CRs (sell-modal + pause/resume/delete row actions) +# - List PurchaseRequest CRs (My Purchases page; never writes) +# +# The frontend is local-only behind the obol.stack hostname restriction +# (the operator owns the cluster), so this is a single trust boundary. +# Defense-in-depth note: the `secrets` rule is intentionally omitted — +# no code path reads them and the SA token shouldn't have that reach. +# /status subresources are omitted from PurchaseRequest because the +# controller is the only writer. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: obol-frontend-openclaw-discovery + labels: + app.kubernetes.io/name: obol-frontend +rules: + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["get", "list"] + - apiGroups: [""] + resources: ["pods", "configmaps"] + verbs: ["get", "list"] + # ServiceOffer CRD — frontend sell modal creates offers, row actions + # pause/resume (annotation patch) and delete. + - apiGroups: ["obol.org"] + resources: ["serviceoffers", "serviceoffers/status"] + verbs: ["get", "list", "create", "update", "patch", "delete"] + # PurchaseRequest CRD — frontend My Purchases page lists buyer-side + # records. Read-only; agent buy.py and the controller are the writers. + - apiGroups: ["obol.org"] + resources: ["purchaserequests"] + verbs: ["get", "list", "watch"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: obol-frontend-openclaw-discovery + labels: + app.kubernetes.io/name: obol-frontend +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: obol-frontend-openclaw-discovery +subjects: + - kind: ServiceAccount + name: obol-frontend + namespace: obol-frontend diff --git a/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml b/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml new file mode 100644 index 00000000..73b10f94 --- /dev/null +++ b/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml @@ -0,0 +1,139 @@ +--- +# Recording + alerting rules for x402 verifier traffic. +# +# Recording rules pre-aggregate the queries that the frontend's +# /api/sell/list joins use (chargedSalesByOfferAndChain, +# chargedRequests24hByOffer). The frontend reads the recorded series +# directly, which: +# * removes the `increase()` 2-sample minimum quirk (cold offers no +# longer show "0" for the first 30s after they receive traffic), +# * decouples the page from raw metric names (renaming +# obol_x402_verifier_charged_requests_total no longer breaks the UI), +# * cuts query cost on dashboards / page reloads (sum is done once at +# evaluation time, not per page-load). +# +# Alerting rules surface the two operator-meaningful failure modes the +# release-smoke flows historically caught manually. +# +# Picked up by kube-prometheus-stack via the `release: monitoring` label +# (configured in values/monitoring.yaml.gotmpl). +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: x402-verifier + namespace: x402 + labels: + release: monitoring + app: x402-verifier +spec: + groups: + - name: x402.recording + interval: 30s + rules: + # 24h charged-request count per (offer, chain). Replaces the + # frontend's `increase(charged_requests_total[24h])` query — same + # math, pre-computed every 30s. + - record: x402:revenue:24h_by_offer_chain + expr: | + sum by (offer_namespace, offer_name, chain) ( + increase(obol_x402_verifier_charged_requests_total[24h]) + ) + + # 7d charged-request count per (offer, chain). Powers the + # EarningsStrip per-chain × CRD price multiplication. + - record: x402:revenue:7d_by_offer_chain + expr: | + sum by (offer_namespace, offer_name, chain) ( + increase(obol_x402_verifier_charged_requests_total[7d]) + ) + + # Lifetime charged-request count per offer (sum across replicas + # + chains). Used in the My Listings "today · X earned" header + # text and the Browse catalog usage badge. + - record: x402:revenue:lifetime_by_offer + expr: | + sum by (offer_namespace, offer_name) ( + obol_x402_verifier_charged_requests_total + ) + + # Settlement rate (verified / attempted) over the last hour, per + # (offer, chain). Useful for the dashboard + the alert below. + - record: x402:settlement_rate:1h_by_offer_chain + expr: | + sum by (offer_namespace, offer_name, chain) ( + rate(obol_x402_verifier_payment_verified_total[1h]) + ) + / + clamp_min( + sum by (offer_namespace, offer_name, chain) ( + rate(obol_x402_verifier_payment_required_total[1h]) + + + rate(obol_x402_verifier_payment_verified_total[1h]) + + + rate(obol_x402_verifier_payment_failed_total[1h]) + ), + 1 + ) + + - name: x402.alerting + rules: + # Payment-failure ratio crossed 10% over the last hour for a paid + # route that's actually receiving traffic. Typical cause: + # facilitator unreachable, chain pruning, or seller's CA bundle + # missing (CLAUDE.md pitfall #8). + - alert: X402PaymentFailureRateHigh + expr: | + ( + sum by (offer_namespace, offer_name, chain) ( + rate(obol_x402_verifier_payment_failed_total[1h]) + ) + / + clamp_min( + sum by (offer_namespace, offer_name, chain) ( + rate(obol_x402_verifier_payment_failed_total[1h]) + + + rate(obol_x402_verifier_payment_verified_total[1h]) + ), + 1 + ) + ) > 0.10 + for: 10m + labels: + severity: warning + annotations: + summary: "x402 payment failures > 10% on {{ $labels.offer_namespace }}/{{ $labels.offer_name }} ({{ $labels.chain }})" + description: | + More than 10% of paid requests to + {{ $labels.offer_namespace }}/{{ $labels.offer_name }} on + {{ $labels.chain }} have failed verification over the last + hour. Check the verifier logs for x509/facilitator errors and + the seller's `ca-certificates` ConfigMap. + + # An offer received a 402 (payment_required) within the last hour + # but no charged_requests happened in the same window. Either + # buyers aren't completing the flow, or settlement is broken + # downstream of the verifier. + - alert: X402NoSettlementsAfterChallenge + expr: | + ( + sum by (offer_namespace, offer_name) ( + increase(obol_x402_verifier_payment_required_total[1h]) + ) > 0 + ) + and + ( + sum by (offer_namespace, offer_name) ( + increase(obol_x402_verifier_charged_requests_total[1h]) + ) == 0 + ) + for: 30m + labels: + severity: warning + annotations: + summary: "{{ $labels.offer_namespace }}/{{ $labels.offer_name }} returns 402 but never settles" + description: | + The x402 verifier issued 402 responses for + {{ $labels.offer_namespace }}/{{ $labels.offer_name }} in the + last hour but observed no settled requests. Check the buyer + sidecar's auth pool (/status) and the facilitator's settlement + endpoint. diff --git a/internal/embed/infrastructure/base/templates/x402.yaml b/internal/embed/infrastructure/base/templates/x402.yaml index 9dcc933e..38482384 100644 --- a/internal/embed/infrastructure/base/templates/x402.yaml +++ b/internal/embed/infrastructure/base/templates/x402.yaml @@ -332,3 +332,29 @@ spec: selector: matchLabels: app: x402-verifier + +--- +# ServiceMonitor for x402-verifier — scrapes the stable Service endpoint +# rather than per-pod IPs (which is what a PodMonitor would do). Lives +# alongside the Service it observes so adding/changing the port or +# selector here is a single-file change. +# +# Picked up by kube-prometheus-stack via the `release: monitoring` label +# (configured in values/monitoring.yaml.gotmpl as the serviceMonitorSelector). +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: x402-verifier + namespace: x402 + labels: + release: monitoring + app: x402-verifier +spec: + selector: + matchLabels: + app: x402-verifier + endpoints: + - port: http + path: /metrics + interval: 30s + scrapeTimeout: 10s diff --git a/internal/embed/infrastructure/helmfile.yaml b/internal/embed/infrastructure/helmfile.yaml index fe6eb001..95f7c8b5 100644 --- a/internal/embed/infrastructure/helmfile.yaml +++ b/internal/embed/infrastructure/helmfile.yaml @@ -44,57 +44,12 @@ releases: values: - ./values/monitoring.yaml.gotmpl - - name: llm-buyer-podmonitor - namespace: llm - createNamespace: true - chart: bedag/raw - version: 2.0.2 - needs: - - monitoring/monitoring - - kube-system/base - values: - - resources: - - apiVersion: monitoring.coreos.com/v1 - kind: PodMonitor - metadata: - name: litellm-x402-buyer - namespace: llm - labels: - release: monitoring - spec: - selector: - matchLabels: - app: litellm - podMetricsEndpoints: - - port: buyer-http - path: /metrics - interval: 30s - - - name: x402-verifier-podmonitor - namespace: x402 - createNamespace: false - chart: bedag/raw - version: 2.0.2 - needs: - - monitoring/monitoring - - kube-system/base - values: - - resources: - - apiVersion: monitoring.coreos.com/v1 - kind: PodMonitor - metadata: - name: x402-verifier - namespace: x402 - labels: - release: monitoring - spec: - selector: - matchLabels: - app: x402-verifier - podMetricsEndpoints: - - port: http - path: /metrics - interval: 30s + # NOTE: PodMonitor for litellm-x402-buyer and ServiceMonitor for + # x402-verifier moved into base/templates/llm.yaml and + # base/templates/x402.yaml respectively. They live alongside the + # workloads they observe so a port/selector edit is one-file. Kills + # two `bedag/raw` releases. kube-prometheus-stack picks them up via + # the `release: monitoring` label. # Traefik ingress controller with Gateway API support # Traefik v38+ bundles Gateway API CRDs in its crds/ directory @@ -305,49 +260,11 @@ releases: - name: obol-frontend-obol-app port: 3000 - # Obol Frontend RBAC (OpenClaw instance discovery via Kubernetes API) - - name: obol-frontend-rbac - namespace: obol-frontend - chart: bedag/raw - version: 2.0.2 - needs: - - obol-frontend/obol-frontend - values: - - resources: - - apiVersion: rbac.authorization.k8s.io/v1 - kind: ClusterRole - metadata: - name: obol-frontend-openclaw-discovery - labels: - app.kubernetes.io/name: obol-frontend - rules: - - apiGroups: [""] - resources: ["namespaces"] - verbs: ["get", "list"] - - apiGroups: [""] - resources: ["pods", "configmaps", "secrets"] - verbs: ["get", "list"] - # ServiceOffer CRD — frontend sell modal creates offers - - apiGroups: ["obol.org"] - resources: ["serviceoffers", "serviceoffers/status"] - verbs: ["get", "list", "create", "update", "patch", "delete"] - # PurchaseRequest CRD — frontend My Purchases page lists buyer-side - # purchase records. Read-only; agent buy.py and the controller - # remain the only writers. - - apiGroups: ["obol.org"] - resources: ["purchaserequests", "purchaserequests/status"] - verbs: ["get", "list", "watch"] - - apiVersion: rbac.authorization.k8s.io/v1 - kind: ClusterRoleBinding - metadata: - name: obol-frontend-openclaw-discovery - labels: - app.kubernetes.io/name: obol-frontend - roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: obol-frontend-openclaw-discovery - subjects: - - kind: ServiceAccount - name: obol-frontend - namespace: obol-frontend + # NOTE: obol-frontend-rbac ClusterRole + ClusterRoleBinding moved into + # base/templates/obol-frontend-rbac.yaml. Co-located with the workload + # they grant; kills a `bedag/raw` release. Frontend-egress NetworkPolicy + # was attempted and reverted — on k3s + Flannel (k3d's default CNI) the + # kubernetes apiserver Service Endpoints point at the host process, + # outside the cluster pod/service CIDRs. A clean allowlist can't target + # the apiserver portably without an install-specific ipBlock for the k3s + # host IP. Tracking as a deferred hardening item. diff --git a/internal/x402/metrics.go b/internal/x402/metrics.go index 42266c2b..8734a58d 100644 --- a/internal/x402/metrics.go +++ b/internal/x402/metrics.go @@ -26,42 +26,42 @@ func newVerifierMetrics() *verifierMetrics { Name: "obol_x402_verifier_requests_total", Help: "Requests evaluated by the x402 verifier for matched paid routes.", }, - []string{"route", "offer_namespace", "offer_name", "chain"}, + []string{"offer_namespace", "offer_name", "chain"}, ), paymentRequired: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_payment_required_total", Help: "Requests rejected with 402 because payment was required.", }, - []string{"route", "offer_namespace", "offer_name", "chain"}, + []string{"offer_namespace", "offer_name", "chain"}, ), paymentVerified: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_payment_verified_total", Help: "Requests approved after successful x402 payment verification.", }, - []string{"route", "offer_namespace", "offer_name", "chain"}, + []string{"offer_namespace", "offer_name", "chain"}, ), paymentFailed: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_payment_failed_total", Help: "Requests rejected after a provided x402 payment failed verification.", }, - []string{"route", "offer_namespace", "offer_name", "chain"}, + []string{"offer_namespace", "offer_name", "chain"}, ), chargedRequests: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_charged_requests_total", Help: "Requests that incurred a paid x402 charge.", }, - []string{"route", "offer_namespace", "offer_name", "chain"}, + []string{"offer_namespace", "offer_name", "chain"}, ), lastPaymentSuccess: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "obol_x402_verifier_last_payment_success_seconds", Help: "Unix timestamp (seconds) of the most recent successful paid x402 charge for a route.", }, - []string{"route", "offer_namespace", "offer_name", "chain"}, + []string{"offer_namespace", "offer_name", "chain"}, ), } diff --git a/internal/x402/verifier.go b/internal/x402/verifier.go index b4451508..38437ec3 100644 --- a/internal/x402/verifier.go +++ b/internal/x402/verifier.go @@ -448,8 +448,11 @@ func (r *statusRecorder) WriteHeader(status int) { } func prometheusLabels(rule *RouteRule) prometheus.Labels { + // `route` (= rule.Pattern) was dropped in favor of (offer_namespace, + // offer_name) which already uniquely identifies a paid route — the + // pattern was redundant and unbounded by path fragments, which would + // have ballooned series count for sellers running many granular routes. return prometheus.Labels{ - "route": rule.Pattern, "offer_namespace": rule.OfferNamespace, "offer_name": rule.OfferName, "chain": rule.Network, diff --git a/internal/x402/verifier_test.go b/internal/x402/verifier_test.go index 6b97548d..4bfd46a4 100644 --- a/internal/x402/verifier_test.go +++ b/internal/x402/verifier_test.go @@ -753,7 +753,6 @@ func TestVerifier_MetricsPaymentRequired(t *testing.T) { metrics := scrapeVerifierMetrics(t, v) labels := map[string]string{ - "route": "/rpc/*", "offer_namespace": "llm", "offer_name": "paid-rpc", "chain": "", @@ -767,7 +766,6 @@ func TestVerifier_MetricsPaymentRequired(t *testing.T) { func TestVerifier_MetricsVerifiedAndRejectedPayments(t *testing.T) { labels := map[string]string{ - "route": "/rpc/*", "offer_namespace": "llm", "offer_name": "paid-rpc", "chain": "", @@ -833,7 +831,6 @@ func TestVerifier_MetricsVerifiedAndRejectedPayments(t *testing.T) { // `chain` is the empty string because the test RouteRule has no Network set. func TestVerifier_LastPaymentSuccessGauge(t *testing.T) { labels := map[string]string{ - "route": "/rpc/*", "offer_namespace": "llm", "offer_name": "paid-rpc", "chain": "", From 0fbb99ae93c60975f64beed49b92e4802b0bbac5 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sat, 23 May 2026 21:53:16 +0400 Subject: [PATCH 5/5] fix(x402): GC verifier metric series for deleted offers The verifier's per-offer counters and the last_payment_success_seconds gauge were created on first use and never removed. Deleting an offer (via `obol sell delete`, ServiceOffer CR deletion, or pricing config edit) left stale series in the registry forever, which: * pollutes My Listings / dashboards with rows for offers that no longer exist, * lets X402NoSettlementsAfterChallenge keep referencing dead labels, * silently inflates the "last successful charge" gauge with timestamps from offers the operator already retired. Verifier.load() now diffs the incoming route set against the live label tuples in the registry and calls DeletePartialMatch on each vec for every (offer_namespace, offer_name, chain) triple that is no longer served. Both reload paths (file config watcher and the kube ServiceOffer informer via ConfigAccumulator) funnel through load(), so one hook covers everything. Also fixes a guard test from the prior hardening commit that was still asserting the old "no ServiceMonitor here" invariant after we intentionally relocated the ServiceMonitor into this manifest. Flipped to assert presence so a future cleanup can't silently drop it. Test: TestVerifier_Reload_PrunesDeletedOfferSeries stamps two offers' worth of metrics, reloads with one removed, and asserts the removed offer is gone from all six vecs while the kept offer survives. --- internal/x402/metrics.go | 58 ++++++++++++++++++++++++ internal/x402/setup_test.go | 8 +++- internal/x402/verifier.go | 14 ++++++ internal/x402/verifier_test.go | 83 ++++++++++++++++++++++++++++++++++ 4 files changed, 161 insertions(+), 2 deletions(-) diff --git a/internal/x402/metrics.go b/internal/x402/metrics.go index 8734a58d..b445d4c3 100644 --- a/internal/x402/metrics.go +++ b/internal/x402/metrics.go @@ -80,3 +80,61 @@ func newVerifierMetrics() *verifierMetrics { func (m *verifierMetrics) handler() http.Handler { return promhttp.HandlerFor(m.registry, promhttp.HandlerOpts{}) } + +// pruneSeriesNotIn drops every (offer_namespace, offer_name, chain) series +// from the verifier's counter/gauge vecs that is not present in `keep`. +// Called from Verifier.load whenever the route set changes so deleted offers +// (e.g. `obol sell delete`) stop emitting stale series — most importantly the +// last_payment_success_seconds gauge, which would otherwise hold the deleted +// offer's last-success timestamp forever and falsely satisfy "recent activity" +// alerts and dashboards. +// +// Key shape: "ns\x00name\x00chain" — \x00 is forbidden in Kubernetes object +// names and CAIP-2 chain ids, so the byte-join can't collide. +func (m *verifierMetrics) pruneSeriesNotIn(keep map[string]struct{}) { + vecs := []interface { + DeletePartialMatch(prometheus.Labels) int + }{ + m.requestsTotal, + m.paymentRequired, + m.paymentVerified, + m.paymentFailed, + m.chargedRequests, + m.lastPaymentSuccess, + } + + gathered, err := m.registry.Gather() + if err != nil { + return + } + for _, family := range gathered { + for _, metric := range family.GetMetric() { + labels := metric.GetLabel() + ns, name, chain := "", "", "" + for _, l := range labels { + switch l.GetName() { + case "offer_namespace": + ns = l.GetValue() + case "offer_name": + name = l.GetValue() + case "chain": + chain = l.GetValue() + } + } + if ns == "" && name == "" { + continue + } + if _, ok := keep[ns+"\x00"+name+"\x00"+chain]; ok { + continue + } + match := prometheus.Labels{ + "offer_namespace": ns, + "offer_name": name, + "chain": chain, + } + for _, vec := range vecs { + vec.DeletePartialMatch(match) + } + } + } +} diff --git a/internal/x402/setup_test.go b/internal/x402/setup_test.go index ff8b7652..7dba813e 100644 --- a/internal/x402/setup_test.go +++ b/internal/x402/setup_test.go @@ -258,7 +258,11 @@ func TestX402Manifest_UsesServiceOfferControllerModel(t *testing.T) { if !strings.Contains(manifest, "resources: [\"serviceoffers\"]") { t.Fatalf("x402 manifest missing serviceoffer watch RBAC:\n%s", manifest) } - if strings.Contains(manifest, "kind: ServiceMonitor") { - t.Fatalf("x402 manifest still includes legacy ServiceMonitor stanza:\n%s", manifest) + // ServiceMonitor now lives in this manifest by design — relocated here + // from a bedag/raw helmfile release so the scrape config sits next to + // the Service it observes. Assert presence so a future cleanup can't + // silently drop it. + if !strings.Contains(manifest, "kind: ServiceMonitor") { + t.Fatalf("x402 manifest missing ServiceMonitor (relocated from bedag/raw helmfile in PR #513 hardening):\n%s", manifest) } } diff --git a/internal/x402/verifier.go b/internal/x402/verifier.go index 38437ec3..60e2fa80 100644 --- a/internal/x402/verifier.go +++ b/internal/x402/verifier.go @@ -64,6 +64,20 @@ func (v *Verifier) load(cfg *PricingConfig) error { v.chains.Store(&chains) v.config.Store(cfg) + // Drop metric series for offers that are no longer in the route set. + // Without this, deleting an offer leaves its counters + last-success + // gauge in the registry forever, polluting dashboards and silently + // keeping alerts (e.g. "no settlements after challenge") tied to dead + // labels. + live := make(map[string]struct{}, len(cfg.Routes)) + for _, r := range cfg.Routes { + if r.OfferNamespace == "" && r.OfferName == "" { + continue + } + live[r.OfferNamespace+"\x00"+r.OfferName+"\x00"+r.Network] = struct{}{} + } + v.metrics.pruneSeriesNotIn(live) + return nil } diff --git a/internal/x402/verifier_test.go b/internal/x402/verifier_test.go index 4bfd46a4..3b62c815 100644 --- a/internal/x402/verifier_test.go +++ b/internal/x402/verifier_test.go @@ -915,6 +915,89 @@ func TestVerifier_LastPaymentSuccessGauge(t *testing.T) { } } +// TestVerifier_Reload_PrunesDeletedOfferSeries asserts that when an offer is +// removed from the route set (via Reload, the same path used by both the +// file-config watcher and the kube ServiceOffer informer), its previously +// stamped metric series are dropped from the registry. Without this, deleted +// offers' last_payment_success_seconds gauge would survive forever and keep +// firing/silencing alerts on dead labels. +func TestVerifier_Reload_PrunesDeletedOfferSeries(t *testing.T) { + fac := newMockFacilitator(t, mockFacilitatorOpts{}) + keptRoute := RouteRule{ + Pattern: "/keep/*", + Price: "0.0001", + OfferNamespace: "llm", + OfferName: "keep", + } + removedRoute := RouteRule{ + Pattern: "/gone/*", + Price: "0.0001", + OfferNamespace: "llm", + OfferName: "gone", + } + v := newTestVerifier(t, fac.URL, []RouteRule{keptRoute, removedRoute}) + + // Stamp metrics for both offers with a successful paid request each. + for _, path := range []string{"/keep/x", "/gone/x"} { + req := httptest.NewRequest(http.MethodPost, "/verify", nil) + req.Header.Set("X-Forwarded-Uri", path) + req.Header.Set("X-Forwarded-Host", "obol.stack") + req.Header.Set("X-PAYMENT", testPaymentHeader(t)) + rec := httptest.NewRecorder() + v.HandleVerify(rec, req) + if rec.Code != http.StatusOK { + t.Fatalf("setup paid request to %s: status=%d", path, rec.Code) + } + } + + keptLabels := map[string]string{"offer_namespace": "llm", "offer_name": "keep", "chain": ""} + goneLabels := map[string]string{"offer_namespace": "llm", "offer_name": "gone", "chain": ""} + + families := scrapeVerifierMetrics(t, v) + for _, name := range []string{ + "obol_x402_verifier_charged_requests_total", + "obol_x402_verifier_last_payment_success_seconds", + } { + family := families[name] + if family == nil { + t.Fatalf("baseline: missing %s before reload", name) + } + findVerifierMetricValue(t, family, keptLabels) + findVerifierMetricValue(t, family, goneLabels) + } + + // Reload with the second offer dropped — the same path ServiceOffer + // deletion takes through ConfigAccumulator.SetRoutes. + if err := v.Reload(&PricingConfig{ + Wallet: "0xdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef", + Chain: "base-sepolia", + FacilitatorURL: fac.URL, + Routes: []RouteRule{keptRoute}, + }); err != nil { + t.Fatalf("Reload: %v", err) + } + + families = scrapeVerifierMetrics(t, v) + for _, name := range []string{ + "obol_x402_verifier_requests_total", + "obol_x402_verifier_payment_required_total", + "obol_x402_verifier_payment_verified_total", + "obol_x402_verifier_payment_failed_total", + "obol_x402_verifier_charged_requests_total", + "obol_x402_verifier_last_payment_success_seconds", + } { + assertVerifierMetricMissing(t, families[name], goneLabels) + } + + // Kept offer's series must survive the prune. + if charged := families["obol_x402_verifier_charged_requests_total"]; charged != nil { + findVerifierMetricValue(t, charged, keptLabels) + } + if gauge := families["obol_x402_verifier_last_payment_success_seconds"]; gauge != nil { + findVerifierMetricValue(t, gauge, keptLabels) + } +} + // findVerifierMetricValue returns the value of the series in `family` whose // labels match `wantLabels` exactly, failing the test if no such series exists. func findVerifierMetricValue(t *testing.T, family *dto.MetricFamily, wantLabels map[string]string) float64 {