diff --git a/internal/embed/infrastructure/base/templates/llm.yaml b/internal/embed/infrastructure/base/templates/llm.yaml index cf34841f..18982349 100644 --- a/internal/embed/infrastructure/base/templates/llm.yaml +++ b/internal/embed/infrastructure/base/templates/llm.yaml @@ -298,3 +298,27 @@ spec: port: 4000 targetPort: http protocol: TCP + +--- +# PodMonitor for the x402-buyer sidecar — kept as PodMonitor (not +# ServiceMonitor) because the sidecar listens on a per-pod port (8402) +# that is NOT exposed via the litellm Service. Lives alongside the +# Deployment that hosts it so changing the buyer port here is one edit. +# +# Picked up by kube-prometheus-stack via the `release: monitoring` label. +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: litellm-x402-buyer + namespace: llm + labels: + release: monitoring + app: litellm +spec: + selector: + matchLabels: + app: litellm + podMetricsEndpoints: + - port: buyer-http + path: /metrics + interval: 30s diff --git a/internal/embed/infrastructure/base/templates/obol-frontend-rbac.yaml b/internal/embed/infrastructure/base/templates/obol-frontend-rbac.yaml new file mode 100644 index 00000000..038df594 --- /dev/null +++ b/internal/embed/infrastructure/base/templates/obol-frontend-rbac.yaml @@ -0,0 +1,53 @@ +--- +# RBAC for the obol-frontend pod's ServiceAccount. +# +# The frontend pod uses this SA's bearer token to: +# - Discover OpenClaw / Hermes instances (namespaces, pods, configmaps) +# - List + mutate ServiceOffer CRs (sell-modal + pause/resume/delete row actions) +# - List PurchaseRequest CRs (My Purchases page; never writes) +# +# The frontend is local-only behind the obol.stack hostname restriction +# (the operator owns the cluster), so this is a single trust boundary. +# Defense-in-depth note: the `secrets` rule is intentionally omitted — +# no code path reads them and the SA token shouldn't have that reach. +# /status subresources are omitted from PurchaseRequest because the +# controller is the only writer. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: obol-frontend-openclaw-discovery + labels: + app.kubernetes.io/name: obol-frontend +rules: + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["get", "list"] + - apiGroups: [""] + resources: ["pods", "configmaps"] + verbs: ["get", "list"] + # ServiceOffer CRD — frontend sell modal creates offers, row actions + # pause/resume (annotation patch) and delete. + - apiGroups: ["obol.org"] + resources: ["serviceoffers", "serviceoffers/status"] + verbs: ["get", "list", "create", "update", "patch", "delete"] + # PurchaseRequest CRD — frontend My Purchases page lists buyer-side + # records. Read-only; agent buy.py and the controller are the writers. + - apiGroups: ["obol.org"] + resources: ["purchaserequests"] + verbs: ["get", "list", "watch"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: obol-frontend-openclaw-discovery + labels: + app.kubernetes.io/name: obol-frontend +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: obol-frontend-openclaw-discovery +subjects: + - kind: ServiceAccount + name: obol-frontend + namespace: obol-frontend diff --git a/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml b/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml new file mode 100644 index 00000000..73b10f94 --- /dev/null +++ b/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml @@ -0,0 +1,139 @@ +--- +# Recording + alerting rules for x402 verifier traffic. +# +# Recording rules pre-aggregate the queries that the frontend's +# /api/sell/list joins use (chargedSalesByOfferAndChain, +# chargedRequests24hByOffer). The frontend reads the recorded series +# directly, which: +# * removes the `increase()` 2-sample minimum quirk (cold offers no +# longer show "0" for the first 30s after they receive traffic), +# * decouples the page from raw metric names (renaming +# obol_x402_verifier_charged_requests_total no longer breaks the UI), +# * cuts query cost on dashboards / page reloads (sum is done once at +# evaluation time, not per page-load). +# +# Alerting rules surface the two operator-meaningful failure modes the +# release-smoke flows historically caught manually. +# +# Picked up by kube-prometheus-stack via the `release: monitoring` label +# (configured in values/monitoring.yaml.gotmpl). +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: x402-verifier + namespace: x402 + labels: + release: monitoring + app: x402-verifier +spec: + groups: + - name: x402.recording + interval: 30s + rules: + # 24h charged-request count per (offer, chain). Replaces the + # frontend's `increase(charged_requests_total[24h])` query — same + # math, pre-computed every 30s. + - record: x402:revenue:24h_by_offer_chain + expr: | + sum by (offer_namespace, offer_name, chain) ( + increase(obol_x402_verifier_charged_requests_total[24h]) + ) + + # 7d charged-request count per (offer, chain). Powers the + # EarningsStrip per-chain × CRD price multiplication. + - record: x402:revenue:7d_by_offer_chain + expr: | + sum by (offer_namespace, offer_name, chain) ( + increase(obol_x402_verifier_charged_requests_total[7d]) + ) + + # Lifetime charged-request count per offer (sum across replicas + # + chains). Used in the My Listings "today · X earned" header + # text and the Browse catalog usage badge. + - record: x402:revenue:lifetime_by_offer + expr: | + sum by (offer_namespace, offer_name) ( + obol_x402_verifier_charged_requests_total + ) + + # Settlement rate (verified / attempted) over the last hour, per + # (offer, chain). Useful for the dashboard + the alert below. + - record: x402:settlement_rate:1h_by_offer_chain + expr: | + sum by (offer_namespace, offer_name, chain) ( + rate(obol_x402_verifier_payment_verified_total[1h]) + ) + / + clamp_min( + sum by (offer_namespace, offer_name, chain) ( + rate(obol_x402_verifier_payment_required_total[1h]) + + + rate(obol_x402_verifier_payment_verified_total[1h]) + + + rate(obol_x402_verifier_payment_failed_total[1h]) + ), + 1 + ) + + - name: x402.alerting + rules: + # Payment-failure ratio crossed 10% over the last hour for a paid + # route that's actually receiving traffic. Typical cause: + # facilitator unreachable, chain pruning, or seller's CA bundle + # missing (CLAUDE.md pitfall #8). + - alert: X402PaymentFailureRateHigh + expr: | + ( + sum by (offer_namespace, offer_name, chain) ( + rate(obol_x402_verifier_payment_failed_total[1h]) + ) + / + clamp_min( + sum by (offer_namespace, offer_name, chain) ( + rate(obol_x402_verifier_payment_failed_total[1h]) + + + rate(obol_x402_verifier_payment_verified_total[1h]) + ), + 1 + ) + ) > 0.10 + for: 10m + labels: + severity: warning + annotations: + summary: "x402 payment failures > 10% on {{ $labels.offer_namespace }}/{{ $labels.offer_name }} ({{ $labels.chain }})" + description: | + More than 10% of paid requests to + {{ $labels.offer_namespace }}/{{ $labels.offer_name }} on + {{ $labels.chain }} have failed verification over the last + hour. Check the verifier logs for x509/facilitator errors and + the seller's `ca-certificates` ConfigMap. + + # An offer received a 402 (payment_required) within the last hour + # but no charged_requests happened in the same window. Either + # buyers aren't completing the flow, or settlement is broken + # downstream of the verifier. + - alert: X402NoSettlementsAfterChallenge + expr: | + ( + sum by (offer_namespace, offer_name) ( + increase(obol_x402_verifier_payment_required_total[1h]) + ) > 0 + ) + and + ( + sum by (offer_namespace, offer_name) ( + increase(obol_x402_verifier_charged_requests_total[1h]) + ) == 0 + ) + for: 30m + labels: + severity: warning + annotations: + summary: "{{ $labels.offer_namespace }}/{{ $labels.offer_name }} returns 402 but never settles" + description: | + The x402 verifier issued 402 responses for + {{ $labels.offer_namespace }}/{{ $labels.offer_name }} in the + last hour but observed no settled requests. Check the buyer + sidecar's auth pool (/status) and the facilitator's settlement + endpoint. diff --git a/internal/embed/infrastructure/base/templates/x402.yaml b/internal/embed/infrastructure/base/templates/x402.yaml index 9dcc933e..38482384 100644 --- a/internal/embed/infrastructure/base/templates/x402.yaml +++ b/internal/embed/infrastructure/base/templates/x402.yaml @@ -332,3 +332,29 @@ spec: selector: matchLabels: app: x402-verifier + +--- +# ServiceMonitor for x402-verifier — scrapes the stable Service endpoint +# rather than per-pod IPs (which is what a PodMonitor would do). Lives +# alongside the Service it observes so adding/changing the port or +# selector here is a single-file change. +# +# Picked up by kube-prometheus-stack via the `release: monitoring` label +# (configured in values/monitoring.yaml.gotmpl as the serviceMonitorSelector). +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: x402-verifier + namespace: x402 + labels: + release: monitoring + app: x402-verifier +spec: + selector: + matchLabels: + app: x402-verifier + endpoints: + - port: http + path: /metrics + interval: 30s + scrapeTimeout: 10s diff --git a/internal/embed/infrastructure/helmfile.yaml b/internal/embed/infrastructure/helmfile.yaml index aa7fc052..95f7c8b5 100644 --- a/internal/embed/infrastructure/helmfile.yaml +++ b/internal/embed/infrastructure/helmfile.yaml @@ -44,31 +44,12 @@ releases: values: - ./values/monitoring.yaml.gotmpl - - name: llm-buyer-podmonitor - namespace: llm - createNamespace: true - chart: bedag/raw - version: 2.0.2 - needs: - - monitoring/monitoring - - kube-system/base - values: - - resources: - - apiVersion: monitoring.coreos.com/v1 - kind: PodMonitor - metadata: - name: litellm-x402-buyer - namespace: llm - labels: - release: monitoring - spec: - selector: - matchLabels: - app: litellm - podMetricsEndpoints: - - port: buyer-http - path: /metrics - interval: 30s + # NOTE: PodMonitor for litellm-x402-buyer and ServiceMonitor for + # x402-verifier moved into base/templates/llm.yaml and + # base/templates/x402.yaml respectively. They live alongside the + # workloads they observe so a port/selector edit is one-file. Kills + # two `bedag/raw` releases. kube-prometheus-stack picks them up via + # the `release: monitoring` label. # Traefik ingress controller with Gateway API support # Traefik v38+ bundles Gateway API CRDs in its crds/ directory @@ -279,43 +260,11 @@ releases: - name: obol-frontend-obol-app port: 3000 - # Obol Frontend RBAC (OpenClaw instance discovery via Kubernetes API) - - name: obol-frontend-rbac - namespace: obol-frontend - chart: bedag/raw - version: 2.0.2 - needs: - - obol-frontend/obol-frontend - values: - - resources: - - apiVersion: rbac.authorization.k8s.io/v1 - kind: ClusterRole - metadata: - name: obol-frontend-openclaw-discovery - labels: - app.kubernetes.io/name: obol-frontend - rules: - - apiGroups: [""] - resources: ["namespaces"] - verbs: ["get", "list"] - - apiGroups: [""] - resources: ["pods", "configmaps", "secrets"] - verbs: ["get", "list"] - # ServiceOffer CRD — frontend sell modal creates offers - - apiGroups: ["obol.org"] - resources: ["serviceoffers", "serviceoffers/status"] - verbs: ["get", "list", "create", "update", "patch", "delete"] - - apiVersion: rbac.authorization.k8s.io/v1 - kind: ClusterRoleBinding - metadata: - name: obol-frontend-openclaw-discovery - labels: - app.kubernetes.io/name: obol-frontend - roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: obol-frontend-openclaw-discovery - subjects: - - kind: ServiceAccount - name: obol-frontend - namespace: obol-frontend + # NOTE: obol-frontend-rbac ClusterRole + ClusterRoleBinding moved into + # base/templates/obol-frontend-rbac.yaml. Co-located with the workload + # they grant; kills a `bedag/raw` release. Frontend-egress NetworkPolicy + # was attempted and reverted — on k3s + Flannel (k3d's default CNI) the + # kubernetes apiserver Service Endpoints point at the host process, + # outside the cluster pod/service CIDRs. A clean allowlist can't target + # the apiserver portably without an install-specific ipBlock for the k3s + # host IP. Tracking as a deferred hardening item. diff --git a/internal/monetizeapi/types.go b/internal/monetizeapi/types.go index 6e905eee..403cffdf 100644 --- a/internal/monetizeapi/types.go +++ b/internal/monetizeapi/types.go @@ -276,9 +276,11 @@ type PreSignedAuth struct { } type PurchaseAutoRefill struct { - Enabled bool `json:"enabled,omitempty"` - Threshold int `json:"threshold,omitempty"` - Count int `json:"count,omitempty"` + Enabled bool `json:"enabled,omitempty"` + Threshold int `json:"threshold,omitempty"` + Count int `json:"count,omitempty"` + MaxTotal int `json:"maxTotal,omitempty"` + MaxSpendPerDay string `json:"maxSpendPerDay,omitempty"` } type PurchasePayment struct { diff --git a/internal/monetizeapi/types_test.go b/internal/monetizeapi/types_test.go new file mode 100644 index 00000000..77a15a90 --- /dev/null +++ b/internal/monetizeapi/types_test.go @@ -0,0 +1,108 @@ +package monetizeapi + +import ( + "encoding/json" + "testing" +) + +// TestPurchaseAutoRefill_JSONRoundTrip asserts every field on +// PurchaseAutoRefill marshals to JSON and unmarshals back without loss. The +// MaxTotal + MaxSpendPerDay fields were added to match the CRD spec; this test +// pins the wire format and `omitempty` semantics so silent drift between the +// Go struct and the CRD surfaces as a test failure. +func TestPurchaseAutoRefill_JSONRoundTrip(t *testing.T) { + tests := []struct { + name string + in PurchaseAutoRefill + wantJSON string + }{ + { + name: "all fields populated", + in: PurchaseAutoRefill{ + Enabled: true, + Threshold: 5, + Count: 10, + MaxTotal: 100, + MaxSpendPerDay: "1.50", + }, + wantJSON: `{"enabled":true,"threshold":5,"count":10,"maxTotal":100,"maxSpendPerDay":"1.50"}`, + }, + { + name: "only enabled + new caps", + in: PurchaseAutoRefill{ + Enabled: true, + MaxTotal: 42, + MaxSpendPerDay: "0.05", + }, + wantJSON: `{"enabled":true,"maxTotal":42,"maxSpendPerDay":"0.05"}`, + }, + { + name: "zero values omit every field", + in: PurchaseAutoRefill{}, + wantJSON: `{}`, + }, + { + name: "MaxSpendPerDay alone", + in: PurchaseAutoRefill{ + MaxSpendPerDay: "0.0001", + }, + wantJSON: `{"maxSpendPerDay":"0.0001"}`, + }, + { + name: "MaxTotal alone", + in: PurchaseAutoRefill{ + MaxTotal: 7, + }, + wantJSON: `{"maxTotal":7}`, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gotJSON, err := json.Marshal(tt.in) + if err != nil { + t.Fatalf("marshal: %v", err) + } + if string(gotJSON) != tt.wantJSON { + t.Fatalf("marshal:\n got: %s\nwant: %s", gotJSON, tt.wantJSON) + } + + var roundTripped PurchaseAutoRefill + if err := json.Unmarshal(gotJSON, &roundTripped); err != nil { + t.Fatalf("unmarshal: %v", err) + } + if roundTripped != tt.in { + t.Fatalf("round-trip mismatch:\n got: %+v\nwant: %+v", roundTripped, tt.in) + } + }) + } +} + +// TestPurchaseAutoRefill_UnmarshalAcceptsCRDForm asserts that a JSON document +// shaped like the CRD spec deserialises into every Go field — this is the +// inverse of the marshal direction and catches accidental json-tag drift. +func TestPurchaseAutoRefill_UnmarshalAcceptsCRDForm(t *testing.T) { + const crdJSON = `{ + "enabled": true, + "threshold": 5, + "count": 10, + "maxTotal": 100, + "maxSpendPerDay": "1.50" + }` + + want := PurchaseAutoRefill{ + Enabled: true, + Threshold: 5, + Count: 10, + MaxTotal: 100, + MaxSpendPerDay: "1.50", + } + + var got PurchaseAutoRefill + if err := json.Unmarshal([]byte(crdJSON), &got); err != nil { + t.Fatalf("unmarshal: %v", err) + } + if got != want { + t.Fatalf("unmarshal mismatch:\n got: %+v\nwant: %+v", got, want) + } +} diff --git a/internal/x402/buyer/metrics.go b/internal/x402/buyer/metrics.go index 0df96424..5079f0a0 100644 --- a/internal/x402/buyer/metrics.go +++ b/internal/x402/buyer/metrics.go @@ -29,35 +29,35 @@ func newMetrics() *metrics { Name: "obol_x402_buyer_requests_total", Help: "Total requests routed through the x402 buyer sidecar.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), paymentAttempts: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_buyer_payment_attempts_total", Help: "Total x402 payment attempts made by the buyer sidecar.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), paymentSuccessTotal: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_buyer_payment_success_total", Help: "Total successful x402 payments made by the buyer sidecar.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), paymentFailureTotal: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_buyer_payment_failure_total", Help: "Total failed x402 payments attempted by the buyer sidecar.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), confirmSpendFailureTotal: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_buyer_confirm_spend_failure_total", Help: "Successful upstream responses whose consumed-auth state could not be persisted.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), // paymentUnsettledConfirmations counts the occurrences of an upstream // returning 2xx without X-PAYMENT-RESPONSE. The buyer still marks the @@ -69,28 +69,28 @@ func newMetrics() *metrics { Name: "obol_x402_buyer_payment_unsettled_confirmations_total", Help: "Upstream 2xx responses with no X-PAYMENT-RESPONSE header — auth consumed locally without observed on-chain settlement.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), authRemaining: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "obol_x402_buyer_auth_remaining", Help: "Remaining pre-signed authorizations for an upstream model mapping.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), authSpent: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "obol_x402_buyer_auth_spent", Help: "Consumed pre-signed authorizations for an upstream model mapping.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), activeModelMappings: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "obol_x402_buyer_active_model_mappings", Help: "Active paid model mappings loaded in the buyer sidecar.", }, - []string{"upstream", "remote_model"}, + []string{"upstream", "remote_model", "chain"}, ), } diff --git a/internal/x402/buyer/metrics_test.go b/internal/x402/buyer/metrics_test.go new file mode 100644 index 00000000..9ce4fabc --- /dev/null +++ b/internal/x402/buyer/metrics_test.go @@ -0,0 +1,191 @@ +package buyer + +import ( + "net/http" + "net/http/httptest" + "strings" + "testing" + + dto "github.com/prometheus/client_model/go" + "github.com/prometheus/common/expfmt" +) + +// TestPrometheusLabels_ChainPropagation asserts that prometheusLabels surfaces +// the `chain` label sourced from UpstreamConfig.Network so paid-request metrics +// can be partitioned by chain (base, base-sepolia, etc.). The empty-chain case +// is also exercised so the label is always rendered cleanly even when an +// upstream has no Network set. +func TestPrometheusLabels_ChainPropagation(t *testing.T) { + tests := []struct { + name string + upstream string + remoteModel string + chain string + want map[string]string + }{ + { + name: "base-sepolia chain propagates", + upstream: "upstream-a", + remoteModel: "qwen3.5:9b", + chain: "base-sepolia", + want: map[string]string{ + "upstream": "upstream-a", + "remote_model": "qwen3.5:9b", + "chain": "base-sepolia", + }, + }, + { + name: "base mainnet chain propagates", + upstream: "upstream-b", + remoteModel: "qwen3.5:4b", + chain: "base", + want: map[string]string{ + "upstream": "upstream-b", + "remote_model": "qwen3.5:4b", + "chain": "base", + }, + }, + { + name: "empty chain renders cleanly", + upstream: "upstream-c", + remoteModel: "qwen3.5:1b", + chain: "", + want: map[string]string{ + "upstream": "upstream-c", + "remote_model": "qwen3.5:1b", + "chain": "", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := prometheusLabels(tt.upstream, tt.remoteModel, tt.chain) + if len(got) != len(tt.want) { + t.Fatalf("got %d labels, want %d (%v vs %v)", len(got), len(tt.want), got, tt.want) + } + for k, v := range tt.want { + if got[k] != v { + t.Errorf("label %q = %q, want %q", k, got[k], v) + } + } + }) + } +} + +// TestMetrics_ChainLabelScrapeRoundtrip increments each of the 9 buyer +// counters/gauges using prometheusLabels and then scrapes /metrics through the +// registry's handler, asserting the `chain` label appears (with the expected +// value) on every series. +func TestMetrics_ChainLabelScrapeRoundtrip(t *testing.T) { + tests := []struct { + name string + upstream string + remoteModel string + chain string + }{ + { + name: "base-sepolia label visible on every series", + upstream: "upstream-a", + remoteModel: "qwen3.5:9b", + chain: "base-sepolia", + }, + { + name: "empty chain label is present and empty", + upstream: "upstream-b", + remoteModel: "qwen3.5:4b", + chain: "", + }, + } + + // Every metric registered by newMetrics carries the same {upstream, + // remote_model, chain} label set. + wantFamilies := []string{ + "obol_x402_buyer_requests_total", + "obol_x402_buyer_payment_attempts_total", + "obol_x402_buyer_payment_success_total", + "obol_x402_buyer_payment_failure_total", + "obol_x402_buyer_confirm_spend_failure_total", + "obol_x402_buyer_payment_unsettled_confirmations_total", + "obol_x402_buyer_auth_remaining", + "obol_x402_buyer_auth_spent", + "obol_x402_buyer_active_model_mappings", + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + m := newMetrics() + labels := prometheusLabels(tt.upstream, tt.remoteModel, tt.chain) + + // Counters: incremented once each. + m.requestsTotal.With(labels).Inc() + m.paymentAttempts.With(labels).Inc() + m.paymentSuccessTotal.With(labels).Inc() + m.paymentFailureTotal.With(labels).Inc() + m.confirmSpendFailureTotal.With(labels).Inc() + m.paymentUnsettledConfirmations.With(labels).Inc() + // Gauges: stamped with arbitrary non-zero values. + m.authRemaining.With(labels).Set(7) + m.authSpent.With(labels).Set(3) + m.activeModelMappings.With(labels).Set(1) + + families := scrapeBuyerMetrics(t, m) + + wantLabels := map[string]string{ + "upstream": tt.upstream, + "remote_model": tt.remoteModel, + "chain": tt.chain, + } + for _, name := range wantFamilies { + fam, ok := families[name] + if !ok { + t.Errorf("missing metric family %s", name) + continue + } + if !buyerHasSeriesWithLabels(fam, wantLabels) { + t.Errorf("metric %s missing series with labels %v", name, wantLabels) + } + } + }) + } +} + +// scrapeBuyerMetrics renders the metrics registry through its HTTP handler and +// parses the Prometheus text exposition into a name → MetricFamily map. +func scrapeBuyerMetrics(t *testing.T, m *metrics) map[string]*dto.MetricFamily { + t.Helper() + + rec := httptest.NewRecorder() + m.handler().ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/metrics", nil)) + if rec.Code != http.StatusOK { + t.Fatalf("metrics status = %d, want 200", rec.Code) + } + + var parser expfmt.TextParser + families, err := parser.TextToMetricFamilies(strings.NewReader(rec.Body.String())) + if err != nil { + t.Fatalf("parse metrics: %v", err) + } + return families +} + +// buyerHasSeriesWithLabels returns true iff `family` contains at least one +// series whose label set exactly equals `want`. +func buyerHasSeriesWithLabels(family *dto.MetricFamily, want map[string]string) bool { + for _, metric := range family.GetMetric() { + if len(metric.GetLabel()) != len(want) { + continue + } + match := true + for _, label := range metric.GetLabel() { + if want[label.GetName()] != label.GetValue() { + match = false + break + } + } + if match { + return true + } + } + return false +} diff --git a/internal/x402/buyer/proxy.go b/internal/x402/buyer/proxy.go index afcb3d5c..b1b644e0 100644 --- a/internal/x402/buyer/proxy.go +++ b/internal/x402/buyer/proxy.go @@ -203,17 +203,18 @@ func (p *Proxy) syncMetricsLocked() { for name, upstream := range p.upstreams { signer := p.signers[name] - labels := prometheusLabels(name, upstream.remoteModel) + labels := prometheusLabels(name, upstream.remoteModel, upstream.config.Network) p.metrics.activeModelMappings.With(labels).Set(1) p.metrics.authRemaining.With(labels).Set(float64(signer.Remaining())) p.metrics.authSpent.With(labels).Set(float64(signer.Spent())) } } -func prometheusLabels(name, remoteModel string) map[string]string { +func prometheusLabels(name, remoteModel, chain string) map[string]string { return map[string]string{ "upstream": name, "remote_model": remoteModel, + "chain": chain, } } @@ -226,7 +227,7 @@ func (p *Proxy) buildUpstreamHandler(name, remoteModel string, cfg UpstreamConfi return nil, fmt.Errorf("parse upstream URL %q: %w", cfg.URL, err) } - labels := prometheusLabels(name, remoteModel) + labels := prometheusLabels(name, remoteModel, cfg.Network) rp := &httputil.ReverseProxy{ Rewrite: func(pr *httputil.ProxyRequest) { pr.SetURL(target) @@ -298,7 +299,7 @@ func (p *Proxy) handleModelRequest(w http.ResponseWriter, r *http.Request) { return io.NopCloser(bytes.NewReader(rewrittenBody)), nil } - labels := prometheusLabels(entry.name, remoteModel) + labels := prometheusLabels(entry.name, remoteModel, entry.config.Network) p.metrics.requestsTotal.With(labels).Inc() entry.handler.ServeHTTP(w, r) } diff --git a/internal/x402/buyer/proxy_test.go b/internal/x402/buyer/proxy_test.go index 55281c46..5b135720 100644 --- a/internal/x402/buyer/proxy_test.go +++ b/internal/x402/buyer/proxy_test.go @@ -1045,7 +1045,7 @@ func TestProxy_ModelRoutingAndMetrics(t *testing.T) { } metrics := scrapeMetricFamilies(t, proxy) - labels := map[string]string{"upstream": "seller-qwen", "remote_model": "qwen3:32b"} + labels := map[string]string{"upstream": "seller-qwen", "remote_model": "qwen3:32b", "chain": "base-sepolia"} assertMetricValue(t, metrics["obol_x402_buyer_requests_total"], labels, 1) assertMetricValue(t, metrics["obol_x402_buyer_payment_attempts_total"], labels, 1) assertMetricValue(t, metrics["obol_x402_buyer_payment_success_total"], labels, 1) @@ -1295,10 +1295,10 @@ func TestProxy_ReloadSkipsConsumedAuthsAndReplacesModelMapping(t *testing.T) { t.Fatalf("active model mapping series = %d, want 1", metricFamilyLen(activeMappings)) } - assertMetricValue(t, activeMappings, map[string]string{"upstream": "seller-new", "remote_model": "new-model"}, 1) - assertMetricMissing(t, activeMappings, map[string]string{"upstream": "seller-old", "remote_model": "old-model"}) - assertMetricValue(t, metrics["obol_x402_buyer_auth_remaining"], map[string]string{"upstream": "seller-new", "remote_model": "new-model"}, 1) - assertMetricMissing(t, metrics["obol_x402_buyer_auth_remaining"], map[string]string{"upstream": "seller-old", "remote_model": "old-model"}) + assertMetricValue(t, activeMappings, map[string]string{"upstream": "seller-new", "remote_model": "new-model", "chain": "base-sepolia"}, 1) + assertMetricMissing(t, activeMappings, map[string]string{"upstream": "seller-old", "remote_model": "old-model", "chain": "base-sepolia"}) + assertMetricValue(t, metrics["obol_x402_buyer_auth_remaining"], map[string]string{"upstream": "seller-new", "remote_model": "new-model", "chain": "base-sepolia"}, 1) + assertMetricMissing(t, metrics["obol_x402_buyer_auth_remaining"], map[string]string{"upstream": "seller-old", "remote_model": "old-model", "chain": "base-sepolia"}) } func TestProxy_ReloadSamePurchasePreservesSpentAndAppendsAuthPool(t *testing.T) { @@ -1752,6 +1752,7 @@ func TestProxy_UpstreamSuccessNoSettlementHeader_IncrementsUnsettledMetric(t *te assertMetricValue(t, family, map[string]string{ "upstream": "paid", "remote_model": "paid", + "chain": "base-sepolia", }, 1) } @@ -1826,6 +1827,7 @@ func TestProxy_UpstreamSuccessWithSettlementHeader_DoesNotIncrementUnsettledMetr assertMetricMissing(t, metrics["obol_x402_buyer_payment_unsettled_confirmations_total"], map[string]string{ "upstream": "paid", "remote_model": "paid", + "chain": "base-sepolia", }) } @@ -1901,6 +1903,7 @@ func TestProxy_ConfirmSpendFailure_IncrementsMetric(t *testing.T) { assertMetricValue(t, metrics["obol_x402_buyer_confirm_spend_failure_total"], map[string]string{ "upstream": "paid", "remote_model": "paid", + "chain": "base-sepolia", }, 1) } diff --git a/internal/x402/metrics.go b/internal/x402/metrics.go index 21d87b68..b445d4c3 100644 --- a/internal/x402/metrics.go +++ b/internal/x402/metrics.go @@ -10,11 +10,12 @@ import ( type verifierMetrics struct { registry *prometheus.Registry - requestsTotal *prometheus.CounterVec - paymentRequired *prometheus.CounterVec - paymentVerified *prometheus.CounterVec - paymentFailed *prometheus.CounterVec - chargedRequests *prometheus.CounterVec + requestsTotal *prometheus.CounterVec + paymentRequired *prometheus.CounterVec + paymentVerified *prometheus.CounterVec + paymentFailed *prometheus.CounterVec + chargedRequests *prometheus.CounterVec + lastPaymentSuccess *prometheus.GaugeVec } func newVerifierMetrics() *verifierMetrics { @@ -25,35 +26,42 @@ func newVerifierMetrics() *verifierMetrics { Name: "obol_x402_verifier_requests_total", Help: "Requests evaluated by the x402 verifier for matched paid routes.", }, - []string{"route", "offer_namespace", "offer_name"}, + []string{"offer_namespace", "offer_name", "chain"}, ), paymentRequired: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_payment_required_total", Help: "Requests rejected with 402 because payment was required.", }, - []string{"route", "offer_namespace", "offer_name"}, + []string{"offer_namespace", "offer_name", "chain"}, ), paymentVerified: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_payment_verified_total", Help: "Requests approved after successful x402 payment verification.", }, - []string{"route", "offer_namespace", "offer_name"}, + []string{"offer_namespace", "offer_name", "chain"}, ), paymentFailed: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_payment_failed_total", Help: "Requests rejected after a provided x402 payment failed verification.", }, - []string{"route", "offer_namespace", "offer_name"}, + []string{"offer_namespace", "offer_name", "chain"}, ), chargedRequests: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "obol_x402_verifier_charged_requests_total", Help: "Requests that incurred a paid x402 charge.", }, - []string{"route", "offer_namespace", "offer_name"}, + []string{"offer_namespace", "offer_name", "chain"}, + ), + lastPaymentSuccess: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "obol_x402_verifier_last_payment_success_seconds", + Help: "Unix timestamp (seconds) of the most recent successful paid x402 charge for a route.", + }, + []string{"offer_namespace", "offer_name", "chain"}, ), } @@ -63,6 +71,7 @@ func newVerifierMetrics() *verifierMetrics { m.paymentVerified, m.paymentFailed, m.chargedRequests, + m.lastPaymentSuccess, ) return m @@ -71,3 +80,61 @@ func newVerifierMetrics() *verifierMetrics { func (m *verifierMetrics) handler() http.Handler { return promhttp.HandlerFor(m.registry, promhttp.HandlerOpts{}) } + +// pruneSeriesNotIn drops every (offer_namespace, offer_name, chain) series +// from the verifier's counter/gauge vecs that is not present in `keep`. +// Called from Verifier.load whenever the route set changes so deleted offers +// (e.g. `obol sell delete`) stop emitting stale series — most importantly the +// last_payment_success_seconds gauge, which would otherwise hold the deleted +// offer's last-success timestamp forever and falsely satisfy "recent activity" +// alerts and dashboards. +// +// Key shape: "ns\x00name\x00chain" — \x00 is forbidden in Kubernetes object +// names and CAIP-2 chain ids, so the byte-join can't collide. +func (m *verifierMetrics) pruneSeriesNotIn(keep map[string]struct{}) { + vecs := []interface { + DeletePartialMatch(prometheus.Labels) int + }{ + m.requestsTotal, + m.paymentRequired, + m.paymentVerified, + m.paymentFailed, + m.chargedRequests, + m.lastPaymentSuccess, + } + + gathered, err := m.registry.Gather() + if err != nil { + return + } + for _, family := range gathered { + for _, metric := range family.GetMetric() { + labels := metric.GetLabel() + ns, name, chain := "", "", "" + for _, l := range labels { + switch l.GetName() { + case "offer_namespace": + ns = l.GetValue() + case "offer_name": + name = l.GetValue() + case "chain": + chain = l.GetValue() + } + } + if ns == "" && name == "" { + continue + } + if _, ok := keep[ns+"\x00"+name+"\x00"+chain]; ok { + continue + } + match := prometheus.Labels{ + "offer_namespace": ns, + "offer_name": name, + "chain": chain, + } + for _, vec := range vecs { + vec.DeletePartialMatch(match) + } + } + } +} diff --git a/internal/x402/setup_test.go b/internal/x402/setup_test.go index ff8b7652..7dba813e 100644 --- a/internal/x402/setup_test.go +++ b/internal/x402/setup_test.go @@ -258,7 +258,11 @@ func TestX402Manifest_UsesServiceOfferControllerModel(t *testing.T) { if !strings.Contains(manifest, "resources: [\"serviceoffers\"]") { t.Fatalf("x402 manifest missing serviceoffer watch RBAC:\n%s", manifest) } - if strings.Contains(manifest, "kind: ServiceMonitor") { - t.Fatalf("x402 manifest still includes legacy ServiceMonitor stanza:\n%s", manifest) + // ServiceMonitor now lives in this manifest by design — relocated here + // from a bedag/raw helmfile release so the scrape config sits next to + // the Service it observes. Assert presence so a future cleanup can't + // silently drop it. + if !strings.Contains(manifest, "kind: ServiceMonitor") { + t.Fatalf("x402 manifest missing ServiceMonitor (relocated from bedag/raw helmfile in PR #513 hardening):\n%s", manifest) } } diff --git a/internal/x402/verifier.go b/internal/x402/verifier.go index 65374ea4..60e2fa80 100644 --- a/internal/x402/verifier.go +++ b/internal/x402/verifier.go @@ -64,6 +64,20 @@ func (v *Verifier) load(cfg *PricingConfig) error { v.chains.Store(&chains) v.config.Store(cfg) + // Drop metric series for offers that are no longer in the route set. + // Without this, deleting an offer leaves its counters + last-success + // gauge in the registry forever, polluting dashboards and silently + // keeping alerts (e.g. "no settlements after challenge") tied to dead + // labels. + live := make(map[string]struct{}, len(cfg.Routes)) + for _, r := range cfg.Routes { + if r.OfferNamespace == "" && r.OfferName == "" { + continue + } + live[r.OfferNamespace+"\x00"+r.OfferName+"\x00"+r.Network] = struct{}{} + } + v.metrics.pruneSeriesNotIn(live) + return nil } @@ -144,6 +158,7 @@ func (v *Verifier) HandleVerify(w http.ResponseWriter, r *http.Request) { case tracker.status == http.StatusOK && r.Header.Get("X-Payment") != "": v.metrics.paymentVerified.With(labels).Inc() v.metrics.chargedRequests.With(labels).Inc() + v.metrics.lastPaymentSuccess.With(labels).SetToCurrentTime() case tracker.status == http.StatusPaymentRequired && r.Header.Get("X-Payment") != "": v.metrics.paymentFailed.With(labels).Inc() case tracker.status == http.StatusPaymentRequired: @@ -198,6 +213,7 @@ func (v *Verifier) HandleProxy(w http.ResponseWriter, r *http.Request) { v.metrics.paymentVerified.With(labels).Inc() if tracker.Header().Get("X-PAYMENT-RESPONSE") != "" { v.metrics.chargedRequests.With(labels).Inc() + v.metrics.lastPaymentSuccess.With(labels).SetToCurrentTime() } } } @@ -446,9 +462,13 @@ func (r *statusRecorder) WriteHeader(status int) { } func prometheusLabels(rule *RouteRule) prometheus.Labels { + // `route` (= rule.Pattern) was dropped in favor of (offer_namespace, + // offer_name) which already uniquely identifies a paid route — the + // pattern was redundant and unbounded by path fragments, which would + // have ballooned series count for sellers running many granular routes. return prometheus.Labels{ - "route": rule.Pattern, "offer_namespace": rule.OfferNamespace, "offer_name": rule.OfferName, + "chain": rule.Network, } } diff --git a/internal/x402/verifier_test.go b/internal/x402/verifier_test.go index 083604a0..3b62c815 100644 --- a/internal/x402/verifier_test.go +++ b/internal/x402/verifier_test.go @@ -10,6 +10,7 @@ import ( "strings" "sync/atomic" "testing" + "time" x402types "github.com/coinbase/x402/go/types" dto "github.com/prometheus/client_model/go" @@ -752,9 +753,9 @@ func TestVerifier_MetricsPaymentRequired(t *testing.T) { metrics := scrapeVerifierMetrics(t, v) labels := map[string]string{ - "route": "/rpc/*", "offer_namespace": "llm", "offer_name": "paid-rpc", + "chain": "", } assertVerifierMetricValue(t, metrics["obol_x402_verifier_requests_total"], labels, 1) assertVerifierMetricValue(t, metrics["obol_x402_verifier_payment_required_total"], labels, 1) @@ -765,9 +766,9 @@ func TestVerifier_MetricsPaymentRequired(t *testing.T) { func TestVerifier_MetricsVerifiedAndRejectedPayments(t *testing.T) { labels := map[string]string{ - "route": "/rpc/*", "offer_namespace": "llm", "offer_name": "paid-rpc", + "chain": "", } okFac := newMockFacilitator(t, mockFacilitatorOpts{}) @@ -821,6 +822,196 @@ func TestVerifier_MetricsVerifiedAndRejectedPayments(t *testing.T) { assertVerifierMetricMissing(t, rejectMetrics["obol_x402_verifier_charged_requests_total"], labels) } +// TestVerifier_LastPaymentSuccessGauge asserts that the +// obol_x402_verifier_last_payment_success_seconds gauge is stamped to the +// current wall-clock time when a paid request succeeds, and is NOT touched +// when an unpaid request is rejected with 402. +// +// The gauge is labeled identically to the verifier counters; for this rule +// `chain` is the empty string because the test RouteRule has no Network set. +func TestVerifier_LastPaymentSuccessGauge(t *testing.T) { + labels := map[string]string{ + "offer_namespace": "llm", + "offer_name": "paid-rpc", + "chain": "", + } + + tests := []struct { + name string + setPayment bool + rejectPayment bool + wantStatus int + wantGaugeFresh bool // assert gauge ~= now() + }{ + { + name: "successful paid request stamps gauge", + setPayment: true, + rejectPayment: false, + wantStatus: http.StatusOK, + wantGaugeFresh: true, + }, + { + name: "unpaid 402 leaves gauge untouched", + setPayment: false, + rejectPayment: false, + wantStatus: http.StatusPaymentRequired, + wantGaugeFresh: false, + }, + { + name: "rejected payment leaves gauge untouched", + setPayment: true, + rejectPayment: true, + wantStatus: http.StatusPaymentRequired, + wantGaugeFresh: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + fac := newMockFacilitator(t, mockFacilitatorOpts{rejectPayment: tt.rejectPayment}) + v := newTestVerifier(t, fac.URL, []RouteRule{{ + Pattern: "/rpc/*", + Price: "0.0001", + OfferNamespace: "llm", + OfferName: "paid-rpc", + }}) + + req := httptest.NewRequest(http.MethodPost, "/verify", nil) + req.Header.Set("X-Forwarded-Uri", "/rpc/mainnet") + req.Header.Set("X-Forwarded-Host", "obol.stack") + if tt.setPayment { + req.Header.Set("X-PAYMENT", testPaymentHeader(t)) + } + + before := time.Now().Unix() + rec := httptest.NewRecorder() + v.HandleVerify(rec, req) + after := time.Now().Unix() + + if rec.Code != tt.wantStatus { + t.Fatalf("status = %d, want %d", rec.Code, tt.wantStatus) + } + + families := scrapeVerifierMetrics(t, v) + gauge := families["obol_x402_verifier_last_payment_success_seconds"] + + if !tt.wantGaugeFresh { + // Either the family is absent (no series emitted) or no + // series exists for these labels — both are acceptable for + // an untouched gauge. + assertVerifierMetricMissing(t, gauge, labels) + return + } + + if gauge == nil { + t.Fatalf("missing metric family obol_x402_verifier_last_payment_success_seconds") + } + got := findVerifierMetricValue(t, gauge, labels) + // Allow ±5s slack for clock skew / slow CI. + if got < float64(before-5) || got > float64(after+5) { + t.Fatalf("gauge = %v, want within [%d, %d]", got, before-5, after+5) + } + }) + } +} + +// TestVerifier_Reload_PrunesDeletedOfferSeries asserts that when an offer is +// removed from the route set (via Reload, the same path used by both the +// file-config watcher and the kube ServiceOffer informer), its previously +// stamped metric series are dropped from the registry. Without this, deleted +// offers' last_payment_success_seconds gauge would survive forever and keep +// firing/silencing alerts on dead labels. +func TestVerifier_Reload_PrunesDeletedOfferSeries(t *testing.T) { + fac := newMockFacilitator(t, mockFacilitatorOpts{}) + keptRoute := RouteRule{ + Pattern: "/keep/*", + Price: "0.0001", + OfferNamespace: "llm", + OfferName: "keep", + } + removedRoute := RouteRule{ + Pattern: "/gone/*", + Price: "0.0001", + OfferNamespace: "llm", + OfferName: "gone", + } + v := newTestVerifier(t, fac.URL, []RouteRule{keptRoute, removedRoute}) + + // Stamp metrics for both offers with a successful paid request each. + for _, path := range []string{"/keep/x", "/gone/x"} { + req := httptest.NewRequest(http.MethodPost, "/verify", nil) + req.Header.Set("X-Forwarded-Uri", path) + req.Header.Set("X-Forwarded-Host", "obol.stack") + req.Header.Set("X-PAYMENT", testPaymentHeader(t)) + rec := httptest.NewRecorder() + v.HandleVerify(rec, req) + if rec.Code != http.StatusOK { + t.Fatalf("setup paid request to %s: status=%d", path, rec.Code) + } + } + + keptLabels := map[string]string{"offer_namespace": "llm", "offer_name": "keep", "chain": ""} + goneLabels := map[string]string{"offer_namespace": "llm", "offer_name": "gone", "chain": ""} + + families := scrapeVerifierMetrics(t, v) + for _, name := range []string{ + "obol_x402_verifier_charged_requests_total", + "obol_x402_verifier_last_payment_success_seconds", + } { + family := families[name] + if family == nil { + t.Fatalf("baseline: missing %s before reload", name) + } + findVerifierMetricValue(t, family, keptLabels) + findVerifierMetricValue(t, family, goneLabels) + } + + // Reload with the second offer dropped — the same path ServiceOffer + // deletion takes through ConfigAccumulator.SetRoutes. + if err := v.Reload(&PricingConfig{ + Wallet: "0xdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef", + Chain: "base-sepolia", + FacilitatorURL: fac.URL, + Routes: []RouteRule{keptRoute}, + }); err != nil { + t.Fatalf("Reload: %v", err) + } + + families = scrapeVerifierMetrics(t, v) + for _, name := range []string{ + "obol_x402_verifier_requests_total", + "obol_x402_verifier_payment_required_total", + "obol_x402_verifier_payment_verified_total", + "obol_x402_verifier_payment_failed_total", + "obol_x402_verifier_charged_requests_total", + "obol_x402_verifier_last_payment_success_seconds", + } { + assertVerifierMetricMissing(t, families[name], goneLabels) + } + + // Kept offer's series must survive the prune. + if charged := families["obol_x402_verifier_charged_requests_total"]; charged != nil { + findVerifierMetricValue(t, charged, keptLabels) + } + if gauge := families["obol_x402_verifier_last_payment_success_seconds"]; gauge != nil { + findVerifierMetricValue(t, gauge, keptLabels) + } +} + +// findVerifierMetricValue returns the value of the series in `family` whose +// labels match `wantLabels` exactly, failing the test if no such series exists. +func findVerifierMetricValue(t *testing.T, family *dto.MetricFamily, wantLabels map[string]string) float64 { + t.Helper() + + for _, metric := range family.GetMetric() { + if verifierLabelsMatch(metric, wantLabels) { + return verifierMetricValue(metric) + } + } + t.Fatalf("metric %s missing labels %v", family.GetName(), wantLabels) + return 0 +} + func scrapeVerifierMetrics(t *testing.T, v *Verifier) map[string]*dto.MetricFamily { t.Helper()