Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions internal/embed/infrastructure/base/templates/llm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -298,3 +298,27 @@ spec:
port: 4000
targetPort: http
protocol: TCP

---
# PodMonitor for the x402-buyer sidecar — kept as PodMonitor (not
# ServiceMonitor) because the sidecar listens on a per-pod port (8402)
# that is NOT exposed via the litellm Service. Lives alongside the
# Deployment that hosts it so changing the buyer port here is one edit.
#
# Picked up by kube-prometheus-stack via the `release: monitoring` label.
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: litellm-x402-buyer
namespace: llm
labels:
release: monitoring
app: litellm
spec:
selector:
matchLabels:
app: litellm
podMetricsEndpoints:
- port: buyer-http
path: /metrics
interval: 30s
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
---
# RBAC for the obol-frontend pod's ServiceAccount.
#
# The frontend pod uses this SA's bearer token to:
# - Discover OpenClaw / Hermes instances (namespaces, pods, configmaps)
# - List + mutate ServiceOffer CRs (sell-modal + pause/resume/delete row actions)
# - List PurchaseRequest CRs (My Purchases page; never writes)
#
# The frontend is local-only behind the obol.stack hostname restriction
# (the operator owns the cluster), so this is a single trust boundary.
# Defense-in-depth note: the `secrets` rule is intentionally omitted —
# no code path reads them and the SA token shouldn't have that reach.
# /status subresources are omitted from PurchaseRequest because the
# controller is the only writer.
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: obol-frontend-openclaw-discovery
labels:
app.kubernetes.io/name: obol-frontend
rules:
- apiGroups: [""]
resources: ["namespaces"]
verbs: ["get", "list"]
- apiGroups: [""]
resources: ["pods", "configmaps"]
verbs: ["get", "list"]
# ServiceOffer CRD — frontend sell modal creates offers, row actions
# pause/resume (annotation patch) and delete.
- apiGroups: ["obol.org"]
resources: ["serviceoffers", "serviceoffers/status"]
verbs: ["get", "list", "create", "update", "patch", "delete"]
# PurchaseRequest CRD — frontend My Purchases page lists buyer-side
# records. Read-only; agent buy.py and the controller are the writers.
- apiGroups: ["obol.org"]
resources: ["purchaserequests"]
verbs: ["get", "list", "watch"]

---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: obol-frontend-openclaw-discovery
labels:
app.kubernetes.io/name: obol-frontend
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: obol-frontend-openclaw-discovery
subjects:
- kind: ServiceAccount
name: obol-frontend
namespace: obol-frontend
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
---
# Recording + alerting rules for x402 verifier traffic.
#
# Recording rules pre-aggregate the queries that the frontend's
# /api/sell/list joins use (chargedSalesByOfferAndChain,
# chargedRequests24hByOffer). The frontend reads the recorded series
# directly, which:
# * removes the `increase()` 2-sample minimum quirk (cold offers no
# longer show "0" for the first 30s after they receive traffic),
# * decouples the page from raw metric names (renaming
# obol_x402_verifier_charged_requests_total no longer breaks the UI),
# * cuts query cost on dashboards / page reloads (sum is done once at
# evaluation time, not per page-load).
#
# Alerting rules surface the two operator-meaningful failure modes the
# release-smoke flows historically caught manually.
#
# Picked up by kube-prometheus-stack via the `release: monitoring` label
# (configured in values/monitoring.yaml.gotmpl).
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: x402-verifier
namespace: x402
labels:
release: monitoring
app: x402-verifier
spec:
groups:
- name: x402.recording
interval: 30s
rules:
# 24h charged-request count per (offer, chain). Replaces the
# frontend's `increase(charged_requests_total[24h])` query — same
# math, pre-computed every 30s.
- record: x402:revenue:24h_by_offer_chain
expr: |
sum by (offer_namespace, offer_name, chain) (
increase(obol_x402_verifier_charged_requests_total[24h])
)

# 7d charged-request count per (offer, chain). Powers the
# EarningsStrip per-chain × CRD price multiplication.
- record: x402:revenue:7d_by_offer_chain
expr: |
sum by (offer_namespace, offer_name, chain) (
increase(obol_x402_verifier_charged_requests_total[7d])
)

# Lifetime charged-request count per offer (sum across replicas
# + chains). Used in the My Listings "today · X earned" header
# text and the Browse catalog usage badge.
- record: x402:revenue:lifetime_by_offer
expr: |
sum by (offer_namespace, offer_name) (
obol_x402_verifier_charged_requests_total
)

# Settlement rate (verified / attempted) over the last hour, per
# (offer, chain). Useful for the dashboard + the alert below.
- record: x402:settlement_rate:1h_by_offer_chain
expr: |
sum by (offer_namespace, offer_name, chain) (
rate(obol_x402_verifier_payment_verified_total[1h])
)
/
clamp_min(
sum by (offer_namespace, offer_name, chain) (
rate(obol_x402_verifier_payment_required_total[1h])
+
rate(obol_x402_verifier_payment_verified_total[1h])
+
rate(obol_x402_verifier_payment_failed_total[1h])
),
1
)

- name: x402.alerting
rules:
# Payment-failure ratio crossed 10% over the last hour for a paid
# route that's actually receiving traffic. Typical cause:
# facilitator unreachable, chain pruning, or seller's CA bundle
# missing (CLAUDE.md pitfall #8).
- alert: X402PaymentFailureRateHigh
expr: |
(
sum by (offer_namespace, offer_name, chain) (
rate(obol_x402_verifier_payment_failed_total[1h])
)
/
clamp_min(
sum by (offer_namespace, offer_name, chain) (
rate(obol_x402_verifier_payment_failed_total[1h])
+
rate(obol_x402_verifier_payment_verified_total[1h])
),
1
)
) > 0.10
for: 10m
labels:
severity: warning
annotations:
summary: "x402 payment failures > 10% on {{ $labels.offer_namespace }}/{{ $labels.offer_name }} ({{ $labels.chain }})"
description: |
More than 10% of paid requests to
{{ $labels.offer_namespace }}/{{ $labels.offer_name }} on
{{ $labels.chain }} have failed verification over the last
hour. Check the verifier logs for x509/facilitator errors and
the seller's `ca-certificates` ConfigMap.

# An offer received a 402 (payment_required) within the last hour
# but no charged_requests happened in the same window. Either
# buyers aren't completing the flow, or settlement is broken
# downstream of the verifier.
- alert: X402NoSettlementsAfterChallenge
expr: |
(
sum by (offer_namespace, offer_name) (
increase(obol_x402_verifier_payment_required_total[1h])
) > 0
)
and
(
sum by (offer_namespace, offer_name) (
increase(obol_x402_verifier_charged_requests_total[1h])
) == 0
)
for: 30m
labels:
severity: warning
annotations:
summary: "{{ $labels.offer_namespace }}/{{ $labels.offer_name }} returns 402 but never settles"
description: |
The x402 verifier issued 402 responses for
{{ $labels.offer_namespace }}/{{ $labels.offer_name }} in the
last hour but observed no settled requests. Check the buyer
sidecar's auth pool (/status) and the facilitator's settlement
endpoint.
26 changes: 26 additions & 0 deletions internal/embed/infrastructure/base/templates/x402.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -332,3 +332,29 @@ spec:
selector:
matchLabels:
app: x402-verifier

---
# ServiceMonitor for x402-verifier — scrapes the stable Service endpoint
# rather than per-pod IPs (which is what a PodMonitor would do). Lives
# alongside the Service it observes so adding/changing the port or
# selector here is a single-file change.
#
# Picked up by kube-prometheus-stack via the `release: monitoring` label
# (configured in values/monitoring.yaml.gotmpl as the serviceMonitorSelector).
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: x402-verifier
namespace: x402
labels:
release: monitoring
app: x402-verifier
spec:
selector:
matchLabels:
app: x402-verifier
endpoints:
- port: http
path: /metrics
interval: 30s
scrapeTimeout: 10s
79 changes: 14 additions & 65 deletions internal/embed/infrastructure/helmfile.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,31 +44,12 @@ releases:
values:
- ./values/monitoring.yaml.gotmpl

- name: llm-buyer-podmonitor
namespace: llm
createNamespace: true
chart: bedag/raw
version: 2.0.2
needs:
- monitoring/monitoring
- kube-system/base
values:
- resources:
- apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: litellm-x402-buyer
namespace: llm
labels:
release: monitoring
spec:
selector:
matchLabels:
app: litellm
podMetricsEndpoints:
- port: buyer-http
path: /metrics
interval: 30s
# NOTE: PodMonitor for litellm-x402-buyer and ServiceMonitor for
# x402-verifier moved into base/templates/llm.yaml and
# base/templates/x402.yaml respectively. They live alongside the
# workloads they observe so a port/selector edit is one-file. Kills
# two `bedag/raw` releases. kube-prometheus-stack picks them up via
# the `release: monitoring` label.

# Traefik ingress controller with Gateway API support
# Traefik v38+ bundles Gateway API CRDs in its crds/ directory
Expand Down Expand Up @@ -279,43 +260,11 @@ releases:
- name: obol-frontend-obol-app
port: 3000

# Obol Frontend RBAC (OpenClaw instance discovery via Kubernetes API)
- name: obol-frontend-rbac
namespace: obol-frontend
chart: bedag/raw
version: 2.0.2
needs:
- obol-frontend/obol-frontend
values:
- resources:
- apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: obol-frontend-openclaw-discovery
labels:
app.kubernetes.io/name: obol-frontend
rules:
- apiGroups: [""]
resources: ["namespaces"]
verbs: ["get", "list"]
- apiGroups: [""]
resources: ["pods", "configmaps", "secrets"]
verbs: ["get", "list"]
# ServiceOffer CRD — frontend sell modal creates offers
- apiGroups: ["obol.org"]
resources: ["serviceoffers", "serviceoffers/status"]
verbs: ["get", "list", "create", "update", "patch", "delete"]
- apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: obol-frontend-openclaw-discovery
labels:
app.kubernetes.io/name: obol-frontend
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: obol-frontend-openclaw-discovery
subjects:
- kind: ServiceAccount
name: obol-frontend
namespace: obol-frontend
# NOTE: obol-frontend-rbac ClusterRole + ClusterRoleBinding moved into
# base/templates/obol-frontend-rbac.yaml. Co-located with the workload
# they grant; kills a `bedag/raw` release. Frontend-egress NetworkPolicy
# was attempted and reverted — on k3s + Flannel (k3d's default CNI) the
# kubernetes apiserver Service Endpoints point at the host process,
# outside the cluster pod/service CIDRs. A clean allowlist can't target
# the apiserver portably without an install-specific ipBlock for the k3s
# host IP. Tracking as a deferred hardening item.
8 changes: 5 additions & 3 deletions internal/monetizeapi/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -276,9 +276,11 @@ type PreSignedAuth struct {
}

type PurchaseAutoRefill struct {
Enabled bool `json:"enabled,omitempty"`
Threshold int `json:"threshold,omitempty"`
Count int `json:"count,omitempty"`
Enabled bool `json:"enabled,omitempty"`
Threshold int `json:"threshold,omitempty"`
Count int `json:"count,omitempty"`
MaxTotal int `json:"maxTotal,omitempty"`
MaxSpendPerDay string `json:"maxSpendPerDay,omitempty"`
}

type PurchasePayment struct {
Expand Down
Loading
Loading