Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
0b9a9b3
feat(x402): chain label on buyer + verifier metrics, sync PurchaseAut…
bussyjd May 23, 2026
08b303e
feat(x402): last-settlement gauge + verifier PodMonitor
bussyjd May 23, 2026
da721d4
test(x402): cover new chain label + last-payment gauge + PurchaseAuto…
bussyjd May 23, 2026
27e1ac5
chore(x402): RBAC trim, ServiceMonitor, relocate monitoring YAML, rec…
bussyjd May 23, 2026
0fbb99a
fix(x402): GC verifier metric series for deleted offers
bussyjd May 23, 2026
9be9de8
fix(x402): verifier replicas: 2 → 1 to keep metric GC correct
bussyjd May 23, 2026
522aeae
fix(x402-metrics): align Prometheus retention with recording-rule win…
bussyjd May 23, 2026
fdb86b3
chore(images): digest-pin verifier, controller, litellm, cloudflared
bussyjd May 23, 2026
7896384
feat(controller): wire client-go leader-election so HA scaling is safe
bussyjd May 23, 2026
d8912eb
fix(x402): gate verifier /readyz on informer cache sync
bussyjd May 23, 2026
08b4808
refactor(x402): drive verifier deployment from helmfile, not Go-side …
bussyjd May 23, 2026
04b9a6e
feat(security): Restricted Pod Security Standard across embedded work…
bussyjd May 23, 2026
5c9a879
fix(x402-buyer): persist consumed-nonce state to PVC instead of emptyDir
bussyjd May 23, 2026
fb594ea
refactor: relocate remaining bedag/raw helmfile releases into base chart
bussyjd May 23, 2026
6bec651
fix(x402): fail-closed when URI is under a paid prefix but no rule ma…
bussyjd May 23, 2026
4353948
feat(monetizeapi): controller-gen as canonical CRD schema source
bussyjd May 23, 2026
9481e4e
fix(prometheus-rules): escape PromQL $labels for Helm rendering
bussyjd May 24, 2026
7919a36
docs(migration): bedag/raw → base release ownership transfer script
bussyjd May 24, 2026
938b380
fix(controller/render): Restricted PSS securityContext on httpd workl…
bussyjd May 24, 2026
f9f1ff5
fix(prometheus-rules): use increase() for the per-offer revenue rule
bussyjd May 24, 2026
b700f34
feat(x402-metrics): add asset_symbol label for per-token queries
bussyjd May 24, 2026
9022f37
fix(prometheus-rules): use epsilon floor not 1.0 to avoid under-repor…
bussyjd May 24, 2026
7c66408
ci: add helm-template-smoke job to catch chart-render parse errors
bussyjd May 24, 2026
e2d4add
docs(observability): record the thin-layer architecture decisions
bussyjd May 24, 2026
5de3d4a
feat(monetize): replace pause annotation with ERC-8004-friendly drain
bussyjd May 24, 2026
ffdd459
merge: chore/digest-pin-cluster-images (#517) - digest-pin verifier, …
bussyjd May 24, 2026
b8f0e09
merge: feat/controller-leader-election (#518) - wire client-go leader…
bussyjd May 24, 2026
3693513
merge: refactor/ensure-verifier-via-helmfile (#520) - drive verifier …
bussyjd May 24, 2026
4b58459
merge: feat/restricted-pss-sweep (#521) - Restricted PSS across embed…
bussyjd May 24, 2026
22971d7
merge: fix/x402-buyer-state-pvc (#522) - persist consumed-nonce state…
bussyjd May 24, 2026
b83a6e4
merge: refactor/eliminate-bedag-raw-releases (#523) - move bedag/raw …
bussyjd May 24, 2026
8e7e371
merge: feat/controller-gen-codegen (#525) - controller-gen as canonic…
bussyjd May 24, 2026
91f11a4
merge: ci/helm-template-smoke (#533) - add helm-template-smoke job
bussyjd May 24, 2026
d085287
merge: docs/observability-thin-layer-architecture (#534) - record thi…
bussyjd May 24, 2026
ef31561
merge: feat/drain-replaces-pause (#535) - ERC-8004-friendly drain ins…
bussyjd May 24, 2026
8c94219
merge: docs/bedag-raw-migration-script (#528) - bedag/raw migration s…
bussyjd May 24, 2026
8dad18e
merge: fix/controller-render-restricted-pss (#529) - Restricted PSS s…
bussyjd May 24, 2026
f4e07b3
merge: feat/x402-marketplace-metrics (#513) - chain label, last-settl…
bussyjd May 24, 2026
7b00484
merge: fix/verifier-single-replica (#515) - verifier replicas 2->1 fo…
bussyjd May 24, 2026
27471b8
merge: fix/prom-retention-window-alignment (#516) - align Prometheus …
bussyjd May 24, 2026
d425181
merge: fix/verifier-readyz-on-informer-sync (#519) - gate verifier /r…
bussyjd May 24, 2026
186a4f0
merge: fix/verifier-fail-closed-on-paid-prefix (#524) - fail-closed w…
bussyjd May 24, 2026
dbc3ee0
merge: fix/prometheus-rules-helm-template-escape (#527) - escape Prom…
bussyjd May 24, 2026
8d85a28
merge: fix/prometheus-rule-increase-7d (#530) - use increase() for pe…
bussyjd May 24, 2026
ff89758
merge: feat/x402-asset-symbol-label (#531) - asset_symbol label for p…
bussyjd May 24, 2026
a3cb0a3
merge: fix/alert-clamp-min-epsilon (#532) - use epsilon floor not 1.0…
bussyjd May 24, 2026
04ed1ab
test(stack): allow multi-line emptyDir after PSS sweep sizeLimit addi…
bussyjd May 24, 2026
c3ba469
fix: resolve marketplace bundle architecture blockers
bussyjd May 24, 2026
82cbfae
chore: remove pre-release migration script
bussyjd May 24, 2026
94418db
docs: warn pre-release testers about stack reset
bussyjd May 24, 2026
46189cd
docs: clarify pre-release ownership warning
bussyjd May 24, 2026
1dbbf60
merge: fix/marketplace-bundle-architecture-review (#541) - resolve ar…
bussyjd May 24, 2026
7453339
ci: restrict workflow token permissions
bussyjd May 24, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .github/release-template.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,19 @@ repositories or docs.]
## Breaking changes / Migration notes

- [Delete this section if there are no breaking changes.]
- **Pre-release tester warning**: If you ran an unreleased marketplace or
chart-consolidation branch before this release, `obol stack up` may fail
with Helm `invalid ownership metadata` errors for resources or namespaces
that moved into the `base` chart. This is not a supported production
migration path. Back up anything you need from the local test stack, then
recreate it:

```bash
obol stack down
obol stack purge --force
obol stack init
obol stack up
```

## Known issues

Expand Down
116 changes: 116 additions & 0 deletions .github/workflows/helm-template-smoke.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
name: Helm Template Smoke

on:
pull_request:
branches: [ main ]
paths:
- 'internal/embed/infrastructure/**'
- '.github/workflows/helm-template-smoke.yml'
push:
branches: [ main ]
paths:
- 'internal/embed/infrastructure/**'
- '.github/workflows/helm-template-smoke.yml'

permissions:
contents: read

jobs:
helm-template-smoke:
name: helm template embedded chart
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1

- name: Set up Helm
uses: azure/setup-helm@1a275c3b69536ee54be43f2070a358922e12c8d4 # v4.3.1
with:
version: v3.20.1 # match obolup.sh pinned version

- name: helm template ./base
run: |
# Render the embedded `base` chart and fail on Go-template parse
# errors. Catches bugs like the unescaped `{{ $labels }}` in
# PrometheusRule annotations that broke `helm upgrade base` on
# every `obol stack up` (see PR #527). `go test ./...` does not
# exercise Helm rendering, so this is the only pre-merge gate
# for chart parse errors.
#
# The base chart contains `{{PLACEHOLDER}}` strings (e.g.
# `{{OLLAMA_HOST_IP}}`, `{{CLUSTER_ID}}`) that are substituted
# by `internal/defaults/defaults.go::InfrastructureReplacements`
# before helmfile runs. Helm's Go-template parser would treat
# them as actions and fail, so we substitute stub values into
# a working copy first — mirroring what `obol stack init` does.
set -euo pipefail
workdir="$(mktemp -d)"
cp -R internal/embed/infrastructure/base "$workdir/base"
# Mirror internal/defaults InfrastructureReplacements with CI stubs.
find "$workdir/base" -type f -name '*.yaml' -print0 \
| xargs -0 sed -i \
-e 's/{{OLLAMA_HOST_IP}}/127.0.0.1/g' \
-e 's/{{OLLAMA_HOST}}/localhost/g' \
-e 's/{{CLUSTER_ID}}/ci-helm-smoke/g'
# Match values passed by helmfile.yaml `releases[base]`.
helm template base "$workdir/base" \
--set dataDir=/data \
--set network=mainnet \
> "$workdir/base-rendered.yaml"

# Kubernetes object identity must be unique within one rendered
# chart. Helm will happily render duplicate apiVersion/kind/name
# tuples and leave the actual outcome to manifest ordering; this
# caught the duplicated obol-frontend ClusterRole/Binding review bug.
awk '
function flush() {
if (api && kind && name) {
key = api "/" kind "/" ns "/" name
count[key]++
}
api = kind = name = ns = ""; inmeta = 0
}
/^---/ { flush(); next }
/^apiVersion:/ { api = $2; next }
/^kind:/ { kind = $2; next }
/^metadata:/ { inmeta = 1; next }
inmeta && /^ name:/ { name = $2; next }
inmeta && /^ namespace:/ { ns = $2; next }
/^[^ ]/ && $0 !~ /^(apiVersion|kind|metadata):/ { inmeta = 0 }
END {
flush()
for (k in count) {
if (count[k] > 1) {
print count[k] " " k
dup = 1
}
}
exit dup
}' "$workdir/base-rendered.yaml"

- name: helm template ./cloudflared
run: |
# The cloudflared chart has no placeholder substitution and uses
# default values from values.yaml.
set -euo pipefail
helm template cloudflared internal/embed/infrastructure/cloudflared \
> /dev/null

- name: helm lint ./base
run: |
set -euo pipefail
workdir="$(mktemp -d)"
cp -R internal/embed/infrastructure/base "$workdir/base"
find "$workdir/base" -type f -name '*.yaml' -print0 \
| xargs -0 sed -i \
-e 's/{{OLLAMA_HOST_IP}}/127.0.0.1/g' \
-e 's/{{OLLAMA_HOST}}/localhost/g' \
-e 's/{{CLUSTER_ID}}/ci-helm-smoke/g'
helm lint "$workdir/base" \
--set dataDir=/data \
--set network=mainnet

- name: helm lint ./cloudflared
run: |
set -euo pipefail
helm lint internal/embed/infrastructure/cloudflared
31 changes: 31 additions & 0 deletions .github/workflows/lint-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ on:
pull_request:
branches: [ main ]

permissions:
contents: read

jobs:
lint-test:
runs-on: ubuntu-latest
Expand Down Expand Up @@ -43,3 +46,31 @@ jobs:

- name: Run chart-testing (install)
run: ct install --target-branch ${{ github.event.repository.default_branch }}

generate-check:
name: CRD generation up-to-date
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1

- name: Set up Go
uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0
with:
go-version-file: 'go.mod'

- name: Set up just
uses: extractions/setup-just@dd310ad5a97d8e7b41793f8ef055398d51ad4de6 # v2.0.2

- name: Regenerate CRDs + DeepCopy
run: just generate

- name: Fail if regeneration changed any tracked files
run: |
if [ -n "$(git status --porcelain)" ]; then
echo "::error::CRD manifests or DeepCopy methods are out of date."
echo "::error::Run 'just generate' locally and commit the result."
git status
git --no-pager diff
exit 1
fi
Comment thread
github-advanced-security[bot] marked this conversation as resolved.
Fixed
2 changes: 2 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,8 @@ A registry digest pin instead of `:latest` on the verifier means your dev rewrit

For a fuller debug catalog with symptom→fix mapping, see `.agents/skills/obol-stack-dev/references/release-smoke-debugging.md`.

For observability architecture decisions (Prometheus retention vs. on-chain canonical record, counter-reset semantics, recording-rule naming, label conventions, CRD versioning stance, `clamp_min` epsilon), see `docs/observability.md` — read this before adding a new metric, recording rule, or proposing counter persistence.

### Security: Tunnel Exposure

The Cloudflare tunnel exposes the cluster to the public internet. Only x402-gated endpoints and discovery metadata should be reachable via the tunnel hostname. Internal services (frontend, eRPC, LiteLLM, monitoring) MUST have `hostnames: ["obol.stack"]` on their HTTPRoutes to restrict them to local access.
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.serviceoffer-controller
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@ RUN go mod download
COPY . .
RUN CGO_ENABLED=0 go build -o /serviceoffer-controller ./cmd/serviceoffer-controller

FROM gcr.io/distroless/static-debian12
FROM gcr.io/distroless/static-debian12:nonroot
COPY --from=builder /serviceoffer-controller /serviceoffer-controller
ENTRYPOINT ["/serviceoffer-controller"]
73 changes: 56 additions & 17 deletions cmd/obol/sell.go
Original file line number Diff line number Diff line change
Expand Up @@ -2331,15 +2331,42 @@ Examples:
func sellStopCommand(cfg *config.Config) *cli.Command {
return &cli.Command{
Name: "stop",
Usage: "Pause a ServiceOffer without deleting it",
Usage: "Drain a ServiceOffer gracefully (advertises wind-down via discovery, then tears down the route)",
ArgsUsage: "<name>",
Description: `Marks a ServiceOffer as draining. While draining:
- The offer stays in /skill.md and /.well-known/agent-registration.json
with available=false and a drainEndsAt timestamp, so external
discovery (and ERC-8004 reputation scorers) can see the wind-down.
- The HTTPRoute and x402 payment gate STAY UP for the grace period
so buyers can complete in-flight payments.
- When the grace period elapses, the controller tears down the route
and marks PaymentGateReady/RoutePublished False with reason=Drained.

The ServiceOffer CR itself is preserved — use 'obol sell delete' to
remove it entirely (which also tombstones the ERC-8004 record).

Flags:
--grace 30m Override the grace period (default 1h).
--force Skip the drain window (equivalent to --grace 0). Use
this when the abrupt-teardown behavior of the old
pause annotation is required for behavior parity.`,
Flags: []cli.Flag{
&cli.StringFlag{
Name: "namespace",
Aliases: []string{"n"},
Usage: "Namespace of the ServiceOffer",
Required: true,
},
&cli.DurationFlag{
Name: "grace",
Usage: "Drain grace period (e.g. 30m, 2h). Defaults to 1h.",
Value: monetizeapi.DefaultDrainGracePeriod,
},
&cli.BoolFlag{
Name: "force",
Aliases: []string{"now"},
Usage: "Skip the drain window and tear the route down on the next reconcile (alias: --now)",
},
},
Action: func(ctx context.Context, cmd *cli.Command) error {
u := getUI(cmd)
Expand All @@ -2352,19 +2379,37 @@ func sellStopCommand(cfg *config.Config) *cli.Command {
return err
}
ns := cmd.String("namespace")
grace := cmd.Duration("grace")
if cmd.Bool("force") {
grace = 0
}
if grace < 0 {
return errors.New("--grace must be >= 0")
}

u.Infof("Stopping the service offering %s/%s...", ns, name)

removePricingRoute(cfg, u, name)

patchJSON := `{"status":{"conditions":[{"type":"Ready","status":"False","reason":"Stopped","message":"Offer stopped by user"}]}}`
err := kubectlRun(cfg, "patch", "serviceoffers.obol.org", name, "-n", ns,
"--type=merge", "-p", patchJSON)
if err != nil {
return fmt.Errorf("failed to pause serviceoffer: %w", err)
now := time.Now().UTC()
drainEndsAt := now.Add(grace)

// metav1.Duration JSON-marshals as the string form (e.g.
// "1h0m0s"), and metav1.Time marshals as RFC3339. We can
// emit a tiny strategic-merge patch directly without
// importing the meta types into the CLI.
patchJSON := fmt.Sprintf(
`{"spec":{"drainAt":%q,"drainGracePeriod":%q}}`,
now.Format(time.RFC3339),
grace.String(),
)
if err := kubectlRun(cfg, "patch", "serviceoffers.obol.org", name, "-n", ns,
"--type=merge", "-p", patchJSON); err != nil {
return fmt.Errorf("failed to drain serviceoffer: %w", err)
}

u.Successf("Service offering %s/%s stopped.", ns, name)
if grace == 0 {
u.Successf("ServiceOffer %s/%s draining; route will be removed on the next reconcile (--force).", ns, name)
} else {
u.Successf("ServiceOffer %s/%s draining; route will be removed at %s.", ns, name, drainEndsAt.Format(time.RFC3339))
}
u.Infof("In-flight buyers can complete payments until then. Run `obol sell delete %s -n %s` to fully remove.", name, ns)
return nil
},
}
Expand Down Expand Up @@ -2518,8 +2563,6 @@ func sellDeleteCommand(cfg *config.Config) *cli.Command {
}
}

removePricingRoute(cfg, u, name)

// Identity-level registration ownership lives in the AgentIdentity
// CR and is managed by the controller. The CLI no longer patches
// the registration ConfigMap here; deleting the ServiceOffer is
Expand Down Expand Up @@ -4126,7 +4169,3 @@ func manifestNSName(manifest map[string]any) (string, string) {
return ns, name
}

// removePricingRoute is a no-op retained for compatibility.
// The serviceoffer-controller now manages pricing routes via the ServiceOffer
// informer; static ConfigMap routes are no longer used.
func removePricingRoute(_ *config.Config, _ *ui.UI, _ string) {}
13 changes: 12 additions & 1 deletion cmd/obol/sell_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -626,9 +626,20 @@ func TestSellStop_Structure(t *testing.T) {
stop := findSubcommand(t, cmd, "stop")
flags := flagMap(stop)

requireFlags(t, flags, "namespace")
requireFlags(t, flags, "namespace", "grace", "force")
assertFlagRequired(t, flags, "namespace")
assertFlagHasAlias(t, flags, "namespace", "n")
// --now is the documented alias for --force; if it disappears,
// scripted operators that rely on it break silently.
assertFlagHasAlias(t, flags, "force", "now")

graceFlag, ok := flags["grace"].(*cli.DurationFlag)
if !ok {
t.Fatalf("--grace should be *cli.DurationFlag, got %T", flags["grace"])
}
if graceFlag.Value != monetizeapi.DefaultDrainGracePeriod {
t.Errorf("--grace default = %v, want %v", graceFlag.Value, monetizeapi.DefaultDrainGracePeriod)
}
}

func TestSellDelete_Structure(t *testing.T) {
Expand Down
Loading
Loading