diff --git a/cmd/x402-verifier/main.go b/cmd/x402-verifier/main.go index d9538c22..e1d52f63 100644 --- a/cmd/x402-verifier/main.go +++ b/cmd/x402-verifier/main.go @@ -60,6 +60,13 @@ func main() { ctx, cancel := context.WithCancel(context.Background()) defer cancel() + // File-sourced routes are populated synchronously by LoadConfig above, + // so they are "loaded" as soon as NewVerifier returns. The kube branch + // below flips this flag only after the first informer apply succeeds. + if *routeSource == "file" { + v.MarkRoutesLoaded() + } + if *watch { switch *routeSource { case "file": @@ -76,7 +83,7 @@ func main() { log.Fatalf("load kube route source config: %v", err) } go func() { - if err := x402verifier.WatchServiceOffers(ctx, kubeCfg, accumulator.SetRoutes); err != nil { + if err := x402verifier.WatchServiceOffers(ctx, kubeCfg, accumulator.SetRoutes, v.MarkRoutesLoaded); err != nil { log.Printf("x402-serviceoffer-source: stopped: %v", err) } }() diff --git a/internal/x402/serviceoffer_source.go b/internal/x402/serviceoffer_source.go index f0b1999a..1442bd72 100644 --- a/internal/x402/serviceoffer_source.go +++ b/internal/x402/serviceoffer_source.go @@ -20,7 +20,12 @@ import ( "k8s.io/client-go/tools/cache" ) -func WatchServiceOffers(ctx context.Context, cfg *rest.Config, apply func([]RouteRule) error) error { +// WatchServiceOffers runs the ServiceOffer + litellm-secrets informers and +// pushes rendered RouteRules to apply on every change. The optional +// onFirstApply callback is invoked exactly once after the post-cache-sync +// refresh succeeds; it is the signal that the route source has produced its +// first usable snapshot. Pass nil to skip. +func WatchServiceOffers(ctx context.Context, cfg *rest.Config, apply func([]RouteRule) error, onFirstApply func()) error { client, err := dynamic.NewForConfig(cfg) if err != nil { return fmt.Errorf("create dynamic client: %w", err) @@ -33,17 +38,18 @@ func WatchServiceOffers(ctx context.Context, cfg *rest.Config, apply func([]Rout offers := offerFactory.ForResource(monetizeapi.ServiceOfferGVR).Informer() secrets := secretFactory.ForResource(monetizeapi.SecretGVR).Informer() - refresh := func() { + refresh := func() (ok bool) { routes, err := routesFromStore(offers.GetStore().List(), secrets.GetStore().List()) if err != nil { log.Printf("x402-serviceoffer-source: render routes: %v", err) - return + return false } if err := apply(routes); err != nil { log.Printf("x402-serviceoffer-source: apply routes: %v", err) - return + return false } log.Printf("x402-serviceoffer-source: routes reloaded (%d routes)", len(routes)) + return true } handler := cache.ResourceEventHandlerFuncs{ @@ -60,7 +66,9 @@ func WatchServiceOffers(ctx context.Context, cfg *rest.Config, apply func([]Rout return fmt.Errorf("wait for serviceoffer informer sync") } - refresh() + if refresh() && onFirstApply != nil { + onFirstApply() + } <-ctx.Done() return nil } diff --git a/internal/x402/verifier.go b/internal/x402/verifier.go index 60e2fa80..2b27f29a 100644 --- a/internal/x402/verifier.go +++ b/internal/x402/verifier.go @@ -21,8 +21,19 @@ type Verifier struct { chain atomic.Pointer[ChainInfo] chains atomic.Pointer[map[string]ChainInfo] // pre-resolved: chain name → config metrics *verifierMetrics + + // routesLoaded is set true after the first route source apply completes. + // Until then HandleReadyz returns 503 so kubelet keeps the pod out of + // the Service Endpoints, preventing the "no rule -> 200 free pass" + // window during informer warmup (CLAUDE.md pitfall #14). + routesLoaded atomic.Bool } +// MarkRoutesLoaded signals that the route source has produced its first +// non-error apply. Idempotent. After this, HandleReadyz returns 200 +// once config is also loaded. +func (v *Verifier) MarkRoutesLoaded() { v.routesLoaded.Store(true) } + // NewVerifier creates a Verifier with the given initial configuration. func NewVerifier(cfg *PricingConfig) (*Verifier, error) { v := &Verifier{metrics: newVerifierMetrics()} @@ -224,10 +235,18 @@ func (v *Verifier) HandleHealthz(w http.ResponseWriter, r *http.Request) { fmt.Fprintln(w, `{"status":"ok"}`) } -// HandleReadyz returns 200 OK if pricing config is loaded, 503 otherwise. +// HandleReadyz returns 200 OK once BOTH pricing config and the first route +// source apply have completed. Until then it returns 503 with a cause-specific +// body so kubelet keeps the pod out of Service Endpoints, preventing the +// "no rule -> 200 free pass" window during informer warmup +// (CLAUDE.md pitfall #14). func (v *Verifier) HandleReadyz(w http.ResponseWriter, r *http.Request) { if v.config.Load() == nil { - http.Error(w, "not ready", http.StatusServiceUnavailable) + http.Error(w, "not ready: config not loaded", http.StatusServiceUnavailable) + return + } + if !v.routesLoaded.Load() { + http.Error(w, "not ready: routes not loaded", http.StatusServiceUnavailable) return } diff --git a/internal/x402/verifier_test.go b/internal/x402/verifier_test.go index 3b62c815..89239542 100644 --- a/internal/x402/verifier_test.go +++ b/internal/x402/verifier_test.go @@ -100,6 +100,8 @@ func testPaymentHeaderFor(t *testing.T, payTo, amount string) string { } // newTestVerifier creates a Verifier backed by the given facilitator URL. +// It also marks routes as loaded so /readyz returns 200 immediately, which +// matches what the production wire-up does once the route source warms up. func newTestVerifier(t *testing.T, facilitatorURL string, routes []RouteRule) *Verifier { t.Helper() v, err := NewVerifier(&PricingConfig{ @@ -112,6 +114,7 @@ func newTestVerifier(t *testing.T, facilitatorURL string, routes []RouteRule) *V if err != nil { t.Fatalf("NewVerifier: %v", err) } + v.MarkRoutesLoaded() return v } @@ -488,6 +491,55 @@ func TestVerifier_ReadyzNotReady(t *testing.T) { if w.Code != http.StatusServiceUnavailable { t.Errorf("expected 503 when config is nil, got %d", w.Code) } + if got := w.Body.String(); !strings.Contains(got, "config not loaded") { + t.Errorf("expected body to mention %q, got %q", "config not loaded", got) + } +} + +// TestVerifier_Readyz_BlocksUntilRoutesLoaded asserts the fix for +// CLAUDE.md pitfall #14: /readyz must return 503 between "config loaded" +// and "first route source apply completed" so kubelet keeps the pod out +// of the Service Endpoints during informer warm-up. +func TestVerifier_Readyz_BlocksUntilRoutesLoaded(t *testing.T) { + v, err := NewVerifier(&PricingConfig{ + Wallet: "0xdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef", + Chain: "base-sepolia", + FacilitatorURL: "http://example.invalid", + }) + if err != nil { + t.Fatalf("NewVerifier: %v", err) + } + + // Config is loaded by NewVerifier, but routes have NOT been marked + // loaded yet — /readyz must still 503 with a routes-specific message + // so kubectl describe pod surfaces the actual cause. + req := httptest.NewRequest(http.MethodGet, "/readyz", nil) + w := httptest.NewRecorder() + v.HandleReadyz(w, req) + + if w.Code != http.StatusServiceUnavailable { + t.Fatalf("expected 503 before routes loaded, got %d", w.Code) + } + if got := w.Body.String(); !strings.Contains(got, "routes not loaded") { + t.Errorf("expected body to mention %q, got %q", "routes not loaded", got) + } + + // After the route source signals first apply, /readyz flips to 200. + v.MarkRoutesLoaded() + + w = httptest.NewRecorder() + v.HandleReadyz(w, req) + if w.Code != http.StatusOK { + t.Fatalf("expected 200 after MarkRoutesLoaded, got %d (body=%q)", w.Code, w.Body.String()) + } + + // MarkRoutesLoaded is idempotent — calling it again must not regress. + v.MarkRoutesLoaded() + w = httptest.NewRecorder() + v.HandleReadyz(w, req) + if w.Code != http.StatusOK { + t.Fatalf("expected 200 after second MarkRoutesLoaded, got %d", w.Code) + } } // ── Per-route PayTo / Network override tests ─────────────────────────────────