From 9022f37e46cf0c31a1adb09177d83df4b8dab4f7 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Sun, 24 May 2026 11:40:57 +0400 Subject: [PATCH] fix(prometheus-rules): use epsilon floor not 1.0 to avoid under-reporting low-traffic alerts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X402PaymentFailureRateHigh and the settlement_rate recording rule used clamp_min(denominator, 1) as a div-by-zero guard. For paid endpoints under light load (sub-1 req/s), the floor is 1.0 instead of the true denominator, so the ratio numerator/denominator returns near-zero even when 50%+ of requests are failing — the alert never fires. Switch the floor to 1e-9. Epsilon prevents division-by-zero while keeping the actual ratio accurate at any non-zero traffic level. Surfaced by Expert #2 review of the PromQL design (plans/integration-test-L7-paid-flow-20260524.md follow-ups). Stacks on PR #531 (asset_symbol label) which is the tip of the rules-file chain. Will rebase onto main as the chain merges. --- .../base/templates/x402-prometheus-rules.yaml | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml b/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml index 77b6429..5115a7d 100644 --- a/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml +++ b/internal/embed/infrastructure/base/templates/x402-prometheus-rules.yaml @@ -105,6 +105,15 @@ spec: # Settlement rate (verified / attempted) over the last hour, per # (offer, chain). Useful for the dashboard + the alert below. + # + # The `clamp_min(..., 1e-9)` is a division-by-zero guard, not a + # traffic floor. An earlier revision used `clamp_min(..., 1)`, + # which floored the denominator at 1 req/s and silently + # distorted the ratio on low-traffic offers (e.g. verified= + # 0.001/s ÷ floored_denominator=1 ≈ 0 instead of the real + # 0.001/0.002 = 0.5). Epsilon keeps the answer accurate at any + # non-zero traffic level while still avoiding a NaN when no + # samples exist in the window. - record: x402:settlement_rate:1h_by_offer_chain expr: | sum by (offer_namespace, offer_name, chain) ( @@ -119,7 +128,7 @@ spec: + rate(obol_x402_verifier_payment_failed_total[1h]) ), - 1 + 1e-9 ) - name: x402.alerting @@ -128,6 +137,14 @@ spec: # route that's actually receiving traffic. Typical cause: # facilitator unreachable, chain pruning, or seller's CA bundle # missing (CLAUDE.md pitfall #8). + # + # The `clamp_min(..., 1e-9)` here is a div-by-zero guard only. + # A prior `clamp_min(..., 1)` floored the denominator at 1 req/s, + # which under-reports the failure ratio on light-traffic + # endpoints (failed=0.001/s ÷ floored_denominator=1 = 0.001 + # instead of the true 0.001/0.002 = 0.5) and prevented the + # alert from ever firing at sub-1 req/s. Epsilon avoids NaN + # without distorting the ratio. - alert: X402PaymentFailureRateHigh expr: | ( @@ -141,7 +158,7 @@ spec: + rate(obol_x402_verifier_payment_verified_total[1h]) ), - 1 + 1e-9 ) ) > 0.10 for: 10m