diff --git a/.server-changes/mollifier-decision-enrolled-org-labels.md b/.server-changes/mollifier-decision-enrolled-org-labels.md new file mode 100644 index 00000000000..b9e8a11f84a --- /dev/null +++ b/.server-changes/mollifier-decision-enrolled-org-labels.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Add bounded `enrolled` and `org` labels to the `mollifier.decisions` metric so per-enrolled-org pass-through vs mollify is visible (the `org` label is attached only for the enrolled cohort to keep cardinality bounded). diff --git a/apps/webapp/app/v3/mollifier/mollifierGate.server.ts b/apps/webapp/app/v3/mollifier/mollifierGate.server.ts index 63146b4c323..461faa8d3b0 100644 --- a/apps/webapp/app/v3/mollifier/mollifierGate.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierGate.server.ts @@ -7,6 +7,7 @@ import { recordDecision, type DecisionOutcome, type DecisionReason, + type RecordDecisionOptions, } from "./mollifierTelemetry.server"; // `count` is the fleet-wide fixed-window counter for the env (INCR with a @@ -80,7 +81,7 @@ export type GateDependencies = { inputs: GateInputs, decision: Extract, ) => void; - recordDecision: (outcome: DecisionOutcome, reason?: DecisionReason) => void; + recordDecision: (outcome: DecisionOutcome, opts: RecordDecisionOptions) => void; }; // `options` is a thunk so env reads happen per-evaluation, not at module load. @@ -152,52 +153,59 @@ export async function evaluateGate( ): Promise { const d = { ...defaultGateDependencies, ...deps }; + // Resolve the per-org flag up front so every decision below — including + // the bypasses — can be labelled enrolled vs not on the + // `mollifier.decisions` counter. Fail open: a transient error must not + // block triggers. The resolver is purely in-memory (reads + // `Organization.featureFlags`); it adds no DB round-trip to the hot path. + let orgFlagEnabled: boolean; + try { + orgFlagEnabled = await d.resolveOrgFlag(inputs); + } catch (error) { + logger.warn("mollifier.resolve_org_flag_failed", { + envId: inputs.envId, + orgId: inputs.orgId, + taskId: inputs.taskId, + error: error instanceof Error ? error.message : String(error), + }); + orgFlagEnabled = false; + } + // Passed to every `recordDecision`. `org` only becomes a label for the + // (operationally capped) enrolled cohort — the guard is in + // `decisionLabels`, so passing orgId unconditionally here is safe. + const labels: RecordDecisionOptions = { enrolled: orgFlagEnabled, orgId: inputs.orgId }; + // Debounce bypass. onDebounced is a closure over webapp state and // can't be snapshotted into the buffer for drainer replay. Skip before the // trip evaluator so debounce traffic is never counted against the rate. if (inputs.options?.debounce) { - d.recordDecision("pass_through"); + d.recordDecision("pass_through", labels); return { action: "pass_through" }; } // OneTimeUseToken bypass. OTU is a security feature on the PUBLIC_JWT // auth path; its synchronous-rejection contract is materially worse to // break than the idempotency-key contract. if (inputs.options?.oneTimeUseToken) { - d.recordDecision("pass_through"); + d.recordDecision("pass_through", labels); return { action: "pass_through" }; } // Single triggerAndWait bypass. batchTriggerAndWait still funnels // through TriggerTaskService.call per item so the dominant burst pattern // remains covered. if (inputs.options?.parentTaskRunId && inputs.options?.resumeParentOnCompletion) { - d.recordDecision("pass_through"); + d.recordDecision("pass_through", labels); return { action: "pass_through" }; } if (!d.isMollifierEnabled()) { - d.recordDecision("pass_through"); + d.recordDecision("pass_through", labels); return { action: "pass_through" }; } - // Fail open: a transient DB error resolving the per-org flag must not - // block triggers. Mirror the evaluator's fail-open posture in - // `mollifierTripEvaluator.server.ts`. - let orgFlagEnabled: boolean; - try { - orgFlagEnabled = await d.resolveOrgFlag(inputs); - } catch (error) { - logger.warn("mollifier.resolve_org_flag_failed", { - envId: inputs.envId, - orgId: inputs.orgId, - taskId: inputs.taskId, - error: error instanceof Error ? error.message : String(error), - }); - orgFlagEnabled = false; - } const shadowOn = d.isShadowModeOn(); if (!orgFlagEnabled && !shadowOn) { - d.recordDecision("pass_through"); + d.recordDecision("pass_through", labels); return { action: "pass_through" }; } @@ -226,17 +234,17 @@ export async function evaluateGate( decision = { divert: false }; } if (!decision.divert) { - d.recordDecision("pass_through"); + d.recordDecision("pass_through", labels); return { action: "pass_through" }; } if (orgFlagEnabled) { d.logMollified(inputs, decision); - d.recordDecision("mollify", decision.reason); + d.recordDecision("mollify", { ...labels, reason: decision.reason }); return { action: "mollify", decision }; } d.logShadow(inputs, decision); - d.recordDecision("shadow_log", decision.reason); + d.recordDecision("shadow_log", { ...labels, reason: decision.reason }); return { action: "shadow_log", decision }; } diff --git a/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts b/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts index deaa32bb74d..4bd2c45eb54 100644 --- a/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts @@ -9,11 +9,39 @@ export const mollifierDecisionsCounter = meter.createCounter("mollifier.decision export type DecisionOutcome = "pass_through" | "shadow_log" | "mollify"; export type DecisionReason = "per_env_rate"; -export function recordDecision(outcome: DecisionOutcome, reason?: DecisionReason): void { - mollifierDecisionsCounter.add(1, { +export type RecordDecisionOptions = { + reason?: DecisionReason; + // Whether the org has the per-org mollifier flag enabled. Emitted as the + // bounded `enrolled` label so we can see how often enrolled orgs pass + // through instead of mollifying — the whole point of this instrumentation. + enrolled: boolean; + // Org id, attached as the `org` label ONLY when `enrolled` is true. The + // enrolled cohort is capped operationally (<= 10 orgs), so this stays + // low-cardinality. It must NEVER be attached for non-enrolled orgs — that + // would fan the metric out across every org id in production (unbounded; + // the same high-cardinality ban that keeps envId/orgId off the other + // mollifier metrics). The guard lives in `decisionLabels`, so callers can + // pass orgId unconditionally. + orgId?: string; +}; + +// Pure: builds the metric label set for a gate decision. Extracted from +// `recordDecision` so the org-only-when-enrolled cardinality guard is +// unit-testable without standing up an OTel meter. +export function decisionLabels( + outcome: DecisionOutcome, + opts: RecordDecisionOptions, +): Record { + return { outcome, - ...(reason ? { reason } : {}), - }); + enrolled: opts.enrolled ? "true" : "false", + ...(opts.reason ? { reason: opts.reason } : {}), + ...(opts.enrolled && opts.orgId ? { org: opts.orgId } : {}), + }; +} + +export function recordDecision(outcome: DecisionOutcome, opts: RecordDecisionOptions): void { + mollifierDecisionsCounter.add(1, decisionLabels(outcome, opts)); } // Counts subscriptions hitting `/realtime/v1/runs/` for a run that diff --git a/apps/webapp/test/mollifierDecisionLabels.test.ts b/apps/webapp/test/mollifierDecisionLabels.test.ts new file mode 100644 index 00000000000..8206edabb0b --- /dev/null +++ b/apps/webapp/test/mollifierDecisionLabels.test.ts @@ -0,0 +1,54 @@ +import { describe, expect, it } from "vitest"; + +import { decisionLabels } from "~/v3/mollifier/mollifierTelemetry.server"; + +// The cardinality guard. `org` is a bounded label (enrolled cohort is capped +// at <= 10 orgs operationally), so it may ONLY be attached when the org is +// enrolled. Attaching it for non-enrolled orgs would fan `mollifier.decisions` +// out across every org id in production — the high-cardinality blow-up these +// labels are explicitly designed to avoid. +describe("decisionLabels", () => { + it("always emits a bounded `enrolled` label (true/false)", () => { + expect(decisionLabels("pass_through", { enrolled: false })).toEqual({ + outcome: "pass_through", + enrolled: "false", + }); + expect(decisionLabels("pass_through", { enrolled: true, orgId: "org_1" })).toMatchObject({ + enrolled: "true", + }); + }); + + it("attaches the `org` label ONLY when enrolled — never for non-enrolled, even if orgId is passed", () => { + // Non-enrolled: orgId passed but MUST be dropped (cardinality guard). + expect(decisionLabels("pass_through", { enrolled: false, orgId: "org_unbounded" })).toEqual({ + outcome: "pass_through", + enrolled: "false", + }); + + // Enrolled: org label present. + expect( + decisionLabels("mollify", { enrolled: true, orgId: "org_1", reason: "per_env_rate" }), + ).toEqual({ + outcome: "mollify", + enrolled: "true", + reason: "per_env_rate", + org: "org_1", + }); + }); + + it("omits `org` when enrolled but no orgId is supplied", () => { + expect(decisionLabels("pass_through", { enrolled: true })).toEqual({ + outcome: "pass_through", + enrolled: "true", + }); + }); + + it("includes `reason` only when supplied", () => { + expect(decisionLabels("pass_through", { enrolled: true, orgId: "org_1" })).not.toHaveProperty( + "reason", + ); + expect( + decisionLabels("shadow_log", { enrolled: false, reason: "per_env_rate" }), + ).toMatchObject({ reason: "per_env_rate" }); + }); +}); diff --git a/apps/webapp/test/mollifierGate.test.ts b/apps/webapp/test/mollifierGate.test.ts index e40a29b2481..5f3f28e668c 100644 --- a/apps/webapp/test/mollifierGate.test.ts +++ b/apps/webapp/test/mollifierGate.test.ts @@ -27,7 +27,12 @@ type Spies = { evaluatorCalls: number; logShadowCalls: Array<{ inputs: GateInputs; decision: Extract }>; logMollifiedCalls: Array<{ inputs: GateInputs; decision: Extract }>; - recordDecisionCalls: Array<{ outcome: DecisionOutcome; reason?: DecisionReason }>; + recordDecisionCalls: Array<{ + outcome: DecisionOutcome; + reason?: DecisionReason; + enrolled?: boolean; + orgId?: string; + }>; }; type Toggles = { @@ -58,8 +63,13 @@ function makeDeps(toggles: Toggles): { deps: GateDependencies; spies: Spies } { logMollified: (inputs, decision) => { spies.logMollifiedCalls.push({ inputs, decision }); }, - recordDecision: (outcome, reason) => { - spies.recordDecisionCalls.push({ outcome, reason }); + recordDecision: (outcome, opts) => { + spies.recordDecisionCalls.push({ + outcome, + reason: opts.reason, + enrolled: opts.enrolled, + orgId: opts.orgId, + }); }, }; return { deps, spies }; @@ -152,6 +162,12 @@ describe("evaluateGate cascade — exhaustive truth table", () => { expect(spies.recordDecisionCalls).toHaveLength(1); expect(spies.recordDecisionCalls[0].outcome).toBe(row.expected.recordedOutcome); expect(spies.recordDecisionCalls[0].reason).toBe(row.expected.expectedReason); + // enrolled label = the resolved per-org flag, now hoisted above the + // bypasses so it's set on every decision. orgId is always passed by the + // gate; the telemetry layer drops it for non-enrolled (covered in + // mollifierDecisionLabels.test.ts). + expect(spies.recordDecisionCalls[0].enrolled).toBe(row.flag); + expect(spies.recordDecisionCalls[0].orgId).toBe(inputs.orgId); }, ); @@ -254,8 +270,13 @@ describe("evaluateGate — fail open on evaluator error", () => { logMollified: (inputs, decision) => { spies.logMollifiedCalls.push({ inputs, decision }); }, - recordDecision: (outcome, reason) => { - spies.recordDecisionCalls.push({ outcome, reason }); + recordDecision: (outcome, opts) => { + spies.recordDecisionCalls.push({ + outcome, + reason: opts.reason, + enrolled: opts.enrolled, + orgId: opts.orgId, + }); }, }; @@ -265,7 +286,13 @@ describe("evaluateGate — fail open on evaluator error", () => { expect(spies.evaluatorCalls).toBe(1); expect(spies.logMollifiedCalls).toHaveLength(0); expect(spies.logShadowCalls).toHaveLength(0); - expect(spies.recordDecisionCalls).toEqual([{ outcome: "pass_through", reason: undefined }]); + expect(spies.recordDecisionCalls).toHaveLength(1); + expect(spies.recordDecisionCalls[0]).toMatchObject({ + outcome: "pass_through", + reason: undefined, + enrolled: true, + orgId: inputs.orgId, + }); }); }); @@ -293,8 +320,13 @@ describe("evaluateGate — fail open on resolveOrgFlag error", () => { logMollified: (inputs, decision) => { spies.logMollifiedCalls.push({ inputs, decision }); }, - recordDecision: (outcome, reason) => { - spies.recordDecisionCalls.push({ outcome, reason }); + recordDecision: (outcome, opts) => { + spies.recordDecisionCalls.push({ + outcome, + reason: opts.reason, + enrolled: opts.enrolled, + orgId: opts.orgId, + }); }, }; @@ -302,7 +334,13 @@ describe("evaluateGate — fail open on resolveOrgFlag error", () => { expect(outcome.action).toBe("pass_through"); expect(spies.evaluatorCalls).toBe(0); - expect(spies.recordDecisionCalls).toEqual([{ outcome: "pass_through", reason: undefined }]); + expect(spies.recordDecisionCalls).toHaveLength(1); + expect(spies.recordDecisionCalls[0]).toMatchObject({ + outcome: "pass_through", + reason: undefined, + enrolled: false, + orgId: inputs.orgId, + }); }); }); @@ -333,8 +371,13 @@ describe("evaluateGate — per-org isolation via Organization.featureFlags", () logMollified: (inputs, decision) => { spies.logMollifiedCalls.push({ inputs, decision }); }, - recordDecision: (outcome, reason) => { - spies.recordDecisionCalls.push({ outcome, reason }); + recordDecision: (outcome, opts) => { + spies.recordDecisionCalls.push({ + outcome, + reason: opts.reason, + enrolled: opts.enrolled, + orgId: opts.orgId, + }); }, }; return { deps, spies };