From 6154e9feba134ad574424815ca6ce9db093c40a5 Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Mon, 15 Jun 2026 19:14:17 -0400 Subject: [PATCH 01/16] Capture a wedged render's stack out-of-process (Debugger.pause + gated --prof) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a prerender render pegs the renderer's main thread, the existing timeout-path diagnostics can't name the offending function: the CDP `Profiler` only yields samples at `stop`, which the pegged thread can't serialize, and the out-of-band trace stream samples continuously, so its own overhead can perturb a timing-sensitive wedge enough to dissolve it. Add a capture that reads the peg from outside without instrumenting it: - `Debugger.pause` (primary, always on the timeout path): V8 honors the pause at the next interrupt check (a loop back-edge / call), so it lands inside a synchronous loop without the loop yielding — the mechanism behind the DevTools "pause" button on a hung page. Zero overhead until the single pause, so it can't mask the wedge. Reports the call frames, the live stack depth (a runaway recursion shows a huge total), and heap used (flat → tight compute/recursion; climbing → combinatorial breadth). Runs out-of-process over CDP, so it survives a pegged JS thread, and only on the timeout path, so it's free on a healthy render. - `--prof` (gated behind `PRERENDER_V8_PROF`, off by default): V8's kernel-SIGPROF sampler, armed at Chrome launch. The signal preempts the thread regardless of what it's running, so it also samples a peg stuck in a non-yielding NATIVE call — where `Debugger.pause` can't get a back-edge and returns `pause-timeout`. It samples every render, so it can perturb a timing-sensitive wedge; enable it only as the native-peg fallback. Both surface in the prerender-server timeout log line (`pausedStack:`, `v8ProfTopFrames:`). Also lower the render timeout 90s → 60s: a healthy render finishes well under 30s, so 60s cleanly separates a genuine wedge from the slowest legitimate cards, and — with the request-timeout overhead — fires the render-level timeout (and this capture) before the request-level abort gives up on the render. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../realm-server/prerender/browser-manager.ts | 15 ++ .../realm-server/prerender/pause-capture.ts | 201 ++++++++++++++++++ .../prerender/prerender-constants.ts | 8 +- packages/realm-server/prerender/utils.ts | 34 +++ packages/realm-server/prerender/v8-prof.ts | 94 ++++++++ 5 files changed, 350 insertions(+), 2 deletions(-) create mode 100644 packages/realm-server/prerender/pause-capture.ts create mode 100644 packages/realm-server/prerender/v8-prof.ts diff --git a/packages/realm-server/prerender/browser-manager.ts b/packages/realm-server/prerender/browser-manager.ts index ef7ba02657..b424635d24 100644 --- a/packages/realm-server/prerender/browser-manager.ts +++ b/packages/realm-server/prerender/browser-manager.ts @@ -7,6 +7,7 @@ import { execFile } from 'child_process'; import { promisify } from 'util'; import { isHttpsLoopback } from '../lib/is-https-loopback.ts'; +import { v8ProfEnabled, v8ProfJsFlags } from './v8-prof.ts'; const log = logger('prerenderer'); const PUPPETEER_PROFILE_PREFIX = 'puppeteer_dev_chrome_profile-'; @@ -61,6 +62,20 @@ export class BrowserManager { launchArgs.push(...extraArgs); } + // Diagnostic (off by default): arm V8's kernel-signal CPU sampler + // (`--prof`) in the renderer at launch. The SIGPROF timer preempts the + // thread on a schedule it can't refuse, so it samples a CPU-pegged + // loop — even one stuck in a non-yielding NATIVE call where + // `Debugger.pause` can't get a back-edge — and it never needs the + // pegged thread to service a CDP `stop`. The cost: it samples EVERY + // render, so it can perturb a timing-sensitive wedge; enable it only + // for a run that needs the native-peg fallback. Per-pid logfile so + // concurrent renderer processes don't clobber one another; the timeout + // path post-processes the log into the prerender-server logs. + if (v8ProfEnabled()) { + launchArgs.push(`--js-flags=${v8ProfJsFlags()}`); + } + this.#browser = await puppeteer.launch({ headless: process.env.BOXEL_SHOW_PRERENDER !== 'true', ...(launchArgs.length > 0 ? { args: launchArgs } : {}), diff --git a/packages/realm-server/prerender/pause-capture.ts b/packages/realm-server/prerender/pause-capture.ts new file mode 100644 index 0000000000..ba2abfdcdd --- /dev/null +++ b/packages/realm-server/prerender/pause-capture.ts @@ -0,0 +1,201 @@ +// One-shot stack capture of a wedged prerender via CDP `Debugger.pause`. +// +// Why this exists alongside the CPU profiler / trace stream: +// +// A hard wedge is a synchronous JS loop pegging the renderer's main +// thread. The CDP `Profiler` can't capture it — its samples only arrive +// at `Profiler.stop`, which the pegged thread can't serialize (see +// `trace-capture.ts`). The out-of-band trace stream CAN capture it, but +// it samples continuously, so its own overhead can perturb a +// timing-sensitive wedge enough to dissolve it. +// +// `Debugger.pause` is the missing tool: it adds ZERO overhead until the +// single pause, so it cannot mask the wedge — we let the loop run at +// full native speed and only look once it's already stuck. And it reads +// a synchronous loop: V8 honors the pause at the next interrupt check (a +// loop back-edge or call), so it lands inside the spin without the loop +// yielding — exactly the mechanism behind the DevTools "pause" button on +// a hung page. The returned `callFrames` name the function the loop is +// in. +// +// Limit: if the peg is one long non-yielding NATIVE call (a catastrophic +// regex, a native sort) there is no back-edge to honor the pause, so it +// times out (`reason: 'pause-timeout'`) — which is itself a signal, +// pivoting to the kernel-signal `--prof` sampler that preempts native +// code too. +// +// Everything here runs in the prerender SERVER (Node) over CDP, never in +// the page, so it uses the real Node timer — the page-side +// render-timer-stub does not apply. It is invoked only on the render +// timeout path, so it costs nothing on a healthy render. + +import type { CDPSession, Page } from 'puppeteer'; +import { logger } from '@cardstack/runtime-common'; + +const log = logger('prerenderer'); + +const TIMED_OUT = Symbol('node-timeout'); + +export interface PausedStackCapture { + // Top JS frames, innermost first, formatted `fn @ url:line:col`. + frames: string[]; + // True when the live stack was deeper than `frames` (recursion depth + // is itself diagnostic — a runaway recursion shows a huge total). + truncated: boolean; + totalFrames: number; + // V8 used heap at the moment of the wedge. Flat across the peg → a tight + // compute/recursion loop; climbing → a combinatorial re-build (breadth). + heapUsedMB: number | null; + // Set when no usable stack was captured, naming why. + reason?: string; +} + +// Race a Node-side promise against a real Node timer. The prerender server +// process is not the rendered page, so `setTimeout` here is the genuine +// one (the render-timer-stub only replaces the page's timers). +async function raceNodeTimeout( + p: Promise, + ms: number, +): Promise { + let timer: ReturnType | undefined; + try { + return await Promise.race([ + p, + new Promise((resolve) => { + timer = setTimeout(() => resolve(TIMED_OUT), ms); + }), + ]); + } finally { + clearTimeout(timer); + } +} + +async function detachQuietly(client: CDPSession | undefined): Promise { + if (!client) { + return; + } + try { + await client.detach(); + } catch { + // session already gone / page closing — nothing to do + } +} + +function empty(heapUsedMB: number | null, reason: string): PausedStackCapture { + return { frames: [], truncated: false, totalFrames: 0, heapUsedMB, reason }; +} + +export async function capturePausedCallStack( + page: Page, + opts?: { budgetMs?: number; maxFrames?: number }, +): Promise { + let budgetMs = opts?.budgetMs ?? 8000; + let maxFrames = opts?.maxFrames ?? 60; + if (page.isClosed()) { + return null; + } + let client: CDPSession | undefined; + try { + let session = await raceNodeTimeout(page.createCDPSession(), budgetMs); + if (session === TIMED_OUT) { + return empty(null, 'cdp-session-timeout'); + } + client = session; + + // Heap usage first: `Runtime.getHeapUsage` is answered from V8 stats, + // so it returns even while the JS thread is pegged. + let heapUsedMB: number | null = null; + try { + let usage = (await raceNodeTimeout( + client.send('Runtime.getHeapUsage'), + budgetMs, + )) as { usedSize?: number } | typeof TIMED_OUT; + if (usage !== TIMED_OUT && typeof usage.usedSize === 'number') { + heapUsedMB = usage.usedSize / (1024 * 1024); + } + } catch { + // best-effort + } + + // Arm the paused listener BEFORE requesting the pause so we can't miss + // the event. + let pausedFrames = new Promise((resolve) => { + client!.once('Debugger.paused', (e: any) => + resolve(Array.isArray(e?.callFrames) ? e.callFrames : null), + ); + }); + + let enabled = await raceNodeTimeout( + client.send('Debugger.enable'), + budgetMs, + ); + if (enabled === TIMED_OUT) { + return empty(heapUsedMB, 'debugger-enable-timeout'); + } + // Fire-and-forget: the pause lands at the next V8 interrupt check. + void client.send('Debugger.pause').catch(() => {}); + + let callFrames = await raceNodeTimeout(pausedFrames, budgetMs); + + // Best-effort unpause so teardown isn't left in a paused state. + try { + await raceNodeTimeout(client.send('Debugger.resume'), 2000); + } catch { + // ignore + } + try { + await raceNodeTimeout(client.send('Debugger.disable'), 2000); + } catch { + // ignore + } + + if (callFrames === TIMED_OUT || !Array.isArray(callFrames)) { + // No back-edge honored the pause within budget — most likely a + // non-yielding native peg. The `--prof` sampler is the fallback. + return empty(heapUsedMB, 'pause-timeout'); + } + + let totalFrames = callFrames.length; + let frames = callFrames.slice(0, maxFrames).map((f: any) => { + let name = + typeof f.functionName === 'string' && f.functionName.length > 0 + ? f.functionName + : '(anonymous)'; + let loc = f.location ?? {}; + let url = + (typeof f.url === 'string' && f.url.length > 0 && f.url) || + f.functionLocation?.scriptId || + ''; + let line = (loc.lineNumber ?? 0) + 1; + let col = (loc.columnNumber ?? 0) + 1; + return `${name} @ ${url}:${line}:${col}`; + }); + return { + frames, + truncated: totalFrames > maxFrames, + totalFrames, + heapUsedMB, + }; + } catch (e) { + log.debug('paused stack capture failed:', e); + return null; + } finally { + await detachQuietly(client); + } +} + +// Format the capture for a single log line / diagnostics field. +export function formatPausedStack( + capture: PausedStackCapture | null, +): string | null { + if (!capture) { + return null; + } + if (capture.frames.length === 0) { + return `<${capture.reason ?? 'no-frames'}>`; + } + let depth = capture.truncated + ? `${capture.frames.length}/${capture.totalFrames}` + : `${capture.totalFrames}`; + return `[depth=${depth}] ` + capture.frames.join(' <- '); +} diff --git a/packages/realm-server/prerender/prerender-constants.ts b/packages/realm-server/prerender/prerender-constants.ts index 7ec8005b19..b6f727c5b7 100644 --- a/packages/realm-server/prerender/prerender-constants.ts +++ b/packages/realm-server/prerender/prerender-constants.ts @@ -75,8 +75,12 @@ export { DURING_PRERENDER_HEADER } from '@cardstack/runtime-common'; export { sanitizePrerenderJobId } from '@cardstack/runtime-common'; // Base timeout for a single prerender capture on the prerender server -// (DOM rendering + data loading inside the headless browser). -const DEFAULT_RENDER_TIMEOUT_MS = 90_000; +// (DOM rendering + data loading inside the headless browser). A healthy +// render completes in well under 30s; 60s cleanly separates a genuine +// wedge from the slowest legitimate cards, and (with the request-timeout +// overhead below) fires the render-level timeout — and its hang +// diagnostics — before the request-level abort gives up on the render. +const DEFAULT_RENDER_TIMEOUT_MS = 60_000; // Additional budget for request-level timeouts that wrap render work across // process/network boundaries (manager proxying, serialization, retries, etc). // Request timeout defaults are computed as: diff --git a/packages/realm-server/prerender/utils.ts b/packages/realm-server/prerender/utils.ts index 3120896ee5..c5454c9f7b 100644 --- a/packages/realm-server/prerender/utils.ts +++ b/packages/realm-server/prerender/utils.ts @@ -17,6 +17,12 @@ import { } from './cpu-profiler.ts'; import { fromAffinityKey } from './affinity.ts'; import { captureTraceStream } from './trace-capture.ts'; +import { + capturePausedCallStack, + formatPausedStack, + type PausedStackCapture, +} from './pause-capture.ts'; +import { v8ProfEnabled, processV8ProfTopFrames } from './v8-prof.ts'; import { ensureHeapSampling, captureHeapSamplingProfile, @@ -1686,6 +1692,27 @@ export async function withTimeout( }); cpuProfileTopFrames = formatTopFrames(summary); } + // One-shot stack capture of the still-pegged renderer via + // `Debugger.pause`. Where the CPU profiler above structurally can't + // capture a hard wedge (its `stop` needs the pegged thread to + // serialize the profile), a debugger pause lands inside the + // synchronous loop at the next V8 interrupt check and reports the + // call frames directly — and it adds zero overhead until the single + // pause, so it can't mask a timing-sensitive wedge the way continuous + // sampling can. Out-of-process (CDP), so it survives a pegged JS + // thread; only on the timeout path, so it's free on a healthy render. + let pausedStack: PausedStackCapture | null = null; + if (!page.isClosed()) { + pausedStack = await capturePausedCallStack(page, { budgetMs: 8000 }); + } + // Native-peg fallback (armed only when `PRERENDER_V8_PROF=true`): + // summarize the renderer's kernel-sampled V8 `--prof` log. Worth + // reading mainly when `pausedStack` came back `pause-timeout` — i.e. + // the peg is a non-yielding native call the debugger couldn't enter. + let v8ProfTopFrames: string | null = null; + if (v8ProfEnabled()) { + v8ProfTopFrames = await processV8ProfTopFrames(); + } // Out-of-band CDP view of fetches still outstanding — survives a // wedged JS thread, so it's available even when the in-page hooks // below are skipped. The longest-lived entry is what a waiting @@ -1791,6 +1818,13 @@ export async function withTimeout( (topPending ? `(top: ${topPending.url} ${topPending.ageMs}ms)` : '') + ` concurrentRenders=${concurrentRenders}` + (cpuProfileTopFrames ? ` cpuTopFrames: ${cpuProfileTopFrames}` : '') + + (pausedStack?.heapUsedMB != null + ? ` jsHeapUsedMB=${pausedStack.heapUsedMB.toFixed(0)}` + : '') + + (formatPausedStack(pausedStack) + ? ` pausedStack: ${formatPausedStack(pausedStack)}` + : '') + + (v8ProfTopFrames ? `\nv8ProfTopFrames:\n${v8ProfTopFrames}` : '') + ` DOM:\n${dom?.trim()}`, ); // Diagnostics ride on the outer `RenderError.diagnostics` as a diff --git a/packages/realm-server/prerender/v8-prof.ts b/packages/realm-server/prerender/v8-prof.ts new file mode 100644 index 0000000000..8270ad0e67 --- /dev/null +++ b/packages/realm-server/prerender/v8-prof.ts @@ -0,0 +1,94 @@ +// Diagnostic V8 `--prof` (kernel-signal CPU sampling) for the prerender +// renderer. Off by default (`PRERENDER_V8_PROF=true` to arm). +// +// Why this exists alongside `Debugger.pause` (pause-capture.ts): +// +// `Debugger.pause` reads a synchronous JS loop (V8 honors the pause at a +// back-edge) with zero continuous overhead. But if the wedge is one long +// NON-YIELDING native call there is no back-edge, and pause times out. +// `--prof` covers that gap: the kernel SIGPROF timer preempts the thread +// mid-instruction regardless of what it's running (JS or native), so the +// busiest frame wins the samples — exactly the case `Debugger.pause` +// can't reach. It also never needs the pegged thread to service a CDP +// `stop` (the failure mode of the CDP `Profiler`; see trace-capture.ts). +// +// The trade-off, and why it's gated off: it samples EVERY render, so its +// own interrupts can perturb a timing-sensitive wedge enough to dissolve +// it. Run it only when `Debugger.pause` reports `pause-timeout` (a +// suspected native peg). + +import { logger } from '@cardstack/runtime-common'; +import { promises as fs } from 'fs'; +import { tmpdir } from 'os'; +import path from 'path'; +import { execFile } from 'child_process'; +import { promisify } from 'util'; + +const log = logger('prerenderer'); +const execFileAsync = promisify(execFile); + +export const V8_PROF_LOG_DIR = tmpdir(); +export const V8_PROF_LOG_PREFIX = 'prerender-v8-prof-'; + +export function v8ProfEnabled(): boolean { + return process.env.PRERENDER_V8_PROF === 'true'; +} + +// The `--js-flags` value to pass at Chrome launch when armed. Per-pid +// logfile so concurrent renderer processes don't clobber one another. +export function v8ProfJsFlags(): string { + return `--prof --logfile=${V8_PROF_LOG_DIR}/${V8_PROF_LOG_PREFIX}%p.log`; +} + +// Best-effort summary of the hottest frames from the renderer's `--prof` +// log via `node --prof-process`. Called on the timeout path only when +// armed. Time-boxed — the log can be large, and the pegged render's +// samples dominate it, so the top self-time frames name the wedge. +export async function processV8ProfTopFrames( + budgetMs = 20000, +): Promise { + if (!v8ProfEnabled()) { + return null; + } + try { + let entries = await fs.readdir(V8_PROF_LOG_DIR); + let logs = entries.filter( + (e) => e.startsWith(V8_PROF_LOG_PREFIX) && e.endsWith('.log'), + ); + if (logs.length === 0) { + return ''; + } + // The pegged render produces by far the most samples, so its + // renderer's log is the largest. + let withSizes = await Promise.all( + logs.map(async (name) => { + let full = path.join(V8_PROF_LOG_DIR, name); + try { + let st = await fs.stat(full); + return { full, size: st.size }; + } catch { + return { full, size: 0 }; + } + }), + ); + withSizes.sort((a, b) => b.size - a.size); + let logPath = withSizes[0].full; + let { stdout } = await execFileAsync( + process.execPath, + ['--prof-process', logPath], + { timeout: budgetMs, maxBuffer: 32 * 1024 * 1024 }, + ); + let lines = stdout.split('\n'); + // The bottom-up "[Summary]" section lists self-time by category; the + // top JS/C++ entries that follow name the hot function. + let summaryIdx = lines.findIndex((l) => /\[Summary\]/.test(l)); + let slice = + summaryIdx >= 0 + ? lines.slice(summaryIdx, summaryIdx + 60) + : lines.slice(0, 60); + return slice.join('\n').trim() || ''; + } catch (e) { + log.debug('v8 --prof processing failed:', e); + return null; + } +} From 55ec800351625059dec1790da3e727172a098875 Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Mon, 15 Jun 2026 19:20:45 -0400 Subject: [PATCH 02/16] Isolate V8 --prof logs to the current browser run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The timeout-path processor scanned the shared OS temp dir and picked the largest matching `--prof` log, so a stale log from an earlier run in a reused container (or another renderer that ran longer without wedging) could outrank the timed-out page and point `v8ProfTopFrames` at the wrong stack. Clear pre-existing logs at browser launch and stamp the launch time, then on the timeout path consider only logs written since launch and pick the most-recently-written one — the renderer still spinning into the timeout, not an older completed render's larger-but-stale log. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../realm-server/prerender/browser-manager.ts | 7 ++- packages/realm-server/prerender/v8-prof.ts | 50 ++++++++++++++++--- 2 files changed, 49 insertions(+), 8 deletions(-) diff --git a/packages/realm-server/prerender/browser-manager.ts b/packages/realm-server/prerender/browser-manager.ts index b424635d24..cda7cca4a8 100644 --- a/packages/realm-server/prerender/browser-manager.ts +++ b/packages/realm-server/prerender/browser-manager.ts @@ -7,7 +7,11 @@ import { execFile } from 'child_process'; import { promisify } from 'util'; import { isHttpsLoopback } from '../lib/is-https-loopback.ts'; -import { v8ProfEnabled, v8ProfJsFlags } from './v8-prof.ts'; +import { + v8ProfEnabled, + v8ProfJsFlags, + prepareV8ProfForLaunch, +} from './v8-prof.ts'; const log = logger('prerenderer'); const PUPPETEER_PROFILE_PREFIX = 'puppeteer_dev_chrome_profile-'; @@ -73,6 +77,7 @@ export class BrowserManager { // concurrent renderer processes don't clobber one another; the timeout // path post-processes the log into the prerender-server logs. if (v8ProfEnabled()) { + await prepareV8ProfForLaunch(); launchArgs.push(`--js-flags=${v8ProfJsFlags()}`); } diff --git a/packages/realm-server/prerender/v8-prof.ts b/packages/realm-server/prerender/v8-prof.ts index 8270ad0e67..aefc9e90a9 100644 --- a/packages/realm-server/prerender/v8-prof.ts +++ b/packages/realm-server/prerender/v8-prof.ts @@ -40,6 +40,34 @@ export function v8ProfJsFlags(): string { return `--prof --logfile=${V8_PROF_LOG_DIR}/${V8_PROF_LOG_PREFIX}%p.log`; } +// Browser-launch wall-clock; logs older than this are from a previous run. +let v8ProfLaunchAt = 0; + +// Call once at browser launch (when armed) to isolate this run's profile +// logs. The OS temp dir is shared and the container can be reused across +// deploys, so a stale `--prof` log from an earlier run could otherwise be +// picked as the timeout diagnostic and point at the wrong stack. Delete +// any pre-existing logs and stamp the launch time so the timeout-path +// processor only ever considers logs written by THIS browser run. +export async function prepareV8ProfForLaunch(): Promise { + if (!v8ProfEnabled()) { + return; + } + v8ProfLaunchAt = Date.now(); + try { + let entries = await fs.readdir(V8_PROF_LOG_DIR); + await Promise.all( + entries + .filter((e) => e.startsWith(V8_PROF_LOG_PREFIX) && e.endsWith('.log')) + .map((e) => + fs.rm(path.join(V8_PROF_LOG_DIR, e), { force: true }).catch(() => {}), + ), + ); + } catch (e) { + log.debug('v8 --prof stale-log cleanup failed:', e); + } +} + // Best-effort summary of the hottest frames from the renderer's `--prof` // log via `node --prof-process`. Called on the timeout path only when // armed. Time-boxed — the log can be large, and the pegged render's @@ -58,21 +86,29 @@ export async function processV8ProfTopFrames( if (logs.length === 0) { return ''; } - // The pegged render produces by far the most samples, so its - // renderer's log is the largest. - let withSizes = await Promise.all( + let withStats = await Promise.all( logs.map(async (name) => { let full = path.join(V8_PROF_LOG_DIR, name); try { let st = await fs.stat(full); - return { full, size: st.size }; + return { full, mtimeMs: st.mtimeMs, size: st.size }; } catch { - return { full, size: 0 }; + return { full, mtimeMs: 0, size: 0 }; } }), ); - withSizes.sort((a, b) => b.size - a.size); - let logPath = withSizes[0].full; + // Only this browser run's logs (cleared + stamped at launch), and + // among those the most-recently-written one — the renderer that was + // still spinning up to the timeout, not an earlier completed render's + // larger-but-stale log. + let fromThisRun = withStats.filter( + (s) => s.size > 0 && s.mtimeMs >= v8ProfLaunchAt, + ); + if (fromThisRun.length === 0) { + return ''; + } + fromThisRun.sort((a, b) => b.mtimeMs - a.mtimeMs); + let logPath = fromThisRun[0].full; let { stdout } = await execFileAsync( process.execPath, ['--prof-process', logPath], From 5cfd32632e9485ee09c76e9978f97d3410d14fde Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Mon, 15 Jun 2026 20:39:46 -0400 Subject: [PATCH 03/16] indexing-diagnostics: document the hard-CPU-peg capture (Mode I) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds Mode I — reading a render wedged in a synchronous CPU peg from outside, when the thread is too pegged to service the CDP message that arms a debugger/profiler (`Debugger.enable` / `Profiler.enable` time out) and Mode H's captures are starved: - `pausedStack` — the always-on one-shot `Debugger.pause` on the timeout path (zero overhead until the pause, so it can't mask), with its `` / `` states. - `v8ProfTopFrames` — the `--prof` kernel-`SIGPROF` sampler (knob `PRERENDER_V8_PROF`) that preempts the pegged thread without cooperation, the fallback that reads a hard peg the CDP captures can't. Also adds the `PRERENDER_V8_PROF` row to the Mode H knobs table and a cross-reference from Mode H's limits, and notes the 60s render timeout that fires the capture before the request-level abort. Co-Authored-By: Claude Opus 4.8 (1M context) --- .claude/skills/indexing-diagnostics/SKILL.md | 68 ++++++++++++++++++-- 1 file changed, 61 insertions(+), 7 deletions(-) diff --git a/.claude/skills/indexing-diagnostics/SKILL.md b/.claude/skills/indexing-diagnostics/SKILL.md index 53cc1e7c6a..4dcdbb3758 100644 --- a/.claude/skills/indexing-diagnostics/SKILL.md +++ b/.claude/skills/indexing-diagnostics/SKILL.md @@ -881,13 +881,14 @@ Rule of thumb: a render that **completes but is heavy** → `.cpuprofile` (+ hea All live at `//boxel/` (Systems Manager → Parameter Store). The bucket itself (`PRERENDER_ARTIFACTS_BUCKET`) and the key prefix (`PRERENDER_ARTIFACTS_ENV`) are wired by Terraform — don't set them by hand. -| Parameter | Values | Effect | -| ------------------------------------- | ---------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `PRERENDER_PROFILE_AFFINITY` | comma-separated affinity keys, e.g. `realm:https://realms.cardstack.com/team/foo/` | **Required to target.** Only renders whose affinity key exactly matches are profiled at all (Tier 1 + Tier 2). Empty / `off` → everything inert. | -| `PRERENDER_PROFILE_CPUPROFILE` | `true` / `false` | Persist the full `.cpuprofile` for targeted renders. | -| `PRERENDER_PROFILE_TRACE` | `true` / `false` | Capture the streaming trace for targeted renders. | -| `PRERENDER_PROFILE_HEAP` | `true` / `false` | Capture the heap allocation-sampling profile for targeted renders. | -| `PRERENDER_PROFILE_MAX_SESSION_BYTES` | positive integer, or `0` for the default | Soft per-process byte budget across all artifacts. `0`/unset → 5 GiB. Once spent, the task declines further uploads (in-flight ones finish, so blobs are never truncated). | +| Parameter | Values | Effect | +| ------------------------------------- | ---------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `PRERENDER_PROFILE_AFFINITY` | comma-separated affinity keys, e.g. `realm:https://realms.cardstack.com/team/foo/` | **Required to target.** Only renders whose affinity key exactly matches are profiled at all (Tier 1 + Tier 2). Empty / `off` → everything inert. | +| `PRERENDER_PROFILE_CPUPROFILE` | `true` / `false` | Persist the full `.cpuprofile` for targeted renders. | +| `PRERENDER_PROFILE_TRACE` | `true` / `false` | Capture the streaming trace for targeted renders. | +| `PRERENDER_PROFILE_HEAP` | `true` / `false` | Capture the heap allocation-sampling profile for targeted renders. | +| `PRERENDER_PROFILE_MAX_SESSION_BYTES` | positive integer, or `0` for the default | Soft per-process byte budget across all artifacts. `0`/unset → 5 GiB. Once spent, the task declines further uploads (in-flight ones finish, so blobs are never truncated). | +| `PRERENDER_V8_PROF` | `true` / `false` | Arm V8's `--prof` kernel-`SIGPROF` CPU sampler at Chrome launch — **renderer-wide, NOT affinity-gated**. The fallback for a hard CPU peg that starves the CDP captures above (kernel preemption needs no thread cooperation). Surfaces as `v8ProfTopFrames` on the render-timeout line; see **Mode I**. Needs a prerender-server task **restart** (not just redeploy) to take effect. | The mode flags are Terraform-seeded sentinels (default `false` / `0`); `PRERENDER_PROFILE_AFFINITY` is operator-managed and must already exist. The affinity key is `realm:` + the realm's canonical URL **with trailing slash** — the same value Mode A/B logs print as `affinity=…`. @@ -927,6 +928,59 @@ The key schema is `env/realm/jobId/card/step/-.` — eve - The `.cpuprofile` and `.heapprofile` need the renderer thread to serialize, so a **fully-wedged** render produces neither — only the streaming trace comes back. That's by design (Tier 1's summary has the same limit); the trace is the wedge tool. - Browser-wide tracing is **single-flight** — only one trace runs at a time across the whole pool. Concurrent targeted renders skip their trace (logged at `debug`), so don't expect a trace for _every_ render under load; constrain concurrency or accept the gaps. The summary and the cpuprofile/heap captures are per-render and unaffected. - Captured artifacts are anonymized only at the key level (host stripped). The blobs themselves contain card URLs and code paths — treat them as you would any prerender diagnostic. +- If the renderer is so pegged that even `Debugger.enable` / `Profiler.enable` time out, **all of Mode H is starved** — the cpuprofile, heap, _and_ the trace's sampler setup all need the renderer to service a CDP message. Read that case from the timeout-path `pausedStack` / `--prof` signals instead — **Mode I**. + +## Mode I — a render is wedged in a synchronous CPU peg (reading the renderer from outside) + +**When to use this mode.** Mode A classified a render as a hard CPU peg (`mainThreadResponsive=false`, `scriptBusy≈1`), or a from-scratch index rejected mid-run on a card that never wrote a row (Mode C), and Mode H's heavyweight captures came back empty / `(idle)`. The renderer's main thread is spinning in a synchronous JS (or native) loop — the worst case for inspection, because a fully-pegged thread can't service the CDP protocol message that _arms_ a debugger or profiler (`Debugger.enable` / `Profiler.enable` time out), so the CDP captures are starved. Two timeout-path signals read it anyway, because neither needs the pegged thread to cooperate. They ride on the same render-timeout log line as `cpuTopFrames`, in the **prerender-server** log. + +A healthy render finishes well under 30s; the render-level timeout (`RENDER_TIMEOUT_MS`, default 60s) fires before the request-level abort (render + 60s overhead), so these are captured and logged before the worker gives up on the visit. (If a wedge ever rejects with no timeout line at all, the diagnostic block itself was starved — that is itself the `` signal below.) + +### `pausedStack` — a one-shot debugger pause (always on, can't mask) + +``` +pausedStack: [depth=N] fn @ url:line:col <- caller <- … jsHeapUsedMB=… +``` + +A single CDP `Debugger.pause` on the timeout path. V8 honors the pause at the next interrupt check (a loop back-edge or call), so it lands _inside_ a synchronous loop without the loop yielding — the same mechanism as the DevTools "pause" button on a hung `while(true)` page. It adds **zero overhead until the one pause**, so unlike a continuous sampler it can't perturb (mask) a timing-sensitive wedge. Three outcomes: + +- **A frame list** → the function the loop is in (`fn @ url:line`) and the call chain out to the render driver. `depth` is the live JS stack depth: a huge depth is runaway recursion; a small depth is a tight loop. A bare scriptId for `url` (no path) means the frame is in eval'd / dynamically-built code. +- **``** → the pause was requested but no back-edge honored it in budget. Most likely a long **non-yielding native call** (a catastrophic regex, a native sort) — no JS back-edge to interrupt. Pivot to `--prof`. +- **``** → the thread is so pegged it couldn't even service `Debugger.enable`. The hardest peg; CDP can't read it at all. Pivot to `--prof`. + +### `v8ProfTopFrames` — V8 `--prof` kernel sampler (gated; reads even a hard peg) + +When `pausedStack` is starved, the kernel-signal sampler is the fallback. `--prof` (knob: `PRERENDER_V8_PROF`, Mode H table) arms V8's `SIGPROF`-driven sampler at Chrome **launch**: the kernel timer preempts the pegged thread mid-instruction on a schedule it can't refuse — no protocol message, no back-edge, no cooperation — and the pre-installed handler walks the stack. It samples even a non-yielding native peg, and it never needs the thread to serialize a profile at `stop` (the failure mode of the CDP `Profiler`). Renderer-wide, not affinity-gated; the top self-time frames surface as `v8ProfTopFrames:` (processed in-process via `node --prof-process`). + +Arm it: + +1. Set SSM `//boxel/PRERENDER_V8_PROF` = `true`. +2. **Restart** the prerender-server task — the renderer must relaunch to pick up the `--js-flags=--prof` launch flag (a value flip alone, or a redeploy that reuses the running browser, won't arm it). `aws ecs update-service … --force-new-deployment` restarts the tasks. +3. Re-run the index; read `v8ProfTopFrames:` on the wedged card's timeout line. +4. Set it back to `false` + restart when done — it samples **every** render while on. + +**Masking note.** `--prof` samples continuously, but for a _synchronous_ peg that doesn't matter: a sync loop's control flow doesn't depend on timing, so sampling records _where_ it spins without changing _that_ it spins. The heavier CDP profiler / trace masking that's bitten before is the risk for _timing-sensitive async_ work — so confirm `mainThreadResponsive=false` before reaching for it. + +### Reading it + +- **Frame / top sample** → the function to fix. Follow the chain: a card computed inside a Glimmer `revalidate → rerender → evaluate` chain is a render-invalidation storm (a computed that dirties tracked state during render → re-render → re-evaluate → …), not a data or serialize cost — and not something the field-getter ceiling or a serialize cycle-guard will catch. +- **`jsHeapUsedMB`** (on the `pausedStack` line) → flat across the peg is a tight compute / recursion loop; climbing is a combinatorial re-build of shared subtrees (breadth). + +### Getting the logs + +```sh +aws --profile claude- logs filter-log-events \ + --log-group-name ecs-boxel-prerender-server- \ + --filter-pattern '"" "timed out after"' --start-time \ + --query 'events[*].message' --output text +``` + +Lines are FireLens-wrapped (`{"log":"…"}`); pull `.log`. Strip everything after ` DOM:` — the `pausedStack:` / `v8ProfTopFrames:` / discriminators are all before it. + +### What this mode can't tell you + +- If `pausedStack` is starved AND `--prof` shows the time in opaque native frames with no JS attribution, the peg is inside a native builtin — the frame names it, but the _why_ needs reading that builtin's inputs (e.g. a pathological regex / string). +- It's a CPU-peg mode. A render that's idle-waiting (`mainThreadResponsive=true`, `cpuTopFrames (idle)`, a long-pending fetch) is a **data stall** — Mode A's `pendingFetches` / Mode G, not this. ## Field-by-field reading From 90c3db5945a35852fa7c2be360c1271956460229 Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Mon, 15 Jun 2026 21:05:04 -0400 Subject: [PATCH 04/16] Match V8's per-isolate --prof log filename The renderer's V8 `--prof` log was being written (PRERENDER_V8_PROF=true, browser launched with the flag) but the timeout-path summarizer reported ``: V8's per-isolate logging prepends `isolate--` to the --logfile name, and the reader matched with `startsWith('prerender-v8-prof-')`, which never matches the prefixed file. Match the prefix with `includes` (in both the launch cleanup and the reader), pick the LARGEST this-run log (the pegged isolate accumulates the most samples over the peg), and on a miss list the `.log` files actually present so a still-wrong pattern is self-diagnosing. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/realm-server/prerender/v8-prof.ts | 29 +++++++++++++++------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/packages/realm-server/prerender/v8-prof.ts b/packages/realm-server/prerender/v8-prof.ts index aefc9e90a9..b0be0ba9bf 100644 --- a/packages/realm-server/prerender/v8-prof.ts +++ b/packages/realm-server/prerender/v8-prof.ts @@ -58,7 +58,9 @@ export async function prepareV8ProfForLaunch(): Promise { let entries = await fs.readdir(V8_PROF_LOG_DIR); await Promise.all( entries - .filter((e) => e.startsWith(V8_PROF_LOG_PREFIX) && e.endsWith('.log')) + // `includes`, not `startsWith`: V8's per-isolate logging prepends + // `isolate--` to the --logfile name. + .filter((e) => e.includes(V8_PROF_LOG_PREFIX) && e.endsWith('.log')) .map((e) => fs.rm(path.join(V8_PROF_LOG_DIR, e), { force: true }).catch(() => {}), ), @@ -80,11 +82,16 @@ export async function processV8ProfTopFrames( } try { let entries = await fs.readdir(V8_PROF_LOG_DIR); + // `includes`, not `startsWith`: V8's per-isolate logging prepends + // `isolate--` to the --logfile name, so the file is e.g. + // `isolate-0x…-prerender-v8-prof-.log`. let logs = entries.filter( - (e) => e.startsWith(V8_PROF_LOG_PREFIX) && e.endsWith('.log'), + (e) => e.includes(V8_PROF_LOG_PREFIX) && e.endsWith('.log'), ); if (logs.length === 0) { - return ''; + // Name what IS present, so a still-missed pattern is self-diagnosing. + let present = entries.filter((e) => e.endsWith('.log')).slice(0, 12); + return ``; } let withStats = await Promise.all( logs.map(async (name) => { @@ -97,17 +104,21 @@ export async function processV8ProfTopFrames( } }), ); - // Only this browser run's logs (cleared + stamped at launch), and - // among those the most-recently-written one — the renderer that was - // still spinning up to the timeout, not an earlier completed render's - // larger-but-stale log. + // Only this browser run's logs (stale prior-run logs were cleared + + // the launch time stamped). Among those pick the LARGEST: the pegged + // isolate accumulates by far the most samples over the 60s peg, so its + // log dwarfs sibling isolates' and earlier renders' — and the peg's + // samples dominate its top frames regardless of what else it rendered. let fromThisRun = withStats.filter( (s) => s.size > 0 && s.mtimeMs >= v8ProfLaunchAt, ); if (fromThisRun.length === 0) { - return ''; + let seen = withStats + .map((s) => `${path.basename(s.full)}(${s.size}b)`) + .slice(0, 8); + return ``; } - fromThisRun.sort((a, b) => b.mtimeMs - a.mtimeMs); + fromThisRun.sort((a, b) => b.size - a.size); let logPath = fromThisRun[0].full; let { stdout } = await execFileAsync( process.execPath, From 65db985871c9068e51301609cff148b171ddc1df Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Mon, 15 Jun 2026 21:10:13 -0400 Subject: [PATCH 05/16] Sweep stale --prof logs on every browser launch (not just when armed) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The renderer's --prof logs live only in the container's ephemeral /tmp (no EFS mount on the prerenderer), so they're reclaimed when the ECS task replaces — but a long-lived task that had PRERENDER_V8_PROF on, then off, would keep the "on" period's logs around because the launch-time cleanup was gated on the flag being enabled. Run the stale-log sweep on every browser launch regardless of the flag, and only stamp the launch time / add the --prof launch flag when enabled. So flipping the flag off + restarting now actually clears the logs. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/realm-server/prerender/browser-manager.ts | 6 +++++- packages/realm-server/prerender/v8-prof.ts | 10 +++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/packages/realm-server/prerender/browser-manager.ts b/packages/realm-server/prerender/browser-manager.ts index cda7cca4a8..a96c271833 100644 --- a/packages/realm-server/prerender/browser-manager.ts +++ b/packages/realm-server/prerender/browser-manager.ts @@ -76,8 +76,12 @@ export class BrowserManager { // for a run that needs the native-peg fallback. Per-pid logfile so // concurrent renderer processes don't clobber one another; the timeout // path post-processes the log into the prerender-server logs. + // Always sweep stale --prof logs from the container's ephemeral /tmp + // (no EFS here — the only filesystem is the container's), even when + // disabled, so flipping the flag off + restarting cleans up the prior + // "on" period's logs. Arm the sampler itself only when enabled. + await prepareV8ProfForLaunch(); if (v8ProfEnabled()) { - await prepareV8ProfForLaunch(); launchArgs.push(`--js-flags=${v8ProfJsFlags()}`); } diff --git a/packages/realm-server/prerender/v8-prof.ts b/packages/realm-server/prerender/v8-prof.ts index b0be0ba9bf..3cba7c38cf 100644 --- a/packages/realm-server/prerender/v8-prof.ts +++ b/packages/realm-server/prerender/v8-prof.ts @@ -50,10 +50,14 @@ let v8ProfLaunchAt = 0; // any pre-existing logs and stamp the launch time so the timeout-path // processor only ever considers logs written by THIS browser run. export async function prepareV8ProfForLaunch(): Promise { - if (!v8ProfEnabled()) { - return; + // Sweep stale logs on EVERY browser launch, even when disabled — so + // flipping PRERENDER_V8_PROF off and restarting clears the logs the + // prior "on" period left behind (the OS temp dir is shared and the + // container can outlive a single browser). Only stamp the launch time + // when armed (the reader uses it to scope to this run's logs). + if (v8ProfEnabled()) { + v8ProfLaunchAt = Date.now(); } - v8ProfLaunchAt = Date.now(); try { let entries = await fs.readdir(V8_PROF_LOG_DIR); await Promise.all( From b5e29dc78832aa50dcf21b739b53456a45916cf9 Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Mon, 15 Jun 2026 21:22:07 -0400 Subject: [PATCH 06/16] Always disable the Chromium sandbox for the prerender renderer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The prerender renderer only ever renders the realm's own trusted cards (never untrusted web content) and always runs in a container where the sandbox must be off for Chrome to launch — and for the renderer to write the V8 `--prof` diagnostic log to /tmp. Drop the CI / PUPPETEER_DISABLE_SANDBOX gate and pass --no-sandbox unconditionally, so it never silently depends on an env var being set. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/realm-server/prerender/browser-manager.ts | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/packages/realm-server/prerender/browser-manager.ts b/packages/realm-server/prerender/browser-manager.ts index a96c271833..cbabf02bd9 100644 --- a/packages/realm-server/prerender/browser-manager.ts +++ b/packages/realm-server/prerender/browser-manager.ts @@ -29,12 +29,13 @@ export class BrowserManager { await this.cleanupUserDataDirs(); let launchArgs: string[] = []; - let disableSandbox = - process.env.CI === 'true' || - process.env.PUPPETEER_DISABLE_SANDBOX === 'true'; - if (disableSandbox) { - launchArgs.push('--no-sandbox', '--disable-setuid-sandbox'); - } + // Always disable the Chromium sandbox for the prerender renderer. It + // only ever renders the realm's own trusted cards (never untrusted web + // content) and always runs in a container where the sandbox must be off + // for Chrome to launch — and for the renderer to write the V8 `--prof` + // diagnostic log. Unconditional rather than gated on CI / + // PUPPETEER_DISABLE_SANDBOX so it never silently depends on the env. + launchArgs.push('--no-sandbox', '--disable-setuid-sandbox'); // When the realm-server speaks HTTPS (local dev with a mkcert leaf // cert), Chromium needs to be told to accept it. mkcert's root CA From 834d848a7b3a3cfb94e15457049235f31533decb Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Mon, 15 Jun 2026 22:23:39 -0400 Subject: [PATCH 07/16] Make the --prof reader report why it has no frames (never silent null) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v8ProfTopFrames came back absent on the bxl wedge with no reason, because the reader's catch swallowed the error and returned null. That left the live-vs-exit question (is the on-disk --prof log usable while the renderer is still running, or only after it closes?) unanswerable. Surface it: report the picked log basename + size in every path; on a prof-process failure return a descriptive string (incl. a "killed: timed out" marker) instead of null; on any reader error return the message rather than null; raise the stdout cap to 128MB and the time-box to 40s. Now the timeout line always shows either the frames + log size, or the exact reason + size — which decides whether the log is written live or is empty/incomplete mid-run. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/realm-server/prerender/v8-prof.ts | 29 +++++++++++++++------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/packages/realm-server/prerender/v8-prof.ts b/packages/realm-server/prerender/v8-prof.ts index 3cba7c38cf..73b34322bc 100644 --- a/packages/realm-server/prerender/v8-prof.ts +++ b/packages/realm-server/prerender/v8-prof.ts @@ -79,7 +79,7 @@ export async function prepareV8ProfForLaunch(): Promise { // armed. Time-boxed — the log can be large, and the pegged render's // samples dominate it, so the top self-time frames name the wedge. export async function processV8ProfTopFrames( - budgetMs = 20000, + budgetMs = 40000, ): Promise { if (!v8ProfEnabled()) { return null; @@ -124,11 +124,21 @@ export async function processV8ProfTopFrames( } fromThisRun.sort((a, b) => b.size - a.size); let logPath = fromThisRun[0].full; - let { stdout } = await execFileAsync( - process.execPath, - ['--prof-process', logPath], - { timeout: budgetMs, maxBuffer: 32 * 1024 * 1024 }, - ); + let sizeMB = (fromThisRun[0].size / (1024 * 1024)).toFixed(1); + let stdout: string; + try { + ({ stdout } = await execFileAsync( + process.execPath, + ['--prof-process', logPath], + { timeout: budgetMs, maxBuffer: 128 * 1024 * 1024 }, + )); + } catch (e) { + // Report the reason + size rather than swallowing it: a large + // accumulated log (every render since browser launch) can blow the + // time-box or the stdout cap, and we need to SEE that, not get null. + let err = e as { code?: string; killed?: boolean; message?: string }; + return ``; + } let lines = stdout.split('\n'); // The bottom-up "[Summary]" section lists self-time by category; the // top JS/C++ entries that follow name the hot function. @@ -137,9 +147,10 @@ export async function processV8ProfTopFrames( summaryIdx >= 0 ? lines.slice(summaryIdx, summaryIdx + 60) : lines.slice(0, 60); - return slice.join('\n').trim() || ''; + let summary = slice.join('\n').trim(); + return `[${path.basename(logPath)} ${sizeMB}MB]\n${summary || ''}`; } catch (e) { - log.debug('v8 --prof processing failed:', e); - return null; + // Never silently null — surface the reason on the timeout line. + return ``; } } From 0b06230530770226f9dd0da6cfca28e19f01b344 Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Tue, 16 Jun 2026 10:09:25 -0400 Subject: [PATCH 08/16] Ship the wedged renderer's V8 --prof log to S3 instead of parsing in-container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A hard synchronous CPU peg starves CDP (Debugger.enable / Profiler.enable time out), so V8 `--prof` — kernel-SIGPROF sampled, written by a separate thread to a file — is the only capture that survives it. But that file accumulates every render on the isolate since browser launch (tens of MB), and `node --prof-process` on it blows the render-timeout budget — which is why the in-container summarizer kept returning nothing. Stop parsing in-container. Add a `v8log` artifact kind and, on the render timeout (when armed), stream the pegged isolate's raw `--prof` log to the prerender S3 artifacts bucket via the existing sink — keyed by realm/card/ job so the wedging task's log is self-identifying across the fleet (no shelling into 4 servers to find it). Symbolize it offline with `node --prof-process`, where there's no deadline and the peg dominates the top self-time frames. The timeout log line reports the upload + key. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../realm-server/prerender/artifact-sink.ts | 9 +- packages/realm-server/prerender/utils.ts | 17 +- packages/realm-server/prerender/v8-prof.ts | 176 +++++++++--------- 3 files changed, 105 insertions(+), 97 deletions(-) diff --git a/packages/realm-server/prerender/artifact-sink.ts b/packages/realm-server/prerender/artifact-sink.ts index c2133cca50..073638b22c 100644 --- a/packages/realm-server/prerender/artifact-sink.ts +++ b/packages/realm-server/prerender/artifact-sink.ts @@ -52,12 +52,19 @@ const DEFAULT_REGION = 'us-east-1'; // usual tools recognise: `.cpuprofile` (Chrome DevTools / speedscope), // `.trace.json` (Chrome tracing / Perfetto), `.heapprofile` (DevTools // allocation-sampling view). -export type ArtifactKind = 'cpuprofile' | 'trace' | 'heap'; +export type ArtifactKind = 'cpuprofile' | 'trace' | 'heap' | 'v8log'; const SUFFIX_BY_KIND: Record = { cpuprofile: 'cpuprofile', trace: 'trace.json', heap: 'heapprofile', + // Raw V8 `--prof` tick log (the renderer's `isolate-…-prerender-v8-prof` + // file), uploaded as-is and symbolized offline with `node --prof-process`. + // This is the one capture that survives a hard synchronous CPU peg: the + // kernel SIGPROF sampler writes it from a separate thread, so it lands even + // when the main thread is too pegged to service CDP — but it's too large to + // `--prof-process` inside the render-timeout budget, so we ship the bytes. + v8log: 'v8log', }; // The render-identifying fields that key an artifact. All but `kind` are diff --git a/packages/realm-server/prerender/utils.ts b/packages/realm-server/prerender/utils.ts index c5454c9f7b..77b5ff543a 100644 --- a/packages/realm-server/prerender/utils.ts +++ b/packages/realm-server/prerender/utils.ts @@ -22,7 +22,7 @@ import { formatPausedStack, type PausedStackCapture, } from './pause-capture.ts'; -import { v8ProfEnabled, processV8ProfTopFrames } from './v8-prof.ts'; +import { v8ProfEnabled, uploadV8ProfLog } from './v8-prof.ts'; import { ensureHeapSampling, captureHeapSamplingProfile, @@ -1705,13 +1705,14 @@ export async function withTimeout( if (!page.isClosed()) { pausedStack = await capturePausedCallStack(page, { budgetMs: 8000 }); } - // Native-peg fallback (armed only when `PRERENDER_V8_PROF=true`): - // summarize the renderer's kernel-sampled V8 `--prof` log. Worth - // reading mainly when `pausedStack` came back `pause-timeout` — i.e. - // the peg is a non-yielding native call the debugger couldn't enter. - let v8ProfTopFrames: string | null = null; + // Hard-peg capture (armed only when `PRERENDER_V8_PROF=true`): ship the + // pegged isolate's raw kernel-sampled V8 `--prof` log to the artifact + // bucket, keyed by this render's realm/card/job, to be symbolized offline. + // This is the capture that survives a peg so hard it starves CDP — i.e. + // exactly when `pausedStack` comes back `debugger-enable-timeout`. + let v8ProfStatus: string | null = null; if (v8ProfEnabled()) { - v8ProfTopFrames = await processV8ProfTopFrames(); + v8ProfStatus = await uploadV8ProfLog(artifactKeyPartsFor(profileContext)); } // Out-of-band CDP view of fetches still outstanding — survives a // wedged JS thread, so it's available even when the in-page hooks @@ -1824,7 +1825,7 @@ export async function withTimeout( (formatPausedStack(pausedStack) ? ` pausedStack: ${formatPausedStack(pausedStack)}` : '') + - (v8ProfTopFrames ? `\nv8ProfTopFrames:\n${v8ProfTopFrames}` : '') + + (v8ProfStatus ? ` v8ProfLog: ${v8ProfStatus}` : '') + ` DOM:\n${dom?.trim()}`, ); // Diagnostics ride on the outer `RenderError.diagnostics` as a diff --git a/packages/realm-server/prerender/v8-prof.ts b/packages/realm-server/prerender/v8-prof.ts index 73b34322bc..9e49c263fc 100644 --- a/packages/realm-server/prerender/v8-prof.ts +++ b/packages/realm-server/prerender/v8-prof.ts @@ -1,31 +1,29 @@ // Diagnostic V8 `--prof` (kernel-signal CPU sampling) for the prerender -// renderer. Off by default (`PRERENDER_V8_PROF=true` to arm). +// renderer — the one capture that survives a hard synchronous CPU peg. // -// Why this exists alongside `Debugger.pause` (pause-capture.ts): +// When a render pegs the main thread hard, CDP can't read it: `Debugger.enable` +// / `Profiler.enable` time out because the pegged thread can't service the +// protocol (see pause-capture.ts / trace-capture.ts). V8's `--prof` sampler is +// driven by the kernel SIGPROF timer and written by a separate thread, so it +// records the spinning frame regardless — straight to a file on disk. // -// `Debugger.pause` reads a synchronous JS loop (V8 honors the pause at a -// back-edge) with zero continuous overhead. But if the wedge is one long -// NON-YIELDING native call there is no back-edge, and pause times out. -// `--prof` covers that gap: the kernel SIGPROF timer preempts the thread -// mid-instruction regardless of what it's running (JS or native), so the -// busiest frame wins the samples — exactly the case `Debugger.pause` -// can't reach. It also never needs the pegged thread to service a CDP -// `stop` (the failure mode of the CDP `Profiler`; see trace-capture.ts). +// The catch: that file accumulates every render on the isolate since browser +// launch (tens of MB), so `node --prof-process` on it blows the render-timeout +// budget. So we don't parse it in-container. On the render timeout we ship the +// RAW log to the prerender S3 artifacts bucket, keyed by realm/card/job, and +// symbolize it offline (`node --prof-process`) where there's no deadline and +// the peg dominates the top self-time frames. // -// The trade-off, and why it's gated off: it samples EVERY render, so its -// own interrupts can perturb a timing-sensitive wedge enough to dissolve -// it. Run it only when `Debugger.pause` reports `pause-timeout` (a -// suspected native peg). +// Off by default (`PRERENDER_V8_PROF=true` to arm) — it samples every render, +// so it can perturb a timing-sensitive wedge; arm it deliberately. import { logger } from '@cardstack/runtime-common'; -import { promises as fs } from 'fs'; +import { promises as fs, createReadStream } from 'fs'; import { tmpdir } from 'os'; import path from 'path'; -import { execFile } from 'child_process'; -import { promisify } from 'util'; +import { uploadArtifact, type ArtifactKeyParts } from './artifact-sink.ts'; const log = logger('prerenderer'); -const execFileAsync = promisify(execFile); export const V8_PROF_LOG_DIR = tmpdir(); export const V8_PROF_LOG_PREFIX = 'prerender-v8-prof-'; @@ -74,83 +72,85 @@ export async function prepareV8ProfForLaunch(): Promise { } } -// Best-effort summary of the hottest frames from the renderer's `--prof` -// log via `node --prof-process`. Called on the timeout path only when -// armed. Time-boxed — the log can be large, and the pegged render's -// samples dominate it, so the top self-time frames name the wedge. -export async function processV8ProfTopFrames( - budgetMs = 40000, +// Find the pegged isolate's `--prof` log among this browser run's logs. +// Returns the chosen file (or a self-diagnosing reason string). +async function pickV8ProfLog(): Promise< + { full: string; sizeMB: string } | { reason: string } +> { + let entries = await fs.readdir(V8_PROF_LOG_DIR); + // `includes`, not `startsWith`: V8's per-isolate logging prepends + // `isolate--` to the --logfile name, so the file is e.g. + // `isolate-0x…-prerender-v8-prof-.log`. + let logs = entries.filter( + (e) => e.includes(V8_PROF_LOG_PREFIX) && e.endsWith('.log'), + ); + if (logs.length === 0) { + let present = entries.filter((e) => e.endsWith('.log')).slice(0, 12); + return { + reason: ``, + }; + } + let withStats = await Promise.all( + logs.map(async (name) => { + let full = path.join(V8_PROF_LOG_DIR, name); + try { + let st = await fs.stat(full); + return { full, mtimeMs: st.mtimeMs, size: st.size }; + } catch { + return { full, mtimeMs: 0, size: 0 }; + } + }), + ); + // This browser run's logs only (stale ones were cleared + the launch time + // stamped). Pick the LARGEST: the pegged isolate accumulates by far the + // most samples over the peg, so its log dwarfs sibling isolates' and + // earlier renders' — and the peg dominates its frames regardless of what + // else it rendered. + let fromThisRun = withStats.filter( + (s) => s.size > 0 && s.mtimeMs >= v8ProfLaunchAt, + ); + if (fromThisRun.length === 0) { + let seen = withStats + .map((s) => `${path.basename(s.full)}(${s.size}b)`) + .slice(0, 8); + return { + reason: ``, + }; + } + fromThisRun.sort((a, b) => b.size - a.size); + return { + full: fromThisRun[0].full, + sizeMB: (fromThisRun[0].size / (1024 * 1024)).toFixed(1), + }; +} + +// On the render timeout (when armed), ship the pegged isolate's RAW `--prof` +// log to the artifact sink, keyed by realm/card/job so the wedging task's log +// is self-identifying across the fleet. We don't `--prof-process` it here: the +// accumulated log is too large to symbolize inside the timeout budget, and the +// sink streams it (managed multipart) so memory stays bounded. Symbolize the +// artifact offline with `node --prof-process`. Returns a short status for the +// timeout log line. +export async function uploadV8ProfLog( + keyParts: Omit, ): Promise { if (!v8ProfEnabled()) { return null; } try { - let entries = await fs.readdir(V8_PROF_LOG_DIR); - // `includes`, not `startsWith`: V8's per-isolate logging prepends - // `isolate--` to the --logfile name, so the file is e.g. - // `isolate-0x…-prerender-v8-prof-.log`. - let logs = entries.filter( - (e) => e.includes(V8_PROF_LOG_PREFIX) && e.endsWith('.log'), - ); - if (logs.length === 0) { - // Name what IS present, so a still-missed pattern is self-diagnosing. - let present = entries.filter((e) => e.endsWith('.log')).slice(0, 12); - return ``; - } - let withStats = await Promise.all( - logs.map(async (name) => { - let full = path.join(V8_PROF_LOG_DIR, name); - try { - let st = await fs.stat(full); - return { full, mtimeMs: st.mtimeMs, size: st.size }; - } catch { - return { full, mtimeMs: 0, size: 0 }; - } - }), - ); - // Only this browser run's logs (stale prior-run logs were cleared + - // the launch time stamped). Among those pick the LARGEST: the pegged - // isolate accumulates by far the most samples over the 60s peg, so its - // log dwarfs sibling isolates' and earlier renders' — and the peg's - // samples dominate its top frames regardless of what else it rendered. - let fromThisRun = withStats.filter( - (s) => s.size > 0 && s.mtimeMs >= v8ProfLaunchAt, - ); - if (fromThisRun.length === 0) { - let seen = withStats - .map((s) => `${path.basename(s.full)}(${s.size}b)`) - .slice(0, 8); - return ``; - } - fromThisRun.sort((a, b) => b.size - a.size); - let logPath = fromThisRun[0].full; - let sizeMB = (fromThisRun[0].size / (1024 * 1024)).toFixed(1); - let stdout: string; - try { - ({ stdout } = await execFileAsync( - process.execPath, - ['--prof-process', logPath], - { timeout: budgetMs, maxBuffer: 128 * 1024 * 1024 }, - )); - } catch (e) { - // Report the reason + size rather than swallowing it: a large - // accumulated log (every render since browser launch) can blow the - // time-box or the stdout cap, and we need to SEE that, not get null. - let err = e as { code?: string; killed?: boolean; message?: string }; - return ``; + let picked = await pickV8ProfLog(); + if ('reason' in picked) { + return picked.reason; } - let lines = stdout.split('\n'); - // The bottom-up "[Summary]" section lists self-time by category; the - // top JS/C++ entries that follow name the hot function. - let summaryIdx = lines.findIndex((l) => /\[Summary\]/.test(l)); - let slice = - summaryIdx >= 0 - ? lines.slice(summaryIdx, summaryIdx + 60) - : lines.slice(0, 60); - let summary = slice.join('\n').trim(); - return `[${path.basename(logPath)} ${sizeMB}MB]\n${summary || ''}`; + await uploadArtifact({ + ...keyParts, + kind: 'v8log', + body: createReadStream(picked.full), + contentType: 'text/plain', + }); + return `uploaded v8 --prof log ${path.basename(picked.full)} (${picked.sizeMB}MB) to artifact bucket — symbolize offline with \`node --prof-process\``; } catch (e) { // Never silently null — surface the reason on the timeout line. - return ``; + return ``; } } From 6df42db0f627740c4be0e407fcc2c5299001ffd2 Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Tue, 16 Jun 2026 10:12:00 -0400 Subject: [PATCH 09/16] indexing-diagnostics: document the v8log S3-artifact + offline symbolization Mode I and the Mode H knob table now describe the real flow: on a render timeout the raw V8 --prof log is streamed to the prerender artifacts bucket as a `v8log` (the timeout line reports the upload + key), and is symbolized OFFLINE with `node --prof-process` (the log self-contains its code-creation map, so JS frames resolve without binaries/sourcemaps). Replaces the old "v8ProfTopFrames on the log line" in-container-parse model that blew the timeout budget on a large log. Co-Authored-By: Claude Opus 4.8 (1M context) --- .claude/skills/indexing-diagnostics/SKILL.md | 39 +++++++++++++------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/.claude/skills/indexing-diagnostics/SKILL.md b/.claude/skills/indexing-diagnostics/SKILL.md index 4dcdbb3758..1379aa6417 100644 --- a/.claude/skills/indexing-diagnostics/SKILL.md +++ b/.claude/skills/indexing-diagnostics/SKILL.md @@ -881,14 +881,14 @@ Rule of thumb: a render that **completes but is heavy** → `.cpuprofile` (+ hea All live at `//boxel/` (Systems Manager → Parameter Store). The bucket itself (`PRERENDER_ARTIFACTS_BUCKET`) and the key prefix (`PRERENDER_ARTIFACTS_ENV`) are wired by Terraform — don't set them by hand. -| Parameter | Values | Effect | -| ------------------------------------- | ---------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `PRERENDER_PROFILE_AFFINITY` | comma-separated affinity keys, e.g. `realm:https://realms.cardstack.com/team/foo/` | **Required to target.** Only renders whose affinity key exactly matches are profiled at all (Tier 1 + Tier 2). Empty / `off` → everything inert. | -| `PRERENDER_PROFILE_CPUPROFILE` | `true` / `false` | Persist the full `.cpuprofile` for targeted renders. | -| `PRERENDER_PROFILE_TRACE` | `true` / `false` | Capture the streaming trace for targeted renders. | -| `PRERENDER_PROFILE_HEAP` | `true` / `false` | Capture the heap allocation-sampling profile for targeted renders. | -| `PRERENDER_PROFILE_MAX_SESSION_BYTES` | positive integer, or `0` for the default | Soft per-process byte budget across all artifacts. `0`/unset → 5 GiB. Once spent, the task declines further uploads (in-flight ones finish, so blobs are never truncated). | -| `PRERENDER_V8_PROF` | `true` / `false` | Arm V8's `--prof` kernel-`SIGPROF` CPU sampler at Chrome launch — **renderer-wide, NOT affinity-gated**. The fallback for a hard CPU peg that starves the CDP captures above (kernel preemption needs no thread cooperation). Surfaces as `v8ProfTopFrames` on the render-timeout line; see **Mode I**. Needs a prerender-server task **restart** (not just redeploy) to take effect. | +| Parameter | Values | Effect | +| ------------------------------------- | ---------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `PRERENDER_PROFILE_AFFINITY` | comma-separated affinity keys, e.g. `realm:https://realms.cardstack.com/team/foo/` | **Required to target.** Only renders whose affinity key exactly matches are profiled at all (Tier 1 + Tier 2). Empty / `off` → everything inert. | +| `PRERENDER_PROFILE_CPUPROFILE` | `true` / `false` | Persist the full `.cpuprofile` for targeted renders. | +| `PRERENDER_PROFILE_TRACE` | `true` / `false` | Capture the streaming trace for targeted renders. | +| `PRERENDER_PROFILE_HEAP` | `true` / `false` | Capture the heap allocation-sampling profile for targeted renders. | +| `PRERENDER_PROFILE_MAX_SESSION_BYTES` | positive integer, or `0` for the default | Soft per-process byte budget across all artifacts. `0`/unset → 5 GiB. Once spent, the task declines further uploads (in-flight ones finish, so blobs are never truncated). | +| `PRERENDER_V8_PROF` | `true` / `false` | Arm V8's `--prof` kernel-`SIGPROF` CPU sampler at Chrome launch — **renderer-wide, NOT affinity-gated**. The capture that survives a hard CPU peg that starves the CDP captures above (kernel preemption needs no thread cooperation). On a render timeout the raw log is streamed to the artifacts bucket as a `v8log` (symbolized offline with `node --prof-process`); the timeout line reports the upload. See **Mode I**. Needs a prerender-server task **restart** (not just redeploy) to take effect. | The mode flags are Terraform-seeded sentinels (default `false` / `0`); `PRERENDER_PROFILE_AFFINITY` is operator-managed and must already exist. The affinity key is `realm:` + the realm's canonical URL **with trailing slash** — the same value Mode A/B logs print as `affinity=…`. @@ -948,17 +948,30 @@ A single CDP `Debugger.pause` on the timeout path. V8 honors the pause at the ne - **``** → the pause was requested but no back-edge honored it in budget. Most likely a long **non-yielding native call** (a catastrophic regex, a native sort) — no JS back-edge to interrupt. Pivot to `--prof`. - **``** → the thread is so pegged it couldn't even service `Debugger.enable`. The hardest peg; CDP can't read it at all. Pivot to `--prof`. -### `v8ProfTopFrames` — V8 `--prof` kernel sampler (gated; reads even a hard peg) +### V8 `--prof` log → S3 artifact (gated; the only capture that survives a hard peg) -When `pausedStack` is starved, the kernel-signal sampler is the fallback. `--prof` (knob: `PRERENDER_V8_PROF`, Mode H table) arms V8's `SIGPROF`-driven sampler at Chrome **launch**: the kernel timer preempts the pegged thread mid-instruction on a schedule it can't refuse — no protocol message, no back-edge, no cooperation — and the pre-installed handler walks the stack. It samples even a non-yielding native peg, and it never needs the thread to serialize a profile at `stop` (the failure mode of the CDP `Profiler`). Renderer-wide, not affinity-gated; the top self-time frames surface as `v8ProfTopFrames:` (processed in-process via `node --prof-process`). +When `pausedStack` is starved, the kernel-signal sampler is the capture that still works. `--prof` (knob: `PRERENDER_V8_PROF`, Mode H table) arms V8's `SIGPROF`-driven sampler at Chrome **launch**: the kernel timer preempts the pegged thread mid-instruction on a schedule it can't refuse — no protocol message, no back-edge, no cooperation — and a **separate thread** writes the samples to a file. So it records the spinning frame even when CDP is dead, and never needs the thread to serialize at `stop` (the failure mode of the CDP `Profiler`). Renderer-wide, not affinity-gated. + +It is **not** symbolized in-container — the log accumulates every render on the isolate since launch (tens of MB), and `node --prof-process` on that blows the render-timeout budget (that's why an in-container summary kept coming back empty). Instead, on the timeout the prerender server **streams the raw log to the prerender S3 artifacts bucket** as a `v8log` artifact, keyed `env/realm/jobId/card/step/.v8log` — so the wedging task's log is self-identifying (no shelling into the N prerender tasks to find it). The timeout log line just reports the upload: `v8ProfLog: uploaded … (NMB) to artifact bucket …`, or a self-diagnosing reason (``) if none was found. Arm it: 1. Set SSM `//boxel/PRERENDER_V8_PROF` = `true`. -2. **Restart** the prerender-server task — the renderer must relaunch to pick up the `--js-flags=--prof` launch flag (a value flip alone, or a redeploy that reuses the running browser, won't arm it). `aws ecs update-service … --force-new-deployment` restarts the tasks. -3. Re-run the index; read `v8ProfTopFrames:` on the wedged card's timeout line. +2. **Restart** the prerender-server tasks — the renderer must relaunch to pick up the `--js-flags=--prof` launch flag (a value flip alone, or a redeploy reusing the running browser, won't arm it): `aws ecs update-service --cluster --service boxel-prerender-server- --force-new-deployment`. +3. Re-run the index. On a wedge, grep the prerender-server log for `v8ProfLog: uploaded` — and pull the artifact (next section). 4. Set it back to `false` + restart when done — it samples **every** render while on. +**Symbolize offline** (the `boxel-claude-readonly` role has `s3:GetObject` on `boxel-prerender-artifacts-*`): + +```sh +aws s3 ls --recursive s3://boxel-prerender-artifacts-/// # find the .v8log key +aws s3 cp s3://boxel-prerender-artifacts-/ /tmp/wedge.v8log +node --prof-process /tmp/wedge.v8log \ + | sed -n '/\[Summary\]/,+14p; /\[JavaScript\]/,+30p; /\[Bottom up (heavy) profile\]/,+50p' +``` + +The peg dominates the cumulative log (a 60s spin at 1 kHz is ~60k ticks; every other render on the tab is sub-second), so the top self-time frame — and the heaviest `[Bottom up]` path — is the wedge. + **Masking note.** `--prof` samples continuously, but for a _synchronous_ peg that doesn't matter: a sync loop's control flow doesn't depend on timing, so sampling records _where_ it spins without changing _that_ it spins. The heavier CDP profiler / trace masking that's bitten before is the risk for _timing-sensitive async_ work — so confirm `mainThreadResponsive=false` before reaching for it. ### Reading it @@ -975,7 +988,7 @@ aws --profile claude- logs filter-log-events \ --query 'events[*].message' --output text ``` -Lines are FireLens-wrapped (`{"log":"…"}`); pull `.log`. Strip everything after ` DOM:` — the `pausedStack:` / `v8ProfTopFrames:` / discriminators are all before it. +Lines are FireLens-wrapped (`{"log":"…"}`); pull `.log`. Strip everything after ` DOM:` — the `pausedStack:` / `v8ProfLog:` / discriminators are all before it. The `v8ProfLog:` line carries the S3 key of the uploaded `--prof` log; fetch + symbolize it per the previous section. ### What this mode can't tell you From addb9499f59a22154d9a22d3b2ecd474f9cd96b4 Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Tue, 16 Jun 2026 10:19:41 -0400 Subject: [PATCH 10/16] Add a one-command helper to fetch + symbolize a prerender wedge's v8log MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit symbolize-prerender-wedge.sh resolves the newest `.v8log` artifact for a realm in the prerender artifacts bucket (or an exact --key), downloads it, and runs `node --prof-process` to print the [Summary]/[JavaScript]/[Bottom up] sections — the push-button version of the three manual aws+node steps. The log self-contains its code-creation map, so JS frames resolve offline without binaries or source maps; native/Chrome frames stay opaque. Mode I now points at the helper as the primary path and keeps the raw commands as the under-the-hood reference. Co-Authored-By: Claude Opus 4.8 (1M context) --- .claude/skills/indexing-diagnostics/SKILL.md | 13 ++- .../scripts/symbolize-prerender-wedge.sh | 79 +++++++++++++++++++ 2 files changed, 89 insertions(+), 3 deletions(-) create mode 100755 packages/realm-server/scripts/symbolize-prerender-wedge.sh diff --git a/.claude/skills/indexing-diagnostics/SKILL.md b/.claude/skills/indexing-diagnostics/SKILL.md index 1379aa6417..177ff44742 100644 --- a/.claude/skills/indexing-diagnostics/SKILL.md +++ b/.claude/skills/indexing-diagnostics/SKILL.md @@ -961,16 +961,23 @@ Arm it: 3. Re-run the index. On a wedge, grep the prerender-server log for `v8ProfLog: uploaded` — and pull the artifact (next section). 4. Set it back to `false` + restart when done — it samples **every** render while on. -**Symbolize offline** (the `boxel-claude-readonly` role has `s3:GetObject` on `boxel-prerender-artifacts-*`): +**Symbolize offline.** With an aws-access session for the env (`boxel-claude-readonly` has `s3:GetObject`/`ListBucket` on `boxel-prerender-artifacts-*`), the helper fetches the newest matching `.v8log` and runs `node --prof-process` for you: ```sh -aws s3 ls --recursive s3://boxel-prerender-artifacts-/// # find the .v8log key +packages/realm-server/scripts/symbolize-prerender-wedge.sh --env staging --realm bxl-dependency-order-test +# --key for a specific artifact; --list to just enumerate candidates; --top N for deeper sections +``` + +It self-documents why this is offline rather than in-container. Under the hood it's just: + +```sh +aws s3api list-objects-v2 --bucket boxel-prerender-artifacts- --prefix / ... # newest .v8log for the realm aws s3 cp s3://boxel-prerender-artifacts-/ /tmp/wedge.v8log node --prof-process /tmp/wedge.v8log \ | sed -n '/\[Summary\]/,+14p; /\[JavaScript\]/,+30p; /\[Bottom up (heavy) profile\]/,+50p' ``` -The peg dominates the cumulative log (a 60s spin at 1 kHz is ~60k ticks; every other render on the tab is sub-second), so the top self-time frame — and the heaviest `[Bottom up]` path — is the wedge. +The log self-contains its `code-creation` records, so `--prof-process` names the JS frames from the file alone — no binaries or source maps. Native/Chrome frames stay opaque (that'd need Chrome debug symbols); the JS layer is what names the wedge. The peg dominates the cumulative log (a 60s spin at 1 kHz is ~60k ticks; every other render on the tab is sub-second), so the top self-time frame — and the heaviest `[Bottom up]` path — is the wedge. **Masking note.** `--prof` samples continuously, but for a _synchronous_ peg that doesn't matter: a sync loop's control flow doesn't depend on timing, so sampling records _where_ it spins without changing _that_ it spins. The heavier CDP profiler / trace masking that's bitten before is the risk for _timing-sensitive async_ work — so confirm `mainThreadResponsive=false` before reaching for it. diff --git a/packages/realm-server/scripts/symbolize-prerender-wedge.sh b/packages/realm-server/scripts/symbolize-prerender-wedge.sh new file mode 100755 index 0000000000..3c9977cc67 --- /dev/null +++ b/packages/realm-server/scripts/symbolize-prerender-wedge.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env bash +# +# Symbolize a prerender wedge from its V8 `--prof` artifact. +# +# When a render times out with PRERENDER_V8_PROF=true, the prerender server +# ships the pegged isolate's raw V8 `--prof` log to the prerender artifacts +# bucket as a `.v8log` object, keyed env/realm/jobId/card/step/.v8log (see +# packages/realm-server/prerender/v8-prof.ts and the indexing-diagnostics +# skill, Mode I). The raw log is too large to `--prof-process` in-container +# under the render-timeout budget; this script fetches it and symbolizes it +# offline, where there's no deadline. +# +# The log self-contains its `code-creation` records, so `node --prof-process` +# resolves the JS frames from the file alone — no binaries or source maps. +# Native/Chrome frames stay opaque (that needs Chrome debug symbols); the JS +# layer is what names the wedge. The 60s peg dominates the cumulative log, so +# the top [JavaScript] self-time frame and heaviest [Bottom up] path are it. +# +# Requires an aws-access session for the env (`mise run claude-aws `) +# — the boxel-claude-readonly role has s3:GetObject/ListBucket on +# boxel-prerender-artifacts-*. And `node` on PATH. +# +# Usage: +# symbolize-prerender-wedge.sh --realm bxl-dependency-order-test +# symbolize-prerender-wedge.sh --env staging --realm ctse/foo --top 80 +# symbolize-prerender-wedge.sh --key staging/////.v8log +# symbolize-prerender-wedge.sh --realm bxl --list # just list candidates +set -euo pipefail + +ENV=staging +REALM= +PROFILE= +KEY= +TOP=50 +LIST_ONLY= + +while [[ $# -gt 0 ]]; do + case "$1" in + --env) ENV="$2"; shift 2 ;; + --realm) REALM="$2"; shift 2 ;; + --profile) PROFILE="$2"; shift 2 ;; + --key) KEY="$2"; shift 2 ;; + --top) TOP="$2"; shift 2 ;; + --list) LIST_ONLY=1; shift ;; + -h | --help) sed -n '2,30p' "$0"; exit 0 ;; + *) echo "unknown arg: $1" >&2; exit 2 ;; + esac +done + +PROFILE="${PROFILE:-claude-$ENV}" +BUCKET="boxel-prerender-artifacts-$ENV" +AWS=(aws --profile "$PROFILE" --region "${AWS_REGION:-us-east-1}") + +command -v node >/dev/null || { echo "node not found on PATH" >&2; exit 127; } + +# Resolve the artifact key: newest `.v8log` whose key matches the realm +# substring (artifacts auto-expire after 14 days, so the listing stays small). +if [[ -z "$KEY" ]]; then + [[ -n "$REALM" ]] || { echo "need --realm or --key " >&2; exit 2; } + echo "Listing .v8log artifacts in s3://$BUCKET (realm ~ '$REALM') ..." >&2 + mapfile -t MATCHES < <( + "${AWS[@]}" s3api list-objects-v2 --bucket "$BUCKET" --prefix "$ENV/" \ + --query "reverse(sort_by(Contents[?ends_with(Key, '.v8log')], &LastModified))[].Key" \ + --output text 2>/dev/null | tr '\t' '\n' | grep -i -- "$REALM" || true + ) + [[ ${#MATCHES[@]} -gt 0 ]] || { echo "no .v8log artifact for realm '$REALM' in $BUCKET" >&2; exit 1; } + if [[ -n "$LIST_ONLY" ]]; then printf '%s\n' "${MATCHES[@]}"; exit 0; fi + KEY="${MATCHES[0]}" +fi + +echo "Artifact: s3://$BUCKET/$KEY" >&2 +TMP="$(mktemp "${TMPDIR:-/tmp}/wedge-XXXXXX.v8log")" +trap 'rm -f "$TMP"' EXIT +"${AWS[@]}" s3 cp "s3://$BUCKET/$KEY" "$TMP" >&2 +echo "Downloaded $(( $(wc -c < "$TMP") / 1024 / 1024 ))MB; running node --prof-process ..." >&2 +echo >&2 + +node --prof-process "$TMP" 2>/dev/null | sed -n \ + "/\[Summary\]/,+16p; /\[JavaScript\]/,+${TOP}p; /\[Bottom up (heavy) profile\]/,+${TOP}p" From d996c0e3be95efcb5d4d7b1295d5caf13eaeeca0 Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Tue, 16 Jun 2026 11:24:22 -0400 Subject: [PATCH 11/16] Delete the V8 --prof log from the container after a durable S3 upload MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The raw --prof log runs tens of MB and `--logfile` accumulates them in the container's temp dir across the browser's long life. Now that the wedged page is evicted on a render timeout (its isolate tears down and the next visit writes a fresh log), the uploaded log strands nothing — so reclaim the disk by deleting it once the upload is durable. uploadArtifact now returns whether the object actually landed, so the delete is gated on a confirmed upload: a disabled/declined/failed sink keeps the local copy rather than destroying a capture that was never persisted. The timeout log line reports which path ran. Mode I of the indexing-diagnostics skill is updated to match the status wording. Co-Authored-By: Claude Opus 4.8 (1M context) --- .claude/skills/indexing-diagnostics/SKILL.md | 6 ++++- .../realm-server/prerender/artifact-sink.ts | 21 +++++++++------ packages/realm-server/prerender/utils.ts | 7 ++--- packages/realm-server/prerender/v8-prof.ts | 27 ++++++++++++++++--- 4 files changed, 46 insertions(+), 15 deletions(-) diff --git a/.claude/skills/indexing-diagnostics/SKILL.md b/.claude/skills/indexing-diagnostics/SKILL.md index 177ff44742..6f04ec7b4d 100644 --- a/.claude/skills/indexing-diagnostics/SKILL.md +++ b/.claude/skills/indexing-diagnostics/SKILL.md @@ -952,7 +952,11 @@ A single CDP `Debugger.pause` on the timeout path. V8 honors the pause at the ne When `pausedStack` is starved, the kernel-signal sampler is the capture that still works. `--prof` (knob: `PRERENDER_V8_PROF`, Mode H table) arms V8's `SIGPROF`-driven sampler at Chrome **launch**: the kernel timer preempts the pegged thread mid-instruction on a schedule it can't refuse — no protocol message, no back-edge, no cooperation — and a **separate thread** writes the samples to a file. So it records the spinning frame even when CDP is dead, and never needs the thread to serialize at `stop` (the failure mode of the CDP `Profiler`). Renderer-wide, not affinity-gated. -It is **not** symbolized in-container — the log accumulates every render on the isolate since launch (tens of MB), and `node --prof-process` on that blows the render-timeout budget (that's why an in-container summary kept coming back empty). Instead, on the timeout the prerender server **streams the raw log to the prerender S3 artifacts bucket** as a `v8log` artifact, keyed `env/realm/jobId/card/step/.v8log` — so the wedging task's log is self-identifying (no shelling into the N prerender tasks to find it). The timeout log line just reports the upload: `v8ProfLog: uploaded … (NMB) to artifact bucket …`, or a self-diagnosing reason (``) if none was found. +It is **not** symbolized in-container — the log accumulates every render on the isolate since launch (tens of MB), and `node --prof-process` on that blows the render-timeout budget (that's why an in-container summary kept coming back empty). Instead, on the timeout the prerender server **streams the raw log to the prerender S3 artifacts bucket** as a `v8log` artifact, keyed `env/realm/jobId/card/step/.v8log` — so the wedging task's log is self-identifying (no shelling into the N prerender tasks to find it). Once the upload is durable the server **deletes the local log** (the wedged page is evicted, so its isolate writes no more), keeping these tens-of-MB files from piling up on the container disk. The timeout log line reports the outcome: + +- `v8ProfLog: uploaded … (NMB) to artifact bucket (local log removed) …` — success; fetch the artifact (below). +- `v8ProfLog: ` — sink disabled / budget spent / upload failed, so the local copy is **retained** (not destroyed for a capture you can't fetch). +- `v8ProfLog: ` — armed but no log written (e.g. the browser didn't relaunch after the flag flip). Arm it: diff --git a/packages/realm-server/prerender/artifact-sink.ts b/packages/realm-server/prerender/artifact-sink.ts index 073638b22c..d838bc218c 100644 --- a/packages/realm-server/prerender/artifact-sink.ts +++ b/packages/realm-server/prerender/artifact-sink.ts @@ -229,15 +229,18 @@ let sessionBytesUsed = 0; let uploadSeq = 0; let budgetExhaustedLogged = false; -// Uploads one artifact. Resolves once the object is durable in S3 (so a -// per-render `await` genuinely persists before the next render), or sooner -// if the sink is disabled / the budget is spent / the upload fails — none -// of which ever throw or reject. Declines (does not truncate) once the -// session budget is reached, so it never produces an invalid blob. -export async function uploadArtifact(upload: ArtifactUpload): Promise { +// Uploads one artifact. Resolves `true` once the object is durable in S3 (so +// a per-render `await` genuinely persists before the next render), and `false` +// if it didn't land — the sink is disabled, the session budget is spent, or +// the upload failed. Never throws or rejects, and declines (does not truncate) +// once the budget is reached, so it never produces an invalid blob. The +// boolean lets a caller that holds the only local copy (the V8 `--prof` log) +// gate its post-upload cleanup on a genuine success rather than destroying a +// copy that was never persisted. +export async function uploadArtifact(upload: ArtifactUpload): Promise { let bucket = artifactBucket(); if (!bucket) { - return; + return false; } if (sessionBytesUsed >= getMaxSessionBytes()) { if (!budgetExhaustedLogged) { @@ -247,7 +250,7 @@ export async function uploadArtifact(upload: ArtifactUpload): Promise { `declining further artifact uploads for this process`, ); } - return; + return false; } let key = buildArtifactKey(upload, new Date(), uploadSeq++); @@ -273,10 +276,12 @@ export async function uploadArtifact(upload: ArtifactUpload): Promise { `artifact-sink uploaded ${upload.kind} key=${key} bytes=${loaded} ` + `sessionBytes=${sessionBytesUsed}/${getMaxSessionBytes()}`, ); + return true; } catch (e) { // Best-effort: a failed flush is a missing diagnostic, not a render // failure. Count nothing against the budget. log.warn(`artifact-sink failed to upload ${upload.kind} key=${key}:`, e); + return false; } } diff --git a/packages/realm-server/prerender/utils.ts b/packages/realm-server/prerender/utils.ts index 77b5ff543a..84a4a93f92 100644 --- a/packages/realm-server/prerender/utils.ts +++ b/packages/realm-server/prerender/utils.ts @@ -1544,12 +1544,13 @@ async function runWithAffinityProfile( maxRunMs: timeoutMs, run: observe, onRawProfile: wantCpuProfile - ? (rawProfile) => - uploadArtifact({ + ? async (rawProfile) => { + await uploadArtifact({ ...keyParts, kind: 'cpuprofile', body: Buffer.from(JSON.stringify(rawProfile), 'utf8'), - }) + }); + } : undefined, }) .then(async (summary) => { diff --git a/packages/realm-server/prerender/v8-prof.ts b/packages/realm-server/prerender/v8-prof.ts index 9e49c263fc..c3b8565e0c 100644 --- a/packages/realm-server/prerender/v8-prof.ts +++ b/packages/realm-server/prerender/v8-prof.ts @@ -12,7 +12,9 @@ // budget. So we don't parse it in-container. On the render timeout we ship the // RAW log to the prerender S3 artifacts bucket, keyed by realm/card/job, and // symbolize it offline (`node --prof-process`) where there's no deadline and -// the peg dominates the top self-time frames. +// the peg dominates the top self-time frames. Once the upload is durable we +// delete the local log so these tens-of-MB files don't pile up on the +// container's disk over the browser's long life. // // Off by default (`PRERENDER_V8_PROF=true` to arm) — it samples every render, // so it can perturb a timing-sensitive wedge; arm it deliberately. @@ -142,13 +144,32 @@ export async function uploadV8ProfLog( if ('reason' in picked) { return picked.reason; } - await uploadArtifact({ + let uploaded = await uploadArtifact({ ...keyParts, kind: 'v8log', body: createReadStream(picked.full), contentType: 'text/plain', }); - return `uploaded v8 --prof log ${path.basename(picked.full)} (${picked.sizeMB}MB) to artifact bucket — symbolize offline with \`node --prof-process\``; + if (!uploaded) { + // The bytes did NOT land in S3 (sink disabled / budget spent / upload + // failed). Keep the local log — it's the only copy — rather than + // destroying it for a capture we can't retrieve. + return ``; + } + // Durable in S3, so reclaim the container's local disk: these logs run + // tens of MB and `--logfile` accumulates them in the OS temp dir across + // the browser's long life. A render timeout evicts the wedged page (its + // isolate tears down; the next visit writes a FRESH log), so deleting + // this one strands nothing. `await uploadArtifact` has fully drained the + // read stream by now, so the unlink is safe; it frees the dirent + // immediately and the bytes once the evicted renderer process exits. + let cleanup = ' (local log removed)'; + try { + await fs.rm(picked.full, { force: true }); + } catch (e) { + cleanup = ` (local delete failed: ${String((e as { message?: string }).message ?? e).slice(0, 80)})`; + } + return `uploaded v8 --prof log ${path.basename(picked.full)} (${picked.sizeMB}MB) to artifact bucket${cleanup} — symbolize offline with \`node --prof-process\``; } catch (e) { // Never silently null — surface the reason on the timeout line. return ``; From c6d2697538d395dc4159b4bf8bded479db3b5b78 Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Tue, 16 Jun 2026 11:29:08 -0400 Subject: [PATCH 12/16] Test the v8log artifact kind, the upload-success contract, and --prof selection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend the artifact-sink unit test for the `v8log` kind suffix and the new boolean return (disabled sink → false, the contract the --prof delete gates on). Add a v8-prof unit test covering flag gating, the launch flags, the stale-log sweep, and how uploadV8ProfLog selects the pegged isolate's log: largest this-run wins, isolate-prefixed names match, logs from before the run are excluded, and a capture that can't be persisted is KEPT on disk rather than deleted. All run standalone (no PG / Chrome / S3). Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/realm-server/tests/index.ts | 1 + .../tests/prerender-artifact-sink-test.ts | 19 +- .../tests/prerender-v8-prof-test.ts | 193 ++++++++++++++++++ 3 files changed, 212 insertions(+), 1 deletion(-) create mode 100644 packages/realm-server/tests/prerender-v8-prof-test.ts diff --git a/packages/realm-server/tests/index.ts b/packages/realm-server/tests/index.ts index 8f1f93aeb2..5156aafb9f 100644 --- a/packages/realm-server/tests/index.ts +++ b/packages/realm-server/tests/index.ts @@ -208,6 +208,7 @@ const ALL_TEST_FILES: string[] = [ './is-json-content-type-test', './prerender-diagnostics-persistence-test', './prerender-proxy-test', + './prerender-v8-prof-test', './queue-test', './finalize-orphan-reservations-test', './finalize-child-fatal-failure-test', diff --git a/packages/realm-server/tests/prerender-artifact-sink-test.ts b/packages/realm-server/tests/prerender-artifact-sink-test.ts index 993ef750ed..587a83b5ee 100644 --- a/packages/realm-server/tests/prerender-artifact-sink-test.ts +++ b/packages/realm-server/tests/prerender-artifact-sink-test.ts @@ -7,6 +7,9 @@ import { shouldCaptureCpuProfile, shouldCaptureTrace, shouldCaptureHeap, + uploadArtifact, + __resetArtifactSinkSessionForTests, + type ArtifactKind, } from '../prerender/artifact-sink.ts'; // The heavyweight artifact sink is gated by env (a configured bucket plus @@ -88,11 +91,12 @@ module('prerender artifact-sink key', function (hooks) { }); test('each artifact kind maps to its conventional file suffix', function (assert) { - let suffix = (kind: 'cpuprofile' | 'trace' | 'heap') => + let suffix = (kind: ArtifactKind) => buildArtifactKey({ kind }, now, 0).split('.').slice(1).join('.'); assert.strictEqual(suffix('cpuprofile'), 'cpuprofile'); assert.strictEqual(suffix('trace'), 'trace.json'); assert.strictEqual(suffix('heap'), 'heapprofile'); + assert.strictEqual(suffix('v8log'), 'v8log'); }); test('the seq disambiguates artifacts that share a millisecond', function (assert) { @@ -159,6 +163,19 @@ module('prerender artifact-sink gates', function (hooks) { 'a bucket with every mode off → nothing to capture', ); }); + + test('uploadArtifact reports false when the sink is disabled, so a caller never destroys its only local copy', async function (assert) { + // The V8 --prof path deletes its local log only on a confirmed upload; + // this is the contract it gates on. No bucket configured → no upload + // attempt, no S3 — a pure, dependency-free assertion of the false return. + __resetArtifactSinkSessionForTests(); + delete process.env.PRERENDER_ARTIFACTS_BUCKET; + let uploaded = await uploadArtifact({ + kind: 'v8log', + body: Buffer.from('not going anywhere'), + }); + assert.false(uploaded, 'disabled sink → false'); + }); }); module('prerender artifact-sink session budget', function (hooks) { diff --git a/packages/realm-server/tests/prerender-v8-prof-test.ts b/packages/realm-server/tests/prerender-v8-prof-test.ts new file mode 100644 index 0000000000..33ff95b6e1 --- /dev/null +++ b/packages/realm-server/tests/prerender-v8-prof-test.ts @@ -0,0 +1,193 @@ +import { module, test } from 'qunit'; +import { promises as fs } from 'fs'; +import path from 'path'; +import { + v8ProfEnabled, + v8ProfJsFlags, + prepareV8ProfForLaunch, + uploadV8ProfLog, + V8_PROF_LOG_DIR, + V8_PROF_LOG_PREFIX, +} from '../prerender/v8-prof.ts'; + +// The V8 `--prof` capture is gated by env and reads/writes the OS temp dir. +// These tests pin the dependency-free behavior — flag gating, the launch +// flags, the stale-log sweep, and how `uploadV8ProfLog` selects the pegged +// isolate's log and what it does when the upload can't land — without S3 or +// Chrome. With no bucket configured the sink returns false, which drives the +// "keep the local log" branch: that lets us assert WHICH file was selected +// (it appears in the status string) and that a non-persisted capture is never +// deleted. The env is read at call time, so each case sets what it needs and +// the hooks restore the process environment and clean up fixtures afterward. + +const ENV_KEYS = ['PRERENDER_V8_PROF', 'PRERENDER_ARTIFACTS_BUCKET']; + +function snapshotEnv(): Record { + let snapshot: Record = {}; + for (let key of ENV_KEYS) { + snapshot[key] = process.env[key]; + } + return snapshot; +} + +function restoreEnv(snapshot: Record): void { + for (let key of ENV_KEYS) { + if (snapshot[key] === undefined) { + delete process.env[key]; + } else { + process.env[key] = snapshot[key]; + } + } +} + +// A `--prof` log fixture. `isolatePrefix` reproduces V8's per-isolate +// `isolate--` prepend so we exercise the `includes` (not `startsWith`) +// filename match. `ageMs` backdates the mtime to simulate a log from before +// this browser run (stale, must be excluded from selection). +async function writeLog(basename: string, sizeBytes: number): Promise { + let full = path.join(V8_PROF_LOG_DIR, basename); + await fs.writeFile(full, Buffer.alloc(sizeBytes, 'v')); + return full; +} + +async function backdate(full: string, ageMs: number): Promise { + let when = new Date(Date.now() - ageMs); + await fs.utimes(full, when, when); +} + +module('prerender v8-prof', function (hooks) { + let saved: Record; + let created: string[]; + + hooks.beforeEach(async function () { + saved = snapshotEnv(); + created = []; + // Arm, point at no bucket (sink disabled), and sweep + stamp the launch + // time so each case starts from a clean slate with a known "this run" + // boundary. + process.env.PRERENDER_V8_PROF = 'true'; + delete process.env.PRERENDER_ARTIFACTS_BUCKET; + await prepareV8ProfForLaunch(); + }); + + hooks.afterEach(async function () { + for (let full of created) { + await fs.rm(full, { force: true }).catch(() => {}); + } + restoreEnv(saved); + }); + + test('v8ProfEnabled mirrors the env flag (exact "true")', function (assert) { + process.env.PRERENDER_V8_PROF = 'true'; + assert.true(v8ProfEnabled()); + process.env.PRERENDER_V8_PROF = 'false'; + assert.false(v8ProfEnabled()); + delete process.env.PRERENDER_V8_PROF; + assert.false(v8ProfEnabled()); + }); + + test('v8ProfJsFlags arms --prof with a per-pid logfile under the prefix', function (assert) { + let flags = v8ProfJsFlags(); + assert.true(flags.includes('--prof'), 'arms the sampler'); + assert.true(flags.includes('--logfile='), 'directs the log to a file'); + assert.true(flags.includes(V8_PROF_LOG_PREFIX), 'uses the known prefix'); + assert.true(flags.includes('%p'), 'per-pid so concurrent renderers split'); + }); + + test('uploadV8ProfLog is inert (null) when not armed', async function (assert) { + process.env.PRERENDER_V8_PROF = 'false'; + let status = await uploadV8ProfLog({ realm: 'r', card: 'c', step: 's' }); + assert.strictEqual(status, null, 'disabled → no work, no status'); + }); + + test('armed with no log present → self-diagnosing reason', async function (assert) { + let status = await uploadV8ProfLog({ realm: 'r', card: 'c', step: 's' }); + assert.true( + typeof status === 'string' && status.includes('no v8 --prof log'), + `reason names the absence, got: ${status}`, + ); + }); + + test('selects the largest this-run log and, on a non-persisted upload, KEEPS it', async function (assert) { + let small = await writeLog( + `isolate-0xaaa-${V8_PROF_LOG_PREFIX}101.log`, + 4096, + ); + let big = await writeLog( + `isolate-0xbbb-${V8_PROF_LOG_PREFIX}102.log`, + 64 * 1024, + ); + created.push(small, big); + + let status = await uploadV8ProfLog({ realm: 'r', card: 'c', step: 's' }); + + assert.true( + typeof status === 'string' && status.includes(path.basename(big)), + `picks the largest (pegged-isolate) log, got: ${status}`, + ); + assert.true( + typeof status === 'string' && status.includes('kept local'), + 'sink disabled → reports the local copy was retained', + ); + // The safety property: a capture that never reached S3 is not destroyed. + assert.true( + await fs + .stat(big) + .then(() => true) + .catch(() => false), + 'the only copy of the log is still on disk', + ); + }); + + test('excludes stale logs written before this browser run', async function (assert) { + // A big but STALE log (mtime before the stamped launch time) must lose to + // a small fresh one — otherwise a previous run's log would be mis-picked. + let staleBig = await writeLog(`${V8_PROF_LOG_PREFIX}900.log`, 128 * 1024); + await backdate(staleBig, 60_000); + let freshSmall = await writeLog(`${V8_PROF_LOG_PREFIX}901.log`, 2048); + created.push(staleBig, freshSmall); + + let status = await uploadV8ProfLog({ realm: 'r', card: 'c', step: 's' }); + + assert.true( + typeof status === 'string' && status.includes(path.basename(freshSmall)), + `picks the fresh log over the larger stale one, got: ${status}`, + ); + }); + + test('only stale logs present → "no log from this run"', async function (assert) { + let stale = await writeLog(`${V8_PROF_LOG_PREFIX}910.log`, 8192); + await backdate(stale, 60_000); + created.push(stale); + + let status = await uploadV8ProfLog({ realm: 'r', card: 'c', step: 's' }); + + assert.true( + typeof status === 'string' && status.includes('from this run'), + `distinguishes stale-only from absent, got: ${status}`, + ); + }); + + test('prepareV8ProfForLaunch sweeps prior --prof logs', async function (assert) { + let orphan = await writeLog( + `isolate-0xccc-${V8_PROF_LOG_PREFIX}777.log`, + 1024, + ); + created.push(orphan); + assert.true( + await fs + .stat(orphan) + .then(() => true) + .catch(() => false), + 'fixture present before the sweep', + ); + await prepareV8ProfForLaunch(); + assert.false( + await fs + .stat(orphan) + .then(() => true) + .catch(() => false), + 'launch sweep removed the prior-run log', + ); + }); +}); From 003f1e5afc8506c8244b019b62455580c865a3e1 Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Tue, 16 Jun 2026 11:30:25 -0400 Subject: [PATCH 13/16] Satisfy qunit/no-assert-logical-expression in the v8-prof test Coerce the nullable status to a string before asserting on it rather than guarding with '&&' inside the assertion argument. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/realm-server/tests/prerender-v8-prof-test.ts | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/packages/realm-server/tests/prerender-v8-prof-test.ts b/packages/realm-server/tests/prerender-v8-prof-test.ts index 33ff95b6e1..5291825608 100644 --- a/packages/realm-server/tests/prerender-v8-prof-test.ts +++ b/packages/realm-server/tests/prerender-v8-prof-test.ts @@ -103,7 +103,7 @@ module('prerender v8-prof', function (hooks) { test('armed with no log present → self-diagnosing reason', async function (assert) { let status = await uploadV8ProfLog({ realm: 'r', card: 'c', step: 's' }); assert.true( - typeof status === 'string' && status.includes('no v8 --prof log'), + (status ?? '').includes('no v8 --prof log'), `reason names the absence, got: ${status}`, ); }); @@ -120,13 +120,14 @@ module('prerender v8-prof', function (hooks) { created.push(small, big); let status = await uploadV8ProfLog({ realm: 'r', card: 'c', step: 's' }); + let s = status ?? ''; assert.true( - typeof status === 'string' && status.includes(path.basename(big)), + s.includes(path.basename(big)), `picks the largest (pegged-isolate) log, got: ${status}`, ); assert.true( - typeof status === 'string' && status.includes('kept local'), + s.includes('kept local'), 'sink disabled → reports the local copy was retained', ); // The safety property: a capture that never reached S3 is not destroyed. @@ -150,7 +151,7 @@ module('prerender v8-prof', function (hooks) { let status = await uploadV8ProfLog({ realm: 'r', card: 'c', step: 's' }); assert.true( - typeof status === 'string' && status.includes(path.basename(freshSmall)), + (status ?? '').includes(path.basename(freshSmall)), `picks the fresh log over the larger stale one, got: ${status}`, ); }); @@ -163,7 +164,7 @@ module('prerender v8-prof', function (hooks) { let status = await uploadV8ProfLog({ realm: 'r', card: 'c', step: 's' }); assert.true( - typeof status === 'string' && status.includes('from this run'), + (status ?? '').includes('from this run'), `distinguishes stale-only from absent, got: ${status}`, ); }); From d4ff6731fc5168237a6d73d3070bd7f3f09cd1d8 Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Tue, 16 Jun 2026 11:41:35 -0400 Subject: [PATCH 14/16] Address review: gate the sandbox, detach late CDP sessions, small cleanups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - browser-manager: the renderer executes user-authored card code, so keep the Chromium sandbox on where it works and disable it only where it must be off — in the container (PUPPETEER_DISABLE_SANDBOX), in CI, and when the V8 --prof diagnostic is armed (a sandboxed renderer can't write the log). - pause-capture: if createCDPSession outlives its timeout, detach the late-resolving session so it isn't orphaned on the page. - utils: compute formatPausedStack once for the timeout log line. - symbolize-prerender-wedge.sh: match the realm with grep -F so a realm containing regex metacharacters can't match unintended keys. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../realm-server/prerender/browser-manager.ts | 20 ++++++++++++------- .../realm-server/prerender/pause-capture.ts | 6 +++++- packages/realm-server/prerender/utils.ts | 5 ++--- .../scripts/symbolize-prerender-wedge.sh | 2 +- 4 files changed, 21 insertions(+), 12 deletions(-) diff --git a/packages/realm-server/prerender/browser-manager.ts b/packages/realm-server/prerender/browser-manager.ts index cbabf02bd9..c9e30160ce 100644 --- a/packages/realm-server/prerender/browser-manager.ts +++ b/packages/realm-server/prerender/browser-manager.ts @@ -29,13 +29,19 @@ export class BrowserManager { await this.cleanupUserDataDirs(); let launchArgs: string[] = []; - // Always disable the Chromium sandbox for the prerender renderer. It - // only ever renders the realm's own trusted cards (never untrusted web - // content) and always runs in a container where the sandbox must be off - // for Chrome to launch — and for the renderer to write the V8 `--prof` - // diagnostic log. Unconditional rather than gated on CI / - // PUPPETEER_DISABLE_SANDBOX so it never silently depends on the env. - launchArgs.push('--no-sandbox', '--disable-setuid-sandbox'); + // The Chromium sandbox is a real boundary here — the renderer executes + // user-authored card code — so keep it on wherever it works and disable + // it only where it must be off: in the container (the sandbox can't + // initialize there, so the image sets PUPPETEER_DISABLE_SANDBOX), in CI, + // and when the V8 `--prof` diagnostic is armed (a sandboxed renderer + // can't write the profile log to disk). + let disableSandbox = + process.env.CI === 'true' || + process.env.PUPPETEER_DISABLE_SANDBOX === 'true' || + v8ProfEnabled(); + if (disableSandbox) { + launchArgs.push('--no-sandbox', '--disable-setuid-sandbox'); + } // When the realm-server speaks HTTPS (local dev with a mkcert leaf // cert), Chromium needs to be told to accept it. mkcert's root CA diff --git a/packages/realm-server/prerender/pause-capture.ts b/packages/realm-server/prerender/pause-capture.ts index ba2abfdcdd..15d2fa3f5a 100644 --- a/packages/realm-server/prerender/pause-capture.ts +++ b/packages/realm-server/prerender/pause-capture.ts @@ -96,8 +96,12 @@ export async function capturePausedCallStack( } let client: CDPSession | undefined; try { - let session = await raceNodeTimeout(page.createCDPSession(), budgetMs); + let sessionPromise = page.createCDPSession(); + let session = await raceNodeTimeout(sessionPromise, budgetMs); if (session === TIMED_OUT) { + // We gave up waiting, but the create may still resolve later — detach it + // then so a slow CDP session isn't orphaned on the page until it closes. + void sessionPromise.then((s) => detachQuietly(s)).catch(() => {}); return empty(null, 'cdp-session-timeout'); } client = session; diff --git a/packages/realm-server/prerender/utils.ts b/packages/realm-server/prerender/utils.ts index 84a4a93f92..8dc04f713b 100644 --- a/packages/realm-server/prerender/utils.ts +++ b/packages/realm-server/prerender/utils.ts @@ -1802,6 +1802,7 @@ export async function withTimeout( } let topPending = pendingNetworkRequests?.[0]; + let pausedStackStr = formatPausedStack(pausedStack); log.warn( `render of ${id} timed out after ${timeoutMs}ms` + ` stage=${richDiagnostics?.renderStage ?? ''}` + @@ -1823,9 +1824,7 @@ export async function withTimeout( (pausedStack?.heapUsedMB != null ? ` jsHeapUsedMB=${pausedStack.heapUsedMB.toFixed(0)}` : '') + - (formatPausedStack(pausedStack) - ? ` pausedStack: ${formatPausedStack(pausedStack)}` - : '') + + (pausedStackStr ? ` pausedStack: ${pausedStackStr}` : '') + (v8ProfStatus ? ` v8ProfLog: ${v8ProfStatus}` : '') + ` DOM:\n${dom?.trim()}`, ); diff --git a/packages/realm-server/scripts/symbolize-prerender-wedge.sh b/packages/realm-server/scripts/symbolize-prerender-wedge.sh index 3c9977cc67..ec393ea319 100755 --- a/packages/realm-server/scripts/symbolize-prerender-wedge.sh +++ b/packages/realm-server/scripts/symbolize-prerender-wedge.sh @@ -61,7 +61,7 @@ if [[ -z "$KEY" ]]; then mapfile -t MATCHES < <( "${AWS[@]}" s3api list-objects-v2 --bucket "$BUCKET" --prefix "$ENV/" \ --query "reverse(sort_by(Contents[?ends_with(Key, '.v8log')], &LastModified))[].Key" \ - --output text 2>/dev/null | tr '\t' '\n' | grep -i -- "$REALM" || true + --output text 2>/dev/null | tr '\t' '\n' | grep -iF -- "$REALM" || true ) [[ ${#MATCHES[@]} -gt 0 ]] || { echo "no .v8log artifact for realm '$REALM' in $BUCKET" >&2; exit 1; } if [[ -n "$LIST_ONLY" ]]; then printf '%s\n' "${MATCHES[@]}"; exit 0; fi From fa10cf1d3439091d19101de2a77cdd0e0864537b Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Tue, 16 Jun 2026 11:48:51 -0400 Subject: [PATCH 15/16] Make --prof log selection transparent and delete only on an unambiguous pick MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Selecting the largest this-run isolate log is a heuristic: a 60s CPU peg samples continuously so its log dwarfs IO-bound renderers', but under several concurrent renderers a long-lived busy isolate could rival it. pickV8ProfLog now reports the candidate count + runner-up size, and the timeout line surfaces them so a mis-pick is visible (re-symbolize another with the helper's --key). The post-upload delete now runs only when there's a single this-run log — then it's unmistakably the wedged, evicted isolate's and the unlink frees real disk. With multiple candidates the largest may be a still-live renderer's log, where unlinking frees nothing (V8 holds the fd) and loses its samples; keep them all for the next browser-launch sweep instead. Co-Authored-By: Claude Opus 4.8 (1M context) --- .claude/skills/indexing-diagnostics/SKILL.md | 2 +- packages/realm-server/prerender/v8-prof.ts | 62 +++++++++++++------- 2 files changed, 42 insertions(+), 22 deletions(-) diff --git a/.claude/skills/indexing-diagnostics/SKILL.md b/.claude/skills/indexing-diagnostics/SKILL.md index 6f04ec7b4d..bc8662430c 100644 --- a/.claude/skills/indexing-diagnostics/SKILL.md +++ b/.claude/skills/indexing-diagnostics/SKILL.md @@ -954,7 +954,7 @@ When `pausedStack` is starved, the kernel-signal sampler is the capture that sti It is **not** symbolized in-container — the log accumulates every render on the isolate since launch (tens of MB), and `node --prof-process` on that blows the render-timeout budget (that's why an in-container summary kept coming back empty). Instead, on the timeout the prerender server **streams the raw log to the prerender S3 artifacts bucket** as a `v8log` artifact, keyed `env/realm/jobId/card/step/.v8log` — so the wedging task's log is self-identifying (no shelling into the N prerender tasks to find it). Once the upload is durable the server **deletes the local log** (the wedged page is evicted, so its isolate writes no more), keeping these tens-of-MB files from piling up on the container disk. The timeout log line reports the outcome: -- `v8ProfLog: uploaded … (NMB) to artifact bucket (local log removed) …` — success; fetch the artifact (below). +- `v8ProfLog: uploaded … (NMB) to artifact bucket (local log removed) …` — success; fetch the artifact (below). If several renderer isolates were active this run the line instead reads `(largest of N this-run logs, runner-up …MB — kept …; re-symbolize another with the helper's --key)` — the largest is a heuristic for the pegged isolate, so if its frames don't look wedged, `--list` the candidates and `--key` a different one. - `v8ProfLog: ` — sink disabled / budget spent / upload failed, so the local copy is **retained** (not destroyed for a capture you can't fetch). - `v8ProfLog: ` — armed but no log written (e.g. the browser didn't relaunch after the flag flip). diff --git a/packages/realm-server/prerender/v8-prof.ts b/packages/realm-server/prerender/v8-prof.ts index c3b8565e0c..aa85358e1a 100644 --- a/packages/realm-server/prerender/v8-prof.ts +++ b/packages/realm-server/prerender/v8-prof.ts @@ -13,8 +13,9 @@ // RAW log to the prerender S3 artifacts bucket, keyed by realm/card/job, and // symbolize it offline (`node --prof-process`) where there's no deadline and // the peg dominates the top self-time frames. Once the upload is durable we -// delete the local log so these tens-of-MB files don't pile up on the -// container's disk over the browser's long life. +// delete the local log (when the pegged isolate is unambiguous) so these +// tens-of-MB files don't pile up on the container's disk over the browser's +// long life. // // Off by default (`PRERENDER_V8_PROF=true` to arm) — it samples every render, // so it can perturb a timing-sensitive wedge; arm it deliberately. @@ -77,7 +78,13 @@ export async function prepareV8ProfForLaunch(): Promise { // Find the pegged isolate's `--prof` log among this browser run's logs. // Returns the chosen file (or a self-diagnosing reason string). async function pickV8ProfLog(): Promise< - { full: string; sizeMB: string } | { reason: string } + | { + full: string; + sizeMB: string; + candidates: number; + runnerUpMB: string | null; + } + | { reason: string } > { let entries = await fs.readdir(V8_PROF_LOG_DIR); // `includes`, not `startsWith`: V8's per-isolate logging prepends @@ -104,10 +111,13 @@ async function pickV8ProfLog(): Promise< }), ); // This browser run's logs only (stale ones were cleared + the launch time - // stamped). Pick the LARGEST: the pegged isolate accumulates by far the - // most samples over the peg, so its log dwarfs sibling isolates' and - // earlier renders' — and the peg dominates its frames regardless of what - // else it rendered. + // stamped). Pick the LARGEST: a CPU peg samples continuously at the kernel + // timer's rate, so over a 60s wedge the pegged isolate's log dwarfs + // IO-bound healthy renderers' and earlier renders'. This is a heuristic by + // cumulative size — under several concurrent renderers a long-lived busy + // isolate's log could rival the peg's — so we return the candidate count + // and runner-up size; the caller surfaces them and the helper's `--key` + // lets you symbolize a different one if the largest wasn't the wedge. let fromThisRun = withStats.filter( (s) => s.size > 0 && s.mtimeMs >= v8ProfLaunchAt, ); @@ -120,9 +130,12 @@ async function pickV8ProfLog(): Promise< }; } fromThisRun.sort((a, b) => b.size - a.size); + let toMB = (n: number) => (n / (1024 * 1024)).toFixed(1); return { full: fromThisRun[0].full, - sizeMB: (fromThisRun[0].size / (1024 * 1024)).toFixed(1), + sizeMB: toMB(fromThisRun[0].size), + candidates: fromThisRun.length, + runnerUpMB: fromThisRun[1] ? toMB(fromThisRun[1].size) : null, }; } @@ -156,20 +169,27 @@ export async function uploadV8ProfLog( // destroying it for a capture we can't retrieve. return ``; } - // Durable in S3, so reclaim the container's local disk: these logs run - // tens of MB and `--logfile` accumulates them in the OS temp dir across - // the browser's long life. A render timeout evicts the wedged page (its - // isolate tears down; the next visit writes a FRESH log), so deleting - // this one strands nothing. `await uploadArtifact` has fully drained the - // read stream by now, so the unlink is safe; it frees the dirent - // immediately and the bytes once the evicted renderer process exits. - let cleanup = ' (local log removed)'; - try { - await fs.rm(picked.full, { force: true }); - } catch (e) { - cleanup = ` (local delete failed: ${String((e as { message?: string }).message ?? e).slice(0, 80)})`; + // Reclaim the container's local disk, but only when the pick is + // unambiguous. A single this-run log is unmistakably the wedged isolate's, + // and a render timeout evicts that page — so its renderer exits and the + // unlink frees the bytes (the next visit writes a fresh log, stranding + // nothing). With multiple this-run logs the largest may belong to a + // still-live concurrent renderer; unlinking that frees no space (V8 holds + // the fd open) and loses its future samples — so keep them all and let the + // next browser-launch sweep reclaim them. `await uploadArtifact` has + // drained the read stream by now, so the unlink is safe. + let disposition: string; + if (picked.candidates === 1) { + try { + await fs.rm(picked.full, { force: true }); + disposition = ' (local log removed)'; + } catch (e) { + disposition = ` (local delete failed: ${String((e as { message?: string }).message ?? e).slice(0, 80)})`; + } + } else { + disposition = ` (largest of ${picked.candidates} this-run logs, runner-up ${picked.runnerUpMB}MB — kept for the next-launch sweep; re-symbolize another with the helper's --key)`; } - return `uploaded v8 --prof log ${path.basename(picked.full)} (${picked.sizeMB}MB) to artifact bucket${cleanup} — symbolize offline with \`node --prof-process\``; + return `uploaded v8 --prof log ${path.basename(picked.full)} (${picked.sizeMB}MB) to artifact bucket${disposition} — symbolize offline with \`node --prof-process\``; } catch (e) { // Never silently null — surface the reason on the timeout line. return ``; From 87ad20da0385c78c2dcdaf60b03a9ce67bde8934 Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Tue, 16 Jun 2026 12:24:49 -0400 Subject: [PATCH 16/16] Always launch the prerender renderer with the Chromium sandbox off MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The prerender task runs only in a container where the sandbox can't initialize, so Chrome won't start with it on — it has always run sandbox-off and is required to. Force it unconditionally rather than gating on CI / PUPPETEER_DISABLE_SANDBOX, so a missing env var can't silently break launch. The security boundary is the task itself: the prerenderer is a separate, segregated ECS task isolated from the realm-server. The comment no longer claims the renderer only runs trusted cards. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../realm-server/prerender/browser-manager.ts | 23 ++++++++----------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/packages/realm-server/prerender/browser-manager.ts b/packages/realm-server/prerender/browser-manager.ts index c9e30160ce..2e71db5037 100644 --- a/packages/realm-server/prerender/browser-manager.ts +++ b/packages/realm-server/prerender/browser-manager.ts @@ -29,19 +29,16 @@ export class BrowserManager { await this.cleanupUserDataDirs(); let launchArgs: string[] = []; - // The Chromium sandbox is a real boundary here — the renderer executes - // user-authored card code — so keep it on wherever it works and disable - // it only where it must be off: in the container (the sandbox can't - // initialize there, so the image sets PUPPETEER_DISABLE_SANDBOX), in CI, - // and when the V8 `--prof` diagnostic is armed (a sandboxed renderer - // can't write the profile log to disk). - let disableSandbox = - process.env.CI === 'true' || - process.env.PUPPETEER_DISABLE_SANDBOX === 'true' || - v8ProfEnabled(); - if (disableSandbox) { - launchArgs.push('--no-sandbox', '--disable-setuid-sandbox'); - } + // Always launch the prerender renderer with the Chromium sandbox off — it + // always has, and we require it to. This task runs only in a container + // where the sandbox can't initialize, so Chrome won't start with it on + // (and the V8 `--prof` diagnostic needs the renderer able to write its log + // to disk, which the sandbox blocks). The security boundary is the task + // itself: the prerenderer is a separate, segregated ECS task, isolated + // from the realm-server. Forced unconditionally rather than gated on CI / + // PUPPETEER_DISABLE_SANDBOX so a missing env var can't silently break the + // launch. + launchArgs.push('--no-sandbox', '--disable-setuid-sandbox'); // When the realm-server speaks HTTPS (local dev with a mkcert leaf // cert), Chromium needs to be told to accept it. mkcert's root CA