Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
6154e9f
Capture a wedged render's stack out-of-process (Debugger.pause + gate…
habdelra Jun 15, 2026
55ec800
Isolate V8 --prof logs to the current browser run
habdelra Jun 15, 2026
d9bd432
Merge remote-tracking branch 'origin/main' into worktree-prerender-we…
habdelra Jun 15, 2026
5cfd326
indexing-diagnostics: document the hard-CPU-peg capture (Mode I)
habdelra Jun 16, 2026
90c3db5
Match V8's per-isolate --prof log filename
habdelra Jun 16, 2026
65db985
Sweep stale --prof logs on every browser launch (not just when armed)
habdelra Jun 16, 2026
b5e29dc
Always disable the Chromium sandbox for the prerender renderer
habdelra Jun 16, 2026
834d848
Make the --prof reader report why it has no frames (never silent null)
habdelra Jun 16, 2026
174d035
Merge remote-tracking branch 'origin/main' into worktree-prerender-we…
habdelra Jun 16, 2026
4e7409c
Merge remote-tracking branch 'origin/main' into worktree-prerender-we…
habdelra Jun 16, 2026
0b06230
Ship the wedged renderer's V8 --prof log to S3 instead of parsing in-…
habdelra Jun 16, 2026
6df42db
indexing-diagnostics: document the v8log S3-artifact + offline symbol…
habdelra Jun 16, 2026
addb949
Add a one-command helper to fetch + symbolize a prerender wedge's v8log
habdelra Jun 16, 2026
d996c0e
Delete the V8 --prof log from the container after a durable S3 upload
habdelra Jun 16, 2026
c6d2697
Test the v8log artifact kind, the upload-success contract, and --prof…
habdelra Jun 16, 2026
003f1e5
Satisfy qunit/no-assert-logical-expression in the v8-prof test
habdelra Jun 16, 2026
d4ff673
Address review: gate the sandbox, detach late CDP sessions, small cle…
habdelra Jun 16, 2026
fa10cf1
Make --prof log selection transparent and delete only on an unambiguo…
habdelra Jun 16, 2026
87ad20d
Always launch the prerender renderer with the Chromium sandbox off
habdelra Jun 16, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 85 additions & 7 deletions .claude/skills/indexing-diagnostics/SKILL.md

Large diffs are not rendered by default.

30 changes: 21 additions & 9 deletions packages/realm-server/prerender/artifact-sink.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,19 @@ const DEFAULT_REGION = 'us-east-1';
// usual tools recognise: `.cpuprofile` (Chrome DevTools / speedscope),
// `.trace.json` (Chrome tracing / Perfetto), `.heapprofile` (DevTools
// allocation-sampling view).
export type ArtifactKind = 'cpuprofile' | 'trace' | 'heap';
export type ArtifactKind = 'cpuprofile' | 'trace' | 'heap' | 'v8log';

const SUFFIX_BY_KIND: Record<ArtifactKind, string> = {
cpuprofile: 'cpuprofile',
trace: 'trace.json',
heap: 'heapprofile',
// Raw V8 `--prof` tick log (the renderer's `isolate-…-prerender-v8-prof`
// file), uploaded as-is and symbolized offline with `node --prof-process`.
// This is the one capture that survives a hard synchronous CPU peg: the
// kernel SIGPROF sampler writes it from a separate thread, so it lands even
// when the main thread is too pegged to service CDP — but it's too large to
// `--prof-process` inside the render-timeout budget, so we ship the bytes.
v8log: 'v8log',
};

// The render-identifying fields that key an artifact. All but `kind` are
Expand Down Expand Up @@ -222,15 +229,18 @@ let sessionBytesUsed = 0;
let uploadSeq = 0;
let budgetExhaustedLogged = false;

// Uploads one artifact. Resolves once the object is durable in S3 (so a
// per-render `await` genuinely persists before the next render), or sooner
// if the sink is disabled / the budget is spent / the upload fails — none
// of which ever throw or reject. Declines (does not truncate) once the
// session budget is reached, so it never produces an invalid blob.
export async function uploadArtifact(upload: ArtifactUpload): Promise<void> {
// Uploads one artifact. Resolves `true` once the object is durable in S3 (so
// a per-render `await` genuinely persists before the next render), and `false`
// if it didn't land — the sink is disabled, the session budget is spent, or
// the upload failed. Never throws or rejects, and declines (does not truncate)
// once the budget is reached, so it never produces an invalid blob. The
// boolean lets a caller that holds the only local copy (the V8 `--prof` log)
// gate its post-upload cleanup on a genuine success rather than destroying a
// copy that was never persisted.
export async function uploadArtifact(upload: ArtifactUpload): Promise<boolean> {
let bucket = artifactBucket();
if (!bucket) {
return;
return false;
}
if (sessionBytesUsed >= getMaxSessionBytes()) {
if (!budgetExhaustedLogged) {
Expand All @@ -240,7 +250,7 @@ export async function uploadArtifact(upload: ArtifactUpload): Promise<void> {
`declining further artifact uploads for this process`,
);
}
return;
return false;
}

let key = buildArtifactKey(upload, new Date(), uploadSeq++);
Expand All @@ -266,10 +276,12 @@ export async function uploadArtifact(upload: ArtifactUpload): Promise<void> {
`artifact-sink uploaded ${upload.kind} key=${key} bytes=${loaded} ` +
`sessionBytes=${sessionBytesUsed}/${getMaxSessionBytes()}`,
);
return true;
} catch (e) {
// Best-effort: a failed flush is a missing diagnostic, not a render
// failure. Count nothing against the budget.
log.warn(`artifact-sink failed to upload ${upload.kind} key=${key}:`, e);
return false;
}
}

Expand Down
40 changes: 34 additions & 6 deletions packages/realm-server/prerender/browser-manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ import { execFile } from 'child_process';
import { promisify } from 'util';

import { isHttpsLoopback } from '../lib/is-https-loopback.ts';
import {
v8ProfEnabled,
v8ProfJsFlags,
prepareV8ProfForLaunch,
} from './v8-prof.ts';

const log = logger('prerenderer');
const PUPPETEER_PROFILE_PREFIX = 'puppeteer_dev_chrome_profile-';
Expand All @@ -24,12 +29,16 @@ export class BrowserManager {
await this.cleanupUserDataDirs();

let launchArgs: string[] = [];
let disableSandbox =
process.env.CI === 'true' ||
process.env.PUPPETEER_DISABLE_SANDBOX === 'true';
if (disableSandbox) {
launchArgs.push('--no-sandbox', '--disable-setuid-sandbox');
}
// Always launch the prerender renderer with the Chromium sandbox off — it
// always has, and we require it to. This task runs only in a container
// where the sandbox can't initialize, so Chrome won't start with it on
// (and the V8 `--prof` diagnostic needs the renderer able to write its log
// to disk, which the sandbox blocks). The security boundary is the task
// itself: the prerenderer is a separate, segregated ECS task, isolated
// from the realm-server. Forced unconditionally rather than gated on CI /
// PUPPETEER_DISABLE_SANDBOX so a missing env var can't silently break the
// launch.
launchArgs.push('--no-sandbox', '--disable-setuid-sandbox');

// When the realm-server speaks HTTPS (local dev with a mkcert leaf
// cert), Chromium needs to be told to accept it. mkcert's root CA
Expand Down Expand Up @@ -61,6 +70,25 @@ export class BrowserManager {
launchArgs.push(...extraArgs);
}

// Diagnostic (off by default): arm V8's kernel-signal CPU sampler
// (`--prof`) in the renderer at launch. The SIGPROF timer preempts the
// thread on a schedule it can't refuse, so it samples a CPU-pegged
// loop — even one stuck in a non-yielding NATIVE call where
// `Debugger.pause` can't get a back-edge — and it never needs the
// pegged thread to service a CDP `stop`. The cost: it samples EVERY
// render, so it can perturb a timing-sensitive wedge; enable it only
// for a run that needs the native-peg fallback. Per-pid logfile so
// concurrent renderer processes don't clobber one another; the timeout
// path post-processes the log into the prerender-server logs.
// Always sweep stale --prof logs from the container's ephemeral /tmp
// (no EFS here — the only filesystem is the container's), even when
// disabled, so flipping the flag off + restarting cleans up the prior
// "on" period's logs. Arm the sampler itself only when enabled.
await prepareV8ProfForLaunch();
if (v8ProfEnabled()) {
launchArgs.push(`--js-flags=${v8ProfJsFlags()}`);
}

this.#browser = await puppeteer.launch({
headless: process.env.BOXEL_SHOW_PRERENDER !== 'true',
...(launchArgs.length > 0 ? { args: launchArgs } : {}),
Expand Down
205 changes: 205 additions & 0 deletions packages/realm-server/prerender/pause-capture.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
// One-shot stack capture of a wedged prerender via CDP `Debugger.pause`.
//
// Why this exists alongside the CPU profiler / trace stream:
//
// A hard wedge is a synchronous JS loop pegging the renderer's main
// thread. The CDP `Profiler` can't capture it — its samples only arrive
// at `Profiler.stop`, which the pegged thread can't serialize (see
// `trace-capture.ts`). The out-of-band trace stream CAN capture it, but
// it samples continuously, so its own overhead can perturb a
// timing-sensitive wedge enough to dissolve it.
//
// `Debugger.pause` is the missing tool: it adds ZERO overhead until the
// single pause, so it cannot mask the wedge — we let the loop run at
// full native speed and only look once it's already stuck. And it reads
// a synchronous loop: V8 honors the pause at the next interrupt check (a
// loop back-edge or call), so it lands inside the spin without the loop
// yielding — exactly the mechanism behind the DevTools "pause" button on
// a hung page. The returned `callFrames` name the function the loop is
// in.
//
// Limit: if the peg is one long non-yielding NATIVE call (a catastrophic
// regex, a native sort) there is no back-edge to honor the pause, so it
// times out (`reason: 'pause-timeout'`) — which is itself a signal,
// pivoting to the kernel-signal `--prof` sampler that preempts native
// code too.
//
// Everything here runs in the prerender SERVER (Node) over CDP, never in
// the page, so it uses the real Node timer — the page-side
// render-timer-stub does not apply. It is invoked only on the render
// timeout path, so it costs nothing on a healthy render.

import type { CDPSession, Page } from 'puppeteer';
import { logger } from '@cardstack/runtime-common';

const log = logger('prerenderer');

const TIMED_OUT = Symbol('node-timeout');

export interface PausedStackCapture {
// Top JS frames, innermost first, formatted `fn @ url:line:col`.
frames: string[];
// True when the live stack was deeper than `frames` (recursion depth
// is itself diagnostic — a runaway recursion shows a huge total).
truncated: boolean;
totalFrames: number;
// V8 used heap at the moment of the wedge. Flat across the peg → a tight
// compute/recursion loop; climbing → a combinatorial re-build (breadth).
heapUsedMB: number | null;
// Set when no usable stack was captured, naming why.
reason?: string;
}

// Race a Node-side promise against a real Node timer. The prerender server
// process is not the rendered page, so `setTimeout` here is the genuine
// one (the render-timer-stub only replaces the page's timers).
async function raceNodeTimeout<T>(
p: Promise<T>,
ms: number,
): Promise<T | typeof TIMED_OUT> {
let timer: ReturnType<typeof setTimeout> | undefined;
try {
return await Promise.race([
p,
new Promise<typeof TIMED_OUT>((resolve) => {
timer = setTimeout(() => resolve(TIMED_OUT), ms);
}),
]);
} finally {
clearTimeout(timer);
}
}

async function detachQuietly(client: CDPSession | undefined): Promise<void> {
if (!client) {
return;
}
try {
await client.detach();
} catch {
// session already gone / page closing — nothing to do
}
}

function empty(heapUsedMB: number | null, reason: string): PausedStackCapture {
return { frames: [], truncated: false, totalFrames: 0, heapUsedMB, reason };
}

export async function capturePausedCallStack(
page: Page,
opts?: { budgetMs?: number; maxFrames?: number },
): Promise<PausedStackCapture | null> {
let budgetMs = opts?.budgetMs ?? 8000;
let maxFrames = opts?.maxFrames ?? 60;
if (page.isClosed()) {
return null;
}
let client: CDPSession | undefined;
try {
let sessionPromise = page.createCDPSession();
let session = await raceNodeTimeout(sessionPromise, budgetMs);
if (session === TIMED_OUT) {
// We gave up waiting, but the create may still resolve later — detach it
// then so a slow CDP session isn't orphaned on the page until it closes.
void sessionPromise.then((s) => detachQuietly(s)).catch(() => {});
return empty(null, 'cdp-session-timeout');
}
client = session;

// Heap usage first: `Runtime.getHeapUsage` is answered from V8 stats,
// so it returns even while the JS thread is pegged.
let heapUsedMB: number | null = null;
try {
let usage = (await raceNodeTimeout(
client.send('Runtime.getHeapUsage'),
budgetMs,
)) as { usedSize?: number } | typeof TIMED_OUT;
if (usage !== TIMED_OUT && typeof usage.usedSize === 'number') {
heapUsedMB = usage.usedSize / (1024 * 1024);
}
} catch {
// best-effort
}

// Arm the paused listener BEFORE requesting the pause so we can't miss
// the event.
let pausedFrames = new Promise<any[] | null>((resolve) => {
client!.once('Debugger.paused', (e: any) =>
resolve(Array.isArray(e?.callFrames) ? e.callFrames : null),
);
});

let enabled = await raceNodeTimeout(
client.send('Debugger.enable'),
budgetMs,
);
if (enabled === TIMED_OUT) {
return empty(heapUsedMB, 'debugger-enable-timeout');
}
// Fire-and-forget: the pause lands at the next V8 interrupt check.
void client.send('Debugger.pause').catch(() => {});

let callFrames = await raceNodeTimeout(pausedFrames, budgetMs);

// Best-effort unpause so teardown isn't left in a paused state.
try {
await raceNodeTimeout(client.send('Debugger.resume'), 2000);
} catch {
// ignore
}
try {
await raceNodeTimeout(client.send('Debugger.disable'), 2000);
} catch {
// ignore
}

if (callFrames === TIMED_OUT || !Array.isArray(callFrames)) {
// No back-edge honored the pause within budget — most likely a
// non-yielding native peg. The `--prof` sampler is the fallback.
return empty(heapUsedMB, 'pause-timeout');
}

let totalFrames = callFrames.length;
let frames = callFrames.slice(0, maxFrames).map((f: any) => {
let name =
typeof f.functionName === 'string' && f.functionName.length > 0
? f.functionName
: '(anonymous)';
let loc = f.location ?? {};
let url =
(typeof f.url === 'string' && f.url.length > 0 && f.url) ||
f.functionLocation?.scriptId ||
'<unknown>';
let line = (loc.lineNumber ?? 0) + 1;
let col = (loc.columnNumber ?? 0) + 1;
return `${name} @ ${url}:${line}:${col}`;
});
return {
frames,
truncated: totalFrames > maxFrames,
totalFrames,
heapUsedMB,
};
} catch (e) {
log.debug('paused stack capture failed:', e);
return null;
} finally {
await detachQuietly(client);
}
}

// Format the capture for a single log line / diagnostics field.
export function formatPausedStack(
capture: PausedStackCapture | null,
): string | null {
if (!capture) {
return null;
}
if (capture.frames.length === 0) {
return `<${capture.reason ?? 'no-frames'}>`;
}
let depth = capture.truncated
? `${capture.frames.length}/${capture.totalFrames}`
: `${capture.totalFrames}`;
return `[depth=${depth}] ` + capture.frames.join(' <- ');
}
8 changes: 6 additions & 2 deletions packages/realm-server/prerender/prerender-constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,12 @@ export { DURING_PRERENDER_HEADER } from '@cardstack/runtime-common';
export { sanitizePrerenderJobId } from '@cardstack/runtime-common';

// Base timeout for a single prerender capture on the prerender server
// (DOM rendering + data loading inside the headless browser).
const DEFAULT_RENDER_TIMEOUT_MS = 90_000;
// (DOM rendering + data loading inside the headless browser). A healthy
// render completes in well under 30s; 60s cleanly separates a genuine
// wedge from the slowest legitimate cards, and (with the request-timeout
// overhead below) fires the render-level timeout — and its hang
// diagnostics — before the request-level abort gives up on the render.
const DEFAULT_RENDER_TIMEOUT_MS = 60_000;
// Additional budget for request-level timeouts that wrap render work across
// process/network boundaries (manager proxying, serialization, retries, etc).
// Request timeout defaults are computed as:
Expand Down
Loading
Loading