From 6154e9feba134ad574424815ca6ce9db093c40a5 Mon Sep 17 00:00:00 2001
From: Hassan Abdel-Rahman <hassan.abdelrahman@gmail.com>
Date: Mon, 15 Jun 2026 19:14:17 -0400
Subject: [PATCH 01/16] Capture a wedged render's stack out-of-process
 (Debugger.pause + gated --prof)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a prerender render pegs the renderer's main thread, the existing
timeout-path diagnostics can't name the offending function: the CDP
`Profiler` only yields samples at `stop`, which the pegged thread can't
serialize, and the out-of-band trace stream samples continuously, so its
own overhead can perturb a timing-sensitive wedge enough to dissolve it.

Add a capture that reads the peg from outside without instrumenting it:

- `Debugger.pause` (primary, always on the timeout path): V8 honors the
  pause at the next interrupt check (a loop back-edge / call), so it lands
  inside a synchronous loop without the loop yielding — the mechanism
  behind the DevTools "pause" button on a hung page. Zero overhead until
  the single pause, so it can't mask the wedge. Reports the call frames,
  the live stack depth (a runaway recursion shows a huge total), and heap
  used (flat → tight compute/recursion; climbing → combinatorial breadth).
  Runs out-of-process over CDP, so it survives a pegged JS thread, and only
  on the timeout path, so it's free on a healthy render.

- `--prof` (gated behind `PRERENDER_V8_PROF`, off by default): V8's
  kernel-SIGPROF sampler, armed at Chrome launch. The signal preempts the
  thread regardless of what it's running, so it also samples a peg stuck in
  a non-yielding NATIVE call — where `Debugger.pause` can't get a back-edge
  and returns `pause-timeout`. It samples every render, so it can perturb a
  timing-sensitive wedge; enable it only as the native-peg fallback.

Both surface in the prerender-server timeout log line (`pausedStack:`,
`v8ProfTopFrames:`).

Also lower the render timeout 90s → 60s: a healthy render finishes well
under 30s, so 60s cleanly separates a genuine wedge from the slowest
legitimate cards, and — with the request-timeout overhead — fires the
render-level timeout (and this capture) before the request-level abort
gives up on the render.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../realm-server/prerender/browser-manager.ts |  15 ++
 .../realm-server/prerender/pause-capture.ts   | 201 ++++++++++++++++++
 .../prerender/prerender-constants.ts          |   8 +-
 packages/realm-server/prerender/utils.ts      |  34 +++
 packages/realm-server/prerender/v8-prof.ts    |  94 ++++++++
 5 files changed, 350 insertions(+), 2 deletions(-)
 create mode 100644 packages/realm-server/prerender/pause-capture.ts
 create mode 100644 packages/realm-server/prerender/v8-prof.ts

diff --git a/packages/realm-server/prerender/browser-manager.ts b/packages/realm-server/prerender/browser-manager.ts
index ef7ba02657..b424635d24 100644
--- a/packages/realm-server/prerender/browser-manager.ts
+++ b/packages/realm-server/prerender/browser-manager.ts
@@ -7,6 +7,7 @@ import { execFile } from 'child_process';
 import { promisify } from 'util';
 
 import { isHttpsLoopback } from '../lib/is-https-loopback.ts';
+import { v8ProfEnabled, v8ProfJsFlags } from './v8-prof.ts';
 
 const log = logger('prerenderer');
 const PUPPETEER_PROFILE_PREFIX = 'puppeteer_dev_chrome_profile-';
@@ -61,6 +62,20 @@ export class BrowserManager {
       launchArgs.push(...extraArgs);
     }
 
+    // Diagnostic (off by default): arm V8's kernel-signal CPU sampler
+    // (`--prof`) in the renderer at launch. The SIGPROF timer preempts the
+    // thread on a schedule it can't refuse, so it samples a CPU-pegged
+    // loop — even one stuck in a non-yielding NATIVE call where
+    // `Debugger.pause` can't get a back-edge — and it never needs the
+    // pegged thread to service a CDP `stop`. The cost: it samples EVERY
+    // render, so it can perturb a timing-sensitive wedge; enable it only
+    // for a run that needs the native-peg fallback. Per-pid logfile so
+    // concurrent renderer processes don't clobber one another; the timeout
+    // path post-processes the log into the prerender-server logs.
+    if (v8ProfEnabled()) {
+      launchArgs.push(`--js-flags=${v8ProfJsFlags()}`);
+    }
+
     this.#browser = await puppeteer.launch({
       headless: process.env.BOXEL_SHOW_PRERENDER !== 'true',
       ...(launchArgs.length > 0 ? { args: launchArgs } : {}),
diff --git a/packages/realm-server/prerender/pause-capture.ts b/packages/realm-server/prerender/pause-capture.ts
new file mode 100644
index 0000000000..ba2abfdcdd
--- /dev/null
+++ b/packages/realm-server/prerender/pause-capture.ts
@@ -0,0 +1,201 @@
+// One-shot stack capture of a wedged prerender via CDP `Debugger.pause`.
+//
+// Why this exists alongside the CPU profiler / trace stream:
+//
+//   A hard wedge is a synchronous JS loop pegging the renderer's main
+//   thread. The CDP `Profiler` can't capture it — its samples only arrive
+//   at `Profiler.stop`, which the pegged thread can't serialize (see
+//   `trace-capture.ts`). The out-of-band trace stream CAN capture it, but
+//   it samples continuously, so its own overhead can perturb a
+//   timing-sensitive wedge enough to dissolve it.
+//
+//   `Debugger.pause` is the missing tool: it adds ZERO overhead until the
+//   single pause, so it cannot mask the wedge — we let the loop run at
+//   full native speed and only look once it's already stuck. And it reads
+//   a synchronous loop: V8 honors the pause at the next interrupt check (a
+//   loop back-edge or call), so it lands inside the spin without the loop
+//   yielding — exactly the mechanism behind the DevTools "pause" button on
+//   a hung page. The returned `callFrames` name the function the loop is
+//   in.
+//
+//   Limit: if the peg is one long non-yielding NATIVE call (a catastrophic
+//   regex, a native sort) there is no back-edge to honor the pause, so it
+//   times out (`reason: 'pause-timeout'`) — which is itself a signal,
+//   pivoting to the kernel-signal `--prof` sampler that preempts native
+//   code too.
+//
+// Everything here runs in the prerender SERVER (Node) over CDP, never in
+// the page, so it uses the real Node timer — the page-side
+// render-timer-stub does not apply. It is invoked only on the render
+// timeout path, so it costs nothing on a healthy render.
+
+import type { CDPSession, Page } from 'puppeteer';
+import { logger } from '@cardstack/runtime-common';
+
+const log = logger('prerenderer');
+
+const TIMED_OUT = Symbol('node-timeout');
+
+export interface PausedStackCapture {
+  // Top JS frames, innermost first, formatted `fn @ url:line:col`.
+  frames: string[];
+  // True when the live stack was deeper than `frames` (recursion depth
+  // is itself diagnostic — a runaway recursion shows a huge total).
+  truncated: boolean;
+  totalFrames: number;
+  // V8 used heap at the moment of the wedge. Flat across the peg → a tight
+  // compute/recursion loop; climbing → a combinatorial re-build (breadth).
+  heapUsedMB: number | null;
+  // Set when no usable stack was captured, naming why.
+  reason?: string;
+}
+
+// Race a Node-side promise against a real Node timer. The prerender server
+// process is not the rendered page, so `setTimeout` here is the genuine
+// one (the render-timer-stub only replaces the page's timers).
+async function raceNodeTimeout<T>(
+  p: Promise<T>,
+  ms: number,
+): Promise<T | typeof TIMED_OUT> {
+  let timer: ReturnType<typeof setTimeout> | undefined;
+  try {
+    return await Promise.race([
+      p,
+      new Promise<typeof TIMED_OUT>((resolve) => {
+        timer = setTimeout(() => resolve(TIMED_OUT), ms);
+      }),
+    ]);
+  } finally {
+    clearTimeout(timer);
+  }
+}
+
+async function detachQuietly(client: CDPSession | undefined): Promise<void> {
+  if (!client) {
+    return;
+  }
+  try {
+    await client.detach();
+  } catch {
+    // session already gone / page closing — nothing to do
+  }
+}
+
+function empty(heapUsedMB: number | null, reason: string): PausedStackCapture {
+  return { frames: [], truncated: false, totalFrames: 0, heapUsedMB, reason };
+}
+
+export async function capturePausedCallStack(
+  page: Page,
+  opts?: { budgetMs?: number; maxFrames?: number },
+): Promise<PausedStackCapture | null> {
+  let budgetMs = opts?.budgetMs ?? 8000;
+  let maxFrames = opts?.maxFrames ?? 60;
+  if (page.isClosed()) {
+    return null;
+  }
+  let client: CDPSession | undefined;
+  try {
+    let session = await raceNodeTimeout(page.createCDPSession(), budgetMs);
+    if (session === TIMED_OUT) {
+      return empty(null, 'cdp-session-timeout');
+    }
+    client = session;
+
+    // Heap usage first: `Runtime.getHeapUsage` is answered from V8 stats,
+    // so it returns even while the JS thread is pegged.
+    let heapUsedMB: number | null = null;
+    try {
+      let usage = (await raceNodeTimeout(
+        client.send('Runtime.getHeapUsage'),
+        budgetMs,
+      )) as { usedSize?: number } | typeof TIMED_OUT;
+      if (usage !== TIMED_OUT && typeof usage.usedSize === 'number') {
+        heapUsedMB = usage.usedSize / (1024 * 1024);
+      }
+    } catch {
+      // best-effort
+    }
+
+    // Arm the paused listener BEFORE requesting the pause so we can't miss
+    // the event.
+    let pausedFrames = new Promise<any[] | null>((resolve) => {
+      client!.once('Debugger.paused', (e: any) =>
+        resolve(Array.isArray(e?.callFrames) ? e.callFrames : null),
+      );
+    });
+
+    let enabled = await raceNodeTimeout(
+      client.send('Debugger.enable'),
+      budgetMs,
+    );
+    if (enabled === TIMED_OUT) {
+      return empty(heapUsedMB, 'debugger-enable-timeout');
+    }
+    // Fire-and-forget: the pause lands at the next V8 interrupt check.
+    void client.send('Debugger.pause').catch(() => {});
+
+    let callFrames = await raceNodeTimeout(pausedFrames, budgetMs);
+
+    // Best-effort unpause so teardown isn't left in a paused state.
+    try {
+      await raceNodeTimeout(client.send('Debugger.resume'), 2000);
+    } catch {
+      // ignore
+    }
+    try {
+      await raceNodeTimeout(client.send('Debugger.disable'), 2000);
+    } catch {
+      // ignore
+    }
+
+    if (callFrames === TIMED_OUT || !Array.isArray(callFrames)) {
+      // No back-edge honored the pause within budget — most likely a
+      // non-yielding native peg. The `--prof` sampler is the fallback.
+      return empty(heapUsedMB, 'pause-timeout');
+    }
+
+    let totalFrames = callFrames.length;
+    let frames = callFrames.slice(0, maxFrames).map((f: any) => {
+      let name =
+        typeof f.functionName === 'string' && f.functionName.length > 0
+          ? f.functionName
+          : '(anonymous)';
+      let loc = f.location ?? {};
+      let url =
+        (typeof f.url === 'string' && f.url.length > 0 && f.url) ||
+        f.functionLocation?.scriptId ||
+        '<unknown>';
+      let line = (loc.lineNumber ?? 0) + 1;
+      let col = (loc.columnNumber ?? 0) + 1;
+      return `${name} @ ${url}:${line}:${col}`;
+    });
+    return {
+      frames,
+      truncated: totalFrames > maxFrames,
+      totalFrames,
+      heapUsedMB,
+    };
+  } catch (e) {
+    log.debug('paused stack capture failed:', e);
+    return null;
+  } finally {
+    await detachQuietly(client);
+  }
+}
+
+// Format the capture for a single log line / diagnostics field.
+export function formatPausedStack(
+  capture: PausedStackCapture | null,
+): string | null {
+  if (!capture) {
+    return null;
+  }
+  if (capture.frames.length === 0) {
+    return `<${capture.reason ?? 'no-frames'}>`;
+  }
+  let depth = capture.truncated
+    ? `${capture.frames.length}/${capture.totalFrames}`
+    : `${capture.totalFrames}`;
+  return `[depth=${depth}] ` + capture.frames.join('  <-  ');
+}
diff --git a/packages/realm-server/prerender/prerender-constants.ts b/packages/realm-server/prerender/prerender-constants.ts
index 7ec8005b19..b6f727c5b7 100644
--- a/packages/realm-server/prerender/prerender-constants.ts
+++ b/packages/realm-server/prerender/prerender-constants.ts
@@ -75,8 +75,12 @@ export { DURING_PRERENDER_HEADER } from '@cardstack/runtime-common';
 export { sanitizePrerenderJobId } from '@cardstack/runtime-common';
 
 // Base timeout for a single prerender capture on the prerender server
-// (DOM rendering + data loading inside the headless browser).
-const DEFAULT_RENDER_TIMEOUT_MS = 90_000;
+// (DOM rendering + data loading inside the headless browser). A healthy
+// render completes in well under 30s; 60s cleanly separates a genuine
+// wedge from the slowest legitimate cards, and (with the request-timeout
+// overhead below) fires the render-level timeout — and its hang
+// diagnostics — before the request-level abort gives up on the render.
+const DEFAULT_RENDER_TIMEOUT_MS = 60_000;
 // Additional budget for request-level timeouts that wrap render work across
 // process/network boundaries (manager proxying, serialization, retries, etc).
 // Request timeout defaults are computed as:
diff --git a/packages/realm-server/prerender/utils.ts b/packages/realm-server/prerender/utils.ts
index 3120896ee5..c5454c9f7b 100644
--- a/packages/realm-server/prerender/utils.ts
+++ b/packages/realm-server/prerender/utils.ts
@@ -17,6 +17,12 @@ import {
 } from './cpu-profiler.ts';
 import { fromAffinityKey } from './affinity.ts';
 import { captureTraceStream } from './trace-capture.ts';
+import {
+  capturePausedCallStack,
+  formatPausedStack,
+  type PausedStackCapture,
+} from './pause-capture.ts';
+import { v8ProfEnabled, processV8ProfTopFrames } from './v8-prof.ts';
 import {
   ensureHeapSampling,
   captureHeapSamplingProfile,
@@ -1686,6 +1692,27 @@ export async function withTimeout<T>(
       });
       cpuProfileTopFrames = formatTopFrames(summary);
     }
+    // One-shot stack capture of the still-pegged renderer via
+    // `Debugger.pause`. Where the CPU profiler above structurally can't
+    // capture a hard wedge (its `stop` needs the pegged thread to
+    // serialize the profile), a debugger pause lands inside the
+    // synchronous loop at the next V8 interrupt check and reports the
+    // call frames directly — and it adds zero overhead until the single
+    // pause, so it can't mask a timing-sensitive wedge the way continuous
+    // sampling can. Out-of-process (CDP), so it survives a pegged JS
+    // thread; only on the timeout path, so it's free on a healthy render.
+    let pausedStack: PausedStackCapture | null = null;
+    if (!page.isClosed()) {
+      pausedStack = await capturePausedCallStack(page, { budgetMs: 8000 });
+    }
+    // Native-peg fallback (armed only when `PRERENDER_V8_PROF=true`):
+    // summarize the renderer's kernel-sampled V8 `--prof` log. Worth
+    // reading mainly when `pausedStack` came back `pause-timeout` — i.e.
+    // the peg is a non-yielding native call the debugger couldn't enter.
+    let v8ProfTopFrames: string | null = null;
+    if (v8ProfEnabled()) {
+      v8ProfTopFrames = await processV8ProfTopFrames();
+    }
     // Out-of-band CDP view of fetches still outstanding — survives a
     // wedged JS thread, so it's available even when the in-page hooks
     // below are skipped. The longest-lived entry is what a waiting
@@ -1791,6 +1818,13 @@ export async function withTimeout<T>(
         (topPending ? `(top: ${topPending.url} ${topPending.ageMs}ms)` : '') +
         ` concurrentRenders=${concurrentRenders}` +
         (cpuProfileTopFrames ? ` cpuTopFrames: ${cpuProfileTopFrames}` : '') +
+        (pausedStack?.heapUsedMB != null
+          ? ` jsHeapUsedMB=${pausedStack.heapUsedMB.toFixed(0)}`
+          : '') +
+        (formatPausedStack(pausedStack)
+          ? ` pausedStack: ${formatPausedStack(pausedStack)}`
+          : '') +
+        (v8ProfTopFrames ? `\nv8ProfTopFrames:\n${v8ProfTopFrames}` : '') +
         ` DOM:\n${dom?.trim()}`,
     );
     // Diagnostics ride on the outer `RenderError.diagnostics` as a
diff --git a/packages/realm-server/prerender/v8-prof.ts b/packages/realm-server/prerender/v8-prof.ts
new file mode 100644
index 0000000000..8270ad0e67
--- /dev/null
+++ b/packages/realm-server/prerender/v8-prof.ts
@@ -0,0 +1,94 @@
+// Diagnostic V8 `--prof` (kernel-signal CPU sampling) for the prerender
+// renderer. Off by default (`PRERENDER_V8_PROF=true` to arm).
+//
+// Why this exists alongside `Debugger.pause` (pause-capture.ts):
+//
+//   `Debugger.pause` reads a synchronous JS loop (V8 honors the pause at a
+//   back-edge) with zero continuous overhead. But if the wedge is one long
+//   NON-YIELDING native call there is no back-edge, and pause times out.
+//   `--prof` covers that gap: the kernel SIGPROF timer preempts the thread
+//   mid-instruction regardless of what it's running (JS or native), so the
+//   busiest frame wins the samples — exactly the case `Debugger.pause`
+//   can't reach. It also never needs the pegged thread to service a CDP
+//   `stop` (the failure mode of the CDP `Profiler`; see trace-capture.ts).
+//
+//   The trade-off, and why it's gated off: it samples EVERY render, so its
+//   own interrupts can perturb a timing-sensitive wedge enough to dissolve
+//   it. Run it only when `Debugger.pause` reports `pause-timeout` (a
+//   suspected native peg).
+
+import { logger } from '@cardstack/runtime-common';
+import { promises as fs } from 'fs';
+import { tmpdir } from 'os';
+import path from 'path';
+import { execFile } from 'child_process';
+import { promisify } from 'util';
+
+const log = logger('prerenderer');
+const execFileAsync = promisify(execFile);
+
+export const V8_PROF_LOG_DIR = tmpdir();
+export const V8_PROF_LOG_PREFIX = 'prerender-v8-prof-';
+
+export function v8ProfEnabled(): boolean {
+  return process.env.PRERENDER_V8_PROF === 'true';
+}
+
+// The `--js-flags` value to pass at Chrome launch when armed. Per-pid
+// logfile so concurrent renderer processes don't clobber one another.
+export function v8ProfJsFlags(): string {
+  return `--prof --logfile=${V8_PROF_LOG_DIR}/${V8_PROF_LOG_PREFIX}%p.log`;
+}
+
+// Best-effort summary of the hottest frames from the renderer's `--prof`
+// log via `node --prof-process`. Called on the timeout path only when
+// armed. Time-boxed — the log can be large, and the pegged render's
+// samples dominate it, so the top self-time frames name the wedge.
+export async function processV8ProfTopFrames(
+  budgetMs = 20000,
+): Promise<string | null> {
+  if (!v8ProfEnabled()) {
+    return null;
+  }
+  try {
+    let entries = await fs.readdir(V8_PROF_LOG_DIR);
+    let logs = entries.filter(
+      (e) => e.startsWith(V8_PROF_LOG_PREFIX) && e.endsWith('.log'),
+    );
+    if (logs.length === 0) {
+      return '<no v8 --prof log found>';
+    }
+    // The pegged render produces by far the most samples, so its
+    // renderer's log is the largest.
+    let withSizes = await Promise.all(
+      logs.map(async (name) => {
+        let full = path.join(V8_PROF_LOG_DIR, name);
+        try {
+          let st = await fs.stat(full);
+          return { full, size: st.size };
+        } catch {
+          return { full, size: 0 };
+        }
+      }),
+    );
+    withSizes.sort((a, b) => b.size - a.size);
+    let logPath = withSizes[0].full;
+    let { stdout } = await execFileAsync(
+      process.execPath,
+      ['--prof-process', logPath],
+      { timeout: budgetMs, maxBuffer: 32 * 1024 * 1024 },
+    );
+    let lines = stdout.split('\n');
+    // The bottom-up "[Summary]" section lists self-time by category; the
+    // top JS/C++ entries that follow name the hot function.
+    let summaryIdx = lines.findIndex((l) => /\[Summary\]/.test(l));
+    let slice =
+      summaryIdx >= 0
+        ? lines.slice(summaryIdx, summaryIdx + 60)
+        : lines.slice(0, 60);
+    return slice.join('\n').trim() || '<empty prof summary>';
+  } catch (e) {
+    log.debug('v8 --prof processing failed:', e);
+    return null;
+  }
+}

From 55ec800351625059dec1790da3e727172a098875 Mon Sep 17 00:00:00 2001
From: Hassan Abdel-Rahman <hassan.abdelrahman@gmail.com>
Date: Mon, 15 Jun 2026 19:20:45 -0400
Subject: [PATCH 02/16] Isolate V8 --prof logs to the current browser run
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The timeout-path processor scanned the shared OS temp dir and picked the
largest matching `--prof` log, so a stale log from an earlier run in a
reused container (or another renderer that ran longer without wedging)
could outrank the timed-out page and point `v8ProfTopFrames` at the wrong
stack. Clear pre-existing logs at browser launch and stamp the launch
time, then on the timeout path consider only logs written since launch and
pick the most-recently-written one — the renderer still spinning into the
timeout, not an older completed render's larger-but-stale log.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../realm-server/prerender/browser-manager.ts |  7 ++-
 packages/realm-server/prerender/v8-prof.ts    | 50 ++++++++++++++++---
 2 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/packages/realm-server/prerender/browser-manager.ts b/packages/realm-server/prerender/browser-manager.ts
index b424635d24..cda7cca4a8 100644
--- a/packages/realm-server/prerender/browser-manager.ts
+++ b/packages/realm-server/prerender/browser-manager.ts
@@ -7,7 +7,11 @@ import { execFile } from 'child_process';
 import { promisify } from 'util';
 
 import { isHttpsLoopback } from '../lib/is-https-loopback.ts';
-import { v8ProfEnabled, v8ProfJsFlags } from './v8-prof.ts';
+import {
+  v8ProfEnabled,
+  v8ProfJsFlags,
+  prepareV8ProfForLaunch,
+} from './v8-prof.ts';
 
 const log = logger('prerenderer');
 const PUPPETEER_PROFILE_PREFIX = 'puppeteer_dev_chrome_profile-';
@@ -73,6 +77,7 @@ export class BrowserManager {
     // concurrent renderer processes don't clobber one another; the timeout
     // path post-processes the log into the prerender-server logs.
     if (v8ProfEnabled()) {
+      await prepareV8ProfForLaunch();
       launchArgs.push(`--js-flags=${v8ProfJsFlags()}`);
     }
 
diff --git a/packages/realm-server/prerender/v8-prof.ts b/packages/realm-server/prerender/v8-prof.ts
index 8270ad0e67..aefc9e90a9 100644
--- a/packages/realm-server/prerender/v8-prof.ts
+++ b/packages/realm-server/prerender/v8-prof.ts
@@ -40,6 +40,34 @@ export function v8ProfJsFlags(): string {
   return `--prof --logfile=${V8_PROF_LOG_DIR}/${V8_PROF_LOG_PREFIX}%p.log`;
 }
 
+// Browser-launch wall-clock; logs older than this are from a previous run.
+let v8ProfLaunchAt = 0;
+
+// Call once at browser launch (when armed) to isolate this run's profile
+// logs. The OS temp dir is shared and the container can be reused across
+// deploys, so a stale `--prof` log from an earlier run could otherwise be
+// picked as the timeout diagnostic and point at the wrong stack. Delete
+// any pre-existing logs and stamp the launch time so the timeout-path
+// processor only ever considers logs written by THIS browser run.
+export async function prepareV8ProfForLaunch(): Promise<void> {
+  if (!v8ProfEnabled()) {
+    return;
+  }
+  v8ProfLaunchAt = Date.now();
+  try {
+    let entries = await fs.readdir(V8_PROF_LOG_DIR);
+    await Promise.all(
+      entries
+        .filter((e) => e.startsWith(V8_PROF_LOG_PREFIX) && e.endsWith('.log'))
+        .map((e) =>
+          fs.rm(path.join(V8_PROF_LOG_DIR, e), { force: true }).catch(() => {}),
+        ),
+    );
+  } catch (e) {
+    log.debug('v8 --prof stale-log cleanup failed:', e);
+  }
+}
+
 // Best-effort summary of the hottest frames from the renderer's `--prof`
 // log via `node --prof-process`. Called on the timeout path only when
 // armed. Time-boxed — the log can be large, and the pegged render's
@@ -58,21 +86,29 @@ export async function processV8ProfTopFrames(
     if (logs.length === 0) {
       return '<no v8 --prof log found>';
     }
-    // The pegged render produces by far the most samples, so its
-    // renderer's log is the largest.
-    let withSizes = await Promise.all(
+    let withStats = await Promise.all(
       logs.map(async (name) => {
         let full = path.join(V8_PROF_LOG_DIR, name);
         try {
           let st = await fs.stat(full);
-          return { full, size: st.size };
+          return { full, mtimeMs: st.mtimeMs, size: st.size };
         } catch {
-          return { full, size: 0 };
+          return { full, mtimeMs: 0, size: 0 };
         }
       }),
     );
-    withSizes.sort((a, b) => b.size - a.size);
-    let logPath = withSizes[0].full;
+    // Only this browser run's logs (cleared + stamped at launch), and
+    // among those the most-recently-written one — the renderer that was
+    // still spinning up to the timeout, not an earlier completed render's
+    // larger-but-stale log.
+    let fromThisRun = withStats.filter(
+      (s) => s.size > 0 && s.mtimeMs >= v8ProfLaunchAt,
+    );
+    if (fromThisRun.length === 0) {
+      return '<no v8 --prof log from this run>';
+    }
+    fromThisRun.sort((a, b) => b.mtimeMs - a.mtimeMs);
+    let logPath = fromThisRun[0].full;
     let { stdout } = await execFileAsync(
       process.execPath,
       ['--prof-process', logPath],

From 5cfd32632e9485ee09c76e9978f97d3410d14fde Mon Sep 17 00:00:00 2001
From: Hassan Abdel-Rahman <hassan.abdelrahman@gmail.com>
Date: Mon, 15 Jun 2026 20:39:46 -0400
Subject: [PATCH 03/16] indexing-diagnostics: document the hard-CPU-peg capture
 (Mode I)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds Mode I — reading a render wedged in a synchronous CPU peg from
outside, when the thread is too pegged to service the CDP message that
arms a debugger/profiler (`Debugger.enable` / `Profiler.enable` time out)
and Mode H's captures are starved:

- `pausedStack` — the always-on one-shot `Debugger.pause` on the timeout
  path (zero overhead until the pause, so it can't mask), with its
  `<pause-timeout>` / `<debugger-enable-timeout>` states.
- `v8ProfTopFrames` — the `--prof` kernel-`SIGPROF` sampler (knob
  `PRERENDER_V8_PROF`) that preempts the pegged thread without cooperation,
  the fallback that reads a hard peg the CDP captures can't.

Also adds the `PRERENDER_V8_PROF` row to the Mode H knobs table and a
cross-reference from Mode H's limits, and notes the 60s render timeout
that fires the capture before the request-level abort.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .claude/skills/indexing-diagnostics/SKILL.md | 68 ++++++++++++++++++--
 1 file changed, 61 insertions(+), 7 deletions(-)

diff --git a/.claude/skills/indexing-diagnostics/SKILL.md b/.claude/skills/indexing-diagnostics/SKILL.md
index 53cc1e7c6a..4dcdbb3758 100644
--- a/.claude/skills/indexing-diagnostics/SKILL.md
+++ b/.claude/skills/indexing-diagnostics/SKILL.md
@@ -881,13 +881,14 @@ Rule of thumb: a render that **completes but is heavy** → `.cpuprofile` (+ hea
 
 All live at `/<env>/boxel/<NAME>` (Systems Manager → Parameter Store). The bucket itself (`PRERENDER_ARTIFACTS_BUCKET`) and the key prefix (`PRERENDER_ARTIFACTS_ENV`) are wired by Terraform — don't set them by hand.
 
-| Parameter                             | Values                                                                             | Effect                                                                                                                                                                     |
-| ------------------------------------- | ---------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `PRERENDER_PROFILE_AFFINITY`          | comma-separated affinity keys, e.g. `realm:https://realms.cardstack.com/team/foo/` | **Required to target.** Only renders whose affinity key exactly matches are profiled at all (Tier 1 + Tier 2). Empty / `off` → everything inert.                           |
-| `PRERENDER_PROFILE_CPUPROFILE`        | `true` / `false`                                                                   | Persist the full `.cpuprofile` for targeted renders.                                                                                                                       |
-| `PRERENDER_PROFILE_TRACE`             | `true` / `false`                                                                   | Capture the streaming trace for targeted renders.                                                                                                                          |
-| `PRERENDER_PROFILE_HEAP`              | `true` / `false`                                                                   | Capture the heap allocation-sampling profile for targeted renders.                                                                                                         |
-| `PRERENDER_PROFILE_MAX_SESSION_BYTES` | positive integer, or `0` for the default                                           | Soft per-process byte budget across all artifacts. `0`/unset → 5 GiB. Once spent, the task declines further uploads (in-flight ones finish, so blobs are never truncated). |
+| Parameter                             | Values                                                                             | Effect                                                                                                                                                                                                                                                                                                                                                                                |
+| ------------------------------------- | ---------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `PRERENDER_PROFILE_AFFINITY`          | comma-separated affinity keys, e.g. `realm:https://realms.cardstack.com/team/foo/` | **Required to target.** Only renders whose affinity key exactly matches are profiled at all (Tier 1 + Tier 2). Empty / `off` → everything inert.                                                                                                                                                                                                                                      |
+| `PRERENDER_PROFILE_CPUPROFILE`        | `true` / `false`                                                                   | Persist the full `.cpuprofile` for targeted renders.                                                                                                                                                                                                                                                                                                                                  |
+| `PRERENDER_PROFILE_TRACE`             | `true` / `false`                                                                   | Capture the streaming trace for targeted renders.                                                                                                                                                                                                                                                                                                                                     |
+| `PRERENDER_PROFILE_HEAP`              | `true` / `false`                                                                   | Capture the heap allocation-sampling profile for targeted renders.                                                                                                                                                                                                                                                                                                                    |
+| `PRERENDER_PROFILE_MAX_SESSION_BYTES` | positive integer, or `0` for the default                                           | Soft per-process byte budget across all artifacts. `0`/unset → 5 GiB. Once spent, the task declines further uploads (in-flight ones finish, so blobs are never truncated).                                                                                                                                                                                                            |
+| `PRERENDER_V8_PROF`                   | `true` / `false`                                                                   | Arm V8's `--prof` kernel-`SIGPROF` CPU sampler at Chrome launch — **renderer-wide, NOT affinity-gated**. The fallback for a hard CPU peg that starves the CDP captures above (kernel preemption needs no thread cooperation). Surfaces as `v8ProfTopFrames` on the render-timeout line; see **Mode I**. Needs a prerender-server task **restart** (not just redeploy) to take effect. |
 
 The mode flags are Terraform-seeded sentinels (default `false` / `0`); `PRERENDER_PROFILE_AFFINITY` is operator-managed and must already exist. The affinity key is `realm:` + the realm's canonical URL **with trailing slash** — the same value Mode A/B logs print as `affinity=…`.
 
@@ -927,6 +928,59 @@ The key schema is `env/realm/jobId/card/step/<timestamp>-<seq>.<suffix>` — eve
 - The `.cpuprofile` and `.heapprofile` need the renderer thread to serialize, so a **fully-wedged** render produces neither — only the streaming trace comes back. That's by design (Tier 1's summary has the same limit); the trace is the wedge tool.
 - Browser-wide tracing is **single-flight** — only one trace runs at a time across the whole pool. Concurrent targeted renders skip their trace (logged at `debug`), so don't expect a trace for _every_ render under load; constrain concurrency or accept the gaps. The summary and the cpuprofile/heap captures are per-render and unaffected.
 - Captured artifacts are anonymized only at the key level (host stripped). The blobs themselves contain card URLs and code paths — treat them as you would any prerender diagnostic.
+- If the renderer is so pegged that even `Debugger.enable` / `Profiler.enable` time out, **all of Mode H is starved** — the cpuprofile, heap, _and_ the trace's sampler setup all need the renderer to service a CDP message. Read that case from the timeout-path `pausedStack` / `--prof` signals instead — **Mode I**.
+
+## Mode I — a render is wedged in a synchronous CPU peg (reading the renderer from outside)
+
+**When to use this mode.** Mode A classified a render as a hard CPU peg (`mainThreadResponsive=false`, `scriptBusy≈1`), or a from-scratch index rejected mid-run on a card that never wrote a row (Mode C), and Mode H's heavyweight captures came back empty / `(idle)`. The renderer's main thread is spinning in a synchronous JS (or native) loop — the worst case for inspection, because a fully-pegged thread can't service the CDP protocol message that _arms_ a debugger or profiler (`Debugger.enable` / `Profiler.enable` time out), so the CDP captures are starved. Two timeout-path signals read it anyway, because neither needs the pegged thread to cooperate. They ride on the same render-timeout log line as `cpuTopFrames`, in the **prerender-server** log.
+
+A healthy render finishes well under 30s; the render-level timeout (`RENDER_TIMEOUT_MS`, default 60s) fires before the request-level abort (render + 60s overhead), so these are captured and logged before the worker gives up on the visit. (If a wedge ever rejects with no timeout line at all, the diagnostic block itself was starved — that is itself the `<debugger-enable-timeout>` signal below.)
+
+### `pausedStack` — a one-shot debugger pause (always on, can't mask)
+
+```
+pausedStack: [depth=N] fn @ url:line:col  <-  caller  <-  …      jsHeapUsedMB=…
+```
+
+A single CDP `Debugger.pause` on the timeout path. V8 honors the pause at the next interrupt check (a loop back-edge or call), so it lands _inside_ a synchronous loop without the loop yielding — the same mechanism as the DevTools "pause" button on a hung `while(true)` page. It adds **zero overhead until the one pause**, so unlike a continuous sampler it can't perturb (mask) a timing-sensitive wedge. Three outcomes:
+
+- **A frame list** → the function the loop is in (`fn @ url:line`) and the call chain out to the render driver. `depth` is the live JS stack depth: a huge depth is runaway recursion; a small depth is a tight loop. A bare scriptId for `url` (no path) means the frame is in eval'd / dynamically-built code.
+- **`<pause-timeout>`** → the pause was requested but no back-edge honored it in budget. Most likely a long **non-yielding native call** (a catastrophic regex, a native sort) — no JS back-edge to interrupt. Pivot to `--prof`.
+- **`<debugger-enable-timeout>`** → the thread is so pegged it couldn't even service `Debugger.enable`. The hardest peg; CDP can't read it at all. Pivot to `--prof`.
+
+### `v8ProfTopFrames` — V8 `--prof` kernel sampler (gated; reads even a hard peg)
+
+When `pausedStack` is starved, the kernel-signal sampler is the fallback. `--prof` (knob: `PRERENDER_V8_PROF`, Mode H table) arms V8's `SIGPROF`-driven sampler at Chrome **launch**: the kernel timer preempts the pegged thread mid-instruction on a schedule it can't refuse — no protocol message, no back-edge, no cooperation — and the pre-installed handler walks the stack. It samples even a non-yielding native peg, and it never needs the thread to serialize a profile at `stop` (the failure mode of the CDP `Profiler`). Renderer-wide, not affinity-gated; the top self-time frames surface as `v8ProfTopFrames:` (processed in-process via `node --prof-process`).
+
+Arm it:
+
+1. Set SSM `/<env>/boxel/PRERENDER_V8_PROF` = `true`.
+2. **Restart** the prerender-server task — the renderer must relaunch to pick up the `--js-flags=--prof` launch flag (a value flip alone, or a redeploy that reuses the running browser, won't arm it). `aws ecs update-service … --force-new-deployment` restarts the tasks.
+3. Re-run the index; read `v8ProfTopFrames:` on the wedged card's timeout line.
+4. Set it back to `false` + restart when done — it samples **every** render while on.
+
+**Masking note.** `--prof` samples continuously, but for a _synchronous_ peg that doesn't matter: a sync loop's control flow doesn't depend on timing, so sampling records _where_ it spins without changing _that_ it spins. The heavier CDP profiler / trace masking that's bitten before is the risk for _timing-sensitive async_ work — so confirm `mainThreadResponsive=false` before reaching for it.
+
+### Reading it
+
+- **Frame / top sample** → the function to fix. Follow the chain: a card computed inside a Glimmer `revalidate → rerender → evaluate` chain is a render-invalidation storm (a computed that dirties tracked state during render → re-render → re-evaluate → …), not a data or serialize cost — and not something the field-getter ceiling or a serialize cycle-guard will catch.
+- **`jsHeapUsedMB`** (on the `pausedStack` line) → flat across the peg is a tight compute / recursion loop; climbing is a combinatorial re-build of shared subtrees (breadth).
+
+### Getting the logs
+
+```sh
+aws --profile claude-<env> logs filter-log-events \
+  --log-group-name ecs-boxel-prerender-server-<env> \
+  --filter-pattern '"<realm-or-card>" "timed out after"' --start-time <epoch-ms> \
+  --query 'events[*].message' --output text
+```
+
+Lines are FireLens-wrapped (`{"log":"…"}`); pull `.log`. Strip everything after ` DOM:` — the `pausedStack:` / `v8ProfTopFrames:` / discriminators are all before it.
+
+### What this mode can't tell you
+
+- If `pausedStack` is starved AND `--prof` shows the time in opaque native frames with no JS attribution, the peg is inside a native builtin — the frame names it, but the _why_ needs reading that builtin's inputs (e.g. a pathological regex / string).
+- It's a CPU-peg mode. A render that's idle-waiting (`mainThreadResponsive=true`, `cpuTopFrames (idle)`, a long-pending fetch) is a **data stall** — Mode A's `pendingFetches` / Mode G, not this.
 
 ## Field-by-field reading
 

From 90c3db5945a35852fa7c2be360c1271956460229 Mon Sep 17 00:00:00 2001
From: Hassan Abdel-Rahman <hassan.abdelrahman@gmail.com>
Date: Mon, 15 Jun 2026 21:05:04 -0400
Subject: [PATCH 04/16] Match V8's per-isolate --prof log filename

The renderer's V8 `--prof` log was being written (PRERENDER_V8_PROF=true,
browser launched with the flag) but the timeout-path summarizer reported
`<no v8 --prof log found>`: V8's per-isolate logging prepends
`isolate-<addr>-` to the --logfile name, and the reader matched with
`startsWith('prerender-v8-prof-')`, which never matches the prefixed file.

Match the prefix with `includes` (in both the launch cleanup and the
reader), pick the LARGEST this-run log (the pegged isolate accumulates the
most samples over the peg), and on a miss list the `.log` files actually
present so a still-wrong pattern is self-diagnosing.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 packages/realm-server/prerender/v8-prof.ts | 29 +++++++++++++++-------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/packages/realm-server/prerender/v8-prof.ts b/packages/realm-server/prerender/v8-prof.ts
index aefc9e90a9..b0be0ba9bf 100644
--- a/packages/realm-server/prerender/v8-prof.ts
+++ b/packages/realm-server/prerender/v8-prof.ts
@@ -58,7 +58,9 @@ export async function prepareV8ProfForLaunch(): Promise<void> {
     let entries = await fs.readdir(V8_PROF_LOG_DIR);
     await Promise.all(
       entries
-        .filter((e) => e.startsWith(V8_PROF_LOG_PREFIX) && e.endsWith('.log'))
+        // `includes`, not `startsWith`: V8's per-isolate logging prepends
+        // `isolate-<addr>-` to the --logfile name.
+        .filter((e) => e.includes(V8_PROF_LOG_PREFIX) && e.endsWith('.log'))
         .map((e) =>
           fs.rm(path.join(V8_PROF_LOG_DIR, e), { force: true }).catch(() => {}),
         ),
@@ -80,11 +82,16 @@ export async function processV8ProfTopFrames(
   }
   try {
     let entries = await fs.readdir(V8_PROF_LOG_DIR);
+    // `includes`, not `startsWith`: V8's per-isolate logging prepends
+    // `isolate-<addr>-` to the --logfile name, so the file is e.g.
+    // `isolate-0x…-prerender-v8-prof-<pid>.log`.
     let logs = entries.filter(
-      (e) => e.startsWith(V8_PROF_LOG_PREFIX) && e.endsWith('.log'),
+      (e) => e.includes(V8_PROF_LOG_PREFIX) && e.endsWith('.log'),
     );
     if (logs.length === 0) {
-      return '<no v8 --prof log found>';
+      // Name what IS present, so a still-missed pattern is self-diagnosing.
+      let present = entries.filter((e) => e.endsWith('.log')).slice(0, 12);
+      return `<no v8 --prof log found; .log present: ${present.join(', ') || 'none'}>`;
     }
     let withStats = await Promise.all(
       logs.map(async (name) => {
@@ -97,17 +104,21 @@ export async function processV8ProfTopFrames(
         }
       }),
     );
-    // Only this browser run's logs (cleared + stamped at launch), and
-    // among those the most-recently-written one — the renderer that was
-    // still spinning up to the timeout, not an earlier completed render's
-    // larger-but-stale log.
+    // Only this browser run's logs (stale prior-run logs were cleared +
+    // the launch time stamped). Among those pick the LARGEST: the pegged
+    // isolate accumulates by far the most samples over the 60s peg, so its
+    // log dwarfs sibling isolates' and earlier renders' — and the peg's
+    // samples dominate its top frames regardless of what else it rendered.
     let fromThisRun = withStats.filter(
       (s) => s.size > 0 && s.mtimeMs >= v8ProfLaunchAt,
     );
     if (fromThisRun.length === 0) {
-      return '<no v8 --prof log from this run>';
+      let seen = withStats
+        .map((s) => `${path.basename(s.full)}(${s.size}b)`)
+        .slice(0, 8);
+      return `<no v8 --prof log from this run; seen: ${seen.join(', ')}>`;
     }
-    fromThisRun.sort((a, b) => b.mtimeMs - a.mtimeMs);
+    fromThisRun.sort((a, b) => b.size - a.size);
     let logPath = fromThisRun[0].full;
     let { stdout } = await execFileAsync(
       process.execPath,

From 65db985871c9068e51301609cff148b171ddc1df Mon Sep 17 00:00:00 2001
From: Hassan Abdel-Rahman <hassan.abdelrahman@gmail.com>
Date: Mon, 15 Jun 2026 21:10:13 -0400
Subject: [PATCH 05/16] Sweep stale --prof logs on every browser launch (not
 just when armed)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The renderer's --prof logs live only in the container's ephemeral /tmp
(no EFS mount on the prerenderer), so they're reclaimed when the ECS task
replaces — but a long-lived task that had PRERENDER_V8_PROF on, then off,
would keep the "on" period's logs around because the launch-time cleanup
was gated on the flag being enabled.

Run the stale-log sweep on every browser launch regardless of the flag,
and only stamp the launch time / add the --prof launch flag when enabled.
So flipping the flag off + restarting now actually clears the logs.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 packages/realm-server/prerender/browser-manager.ts |  6 +++++-
 packages/realm-server/prerender/v8-prof.ts         | 10 +++++++---
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/packages/realm-server/prerender/browser-manager.ts b/packages/realm-server/prerender/browser-manager.ts
index cda7cca4a8..a96c271833 100644
--- a/packages/realm-server/prerender/browser-manager.ts
+++ b/packages/realm-server/prerender/browser-manager.ts
@@ -76,8 +76,12 @@ export class BrowserManager {
     // for a run that needs the native-peg fallback. Per-pid logfile so
     // concurrent renderer processes don't clobber one another; the timeout
     // path post-processes the log into the prerender-server logs.
+    // Always sweep stale --prof logs from the container's ephemeral /tmp
+    // (no EFS here — the only filesystem is the container's), even when
+    // disabled, so flipping the flag off + restarting cleans up the prior
+    // "on" period's logs. Arm the sampler itself only when enabled.
+    await prepareV8ProfForLaunch();
     if (v8ProfEnabled()) {
-      await prepareV8ProfForLaunch();
       launchArgs.push(`--js-flags=${v8ProfJsFlags()}`);
     }
 
diff --git a/packages/realm-server/prerender/v8-prof.ts b/packages/realm-server/prerender/v8-prof.ts
index b0be0ba9bf..3cba7c38cf 100644
--- a/packages/realm-server/prerender/v8-prof.ts
+++ b/packages/realm-server/prerender/v8-prof.ts
@@ -50,10 +50,14 @@ let v8ProfLaunchAt = 0;
 // any pre-existing logs and stamp the launch time so the timeout-path
 // processor only ever considers logs written by THIS browser run.
 export async function prepareV8ProfForLaunch(): Promise<void> {
-  if (!v8ProfEnabled()) {
-    return;
+  // Sweep stale logs on EVERY browser launch, even when disabled — so
+  // flipping PRERENDER_V8_PROF off and restarting clears the logs the
+  // prior "on" period left behind (the OS temp dir is shared and the
+  // container can outlive a single browser). Only stamp the launch time
+  // when armed (the reader uses it to scope to this run's logs).
+  if (v8ProfEnabled()) {
+    v8ProfLaunchAt = Date.now();
   }
-  v8ProfLaunchAt = Date.now();
   try {
     let entries = await fs.readdir(V8_PROF_LOG_DIR);
     await Promise.all(

From b5e29dc78832aa50dcf21b739b53456a45916cf9 Mon Sep 17 00:00:00 2001
From: Hassan Abdel-Rahman <hassan.abdelrahman@gmail.com>
Date: Mon, 15 Jun 2026 21:22:07 -0400
Subject: [PATCH 06/16] Always disable the Chromium sandbox for the prerender
 renderer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The prerender renderer only ever renders the realm's own trusted cards
(never untrusted web content) and always runs in a container where the
sandbox must be off for Chrome to launch — and for the renderer to write
the V8 `--prof` diagnostic log to /tmp. Drop the CI /
PUPPETEER_DISABLE_SANDBOX gate and pass --no-sandbox unconditionally, so it
never silently depends on an env var being set.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 packages/realm-server/prerender/browser-manager.ts | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/packages/realm-server/prerender/browser-manager.ts b/packages/realm-server/prerender/browser-manager.ts
index a96c271833..cbabf02bd9 100644
--- a/packages/realm-server/prerender/browser-manager.ts
+++ b/packages/realm-server/prerender/browser-manager.ts
@@ -29,12 +29,13 @@ export class BrowserManager {
     await this.cleanupUserDataDirs();
 
     let launchArgs: string[] = [];
-    let disableSandbox =
-      process.env.CI === 'true' ||
-      process.env.PUPPETEER_DISABLE_SANDBOX === 'true';
-    if (disableSandbox) {
-      launchArgs.push('--no-sandbox', '--disable-setuid-sandbox');
-    }
+    // Always disable the Chromium sandbox for the prerender renderer. It
+    // only ever renders the realm's own trusted cards (never untrusted web
+    // content) and always runs in a container where the sandbox must be off
+    // for Chrome to launch — and for the renderer to write the V8 `--prof`
+    // diagnostic log. Unconditional rather than gated on CI /
+    // PUPPETEER_DISABLE_SANDBOX so it never silently depends on the env.
+    launchArgs.push('--no-sandbox', '--disable-setuid-sandbox');
 
     // When the realm-server speaks HTTPS (local dev with a mkcert leaf
     // cert), Chromium needs to be told to accept it. mkcert's root CA

From 834d848a7b3a3cfb94e15457049235f31533decb Mon Sep 17 00:00:00 2001
From: Hassan Abdel-Rahman <hassan.abdelrahman@gmail.com>
Date: Mon, 15 Jun 2026 22:23:39 -0400
Subject: [PATCH 07/16] Make the --prof reader report why it has no frames
 (never silent null)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v8ProfTopFrames came back absent on the bxl wedge with no reason, because
the reader's catch swallowed the error and returned null. That left the
live-vs-exit question (is the on-disk --prof log usable while the renderer
is still running, or only after it closes?) unanswerable.

Surface it: report the picked log basename + size in every path; on a
prof-process failure return a descriptive string (incl. a "killed: timed
out" marker) instead of null; on any reader error return the message rather
than null; raise the stdout cap to 128MB and the time-box to 40s. Now the
timeout line always shows either the frames + log size, or the exact
reason + size — which decides whether the log is written live or is
empty/incomplete mid-run.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 packages/realm-server/prerender/v8-prof.ts | 29 +++++++++++++++-------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/packages/realm-server/prerender/v8-prof.ts b/packages/realm-server/prerender/v8-prof.ts
index 3cba7c38cf..73b34322bc 100644
--- a/packages/realm-server/prerender/v8-prof.ts
+++ b/packages/realm-server/prerender/v8-prof.ts
@@ -79,7 +79,7 @@ export async function prepareV8ProfForLaunch(): Promise<void> {
 // armed. Time-boxed — the log can be large, and the pegged render's
 // samples dominate it, so the top self-time frames name the wedge.
 export async function processV8ProfTopFrames(
-  budgetMs = 20000,
+  budgetMs = 40000,
 ): Promise<string | null> {
   if (!v8ProfEnabled()) {
     return null;
@@ -124,11 +124,21 @@ export async function processV8ProfTopFrames(
     }
     fromThisRun.sort((a, b) => b.size - a.size);
     let logPath = fromThisRun[0].full;
-    let { stdout } = await execFileAsync(
-      process.execPath,
-      ['--prof-process', logPath],
-      { timeout: budgetMs, maxBuffer: 32 * 1024 * 1024 },
-    );
+    let sizeMB = (fromThisRun[0].size / (1024 * 1024)).toFixed(1);
+    let stdout: string;
+    try {
+      ({ stdout } = await execFileAsync(
+        process.execPath,
+        ['--prof-process', logPath],
+        { timeout: budgetMs, maxBuffer: 128 * 1024 * 1024 },
+      ));
+    } catch (e) {
+      // Report the reason + size rather than swallowing it: a large
+      // accumulated log (every render since browser launch) can blow the
+      // time-box or the stdout cap, and we need to SEE that, not get null.
+      let err = e as { code?: string; killed?: boolean; message?: string };
+      return `<prof-process failed on ${path.basename(logPath)} (${sizeMB}MB)${err.killed ? ' [killed: timed out]' : ''}: ${err.code ?? ''} ${String(err.message ?? e).slice(0, 140)}>`;
+    }
     let lines = stdout.split('\n');
     // The bottom-up "[Summary]" section lists self-time by category; the
     // top JS/C++ entries that follow name the hot function.
@@ -137,9 +147,10 @@ export async function processV8ProfTopFrames(
       summaryIdx >= 0
         ? lines.slice(summaryIdx, summaryIdx + 60)
         : lines.slice(0, 60);
-    return slice.join('\n').trim() || '<empty prof summary>';
+    let summary = slice.join('\n').trim();
+    return `[${path.basename(logPath)} ${sizeMB}MB]\n${summary || '<empty prof summary>'}`;
   } catch (e) {
-    log.debug('v8 --prof processing failed:', e);
-    return null;
+    // Never silently null — surface the reason on the timeout line.
+    return `<v8 --prof reader error: ${String((e as { message?: string }).message ?? e).slice(0, 160)}>`;
   }
 }

From 0b06230530770226f9dd0da6cfca28e19f01b344 Mon Sep 17 00:00:00 2001
From: Hassan Abdel-Rahman <hassan.abdelrahman@gmail.com>
Date: Tue, 16 Jun 2026 10:09:25 -0400
Subject: [PATCH 08/16] Ship the wedged renderer's V8 --prof log to S3 instead
 of parsing in-container
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A hard synchronous CPU peg starves CDP (Debugger.enable / Profiler.enable
time out), so V8 `--prof` — kernel-SIGPROF sampled, written by a separate
thread to a file — is the only capture that survives it. But that file
accumulates every render on the isolate since browser launch (tens of MB),
and `node --prof-process` on it blows the render-timeout budget — which is
why the in-container summarizer kept returning nothing.

Stop parsing in-container. Add a `v8log` artifact kind and, on the render
timeout (when armed), stream the pegged isolate's raw `--prof` log to the
prerender S3 artifacts bucket via the existing sink — keyed by realm/card/
job so the wedging task's log is self-identifying across the fleet (no
shelling into 4 servers to find it). Symbolize it offline with
`node --prof-process`, where there's no deadline and the peg dominates the
top self-time frames. The timeout log line reports the upload + key.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../realm-server/prerender/artifact-sink.ts   |   9 +-
 packages/realm-server/prerender/utils.ts      |  17 +-
 packages/realm-server/prerender/v8-prof.ts    | 176 +++++++++---------
 3 files changed, 105 insertions(+), 97 deletions(-)

diff --git a/packages/realm-server/prerender/artifact-sink.ts b/packages/realm-server/prerender/artifact-sink.ts
index c2133cca50..073638b22c 100644
--- a/packages/realm-server/prerender/artifact-sink.ts
+++ b/packages/realm-server/prerender/artifact-sink.ts
@@ -52,12 +52,19 @@ const DEFAULT_REGION = 'us-east-1';
 // usual tools recognise: `.cpuprofile` (Chrome DevTools / speedscope),
 // `.trace.json` (Chrome tracing / Perfetto), `.heapprofile` (DevTools
 // allocation-sampling view).
-export type ArtifactKind = 'cpuprofile' | 'trace' | 'heap';
+export type ArtifactKind = 'cpuprofile' | 'trace' | 'heap' | 'v8log';
 
 const SUFFIX_BY_KIND: Record<ArtifactKind, string> = {
   cpuprofile: 'cpuprofile',
   trace: 'trace.json',
   heap: 'heapprofile',
+  // Raw V8 `--prof` tick log (the renderer's `isolate-…-prerender-v8-prof`
+  // file), uploaded as-is and symbolized offline with `node --prof-process`.
+  // This is the one capture that survives a hard synchronous CPU peg: the
+  // kernel SIGPROF sampler writes it from a separate thread, so it lands even
+  // when the main thread is too pegged to service CDP — but it's too large to
+  // `--prof-process` inside the render-timeout budget, so we ship the bytes.
+  v8log: 'v8log',
 };
 
 // The render-identifying fields that key an artifact. All but `kind` are
diff --git a/packages/realm-server/prerender/utils.ts b/packages/realm-server/prerender/utils.ts
index c5454c9f7b..77b5ff543a 100644
--- a/packages/realm-server/prerender/utils.ts
+++ b/packages/realm-server/prerender/utils.ts
@@ -22,7 +22,7 @@ import {
   formatPausedStack,
   type PausedStackCapture,
 } from './pause-capture.ts';
-import { v8ProfEnabled, processV8ProfTopFrames } from './v8-prof.ts';
+import { v8ProfEnabled, uploadV8ProfLog } from './v8-prof.ts';
 import {
   ensureHeapSampling,
   captureHeapSamplingProfile,
@@ -1705,13 +1705,14 @@ export async function withTimeout<T>(
     if (!page.isClosed()) {
       pausedStack = await capturePausedCallStack(page, { budgetMs: 8000 });
     }
-    // Native-peg fallback (armed only when `PRERENDER_V8_PROF=true`):
-    // summarize the renderer's kernel-sampled V8 `--prof` log. Worth
-    // reading mainly when `pausedStack` came back `pause-timeout` — i.e.
-    // the peg is a non-yielding native call the debugger couldn't enter.
-    let v8ProfTopFrames: string | null = null;
+    // Hard-peg capture (armed only when `PRERENDER_V8_PROF=true`): ship the
+    // pegged isolate's raw kernel-sampled V8 `--prof` log to the artifact
+    // bucket, keyed by this render's realm/card/job, to be symbolized offline.
+    // This is the capture that survives a peg so hard it starves CDP — i.e.
+    // exactly when `pausedStack` comes back `debugger-enable-timeout`.
+    let v8ProfStatus: string | null = null;
     if (v8ProfEnabled()) {
-      v8ProfTopFrames = await processV8ProfTopFrames();
+      v8ProfStatus = await uploadV8ProfLog(artifactKeyPartsFor(profileContext));
     }
     // Out-of-band CDP view of fetches still outstanding — survives a
     // wedged JS thread, so it's available even when the in-page hooks
@@ -1824,7 +1825,7 @@ export async function withTimeout<T>(
         (formatPausedStack(pausedStack)
           ? ` pausedStack: ${formatPausedStack(pausedStack)}`
           : '') +
-        (v8ProfTopFrames ? `\nv8ProfTopFrames:\n${v8ProfTopFrames}` : '') +
+        (v8ProfStatus ? ` v8ProfLog: ${v8ProfStatus}` : '') +
         ` DOM:\n${dom?.trim()}`,
     );
     // Diagnostics ride on the outer `RenderError.diagnostics` as a
diff --git a/packages/realm-server/prerender/v8-prof.ts b/packages/realm-server/prerender/v8-prof.ts
index 73b34322bc..9e49c263fc 100644
--- a/packages/realm-server/prerender/v8-prof.ts
+++ b/packages/realm-server/prerender/v8-prof.ts
@@ -1,31 +1,29 @@
 // Diagnostic V8 `--prof` (kernel-signal CPU sampling) for the prerender
-// renderer. Off by default (`PRERENDER_V8_PROF=true` to arm).
+// renderer — the one capture that survives a hard synchronous CPU peg.
 //
-// Why this exists alongside `Debugger.pause` (pause-capture.ts):
+// When a render pegs the main thread hard, CDP can't read it: `Debugger.enable`
+// / `Profiler.enable` time out because the pegged thread can't service the
+// protocol (see pause-capture.ts / trace-capture.ts). V8's `--prof` sampler is
+// driven by the kernel SIGPROF timer and written by a separate thread, so it
+// records the spinning frame regardless — straight to a file on disk.
 //
-//   `Debugger.pause` reads a synchronous JS loop (V8 honors the pause at a
-//   back-edge) with zero continuous overhead. But if the wedge is one long
-//   NON-YIELDING native call there is no back-edge, and pause times out.
-//   `--prof` covers that gap: the kernel SIGPROF timer preempts the thread
-//   mid-instruction regardless of what it's running (JS or native), so the
-//   busiest frame wins the samples — exactly the case `Debugger.pause`
-//   can't reach. It also never needs the pegged thread to service a CDP
-//   `stop` (the failure mode of the CDP `Profiler`; see trace-capture.ts).
+// The catch: that file accumulates every render on the isolate since browser
+// launch (tens of MB), so `node --prof-process` on it blows the render-timeout
+// budget. So we don't parse it in-container. On the render timeout we ship the
+// RAW log to the prerender S3 artifacts bucket, keyed by realm/card/job, and
+// symbolize it offline (`node --prof-process`) where there's no deadline and
+// the peg dominates the top self-time frames.
 //
-//   The trade-off, and why it's gated off: it samples EVERY render, so its
-//   own interrupts can perturb a timing-sensitive wedge enough to dissolve
-//   it. Run it only when `Debugger.pause` reports `pause-timeout` (a
-//   suspected native peg).
+// Off by default (`PRERENDER_V8_PROF=true` to arm) — it samples every render,
+// so it can perturb a timing-sensitive wedge; arm it deliberately.
 
 import { logger } from '@cardstack/runtime-common';
-import { promises as fs } from 'fs';
+import { promises as fs, createReadStream } from 'fs';
 import { tmpdir } from 'os';
 import path from 'path';
-import { execFile } from 'child_process';
-import { promisify } from 'util';
+import { uploadArtifact, type ArtifactKeyParts } from './artifact-sink.ts';
 
 const log = logger('prerenderer');
-const execFileAsync = promisify(execFile);
 
 export const V8_PROF_LOG_DIR = tmpdir();
 export const V8_PROF_LOG_PREFIX = 'prerender-v8-prof-';
@@ -74,83 +72,85 @@ export async function prepareV8ProfForLaunch(): Promise<void> {
   }
 }
 
-// Best-effort summary of the hottest frames from the renderer's `--prof`
-// log via `node --prof-process`. Called on the timeout path only when
-// armed. Time-boxed — the log can be large, and the pegged render's
-// samples dominate it, so the top self-time frames name the wedge.
-export async function processV8ProfTopFrames(
-  budgetMs = 40000,
+// Find the pegged isolate's `--prof` log among this browser run's logs.
+// Returns the chosen file (or a self-diagnosing reason string).
+async function pickV8ProfLog(): Promise<
+  { full: string; sizeMB: string } | { reason: string }
+> {
+  let entries = await fs.readdir(V8_PROF_LOG_DIR);
+  // `includes`, not `startsWith`: V8's per-isolate logging prepends
+  // `isolate-<addr>-` to the --logfile name, so the file is e.g.
+  // `isolate-0x…-prerender-v8-prof-<pid>.log`.
+  let logs = entries.filter(
+    (e) => e.includes(V8_PROF_LOG_PREFIX) && e.endsWith('.log'),
+  );
+  if (logs.length === 0) {
+    let present = entries.filter((e) => e.endsWith('.log')).slice(0, 12);
+    return {
+      reason: `<no v8 --prof log found; .log present: ${present.join(', ') || 'none'}>`,
+    };
+  }
+  let withStats = await Promise.all(
+    logs.map(async (name) => {
+      let full = path.join(V8_PROF_LOG_DIR, name);
+      try {
+        let st = await fs.stat(full);
+        return { full, mtimeMs: st.mtimeMs, size: st.size };
+      } catch {
+        return { full, mtimeMs: 0, size: 0 };
+      }
+    }),
+  );
+  // This browser run's logs only (stale ones were cleared + the launch time
+  // stamped). Pick the LARGEST: the pegged isolate accumulates by far the
+  // most samples over the peg, so its log dwarfs sibling isolates' and
+  // earlier renders' — and the peg dominates its frames regardless of what
+  // else it rendered.
+  let fromThisRun = withStats.filter(
+    (s) => s.size > 0 && s.mtimeMs >= v8ProfLaunchAt,
+  );
+  if (fromThisRun.length === 0) {
+    let seen = withStats
+      .map((s) => `${path.basename(s.full)}(${s.size}b)`)
+      .slice(0, 8);
+    return {
+      reason: `<no v8 --prof log from this run; seen: ${seen.join(', ')}>`,
+    };
+  }
+  fromThisRun.sort((a, b) => b.size - a.size);
+  return {
+    full: fromThisRun[0].full,
+    sizeMB: (fromThisRun[0].size / (1024 * 1024)).toFixed(1),
+  };
+}
+
+// On the render timeout (when armed), ship the pegged isolate's RAW `--prof`
+// log to the artifact sink, keyed by realm/card/job so the wedging task's log
+// is self-identifying across the fleet. We don't `--prof-process` it here: the
+// accumulated log is too large to symbolize inside the timeout budget, and the
+// sink streams it (managed multipart) so memory stays bounded. Symbolize the
+// artifact offline with `node --prof-process`. Returns a short status for the
+// timeout log line.
+export async function uploadV8ProfLog(
+  keyParts: Omit<ArtifactKeyParts, 'kind'>,
 ): Promise<string | null> {
   if (!v8ProfEnabled()) {
     return null;
   }
   try {
-    let entries = await fs.readdir(V8_PROF_LOG_DIR);
-    // `includes`, not `startsWith`: V8's per-isolate logging prepends
-    // `isolate-<addr>-` to the --logfile name, so the file is e.g.
-    // `isolate-0x…-prerender-v8-prof-<pid>.log`.
-    let logs = entries.filter(
-      (e) => e.includes(V8_PROF_LOG_PREFIX) && e.endsWith('.log'),
-    );
-    if (logs.length === 0) {
-      // Name what IS present, so a still-missed pattern is self-diagnosing.
-      let present = entries.filter((e) => e.endsWith('.log')).slice(0, 12);
-      return `<no v8 --prof log found; .log present: ${present.join(', ') || 'none'}>`;
-    }
-    let withStats = await Promise.all(
-      logs.map(async (name) => {
-        let full = path.join(V8_PROF_LOG_DIR, name);
-        try {
-          let st = await fs.stat(full);
-          return { full, mtimeMs: st.mtimeMs, size: st.size };
-        } catch {
-          return { full, mtimeMs: 0, size: 0 };
-        }
-      }),
-    );
-    // Only this browser run's logs (stale prior-run logs were cleared +
-    // the launch time stamped). Among those pick the LARGEST: the pegged
-    // isolate accumulates by far the most samples over the 60s peg, so its
-    // log dwarfs sibling isolates' and earlier renders' — and the peg's
-    // samples dominate its top frames regardless of what else it rendered.
-    let fromThisRun = withStats.filter(
-      (s) => s.size > 0 && s.mtimeMs >= v8ProfLaunchAt,
-    );
-    if (fromThisRun.length === 0) {
-      let seen = withStats
-        .map((s) => `${path.basename(s.full)}(${s.size}b)`)
-        .slice(0, 8);
-      return `<no v8 --prof log from this run; seen: ${seen.join(', ')}>`;
-    }
-    fromThisRun.sort((a, b) => b.size - a.size);
-    let logPath = fromThisRun[0].full;
-    let sizeMB = (fromThisRun[0].size / (1024 * 1024)).toFixed(1);
-    let stdout: string;
-    try {
-      ({ stdout } = await execFileAsync(
-        process.execPath,
-        ['--prof-process', logPath],
-        { timeout: budgetMs, maxBuffer: 128 * 1024 * 1024 },
-      ));
-    } catch (e) {
-      // Report the reason + size rather than swallowing it: a large
-      // accumulated log (every render since browser launch) can blow the
-      // time-box or the stdout cap, and we need to SEE that, not get null.
-      let err = e as { code?: string; killed?: boolean; message?: string };
-      return `<prof-process failed on ${path.basename(logPath)} (${sizeMB}MB)${err.killed ? ' [killed: timed out]' : ''}: ${err.code ?? ''} ${String(err.message ?? e).slice(0, 140)}>`;
+    let picked = await pickV8ProfLog();
+    if ('reason' in picked) {
+      return picked.reason;
     }
-    let lines = stdout.split('\n');
-    // The bottom-up "[Summary]" section lists self-time by category; the
-    // top JS/C++ entries that follow name the hot function.
-    let summaryIdx = lines.findIndex((l) => /\[Summary\]/.test(l));
-    let slice =
-      summaryIdx >= 0
-        ? lines.slice(summaryIdx, summaryIdx + 60)
-        : lines.slice(0, 60);
-    let summary = slice.join('\n').trim();
-    return `[${path.basename(logPath)} ${sizeMB}MB]\n${summary || '<empty prof summary>'}`;
+    await uploadArtifact({
+      ...keyParts,
+      kind: 'v8log',
+      body: createReadStream(picked.full),
+      contentType: 'text/plain',
+    });
+    return `uploaded v8 --prof log ${path.basename(picked.full)} (${picked.sizeMB}MB) to artifact bucket — symbolize offline with \`node --prof-process\``;
   } catch (e) {
     // Never silently null — surface the reason on the timeout line.
-    return `<v8 --prof reader error: ${String((e as { message?: string }).message ?? e).slice(0, 160)}>`;
+    return `<v8 --prof upload error: ${String((e as { message?: string }).message ?? e).slice(0, 160)}>`;
   }
 }

From 6df42db0f627740c4be0e407fcc2c5299001ffd2 Mon Sep 17 00:00:00 2001
From: Hassan Abdel-Rahman <hassan.abdelrahman@gmail.com>
Date: Tue, 16 Jun 2026 10:12:00 -0400
Subject: [PATCH 09/16] indexing-diagnostics: document the v8log S3-artifact +
 offline symbolization

Mode I and the Mode H knob table now describe the real flow: on a render
timeout the raw V8 --prof log is streamed to the prerender artifacts bucket
as a `v8log` (the timeout line reports the upload + key), and is symbolized
OFFLINE with `node --prof-process` (the log self-contains its code-creation
map, so JS frames resolve without binaries/sourcemaps). Replaces the old
"v8ProfTopFrames on the log line" in-container-parse model that blew the
timeout budget on a large log.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .claude/skills/indexing-diagnostics/SKILL.md | 39 +++++++++++++-------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/.claude/skills/indexing-diagnostics/SKILL.md b/.claude/skills/indexing-diagnostics/SKILL.md
index 4dcdbb3758..1379aa6417 100644
--- a/.claude/skills/indexing-diagnostics/SKILL.md
+++ b/.claude/skills/indexing-diagnostics/SKILL.md
@@ -881,14 +881,14 @@ Rule of thumb: a render that **completes but is heavy** → `.cpuprofile` (+ hea
 
 All live at `/<env>/boxel/<NAME>` (Systems Manager → Parameter Store). The bucket itself (`PRERENDER_ARTIFACTS_BUCKET`) and the key prefix (`PRERENDER_ARTIFACTS_ENV`) are wired by Terraform — don't set them by hand.
 
-| Parameter                             | Values                                                                             | Effect                                                                                                                                                                                                                                                                                                                                                                                |
-| ------------------------------------- | ---------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `PRERENDER_PROFILE_AFFINITY`          | comma-separated affinity keys, e.g. `realm:https://realms.cardstack.com/team/foo/` | **Required to target.** Only renders whose affinity key exactly matches are profiled at all (Tier 1 + Tier 2). Empty / `off` → everything inert.                                                                                                                                                                                                                                      |
-| `PRERENDER_PROFILE_CPUPROFILE`        | `true` / `false`                                                                   | Persist the full `.cpuprofile` for targeted renders.                                                                                                                                                                                                                                                                                                                                  |
-| `PRERENDER_PROFILE_TRACE`             | `true` / `false`                                                                   | Capture the streaming trace for targeted renders.                                                                                                                                                                                                                                                                                                                                     |
-| `PRERENDER_PROFILE_HEAP`              | `true` / `false`                                                                   | Capture the heap allocation-sampling profile for targeted renders.                                                                                                                                                                                                                                                                                                                    |
-| `PRERENDER_PROFILE_MAX_SESSION_BYTES` | positive integer, or `0` for the default                                           | Soft per-process byte budget across all artifacts. `0`/unset → 5 GiB. Once spent, the task declines further uploads (in-flight ones finish, so blobs are never truncated).                                                                                                                                                                                                            |
-| `PRERENDER_V8_PROF`                   | `true` / `false`                                                                   | Arm V8's `--prof` kernel-`SIGPROF` CPU sampler at Chrome launch — **renderer-wide, NOT affinity-gated**. The fallback for a hard CPU peg that starves the CDP captures above (kernel preemption needs no thread cooperation). Surfaces as `v8ProfTopFrames` on the render-timeout line; see **Mode I**. Needs a prerender-server task **restart** (not just redeploy) to take effect. |
+| Parameter                             | Values                                                                             | Effect                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| ------------------------------------- | ---------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `PRERENDER_PROFILE_AFFINITY`          | comma-separated affinity keys, e.g. `realm:https://realms.cardstack.com/team/foo/` | **Required to target.** Only renders whose affinity key exactly matches are profiled at all (Tier 1 + Tier 2). Empty / `off` → everything inert.                                                                                                                                                                                                                                                                                                                                                            |
+| `PRERENDER_PROFILE_CPUPROFILE`        | `true` / `false`                                                                   | Persist the full `.cpuprofile` for targeted renders.                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| `PRERENDER_PROFILE_TRACE`             | `true` / `false`                                                                   | Capture the streaming trace for targeted renders.                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+| `PRERENDER_PROFILE_HEAP`              | `true` / `false`                                                                   | Capture the heap allocation-sampling profile for targeted renders.                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| `PRERENDER_PROFILE_MAX_SESSION_BYTES` | positive integer, or `0` for the default                                           | Soft per-process byte budget across all artifacts. `0`/unset → 5 GiB. Once spent, the task declines further uploads (in-flight ones finish, so blobs are never truncated).                                                                                                                                                                                                                                                                                                                                  |
+| `PRERENDER_V8_PROF`                   | `true` / `false`                                                                   | Arm V8's `--prof` kernel-`SIGPROF` CPU sampler at Chrome launch — **renderer-wide, NOT affinity-gated**. The capture that survives a hard CPU peg that starves the CDP captures above (kernel preemption needs no thread cooperation). On a render timeout the raw log is streamed to the artifacts bucket as a `v8log` (symbolized offline with `node --prof-process`); the timeout line reports the upload. See **Mode I**. Needs a prerender-server task **restart** (not just redeploy) to take effect. |
 
 The mode flags are Terraform-seeded sentinels (default `false` / `0`); `PRERENDER_PROFILE_AFFINITY` is operator-managed and must already exist. The affinity key is `realm:` + the realm's canonical URL **with trailing slash** — the same value Mode A/B logs print as `affinity=…`.
 
@@ -948,17 +948,30 @@ A single CDP `Debugger.pause` on the timeout path. V8 honors the pause at the ne
 - **`<pause-timeout>`** → the pause was requested but no back-edge honored it in budget. Most likely a long **non-yielding native call** (a catastrophic regex, a native sort) — no JS back-edge to interrupt. Pivot to `--prof`.
 - **`<debugger-enable-timeout>`** → the thread is so pegged it couldn't even service `Debugger.enable`. The hardest peg; CDP can't read it at all. Pivot to `--prof`.
 
-### `v8ProfTopFrames` — V8 `--prof` kernel sampler (gated; reads even a hard peg)
+### V8 `--prof` log → S3 artifact (gated; the only capture that survives a hard peg)
 
-When `pausedStack` is starved, the kernel-signal sampler is the fallback. `--prof` (knob: `PRERENDER_V8_PROF`, Mode H table) arms V8's `SIGPROF`-driven sampler at Chrome **launch**: the kernel timer preempts the pegged thread mid-instruction on a schedule it can't refuse — no protocol message, no back-edge, no cooperation — and the pre-installed handler walks the stack. It samples even a non-yielding native peg, and it never needs the thread to serialize a profile at `stop` (the failure mode of the CDP `Profiler`). Renderer-wide, not affinity-gated; the top self-time frames surface as `v8ProfTopFrames:` (processed in-process via `node --prof-process`).
+When `pausedStack` is starved, the kernel-signal sampler is the capture that still works. `--prof` (knob: `PRERENDER_V8_PROF`, Mode H table) arms V8's `SIGPROF`-driven sampler at Chrome **launch**: the kernel timer preempts the pegged thread mid-instruction on a schedule it can't refuse — no protocol message, no back-edge, no cooperation — and a **separate thread** writes the samples to a file. So it records the spinning frame even when CDP is dead, and never needs the thread to serialize at `stop` (the failure mode of the CDP `Profiler`). Renderer-wide, not affinity-gated.
+
+It is **not** symbolized in-container — the log accumulates every render on the isolate since launch (tens of MB), and `node --prof-process` on that blows the render-timeout budget (that's why an in-container summary kept coming back empty). Instead, on the timeout the prerender server **streams the raw log to the prerender S3 artifacts bucket** as a `v8log` artifact, keyed `env/realm/jobId/card/step/<ts>.v8log` — so the wedging task's log is self-identifying (no shelling into the N prerender tasks to find it). The timeout log line just reports the upload: `v8ProfLog: uploaded … (NMB) to artifact bucket …`, or a self-diagnosing reason (`<no v8 --prof log from this run; seen: …>`) if none was found.
 
 Arm it:
 
 1. Set SSM `/<env>/boxel/PRERENDER_V8_PROF` = `true`.
-2. **Restart** the prerender-server task — the renderer must relaunch to pick up the `--js-flags=--prof` launch flag (a value flip alone, or a redeploy that reuses the running browser, won't arm it). `aws ecs update-service … --force-new-deployment` restarts the tasks.
-3. Re-run the index; read `v8ProfTopFrames:` on the wedged card's timeout line.
+2. **Restart** the prerender-server tasks — the renderer must relaunch to pick up the `--js-flags=--prof` launch flag (a value flip alone, or a redeploy reusing the running browser, won't arm it): `aws ecs update-service --cluster <env> --service boxel-prerender-server-<env> --force-new-deployment`.
+3. Re-run the index. On a wedge, grep the prerender-server log for `v8ProfLog: uploaded` — and pull the artifact (next section).
 4. Set it back to `false` + restart when done — it samples **every** render while on.
 
+**Symbolize offline** (the `boxel-claude-readonly` role has `s3:GetObject` on `boxel-prerender-artifacts-*`):
+
+```sh
+aws s3 ls --recursive s3://boxel-prerender-artifacts-<env>/<env>/<realm>/   # find the .v8log key
+aws s3 cp s3://boxel-prerender-artifacts-<env>/<key> /tmp/wedge.v8log
+node --prof-process /tmp/wedge.v8log \
+  | sed -n '/\[Summary\]/,+14p; /\[JavaScript\]/,+30p; /\[Bottom up (heavy) profile\]/,+50p'
+```
+
+The peg dominates the cumulative log (a 60s spin at 1 kHz is ~60k ticks; every other render on the tab is sub-second), so the top self-time frame — and the heaviest `[Bottom up]` path — is the wedge.
+
 **Masking note.** `--prof` samples continuously, but for a _synchronous_ peg that doesn't matter: a sync loop's control flow doesn't depend on timing, so sampling records _where_ it spins without changing _that_ it spins. The heavier CDP profiler / trace masking that's bitten before is the risk for _timing-sensitive async_ work — so confirm `mainThreadResponsive=false` before reaching for it.
 
 ### Reading it
@@ -975,7 +988,7 @@ aws --profile claude-<env> logs filter-log-events \
   --query 'events[*].message' --output text
 ```
 
-Lines are FireLens-wrapped (`{"log":"…"}`); pull `.log`. Strip everything after ` DOM:` — the `pausedStack:` / `v8ProfTopFrames:` / discriminators are all before it.
+Lines are FireLens-wrapped (`{"log":"…"}`); pull `.log`. Strip everything after ` DOM:` — the `pausedStack:` / `v8ProfLog:` / discriminators are all before it. The `v8ProfLog:` line carries the S3 key of the uploaded `--prof` log; fetch + symbolize it per the previous section.
 
 ### What this mode can't tell you
 

From addb9499f59a22154d9a22d3b2ecd474f9cd96b4 Mon Sep 17 00:00:00 2001
From: Hassan Abdel-Rahman <hassan.abdelrahman@gmail.com>
Date: Tue, 16 Jun 2026 10:19:41 -0400
Subject: [PATCH 10/16] Add a one-command helper to fetch + symbolize a
 prerender wedge's v8log
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

symbolize-prerender-wedge.sh resolves the newest `.v8log` artifact for a
realm in the prerender artifacts bucket (or an exact --key), downloads it,
and runs `node --prof-process` to print the [Summary]/[JavaScript]/[Bottom
up] sections — the push-button version of the three manual aws+node steps.
The log self-contains its code-creation map, so JS frames resolve offline
without binaries or source maps; native/Chrome frames stay opaque.

Mode I now points at the helper as the primary path and keeps the raw
commands as the under-the-hood reference.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .claude/skills/indexing-diagnostics/SKILL.md  | 13 ++-
 .../scripts/symbolize-prerender-wedge.sh      | 79 +++++++++++++++++++
 2 files changed, 89 insertions(+), 3 deletions(-)
 create mode 100755 packages/realm-server/scripts/symbolize-prerender-wedge.sh

diff --git a/.claude/skills/indexing-diagnostics/SKILL.md b/.claude/skills/indexing-diagnostics/SKILL.md
index 1379aa6417..177ff44742 100644
--- a/.claude/skills/indexing-diagnostics/SKILL.md
+++ b/.claude/skills/indexing-diagnostics/SKILL.md
@@ -961,16 +961,23 @@ Arm it:
 3. Re-run the index. On a wedge, grep the prerender-server log for `v8ProfLog: uploaded` — and pull the artifact (next section).
 4. Set it back to `false` + restart when done — it samples **every** render while on.
 
-**Symbolize offline** (the `boxel-claude-readonly` role has `s3:GetObject` on `boxel-prerender-artifacts-*`):
+**Symbolize offline.** With an aws-access session for the env (`boxel-claude-readonly` has `s3:GetObject`/`ListBucket` on `boxel-prerender-artifacts-*`), the helper fetches the newest matching `.v8log` and runs `node --prof-process` for you:
 
 ```sh
-aws s3 ls --recursive s3://boxel-prerender-artifacts-<env>/<env>/<realm>/   # find the .v8log key
+packages/realm-server/scripts/symbolize-prerender-wedge.sh --env staging --realm bxl-dependency-order-test
+# --key <exact-s3-key> for a specific artifact; --list to just enumerate candidates; --top N for deeper sections
+```
+
+It self-documents why this is offline rather than in-container. Under the hood it's just:
+
+```sh
+aws s3api list-objects-v2 --bucket boxel-prerender-artifacts-<env> --prefix <env>/ ...   # newest .v8log for the realm
 aws s3 cp s3://boxel-prerender-artifacts-<env>/<key> /tmp/wedge.v8log
 node --prof-process /tmp/wedge.v8log \
   | sed -n '/\[Summary\]/,+14p; /\[JavaScript\]/,+30p; /\[Bottom up (heavy) profile\]/,+50p'
 ```
 
-The peg dominates the cumulative log (a 60s spin at 1 kHz is ~60k ticks; every other render on the tab is sub-second), so the top self-time frame — and the heaviest `[Bottom up]` path — is the wedge.
+The log self-contains its `code-creation` records, so `--prof-process` names the JS frames from the file alone — no binaries or source maps. Native/Chrome frames stay opaque (that'd need Chrome debug symbols); the JS layer is what names the wedge. The peg dominates the cumulative log (a 60s spin at 1 kHz is ~60k ticks; every other render on the tab is sub-second), so the top self-time frame — and the heaviest `[Bottom up]` path — is the wedge.
 
 **Masking note.** `--prof` samples continuously, but for a _synchronous_ peg that doesn't matter: a sync loop's control flow doesn't depend on timing, so sampling records _where_ it spins without changing _that_ it spins. The heavier CDP profiler / trace masking that's bitten before is the risk for _timing-sensitive async_ work — so confirm `mainThreadResponsive=false` before reaching for it.
 
diff --git a/packages/realm-server/scripts/symbolize-prerender-wedge.sh b/packages/realm-server/scripts/symbolize-prerender-wedge.sh
new file mode 100755
index 0000000000..3c9977cc67
--- /dev/null
+++ b/packages/realm-server/scripts/symbolize-prerender-wedge.sh
@@ -0,0 +1,79 @@
+#!/usr/bin/env bash
+#
+# Symbolize a prerender wedge from its V8 `--prof` artifact.
+#
+# When a render times out with PRERENDER_V8_PROF=true, the prerender server
+# ships the pegged isolate's raw V8 `--prof` log to the prerender artifacts
+# bucket as a `.v8log` object, keyed env/realm/jobId/card/step/<ts>.v8log (see
+# packages/realm-server/prerender/v8-prof.ts and the indexing-diagnostics
+# skill, Mode I). The raw log is too large to `--prof-process` in-container
+# under the render-timeout budget; this script fetches it and symbolizes it
+# offline, where there's no deadline.
+#
+# The log self-contains its `code-creation` records, so `node --prof-process`
+# resolves the JS frames from the file alone — no binaries or source maps.
+# Native/Chrome frames stay opaque (that needs Chrome debug symbols); the JS
+# layer is what names the wedge. The 60s peg dominates the cumulative log, so
+# the top [JavaScript] self-time frame and heaviest [Bottom up] path are it.
+#
+# Requires an aws-access session for the env (`mise run claude-aws <env> <mfa>`)
+# — the boxel-claude-readonly role has s3:GetObject/ListBucket on
+# boxel-prerender-artifacts-*. And `node` on PATH.
+#
+# Usage:
+#   symbolize-prerender-wedge.sh --realm bxl-dependency-order-test
+#   symbolize-prerender-wedge.sh --env staging --realm ctse/foo --top 80
+#   symbolize-prerender-wedge.sh --key staging/<realm>/<job>/<card>/<step>/<ts>.v8log
+#   symbolize-prerender-wedge.sh --realm bxl --list        # just list candidates
+set -euo pipefail
+
+ENV=staging
+REALM=
+PROFILE=
+KEY=
+TOP=50
+LIST_ONLY=
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --env) ENV="$2"; shift 2 ;;
+    --realm) REALM="$2"; shift 2 ;;
+    --profile) PROFILE="$2"; shift 2 ;;
+    --key) KEY="$2"; shift 2 ;;
+    --top) TOP="$2"; shift 2 ;;
+    --list) LIST_ONLY=1; shift ;;
+    -h | --help) sed -n '2,30p' "$0"; exit 0 ;;
+    *) echo "unknown arg: $1" >&2; exit 2 ;;
+  esac
+done
+
+PROFILE="${PROFILE:-claude-$ENV}"
+BUCKET="boxel-prerender-artifacts-$ENV"
+AWS=(aws --profile "$PROFILE" --region "${AWS_REGION:-us-east-1}")
+
+command -v node >/dev/null || { echo "node not found on PATH" >&2; exit 127; }
+
+# Resolve the artifact key: newest `.v8log` whose key matches the realm
+# substring (artifacts auto-expire after 14 days, so the listing stays small).
+if [[ -z "$KEY" ]]; then
+  [[ -n "$REALM" ]] || { echo "need --realm <substring> or --key <s3-key>" >&2; exit 2; }
+  echo "Listing .v8log artifacts in s3://$BUCKET (realm ~ '$REALM') ..." >&2
+  mapfile -t MATCHES < <(
+    "${AWS[@]}" s3api list-objects-v2 --bucket "$BUCKET" --prefix "$ENV/" \
+      --query "reverse(sort_by(Contents[?ends_with(Key, '.v8log')], &LastModified))[].Key" \
+      --output text 2>/dev/null | tr '\t' '\n' | grep -i -- "$REALM" || true
+  )
+  [[ ${#MATCHES[@]} -gt 0 ]] || { echo "no .v8log artifact for realm '$REALM' in $BUCKET" >&2; exit 1; }
+  if [[ -n "$LIST_ONLY" ]]; then printf '%s\n' "${MATCHES[@]}"; exit 0; fi
+  KEY="${MATCHES[0]}"
+fi
+
+echo "Artifact: s3://$BUCKET/$KEY" >&2
+TMP="$(mktemp "${TMPDIR:-/tmp}/wedge-XXXXXX.v8log")"
+trap 'rm -f "$TMP"' EXIT
+"${AWS[@]}" s3 cp "s3://$BUCKET/$KEY" "$TMP" >&2
+echo "Downloaded $(( $(wc -c < "$TMP") / 1024 / 1024 ))MB; running node --prof-process ..." >&2
+echo >&2
+
+node --prof-process "$TMP" 2>/dev/null | sed -n \
+  "/\[Summary\]/,+16p; /\[JavaScript\]/,+${TOP}p; /\[Bottom up (heavy) profile\]/,+${TOP}p"

From d996c0e3be95efcb5d4d7b1295d5caf13eaeeca0 Mon Sep 17 00:00:00 2001
From: Hassan Abdel-Rahman <hassan.abdelrahman@gmail.com>
Date: Tue, 16 Jun 2026 11:24:22 -0400
Subject: [PATCH 11/16] Delete the V8 --prof log from the container after a
 durable S3 upload
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The raw --prof log runs tens of MB and `--logfile` accumulates them in the
container's temp dir across the browser's long life. Now that the wedged
page is evicted on a render timeout (its isolate tears down and the next
visit writes a fresh log), the uploaded log strands nothing — so reclaim
the disk by deleting it once the upload is durable.

uploadArtifact now returns whether the object actually landed, so the
delete is gated on a confirmed upload: a disabled/declined/failed sink
keeps the local copy rather than destroying a capture that was never
persisted. The timeout log line reports which path ran. Mode I of the
indexing-diagnostics skill is updated to match the status wording.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .claude/skills/indexing-diagnostics/SKILL.md  |  6 ++++-
 .../realm-server/prerender/artifact-sink.ts   | 21 +++++++++------
 packages/realm-server/prerender/utils.ts      |  7 ++---
 packages/realm-server/prerender/v8-prof.ts    | 27 ++++++++++++++++---
 4 files changed, 46 insertions(+), 15 deletions(-)

diff --git a/.claude/skills/indexing-diagnostics/SKILL.md b/.claude/skills/indexing-diagnostics/SKILL.md
index 177ff44742..6f04ec7b4d 100644
--- a/.claude/skills/indexing-diagnostics/SKILL.md
+++ b/.claude/skills/indexing-diagnostics/SKILL.md
@@ -952,7 +952,11 @@ A single CDP `Debugger.pause` on the timeout path. V8 honors the pause at the ne
 
 When `pausedStack` is starved, the kernel-signal sampler is the capture that still works. `--prof` (knob: `PRERENDER_V8_PROF`, Mode H table) arms V8's `SIGPROF`-driven sampler at Chrome **launch**: the kernel timer preempts the pegged thread mid-instruction on a schedule it can't refuse — no protocol message, no back-edge, no cooperation — and a **separate thread** writes the samples to a file. So it records the spinning frame even when CDP is dead, and never needs the thread to serialize at `stop` (the failure mode of the CDP `Profiler`). Renderer-wide, not affinity-gated.
 
-It is **not** symbolized in-container — the log accumulates every render on the isolate since launch (tens of MB), and `node --prof-process` on that blows the render-timeout budget (that's why an in-container summary kept coming back empty). Instead, on the timeout the prerender server **streams the raw log to the prerender S3 artifacts bucket** as a `v8log` artifact, keyed `env/realm/jobId/card/step/<ts>.v8log` — so the wedging task's log is self-identifying (no shelling into the N prerender tasks to find it). The timeout log line just reports the upload: `v8ProfLog: uploaded … (NMB) to artifact bucket …`, or a self-diagnosing reason (`<no v8 --prof log from this run; seen: …>`) if none was found.
+It is **not** symbolized in-container — the log accumulates every render on the isolate since launch (tens of MB), and `node --prof-process` on that blows the render-timeout budget (that's why an in-container summary kept coming back empty). Instead, on the timeout the prerender server **streams the raw log to the prerender S3 artifacts bucket** as a `v8log` artifact, keyed `env/realm/jobId/card/step/<ts>.v8log` — so the wedging task's log is self-identifying (no shelling into the N prerender tasks to find it). Once the upload is durable the server **deletes the local log** (the wedged page is evicted, so its isolate writes no more), keeping these tens-of-MB files from piling up on the container disk. The timeout log line reports the outcome:
+
+- `v8ProfLog: uploaded … (NMB) to artifact bucket (local log removed) …` — success; fetch the artifact (below).
+- `v8ProfLog: <v8 --prof log not persisted …; kept local …>` — sink disabled / budget spent / upload failed, so the local copy is **retained** (not destroyed for a capture you can't fetch).
+- `v8ProfLog: <no v8 --prof log from this run; seen: …>` — armed but no log written (e.g. the browser didn't relaunch after the flag flip).
 
 Arm it:
 
diff --git a/packages/realm-server/prerender/artifact-sink.ts b/packages/realm-server/prerender/artifact-sink.ts
index 073638b22c..d838bc218c 100644
--- a/packages/realm-server/prerender/artifact-sink.ts
+++ b/packages/realm-server/prerender/artifact-sink.ts
@@ -229,15 +229,18 @@ let sessionBytesUsed = 0;
 let uploadSeq = 0;
 let budgetExhaustedLogged = false;
 
-// Uploads one artifact. Resolves once the object is durable in S3 (so a
-// per-render `await` genuinely persists before the next render), or sooner
-// if the sink is disabled / the budget is spent / the upload fails — none
-// of which ever throw or reject. Declines (does not truncate) once the
-// session budget is reached, so it never produces an invalid blob.
-export async function uploadArtifact(upload: ArtifactUpload): Promise<void> {
+// Uploads one artifact. Resolves `true` once the object is durable in S3 (so
+// a per-render `await` genuinely persists before the next render), and `false`
+// if it didn't land — the sink is disabled, the session budget is spent, or
+// the upload failed. Never throws or rejects, and declines (does not truncate)
+// once the budget is reached, so it never produces an invalid blob. The
+// boolean lets a caller that holds the only local copy (the V8 `--prof` log)
+// gate its post-upload cleanup on a genuine success rather than destroying a
+// copy that was never persisted.
+export async function uploadArtifact(upload: ArtifactUpload): Promise<boolean> {
   let bucket = artifactBucket();
   if (!bucket) {
-    return;
+    return false;
   }
   if (sessionBytesUsed >= getMaxSessionBytes()) {
     if (!budgetExhaustedLogged) {
@@ -247,7 +250,7 @@ export async function uploadArtifact(upload: ArtifactUpload): Promise<void> {
           `declining further artifact uploads for this process`,
       );
     }
-    return;
+    return false;
   }
 
   let key = buildArtifactKey(upload, new Date(), uploadSeq++);
@@ -273,10 +276,12 @@ export async function uploadArtifact(upload: ArtifactUpload): Promise<void> {
       `artifact-sink uploaded ${upload.kind} key=${key} bytes=${loaded} ` +
         `sessionBytes=${sessionBytesUsed}/${getMaxSessionBytes()}`,
     );
+    return true;
   } catch (e) {
     // Best-effort: a failed flush is a missing diagnostic, not a render
     // failure. Count nothing against the budget.
     log.warn(`artifact-sink failed to upload ${upload.kind} key=${key}:`, e);
+    return false;
   }
 }
 
diff --git a/packages/realm-server/prerender/utils.ts b/packages/realm-server/prerender/utils.ts
index 77b5ff543a..84a4a93f92 100644
--- a/packages/realm-server/prerender/utils.ts
+++ b/packages/realm-server/prerender/utils.ts
@@ -1544,12 +1544,13 @@ async function runWithAffinityProfile<T>(
     maxRunMs: timeoutMs,
     run: observe,
     onRawProfile: wantCpuProfile
-      ? (rawProfile) =>
-          uploadArtifact({
+      ? async (rawProfile) => {
+          await uploadArtifact({
             ...keyParts,
             kind: 'cpuprofile',
             body: Buffer.from(JSON.stringify(rawProfile), 'utf8'),
-          })
+          });
+        }
       : undefined,
   })
     .then(async (summary) => {
diff --git a/packages/realm-server/prerender/v8-prof.ts b/packages/realm-server/prerender/v8-prof.ts
index 9e49c263fc..c3b8565e0c 100644
--- a/packages/realm-server/prerender/v8-prof.ts
+++ b/packages/realm-server/prerender/v8-prof.ts
@@ -12,7 +12,9 @@
 // budget. So we don't parse it in-container. On the render timeout we ship the
 // RAW log to the prerender S3 artifacts bucket, keyed by realm/card/job, and
 // symbolize it offline (`node --prof-process`) where there's no deadline and
-// the peg dominates the top self-time frames.
+// the peg dominates the top self-time frames. Once the upload is durable we
+// delete the local log so these tens-of-MB files don't pile up on the
+// container's disk over the browser's long life.
 //
 // Off by default (`PRERENDER_V8_PROF=true` to arm) — it samples every render,
 // so it can perturb a timing-sensitive wedge; arm it deliberately.
@@ -142,13 +144,32 @@ export async function uploadV8ProfLog(
     if ('reason' in picked) {
       return picked.reason;
     }
-    await uploadArtifact({
+    let uploaded = await uploadArtifact({
       ...keyParts,
       kind: 'v8log',
       body: createReadStream(picked.full),
       contentType: 'text/plain',
     });
-    return `uploaded v8 --prof log ${path.basename(picked.full)} (${picked.sizeMB}MB) to artifact bucket — symbolize offline with \`node --prof-process\``;
+    if (!uploaded) {
+      // The bytes did NOT land in S3 (sink disabled / budget spent / upload
+      // failed). Keep the local log — it's the only copy — rather than
+      // destroying it for a capture we can't retrieve.
+      return `<v8 --prof log not persisted to artifact bucket (sink disabled/declined/failed); kept local ${path.basename(picked.full)} (${picked.sizeMB}MB)>`;
+    }
+    // Durable in S3, so reclaim the container's local disk: these logs run
+    // tens of MB and `--logfile` accumulates them in the OS temp dir across
+    // the browser's long life. A render timeout evicts the wedged page (its
+    // isolate tears down; the next visit writes a FRESH log), so deleting
+    // this one strands nothing. `await uploadArtifact` has fully drained the
+    // read stream by now, so the unlink is safe; it frees the dirent
+    // immediately and the bytes once the evicted renderer process exits.
+    let cleanup = ' (local log removed)';
+    try {
+      await fs.rm(picked.full, { force: true });
+    } catch (e) {
+      cleanup = ` (local delete failed: ${String((e as { message?: string }).message ?? e).slice(0, 80)})`;
+    }
+    return `uploaded v8 --prof log ${path.basename(picked.full)} (${picked.sizeMB}MB) to artifact bucket${cleanup} — symbolize offline with \`node --prof-process\``;
   } catch (e) {
     // Never silently null — surface the reason on the timeout line.
     return `<v8 --prof upload error: ${String((e as { message?: string }).message ?? e).slice(0, 160)}>`;

From c6d2697538d395dc4159b4bf8bded479db3b5b78 Mon Sep 17 00:00:00 2001
From: Hassan Abdel-Rahman <hassan.abdelrahman@gmail.com>
Date: Tue, 16 Jun 2026 11:29:08 -0400
Subject: [PATCH 12/16] Test the v8log artifact kind, the upload-success
 contract, and --prof selection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend the artifact-sink unit test for the `v8log` kind suffix and the new
boolean return (disabled sink → false, the contract the --prof delete gates
on). Add a v8-prof unit test covering flag gating, the launch flags, the
stale-log sweep, and how uploadV8ProfLog selects the pegged isolate's log:
largest this-run wins, isolate-prefixed names match, logs from before the
run are excluded, and a capture that can't be persisted is KEPT on disk
rather than deleted. All run standalone (no PG / Chrome / S3).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 packages/realm-server/tests/index.ts          |   1 +
 .../tests/prerender-artifact-sink-test.ts     |  19 +-
 .../tests/prerender-v8-prof-test.ts           | 193 ++++++++++++++++++
 3 files changed, 212 insertions(+), 1 deletion(-)
 create mode 100644 packages/realm-server/tests/prerender-v8-prof-test.ts

diff --git a/packages/realm-server/tests/index.ts b/packages/realm-server/tests/index.ts
index 8f1f93aeb2..5156aafb9f 100644
--- a/packages/realm-server/tests/index.ts
+++ b/packages/realm-server/tests/index.ts
@@ -208,6 +208,7 @@ const ALL_TEST_FILES: string[] = [
   './is-json-content-type-test',
   './prerender-diagnostics-persistence-test',
   './prerender-proxy-test',
+  './prerender-v8-prof-test',
   './queue-test',
   './finalize-orphan-reservations-test',
   './finalize-child-fatal-failure-test',
diff --git a/packages/realm-server/tests/prerender-artifact-sink-test.ts b/packages/realm-server/tests/prerender-artifact-sink-test.ts
index 993ef750ed..587a83b5ee 100644
--- a/packages/realm-server/tests/prerender-artifact-sink-test.ts
+++ b/packages/realm-server/tests/prerender-artifact-sink-test.ts
@@ -7,6 +7,9 @@ import {
   shouldCaptureCpuProfile,
   shouldCaptureTrace,
   shouldCaptureHeap,
+  uploadArtifact,
+  __resetArtifactSinkSessionForTests,
+  type ArtifactKind,
 } from '../prerender/artifact-sink.ts';
 
 // The heavyweight artifact sink is gated by env (a configured bucket plus
@@ -88,11 +91,12 @@ module('prerender artifact-sink key', function (hooks) {
   });
 
   test('each artifact kind maps to its conventional file suffix', function (assert) {
-    let suffix = (kind: 'cpuprofile' | 'trace' | 'heap') =>
+    let suffix = (kind: ArtifactKind) =>
       buildArtifactKey({ kind }, now, 0).split('.').slice(1).join('.');
     assert.strictEqual(suffix('cpuprofile'), 'cpuprofile');
     assert.strictEqual(suffix('trace'), 'trace.json');
     assert.strictEqual(suffix('heap'), 'heapprofile');
+    assert.strictEqual(suffix('v8log'), 'v8log');
   });
 
   test('the seq disambiguates artifacts that share a millisecond', function (assert) {
@@ -159,6 +163,19 @@ module('prerender artifact-sink gates', function (hooks) {
       'a bucket with every mode off → nothing to capture',
     );
   });
+
+  test('uploadArtifact reports false when the sink is disabled, so a caller never destroys its only local copy', async function (assert) {
+    // The V8 --prof path deletes its local log only on a confirmed upload;
+    // this is the contract it gates on. No bucket configured → no upload
+    // attempt, no S3 — a pure, dependency-free assertion of the false return.
+    __resetArtifactSinkSessionForTests();
+    delete process.env.PRERENDER_ARTIFACTS_BUCKET;
+    let uploaded = await uploadArtifact({
+      kind: 'v8log',
+      body: Buffer.from('not going anywhere'),
+    });
+    assert.false(uploaded, 'disabled sink → false');
+  });
 });
 
 module('prerender artifact-sink session budget', function (hooks) {
diff --git a/packages/realm-server/tests/prerender-v8-prof-test.ts b/packages/realm-server/tests/prerender-v8-prof-test.ts
new file mode 100644
index 0000000000..33ff95b6e1
--- /dev/null
+++ b/packages/realm-server/tests/prerender-v8-prof-test.ts
@@ -0,0 +1,193 @@
+import { module, test } from 'qunit';
+import { promises as fs } from 'fs';
+import path from 'path';
+import {
+  v8ProfEnabled,
+  v8ProfJsFlags,
+  prepareV8ProfForLaunch,
+  uploadV8ProfLog,
+  V8_PROF_LOG_DIR,
+  V8_PROF_LOG_PREFIX,
+} from '../prerender/v8-prof.ts';
+
+// The V8 `--prof` capture is gated by env and reads/writes the OS temp dir.
+// These tests pin the dependency-free behavior — flag gating, the launch
+// flags, the stale-log sweep, and how `uploadV8ProfLog` selects the pegged
+// isolate's log and what it does when the upload can't land — without S3 or
+// Chrome. With no bucket configured the sink returns false, which drives the
+// "keep the local log" branch: that lets us assert WHICH file was selected
+// (it appears in the status string) and that a non-persisted capture is never
+// deleted. The env is read at call time, so each case sets what it needs and
+// the hooks restore the process environment and clean up fixtures afterward.
+
+const ENV_KEYS = ['PRERENDER_V8_PROF', 'PRERENDER_ARTIFACTS_BUCKET'];
+
+function snapshotEnv(): Record<string, string | undefined> {
+  let snapshot: Record<string, string | undefined> = {};
+  for (let key of ENV_KEYS) {
+    snapshot[key] = process.env[key];
+  }
+  return snapshot;
+}
+
+function restoreEnv(snapshot: Record<string, string | undefined>): void {
+  for (let key of ENV_KEYS) {
+    if (snapshot[key] === undefined) {
+      delete process.env[key];
+    } else {
+      process.env[key] = snapshot[key];
+    }
+  }
+}
+
+// A `--prof` log fixture. `isolatePrefix` reproduces V8's per-isolate
+// `isolate-<addr>-` prepend so we exercise the `includes` (not `startsWith`)
+// filename match. `ageMs` backdates the mtime to simulate a log from before
+// this browser run (stale, must be excluded from selection).
+async function writeLog(basename: string, sizeBytes: number): Promise<string> {
+  let full = path.join(V8_PROF_LOG_DIR, basename);
+  await fs.writeFile(full, Buffer.alloc(sizeBytes, 'v'));
+  return full;
+}
+
+async function backdate(full: string, ageMs: number): Promise<void> {
+  let when = new Date(Date.now() - ageMs);
+  await fs.utimes(full, when, when);
+}
+
+module('prerender v8-prof', function (hooks) {
+  let saved: Record<string, string | undefined>;
+  let created: string[];
+
+  hooks.beforeEach(async function () {
+    saved = snapshotEnv();
+    created = [];
+    // Arm, point at no bucket (sink disabled), and sweep + stamp the launch
+    // time so each case starts from a clean slate with a known "this run"
+    // boundary.
+    process.env.PRERENDER_V8_PROF = 'true';
+    delete process.env.PRERENDER_ARTIFACTS_BUCKET;
+    await prepareV8ProfForLaunch();
+  });
+
+  hooks.afterEach(async function () {
+    for (let full of created) {
+      await fs.rm(full, { force: true }).catch(() => {});
+    }
+    restoreEnv(saved);
+  });
+
+  test('v8ProfEnabled mirrors the env flag (exact "true")', function (assert) {
+    process.env.PRERENDER_V8_PROF = 'true';
+    assert.true(v8ProfEnabled());
+    process.env.PRERENDER_V8_PROF = 'false';
+    assert.false(v8ProfEnabled());
+    delete process.env.PRERENDER_V8_PROF;
+    assert.false(v8ProfEnabled());
+  });
+
+  test('v8ProfJsFlags arms --prof with a per-pid logfile under the prefix', function (assert) {
+    let flags = v8ProfJsFlags();
+    assert.true(flags.includes('--prof'), 'arms the sampler');
+    assert.true(flags.includes('--logfile='), 'directs the log to a file');
+    assert.true(flags.includes(V8_PROF_LOG_PREFIX), 'uses the known prefix');
+    assert.true(flags.includes('%p'), 'per-pid so concurrent renderers split');
+  });
+
+  test('uploadV8ProfLog is inert (null) when not armed', async function (assert) {
+    process.env.PRERENDER_V8_PROF = 'false';
+    let status = await uploadV8ProfLog({ realm: 'r', card: 'c', step: 's' });
+    assert.strictEqual(status, null, 'disabled → no work, no status');
+  });
+
+  test('armed with no log present → self-diagnosing reason', async function (assert) {
+    let status = await uploadV8ProfLog({ realm: 'r', card: 'c', step: 's' });
+    assert.true(
+      typeof status === 'string' && status.includes('no v8 --prof log'),
+      `reason names the absence, got: ${status}`,
+    );
+  });
+
+  test('selects the largest this-run log and, on a non-persisted upload, KEEPS it', async function (assert) {
+    let small = await writeLog(
+      `isolate-0xaaa-${V8_PROF_LOG_PREFIX}101.log`,
+      4096,
+    );
+    let big = await writeLog(
+      `isolate-0xbbb-${V8_PROF_LOG_PREFIX}102.log`,
+      64 * 1024,
+    );
+    created.push(small, big);
+
+    let status = await uploadV8ProfLog({ realm: 'r', card: 'c', step: 's' });
+
+    assert.true(
+      typeof status === 'string' && status.includes(path.basename(big)),
+      `picks the largest (pegged-isolate) log, got: ${status}`,
+    );
+    assert.true(
+      typeof status === 'string' && status.includes('kept local'),
+      'sink disabled → reports the local copy was retained',
+    );
+    // The safety property: a capture that never reached S3 is not destroyed.
+    assert.true(
+      await fs
+        .stat(big)
+        .then(() => true)
+        .catch(() => false),
+      'the only copy of the log is still on disk',
+    );
+  });
+
+  test('excludes stale logs written before this browser run', async function (assert) {
+    // A big but STALE log (mtime before the stamped launch time) must lose to
+    // a small fresh one — otherwise a previous run's log would be mis-picked.
+    let staleBig = await writeLog(`${V8_PROF_LOG_PREFIX}900.log`, 128 * 1024);
+    await backdate(staleBig, 60_000);
+    let freshSmall = await writeLog(`${V8_PROF_LOG_PREFIX}901.log`, 2048);
+    created.push(staleBig, freshSmall);
+
+    let status = await uploadV8ProfLog({ realm: 'r', card: 'c', step: 's' });
+
+    assert.true(
+      typeof status === 'string' && status.includes(path.basename(freshSmall)),
+      `picks the fresh log over the larger stale one, got: ${status}`,
+    );
+  });
+
+  test('only stale logs present → "no log from this run"', async function (assert) {
+    let stale = await writeLog(`${V8_PROF_LOG_PREFIX}910.log`, 8192);
+    await backdate(stale, 60_000);
+    created.push(stale);
+
+    let status = await uploadV8ProfLog({ realm: 'r', card: 'c', step: 's' });
+
+    assert.true(
+      typeof status === 'string' && status.includes('from this run'),
+      `distinguishes stale-only from absent, got: ${status}`,
+    );
+  });
+
+  test('prepareV8ProfForLaunch sweeps prior --prof logs', async function (assert) {
+    let orphan = await writeLog(
+      `isolate-0xccc-${V8_PROF_LOG_PREFIX}777.log`,
+      1024,
+    );
+    created.push(orphan);
+    assert.true(
+      await fs
+        .stat(orphan)
+        .then(() => true)
+        .catch(() => false),
+      'fixture present before the sweep',
+    );
+    await prepareV8ProfForLaunch();
+    assert.false(
+      await fs
+        .stat(orphan)
+        .then(() => true)
+        .catch(() => false),
+      'launch sweep removed the prior-run log',
+    );
+  });
+});

From 003f1e5afc8506c8244b019b62455580c865a3e1 Mon Sep 17 00:00:00 2001
From: Hassan Abdel-Rahman <hassan.abdelrahman@gmail.com>
Date: Tue, 16 Jun 2026 11:30:25 -0400
Subject: [PATCH 13/16] Satisfy qunit/no-assert-logical-expression in the
 v8-prof test

Coerce the nullable status to a string before asserting on it rather than
guarding with '&&' inside the assertion argument.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 packages/realm-server/tests/prerender-v8-prof-test.ts | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/packages/realm-server/tests/prerender-v8-prof-test.ts b/packages/realm-server/tests/prerender-v8-prof-test.ts
index 33ff95b6e1..5291825608 100644
--- a/packages/realm-server/tests/prerender-v8-prof-test.ts
+++ b/packages/realm-server/tests/prerender-v8-prof-test.ts
@@ -103,7 +103,7 @@ module('prerender v8-prof', function (hooks) {
   test('armed with no log present → self-diagnosing reason', async function (assert) {
     let status = await uploadV8ProfLog({ realm: 'r', card: 'c', step: 's' });
     assert.true(
-      typeof status === 'string' && status.includes('no v8 --prof log'),
+      (status ?? '').includes('no v8 --prof log'),
       `reason names the absence, got: ${status}`,
     );
   });
@@ -120,13 +120,14 @@ module('prerender v8-prof', function (hooks) {
     created.push(small, big);
 
     let status = await uploadV8ProfLog({ realm: 'r', card: 'c', step: 's' });
+    let s = status ?? '';
 
     assert.true(
-      typeof status === 'string' && status.includes(path.basename(big)),
+      s.includes(path.basename(big)),
       `picks the largest (pegged-isolate) log, got: ${status}`,
     );
     assert.true(
-      typeof status === 'string' && status.includes('kept local'),
+      s.includes('kept local'),
       'sink disabled → reports the local copy was retained',
     );
     // The safety property: a capture that never reached S3 is not destroyed.
@@ -150,7 +151,7 @@ module('prerender v8-prof', function (hooks) {
     let status = await uploadV8ProfLog({ realm: 'r', card: 'c', step: 's' });
 
     assert.true(
-      typeof status === 'string' && status.includes(path.basename(freshSmall)),
+      (status ?? '').includes(path.basename(freshSmall)),
       `picks the fresh log over the larger stale one, got: ${status}`,
     );
   });
@@ -163,7 +164,7 @@ module('prerender v8-prof', function (hooks) {
     let status = await uploadV8ProfLog({ realm: 'r', card: 'c', step: 's' });
 
     assert.true(
-      typeof status === 'string' && status.includes('from this run'),
+      (status ?? '').includes('from this run'),
       `distinguishes stale-only from absent, got: ${status}`,
     );
   });

From d4ff6731fc5168237a6d73d3070bd7f3f09cd1d8 Mon Sep 17 00:00:00 2001
From: Hassan Abdel-Rahman <hassan.abdelrahman@gmail.com>
Date: Tue, 16 Jun 2026 11:41:35 -0400
Subject: [PATCH 14/16] Address review: gate the sandbox, detach late CDP
 sessions, small cleanups
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- browser-manager: the renderer executes user-authored card code, so keep
  the Chromium sandbox on where it works and disable it only where it must
  be off — in the container (PUPPETEER_DISABLE_SANDBOX), in CI, and when the
  V8 --prof diagnostic is armed (a sandboxed renderer can't write the log).
- pause-capture: if createCDPSession outlives its timeout, detach the
  late-resolving session so it isn't orphaned on the page.
- utils: compute formatPausedStack once for the timeout log line.
- symbolize-prerender-wedge.sh: match the realm with grep -F so a realm
  containing regex metacharacters can't match unintended keys.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../realm-server/prerender/browser-manager.ts | 20 ++++++++++++-------
 .../realm-server/prerender/pause-capture.ts   |  6 +++++-
 packages/realm-server/prerender/utils.ts      |  5 ++---
 .../scripts/symbolize-prerender-wedge.sh      |  2 +-
 4 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/packages/realm-server/prerender/browser-manager.ts b/packages/realm-server/prerender/browser-manager.ts
index cbabf02bd9..c9e30160ce 100644
--- a/packages/realm-server/prerender/browser-manager.ts
+++ b/packages/realm-server/prerender/browser-manager.ts
@@ -29,13 +29,19 @@ export class BrowserManager {
     await this.cleanupUserDataDirs();
 
     let launchArgs: string[] = [];
-    // Always disable the Chromium sandbox for the prerender renderer. It
-    // only ever renders the realm's own trusted cards (never untrusted web
-    // content) and always runs in a container where the sandbox must be off
-    // for Chrome to launch — and for the renderer to write the V8 `--prof`
-    // diagnostic log. Unconditional rather than gated on CI /
-    // PUPPETEER_DISABLE_SANDBOX so it never silently depends on the env.
-    launchArgs.push('--no-sandbox', '--disable-setuid-sandbox');
+    // The Chromium sandbox is a real boundary here — the renderer executes
+    // user-authored card code — so keep it on wherever it works and disable
+    // it only where it must be off: in the container (the sandbox can't
+    // initialize there, so the image sets PUPPETEER_DISABLE_SANDBOX), in CI,
+    // and when the V8 `--prof` diagnostic is armed (a sandboxed renderer
+    // can't write the profile log to disk).
+    let disableSandbox =
+      process.env.CI === 'true' ||
+      process.env.PUPPETEER_DISABLE_SANDBOX === 'true' ||
+      v8ProfEnabled();
+    if (disableSandbox) {
+      launchArgs.push('--no-sandbox', '--disable-setuid-sandbox');
+    }
 
     // When the realm-server speaks HTTPS (local dev with a mkcert leaf
     // cert), Chromium needs to be told to accept it. mkcert's root CA
diff --git a/packages/realm-server/prerender/pause-capture.ts b/packages/realm-server/prerender/pause-capture.ts
index ba2abfdcdd..15d2fa3f5a 100644
--- a/packages/realm-server/prerender/pause-capture.ts
+++ b/packages/realm-server/prerender/pause-capture.ts
@@ -96,8 +96,12 @@ export async function capturePausedCallStack(
   }
   let client: CDPSession | undefined;
   try {
-    let session = await raceNodeTimeout(page.createCDPSession(), budgetMs);
+    let sessionPromise = page.createCDPSession();
+    let session = await raceNodeTimeout(sessionPromise, budgetMs);
     if (session === TIMED_OUT) {
+      // We gave up waiting, but the create may still resolve later — detach it
+      // then so a slow CDP session isn't orphaned on the page until it closes.
+      void sessionPromise.then((s) => detachQuietly(s)).catch(() => {});
       return empty(null, 'cdp-session-timeout');
     }
     client = session;
diff --git a/packages/realm-server/prerender/utils.ts b/packages/realm-server/prerender/utils.ts
index 84a4a93f92..8dc04f713b 100644
--- a/packages/realm-server/prerender/utils.ts
+++ b/packages/realm-server/prerender/utils.ts
@@ -1802,6 +1802,7 @@ export async function withTimeout<T>(
     }
 
     let topPending = pendingNetworkRequests?.[0];
+    let pausedStackStr = formatPausedStack(pausedStack);
     log.warn(
       `render of ${id} timed out after ${timeoutMs}ms` +
         ` stage=${richDiagnostics?.renderStage ?? '<unknown>'}` +
@@ -1823,9 +1824,7 @@ export async function withTimeout<T>(
         (pausedStack?.heapUsedMB != null
           ? ` jsHeapUsedMB=${pausedStack.heapUsedMB.toFixed(0)}`
           : '') +
-        (formatPausedStack(pausedStack)
-          ? ` pausedStack: ${formatPausedStack(pausedStack)}`
-          : '') +
+        (pausedStackStr ? ` pausedStack: ${pausedStackStr}` : '') +
         (v8ProfStatus ? ` v8ProfLog: ${v8ProfStatus}` : '') +
         ` DOM:\n${dom?.trim()}`,
     );
diff --git a/packages/realm-server/scripts/symbolize-prerender-wedge.sh b/packages/realm-server/scripts/symbolize-prerender-wedge.sh
index 3c9977cc67..ec393ea319 100755
--- a/packages/realm-server/scripts/symbolize-prerender-wedge.sh
+++ b/packages/realm-server/scripts/symbolize-prerender-wedge.sh
@@ -61,7 +61,7 @@ if [[ -z "$KEY" ]]; then
   mapfile -t MATCHES < <(
     "${AWS[@]}" s3api list-objects-v2 --bucket "$BUCKET" --prefix "$ENV/" \
       --query "reverse(sort_by(Contents[?ends_with(Key, '.v8log')], &LastModified))[].Key" \
-      --output text 2>/dev/null | tr '\t' '\n' | grep -i -- "$REALM" || true
+      --output text 2>/dev/null | tr '\t' '\n' | grep -iF -- "$REALM" || true
   )
   [[ ${#MATCHES[@]} -gt 0 ]] || { echo "no .v8log artifact for realm '$REALM' in $BUCKET" >&2; exit 1; }
   if [[ -n "$LIST_ONLY" ]]; then printf '%s\n' "${MATCHES[@]}"; exit 0; fi

From fa10cf1d3439091d19101de2a77cdd0e0864537b Mon Sep 17 00:00:00 2001
From: Hassan Abdel-Rahman <hassan.abdelrahman@gmail.com>
Date: Tue, 16 Jun 2026 11:48:51 -0400
Subject: [PATCH 15/16] Make --prof log selection transparent and delete only
 on an unambiguous pick
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Selecting the largest this-run isolate log is a heuristic: a 60s CPU peg
samples continuously so its log dwarfs IO-bound renderers', but under
several concurrent renderers a long-lived busy isolate could rival it.
pickV8ProfLog now reports the candidate count + runner-up size, and the
timeout line surfaces them so a mis-pick is visible (re-symbolize another
with the helper's --key).

The post-upload delete now runs only when there's a single this-run log —
then it's unmistakably the wedged, evicted isolate's and the unlink frees
real disk. With multiple candidates the largest may be a still-live
renderer's log, where unlinking frees nothing (V8 holds the fd) and loses
its samples; keep them all for the next browser-launch sweep instead.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .claude/skills/indexing-diagnostics/SKILL.md |  2 +-
 packages/realm-server/prerender/v8-prof.ts   | 62 +++++++++++++-------
 2 files changed, 42 insertions(+), 22 deletions(-)

diff --git a/.claude/skills/indexing-diagnostics/SKILL.md b/.claude/skills/indexing-diagnostics/SKILL.md
index 6f04ec7b4d..bc8662430c 100644
--- a/.claude/skills/indexing-diagnostics/SKILL.md
+++ b/.claude/skills/indexing-diagnostics/SKILL.md
@@ -954,7 +954,7 @@ When `pausedStack` is starved, the kernel-signal sampler is the capture that sti
 
 It is **not** symbolized in-container — the log accumulates every render on the isolate since launch (tens of MB), and `node --prof-process` on that blows the render-timeout budget (that's why an in-container summary kept coming back empty). Instead, on the timeout the prerender server **streams the raw log to the prerender S3 artifacts bucket** as a `v8log` artifact, keyed `env/realm/jobId/card/step/<ts>.v8log` — so the wedging task's log is self-identifying (no shelling into the N prerender tasks to find it). Once the upload is durable the server **deletes the local log** (the wedged page is evicted, so its isolate writes no more), keeping these tens-of-MB files from piling up on the container disk. The timeout log line reports the outcome:
 
-- `v8ProfLog: uploaded … (NMB) to artifact bucket (local log removed) …` — success; fetch the artifact (below).
+- `v8ProfLog: uploaded … (NMB) to artifact bucket (local log removed) …` — success; fetch the artifact (below). If several renderer isolates were active this run the line instead reads `(largest of N this-run logs, runner-up …MB — kept …; re-symbolize another with the helper's --key)` — the largest is a heuristic for the pegged isolate, so if its frames don't look wedged, `--list` the candidates and `--key` a different one.
 - `v8ProfLog: <v8 --prof log not persisted …; kept local …>` — sink disabled / budget spent / upload failed, so the local copy is **retained** (not destroyed for a capture you can't fetch).
 - `v8ProfLog: <no v8 --prof log from this run; seen: …>` — armed but no log written (e.g. the browser didn't relaunch after the flag flip).
 
diff --git a/packages/realm-server/prerender/v8-prof.ts b/packages/realm-server/prerender/v8-prof.ts
index c3b8565e0c..aa85358e1a 100644
--- a/packages/realm-server/prerender/v8-prof.ts
+++ b/packages/realm-server/prerender/v8-prof.ts
@@ -13,8 +13,9 @@
 // RAW log to the prerender S3 artifacts bucket, keyed by realm/card/job, and
 // symbolize it offline (`node --prof-process`) where there's no deadline and
 // the peg dominates the top self-time frames. Once the upload is durable we
-// delete the local log so these tens-of-MB files don't pile up on the
-// container's disk over the browser's long life.
+// delete the local log (when the pegged isolate is unambiguous) so these
+// tens-of-MB files don't pile up on the container's disk over the browser's
+// long life.
 //
 // Off by default (`PRERENDER_V8_PROF=true` to arm) — it samples every render,
 // so it can perturb a timing-sensitive wedge; arm it deliberately.
@@ -77,7 +78,13 @@ export async function prepareV8ProfForLaunch(): Promise<void> {
 // Find the pegged isolate's `--prof` log among this browser run's logs.
 // Returns the chosen file (or a self-diagnosing reason string).
 async function pickV8ProfLog(): Promise<
-  { full: string; sizeMB: string } | { reason: string }
+  | {
+      full: string;
+      sizeMB: string;
+      candidates: number;
+      runnerUpMB: string | null;
+    }
+  | { reason: string }
 > {
   let entries = await fs.readdir(V8_PROF_LOG_DIR);
   // `includes`, not `startsWith`: V8's per-isolate logging prepends
@@ -104,10 +111,13 @@ async function pickV8ProfLog(): Promise<
     }),
   );
   // This browser run's logs only (stale ones were cleared + the launch time
-  // stamped). Pick the LARGEST: the pegged isolate accumulates by far the
-  // most samples over the peg, so its log dwarfs sibling isolates' and
-  // earlier renders' — and the peg dominates its frames regardless of what
-  // else it rendered.
+  // stamped). Pick the LARGEST: a CPU peg samples continuously at the kernel
+  // timer's rate, so over a 60s wedge the pegged isolate's log dwarfs
+  // IO-bound healthy renderers' and earlier renders'. This is a heuristic by
+  // cumulative size — under several concurrent renderers a long-lived busy
+  // isolate's log could rival the peg's — so we return the candidate count
+  // and runner-up size; the caller surfaces them and the helper's `--key`
+  // lets you symbolize a different one if the largest wasn't the wedge.
   let fromThisRun = withStats.filter(
     (s) => s.size > 0 && s.mtimeMs >= v8ProfLaunchAt,
   );
@@ -120,9 +130,12 @@ async function pickV8ProfLog(): Promise<
     };
   }
   fromThisRun.sort((a, b) => b.size - a.size);
+  let toMB = (n: number) => (n / (1024 * 1024)).toFixed(1);
   return {
     full: fromThisRun[0].full,
-    sizeMB: (fromThisRun[0].size / (1024 * 1024)).toFixed(1),
+    sizeMB: toMB(fromThisRun[0].size),
+    candidates: fromThisRun.length,
+    runnerUpMB: fromThisRun[1] ? toMB(fromThisRun[1].size) : null,
   };
 }
 
@@ -156,20 +169,27 @@ export async function uploadV8ProfLog(
       // destroying it for a capture we can't retrieve.
       return `<v8 --prof log not persisted to artifact bucket (sink disabled/declined/failed); kept local ${path.basename(picked.full)} (${picked.sizeMB}MB)>`;
     }
-    // Durable in S3, so reclaim the container's local disk: these logs run
-    // tens of MB and `--logfile` accumulates them in the OS temp dir across
-    // the browser's long life. A render timeout evicts the wedged page (its
-    // isolate tears down; the next visit writes a FRESH log), so deleting
-    // this one strands nothing. `await uploadArtifact` has fully drained the
-    // read stream by now, so the unlink is safe; it frees the dirent
-    // immediately and the bytes once the evicted renderer process exits.
-    let cleanup = ' (local log removed)';
-    try {
-      await fs.rm(picked.full, { force: true });
-    } catch (e) {
-      cleanup = ` (local delete failed: ${String((e as { message?: string }).message ?? e).slice(0, 80)})`;
+    // Reclaim the container's local disk, but only when the pick is
+    // unambiguous. A single this-run log is unmistakably the wedged isolate's,
+    // and a render timeout evicts that page — so its renderer exits and the
+    // unlink frees the bytes (the next visit writes a fresh log, stranding
+    // nothing). With multiple this-run logs the largest may belong to a
+    // still-live concurrent renderer; unlinking that frees no space (V8 holds
+    // the fd open) and loses its future samples — so keep them all and let the
+    // next browser-launch sweep reclaim them. `await uploadArtifact` has
+    // drained the read stream by now, so the unlink is safe.
+    let disposition: string;
+    if (picked.candidates === 1) {
+      try {
+        await fs.rm(picked.full, { force: true });
+        disposition = ' (local log removed)';
+      } catch (e) {
+        disposition = ` (local delete failed: ${String((e as { message?: string }).message ?? e).slice(0, 80)})`;
+      }
+    } else {
+      disposition = ` (largest of ${picked.candidates} this-run logs, runner-up ${picked.runnerUpMB}MB — kept for the next-launch sweep; re-symbolize another with the helper's --key)`;
     }
-    return `uploaded v8 --prof log ${path.basename(picked.full)} (${picked.sizeMB}MB) to artifact bucket${cleanup} — symbolize offline with \`node --prof-process\``;
+    return `uploaded v8 --prof log ${path.basename(picked.full)} (${picked.sizeMB}MB) to artifact bucket${disposition} — symbolize offline with \`node --prof-process\``;
   } catch (e) {
     // Never silently null — surface the reason on the timeout line.
     return `<v8 --prof upload error: ${String((e as { message?: string }).message ?? e).slice(0, 160)}>`;

From 87ad20da0385c78c2dcdaf60b03a9ce67bde8934 Mon Sep 17 00:00:00 2001
From: Hassan Abdel-Rahman <hassan.abdelrahman@gmail.com>
Date: Tue, 16 Jun 2026 12:24:49 -0400
Subject: [PATCH 16/16] Always launch the prerender renderer with the Chromium
 sandbox off
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The prerender task runs only in a container where the sandbox can't
initialize, so Chrome won't start with it on — it has always run sandbox-off
and is required to. Force it unconditionally rather than gating on CI /
PUPPETEER_DISABLE_SANDBOX, so a missing env var can't silently break launch.
The security boundary is the task itself: the prerenderer is a separate,
segregated ECS task isolated from the realm-server. The comment no longer
claims the renderer only runs trusted cards.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../realm-server/prerender/browser-manager.ts | 23 ++++++++-----------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/packages/realm-server/prerender/browser-manager.ts b/packages/realm-server/prerender/browser-manager.ts
index c9e30160ce..2e71db5037 100644
--- a/packages/realm-server/prerender/browser-manager.ts
+++ b/packages/realm-server/prerender/browser-manager.ts
@@ -29,19 +29,16 @@ export class BrowserManager {
     await this.cleanupUserDataDirs();
 
     let launchArgs: string[] = [];
-    // The Chromium sandbox is a real boundary here — the renderer executes
-    // user-authored card code — so keep it on wherever it works and disable
-    // it only where it must be off: in the container (the sandbox can't
-    // initialize there, so the image sets PUPPETEER_DISABLE_SANDBOX), in CI,
-    // and when the V8 `--prof` diagnostic is armed (a sandboxed renderer
-    // can't write the profile log to disk).
-    let disableSandbox =
-      process.env.CI === 'true' ||
-      process.env.PUPPETEER_DISABLE_SANDBOX === 'true' ||
-      v8ProfEnabled();
-    if (disableSandbox) {
-      launchArgs.push('--no-sandbox', '--disable-setuid-sandbox');
-    }
+    // Always launch the prerender renderer with the Chromium sandbox off — it
+    // always has, and we require it to. This task runs only in a container
+    // where the sandbox can't initialize, so Chrome won't start with it on
+    // (and the V8 `--prof` diagnostic needs the renderer able to write its log
+    // to disk, which the sandbox blocks). The security boundary is the task
+    // itself: the prerenderer is a separate, segregated ECS task, isolated
+    // from the realm-server. Forced unconditionally rather than gated on CI /
+    // PUPPETEER_DISABLE_SANDBOX so a missing env var can't silently break the
+    // launch.
+    launchArgs.push('--no-sandbox', '--disable-setuid-sandbox');
 
     // When the realm-server speaks HTTPS (local dev with a mkcert leaf
     // cert), Chromium needs to be told to accept it. mkcert's root CA