triggerdotdev · ericallam · Jun 13, 2026
diff --git a/scenarios/chat-agent-advanced/scenario.ts b/scenarios/chat-agent-advanced/scenario.ts
@@ -13,6 +13,9 @@ export const chatAgentAdvanced: Scenario = {
     "commands; only create and edit files. When finished, briefly list the files you created.",
   ].join(" "),
   fixtureDir: fileURLToPath(new URL("../../fixtures/backend-ts", import.meta.url)),
+  // The hardest skill (sessions/HITL/sub-agents): the full HITL task legitimately needs
+  // longer than the global default, so both arms were timing out at 300s.
+  timeoutMs: 600_000,
   assertions: [
     fileMatches("uses-chat-agent", "defines an agent with chat.agent", /chat\.agent\(/),
     noSdkV3(),

diff --git a/src/runner.ts b/src/runner.ts
@@ -63,7 +63,8 @@ function parseMetrics(stdout: string): RunMetrics | undefined {
 async function runAgent(
   dir: string,
   prompt: string,
-  config: EvalConfig
+  config: EvalConfig,
+  timeoutMs: number
 ): Promise<{ error?: string; metrics?: RunMetrics }> {
   // --output-format json gives us num_turns + token usage + cost as a final JSON line.
   const args = ["-p", prompt, "--permission-mode", config.permissionMode, "--output-format", "json"];
@@ -72,7 +73,7 @@ async function runAgent(
   try {
     const { stdout, stderr } = await exec("claude", args, {
       cwd: dir,
-      timeout: config.timeoutMs,
+      timeout: timeoutMs,
       maxBuffer: 64 * 1024 * 1024,
     });
     await writeFile(log, `${stdout}\n${stderr}`).catch(() => {});
@@ -101,7 +102,12 @@ export async function runArmSample(
 
   if (arm === "withskills") await installSkills(runDir, config);
 
-  const { error: agentError, metrics } = await runAgent(runDir, scenario.prompt, config);
+  const { error: agentError, metrics } = await runAgent(
+    runDir,
+    scenario.prompt,
+    config,
+    scenario.timeoutMs ?? config.timeoutMs
+  );
   const ctx: RunContext = { dir: runDir, arm, sample };
   const result = await gradeRun(scenario, ctx, await relFiles(runDir), agentError);
   result.metrics = metrics;

diff --git a/src/types.ts b/src/types.ts
@@ -65,6 +65,9 @@ export interface Scenario {
   /** Absolute path to the fixture template copied into each run. */
   fixtureDir: string;
   assertions: Assertion[];
+  /** Per-agent-run timeout for this scenario. Overrides config.timeoutMs (e.g. heavier
+   *  scenarios that legitimately need longer than the global default). */
+  timeoutMs?: number;
 }
 
 /** Cost signal for a single agent run, parsed from `claude -p --output-format json`. */