Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions scenarios/chat-agent-advanced/scenario.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ export const chatAgentAdvanced: Scenario = {
"commands; only create and edit files. When finished, briefly list the files you created.",
].join(" "),
fixtureDir: fileURLToPath(new URL("../../fixtures/backend-ts", import.meta.url)),
// The hardest skill (sessions/HITL/sub-agents): the full HITL task legitimately needs
// longer than the global default, so both arms were timing out at 300s.
timeoutMs: 600_000,
assertions: [
fileMatches("uses-chat-agent", "defines an agent with chat.agent", /chat\.agent\(/),
noSdkV3(),
Expand Down
12 changes: 9 additions & 3 deletions src/runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ function parseMetrics(stdout: string): RunMetrics | undefined {
async function runAgent(
dir: string,
prompt: string,
config: EvalConfig
config: EvalConfig,
timeoutMs: number
): Promise<{ error?: string; metrics?: RunMetrics }> {
// --output-format json gives us num_turns + token usage + cost as a final JSON line.
const args = ["-p", prompt, "--permission-mode", config.permissionMode, "--output-format", "json"];
Expand All @@ -72,7 +73,7 @@ async function runAgent(
try {
const { stdout, stderr } = await exec("claude", args, {
cwd: dir,
timeout: config.timeoutMs,
timeout: timeoutMs,
maxBuffer: 64 * 1024 * 1024,
});
await writeFile(log, `${stdout}\n${stderr}`).catch(() => {});
Expand Down Expand Up @@ -101,7 +102,12 @@ export async function runArmSample(

if (arm === "withskills") await installSkills(runDir, config);

const { error: agentError, metrics } = await runAgent(runDir, scenario.prompt, config);
const { error: agentError, metrics } = await runAgent(
runDir,
scenario.prompt,
config,
scenario.timeoutMs ?? config.timeoutMs
);
const ctx: RunContext = { dir: runDir, arm, sample };
const result = await gradeRun(scenario, ctx, await relFiles(runDir), agentError);
result.metrics = metrics;
Expand Down
3 changes: 3 additions & 0 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ export interface Scenario {
/** Absolute path to the fixture template copied into each run. */
fixtureDir: string;
assertions: Assertion[];
/** Per-agent-run timeout for this scenario. Overrides config.timeoutMs (e.g. heavier
* scenarios that legitimately need longer than the global default). */
timeoutMs?: number;
}

/** Cost signal for a single agent run, parsed from `claude -p --output-format json`. */
Expand Down