From 453d40d294e8bbdac351de366583b310937a677e Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 2 Apr 2026 11:38:11 +0000 Subject: [PATCH 1/3] chore(core): claim issue #912 Co-Authored-By: Claude Opus 4.6 From e783c008da6d3bc723ad3d5d31a644b448031b5e Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 2 Apr 2026 12:01:04 +0000 Subject: [PATCH 2/3] fix(core): preserve workspace cwd on CLI provider retries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix CliProvider.invokeBatch() to use per-request cwd override (request.cwd) instead of always using this.config.cwd. The first request's cwd takes priority, falling back to config.cwd. - Add test verifying orchestrator retry loop preserves workspace cwd across retry attempts via sharedWorkspacePath → request.cwd chain. - Add tests for invoke/invokeBatch cwd override vs config.cwd fallback. Closes #912 Co-Authored-By: Claude Opus 4.6 --- packages/core/src/evaluation/providers/cli.ts | 8 ++- .../core/test/evaluation/orchestrator.test.ts | 32 ++++++++++ .../test/evaluation/providers/cli.test.ts | 63 +++++++++++++++++++ 3 files changed, 101 insertions(+), 2 deletions(-) diff --git a/packages/core/src/evaluation/providers/cli.ts b/packages/core/src/evaluation/providers/cli.ts index 2a98a85cb..7f1a1da42 100644 --- a/packages/core/src/evaluation/providers/cli.ts +++ b/packages/core/src/evaluation/providers/cli.ts @@ -352,9 +352,13 @@ export class CliProvider implements Provider { ); const renderedCommand = renderTemplate(this.config.command, templateValues); + // Use per-request cwd override (from workspace) if any request provides one, + // otherwise fall back to the target's configured cwd. + const effectiveCwd = requests[0]?.cwd ?? this.config.cwd; + if (this.verbose) { console.log( - `[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ''} command=${renderedCommand}`, + `[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${effectiveCwd ?? ''} command=${renderedCommand}`, ); } @@ -362,7 +366,7 @@ export class CliProvider implements Provider { try { const startTime = Date.now(); const result = await this.runCommand(renderedCommand, { - cwd: this.config.cwd, + cwd: effectiveCwd, env: process.env, timeoutMs: this.config.timeoutMs, signal: controller.signal, diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index 4249b9fe5..a51340761 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -223,6 +223,38 @@ describe('runTestCase', () => { expect(provider.callIndex).toBe(1); }); + it('preserves workspace cwd across retry attempts', async () => { + const cwdsSeen: (string | undefined)[] = []; + const provider: Provider = { + id: 'mock:cwd-test', + kind: 'mock' as const, + targetName: 'cwd-test', + async invoke(request: ProviderRequest): Promise { + cwdsSeen.push(request.cwd); + if (cwdsSeen.length === 1) { + throw new Error('Transient failure'); + } + return { + output: [{ role: 'assistant', content: 'Success on retry' }], + }; + }, + }; + + const result = await runEvalCase({ + evalCase: baseTestCase, + provider, + target: baseTarget, + evaluators: evaluatorRegistry, + maxRetries: 1, + sharedWorkspacePath: '/fake/workspace/path', + }); + + expect(result.score).toBeGreaterThan(0); + expect(cwdsSeen).toHaveLength(2); + expect(cwdsSeen[0]).toBe('/fake/workspace/path'); + expect(cwdsSeen[1]).toBe('/fake/workspace/path'); + }); + it('retries provider errors up to maxRetries', async () => { const provider = new SequenceProvider('mock', { errors: [new Error('Request timeout')], diff --git a/packages/core/test/evaluation/providers/cli.test.ts b/packages/core/test/evaluation/providers/cli.test.ts index ec08d4275..9e0e7dd2c 100644 --- a/packages/core/test/evaluation/providers/cli.test.ts +++ b/packages/core/test/evaluation/providers/cli.test.ts @@ -138,6 +138,69 @@ describe('CliProvider', () => { await expect(provider.invoke(baseRequest)).rejects.toThrow(/timed out/i); }); + it('uses request.cwd as working directory override in invoke', async () => { + let capturedCwd: string | undefined; + const runner = mock(async (command: string, options): Promise => { + capturedCwd = options?.cwd; + const match = command.match(/agentv-case-1-\d+-\w+\.json/); + if (match) { + const outputFilePath = path.join(os.tmpdir(), match[0]); + await writeFile(outputFilePath, 'response', 'utf-8'); + createdFiles.push(outputFilePath); + } + return { stdout: '', stderr: '', exitCode: 0, failed: false }; + }); + + const configWithCwd: CliResolvedConfig = { ...baseConfig, cwd: '/config/cwd' }; + const provider = new CliProvider('cli-target', configWithCwd, runner); + + // request.cwd should override config.cwd + await provider.invoke({ ...baseRequest, cwd: '/workspace/path' }); + expect(capturedCwd).toBe('/workspace/path'); + }); + + it('falls back to config.cwd when request.cwd is undefined in invoke', async () => { + let capturedCwd: string | undefined; + const runner = mock(async (command: string, options): Promise => { + capturedCwd = options?.cwd; + const match = command.match(/agentv-case-1-\d+-\w+\.json/); + if (match) { + const outputFilePath = path.join(os.tmpdir(), match[0]); + await writeFile(outputFilePath, 'response', 'utf-8'); + createdFiles.push(outputFilePath); + } + return { stdout: '', stderr: '', exitCode: 0, failed: false }; + }); + + const configWithCwd: CliResolvedConfig = { ...baseConfig, cwd: '/config/cwd' }; + const provider = new CliProvider('cli-target', configWithCwd, runner); + + await provider.invoke(baseRequest); // no cwd in request + expect(capturedCwd).toBe('/config/cwd'); + }); + + it('uses request.cwd as working directory override in invokeBatch', async () => { + let capturedCwd: string | undefined; + const runner = mock(async (command: string, options): Promise => { + capturedCwd = options?.cwd; + const match = command.match(/agentv-batch-\d+-\w+\.jsonl/); + if (match) { + const outputFilePath = path.join(os.tmpdir(), match[0]); + const jsonl = `${JSON.stringify({ id: 'case-1', text: 'ok' })}\n`; + await writeFile(outputFilePath, jsonl, 'utf-8'); + createdFiles.push(outputFilePath); + } + return { stdout: '', stderr: '', exitCode: 0, failed: false }; + }); + + const configWithCwd: CliResolvedConfig = { ...baseConfig, cwd: '/config/cwd' }; + const provider = new CliProvider('cli-target', configWithCwd, runner); + + // First request's cwd should override config.cwd for the batch + await provider.invokeBatch([{ ...baseRequest, cwd: '/workspace/path' }]); + expect(capturedCwd).toBe('/workspace/path'); + }); + it('supports batch mode by reading JSONL records keyed by id', async () => { const runner = mock(async (command: string): Promise => { const match = command.match(/agentv-batch-\d+-\w+\.jsonl/); From aaa22a84b52be2870085b119d2ca8f27b2ddd432 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 2 Apr 2026 12:23:49 +0000 Subject: [PATCH 3/3] fix(core): use effectiveCwd in invokeBatch response metadata Address code review feedback: - Replace raw.cwd references with effectiveCwd in all 3 invokeBatch response objects (success, error, fallback) - Add batch cwd invariant comment - Add invokeBatch fallback test for undefined request.cwd Co-Authored-By: Claude Opus 4.6 --- packages/core/src/evaluation/providers/cli.ts | 8 ++++--- .../test/evaluation/providers/cli.test.ts | 22 +++++++++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/packages/core/src/evaluation/providers/cli.ts b/packages/core/src/evaluation/providers/cli.ts index 7f1a1da42..49701b2e3 100644 --- a/packages/core/src/evaluation/providers/cli.ts +++ b/packages/core/src/evaluation/providers/cli.ts @@ -354,6 +354,8 @@ export class CliProvider implements Provider { // Use per-request cwd override (from workspace) if any request provides one, // otherwise fall back to the target's configured cwd. + // All requests in a batch share the same workspace, so the first request's cwd + // is representative of the entire batch. const effectiveCwd = requests[0]?.cwd ?? this.config.cwd; if (this.verbose) { @@ -406,7 +408,7 @@ export class CliProvider implements Provider { command: renderedCommand, stderr: result.stderr, exitCode: result.exitCode ?? 0, - cwd: this.config.cwd, + cwd: effectiveCwd, outputFile: outputFilePath, }, }; @@ -427,7 +429,7 @@ export class CliProvider implements Provider { command: renderedCommand, stderr: result.stderr, exitCode: result.exitCode ?? 0, - cwd: this.config.cwd, + cwd: effectiveCwd, outputFile: outputFilePath, error: errorMessage, }, @@ -443,7 +445,7 @@ export class CliProvider implements Provider { command: renderedCommand, stderr: result.stderr, exitCode: result.exitCode ?? 0, - cwd: this.config.cwd, + cwd: effectiveCwd, outputFile: outputFilePath, recordId: evalCaseId, }, diff --git a/packages/core/test/evaluation/providers/cli.test.ts b/packages/core/test/evaluation/providers/cli.test.ts index 9e0e7dd2c..0d2958a84 100644 --- a/packages/core/test/evaluation/providers/cli.test.ts +++ b/packages/core/test/evaluation/providers/cli.test.ts @@ -201,6 +201,28 @@ describe('CliProvider', () => { expect(capturedCwd).toBe('/workspace/path'); }); + it('falls back to config.cwd when request.cwd is undefined in invokeBatch', async () => { + let capturedCwd: string | undefined; + const runner = mock(async (command: string, options): Promise => { + capturedCwd = options?.cwd; + const match = command.match(/agentv-batch-\d+-\w+\.jsonl/); + if (match) { + const outputFilePath = path.join(os.tmpdir(), match[0]); + const jsonl = `${JSON.stringify({ id: 'case-1', text: 'ok' })}\n`; + await writeFile(outputFilePath, jsonl, 'utf-8'); + createdFiles.push(outputFilePath); + } + return { stdout: '', stderr: '', exitCode: 0, failed: false }; + }); + + const configWithCwd: CliResolvedConfig = { ...baseConfig, cwd: '/config/cwd' }; + const provider = new CliProvider('cli-target', configWithCwd, runner); + + // No cwd in request — should fall back to config.cwd + await provider.invokeBatch([baseRequest]); + expect(capturedCwd).toBe('/config/cwd'); + }); + it('supports batch mode by reading JSONL records keyed by id', async () => { const runner = mock(async (command: string): Promise => { const match = command.match(/agentv-batch-\d+-\w+\.jsonl/);