diff --git a/packages/core/src/evaluation/providers/cli.ts b/packages/core/src/evaluation/providers/cli.ts index 2a98a85c..49701b2e 100644 --- a/packages/core/src/evaluation/providers/cli.ts +++ b/packages/core/src/evaluation/providers/cli.ts @@ -352,9 +352,15 @@ export class CliProvider implements Provider { ); const renderedCommand = renderTemplate(this.config.command, templateValues); + // Use per-request cwd override (from workspace) if any request provides one, + // otherwise fall back to the target's configured cwd. + // All requests in a batch share the same workspace, so the first request's cwd + // is representative of the entire batch. + const effectiveCwd = requests[0]?.cwd ?? this.config.cwd; + if (this.verbose) { console.log( - `[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ''} command=${renderedCommand}`, + `[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${effectiveCwd ?? ''} command=${renderedCommand}`, ); } @@ -362,7 +368,7 @@ export class CliProvider implements Provider { try { const startTime = Date.now(); const result = await this.runCommand(renderedCommand, { - cwd: this.config.cwd, + cwd: effectiveCwd, env: process.env, timeoutMs: this.config.timeoutMs, signal: controller.signal, @@ -402,7 +408,7 @@ export class CliProvider implements Provider { command: renderedCommand, stderr: result.stderr, exitCode: result.exitCode ?? 0, - cwd: this.config.cwd, + cwd: effectiveCwd, outputFile: outputFilePath, }, }; @@ -423,7 +429,7 @@ export class CliProvider implements Provider { command: renderedCommand, stderr: result.stderr, exitCode: result.exitCode ?? 0, - cwd: this.config.cwd, + cwd: effectiveCwd, outputFile: outputFilePath, error: errorMessage, }, @@ -439,7 +445,7 @@ export class CliProvider implements Provider { command: renderedCommand, stderr: result.stderr, exitCode: result.exitCode ?? 0, - cwd: this.config.cwd, + cwd: effectiveCwd, outputFile: outputFilePath, recordId: evalCaseId, }, diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index 4249b9fe..a5134076 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -223,6 +223,38 @@ describe('runTestCase', () => { expect(provider.callIndex).toBe(1); }); + it('preserves workspace cwd across retry attempts', async () => { + const cwdsSeen: (string | undefined)[] = []; + const provider: Provider = { + id: 'mock:cwd-test', + kind: 'mock' as const, + targetName: 'cwd-test', + async invoke(request: ProviderRequest): Promise { + cwdsSeen.push(request.cwd); + if (cwdsSeen.length === 1) { + throw new Error('Transient failure'); + } + return { + output: [{ role: 'assistant', content: 'Success on retry' }], + }; + }, + }; + + const result = await runEvalCase({ + evalCase: baseTestCase, + provider, + target: baseTarget, + evaluators: evaluatorRegistry, + maxRetries: 1, + sharedWorkspacePath: '/fake/workspace/path', + }); + + expect(result.score).toBeGreaterThan(0); + expect(cwdsSeen).toHaveLength(2); + expect(cwdsSeen[0]).toBe('/fake/workspace/path'); + expect(cwdsSeen[1]).toBe('/fake/workspace/path'); + }); + it('retries provider errors up to maxRetries', async () => { const provider = new SequenceProvider('mock', { errors: [new Error('Request timeout')], diff --git a/packages/core/test/evaluation/providers/cli.test.ts b/packages/core/test/evaluation/providers/cli.test.ts index ec08d427..0d2958a8 100644 --- a/packages/core/test/evaluation/providers/cli.test.ts +++ b/packages/core/test/evaluation/providers/cli.test.ts @@ -138,6 +138,91 @@ describe('CliProvider', () => { await expect(provider.invoke(baseRequest)).rejects.toThrow(/timed out/i); }); + it('uses request.cwd as working directory override in invoke', async () => { + let capturedCwd: string | undefined; + const runner = mock(async (command: string, options): Promise => { + capturedCwd = options?.cwd; + const match = command.match(/agentv-case-1-\d+-\w+\.json/); + if (match) { + const outputFilePath = path.join(os.tmpdir(), match[0]); + await writeFile(outputFilePath, 'response', 'utf-8'); + createdFiles.push(outputFilePath); + } + return { stdout: '', stderr: '', exitCode: 0, failed: false }; + }); + + const configWithCwd: CliResolvedConfig = { ...baseConfig, cwd: '/config/cwd' }; + const provider = new CliProvider('cli-target', configWithCwd, runner); + + // request.cwd should override config.cwd + await provider.invoke({ ...baseRequest, cwd: '/workspace/path' }); + expect(capturedCwd).toBe('/workspace/path'); + }); + + it('falls back to config.cwd when request.cwd is undefined in invoke', async () => { + let capturedCwd: string | undefined; + const runner = mock(async (command: string, options): Promise => { + capturedCwd = options?.cwd; + const match = command.match(/agentv-case-1-\d+-\w+\.json/); + if (match) { + const outputFilePath = path.join(os.tmpdir(), match[0]); + await writeFile(outputFilePath, 'response', 'utf-8'); + createdFiles.push(outputFilePath); + } + return { stdout: '', stderr: '', exitCode: 0, failed: false }; + }); + + const configWithCwd: CliResolvedConfig = { ...baseConfig, cwd: '/config/cwd' }; + const provider = new CliProvider('cli-target', configWithCwd, runner); + + await provider.invoke(baseRequest); // no cwd in request + expect(capturedCwd).toBe('/config/cwd'); + }); + + it('uses request.cwd as working directory override in invokeBatch', async () => { + let capturedCwd: string | undefined; + const runner = mock(async (command: string, options): Promise => { + capturedCwd = options?.cwd; + const match = command.match(/agentv-batch-\d+-\w+\.jsonl/); + if (match) { + const outputFilePath = path.join(os.tmpdir(), match[0]); + const jsonl = `${JSON.stringify({ id: 'case-1', text: 'ok' })}\n`; + await writeFile(outputFilePath, jsonl, 'utf-8'); + createdFiles.push(outputFilePath); + } + return { stdout: '', stderr: '', exitCode: 0, failed: false }; + }); + + const configWithCwd: CliResolvedConfig = { ...baseConfig, cwd: '/config/cwd' }; + const provider = new CliProvider('cli-target', configWithCwd, runner); + + // First request's cwd should override config.cwd for the batch + await provider.invokeBatch([{ ...baseRequest, cwd: '/workspace/path' }]); + expect(capturedCwd).toBe('/workspace/path'); + }); + + it('falls back to config.cwd when request.cwd is undefined in invokeBatch', async () => { + let capturedCwd: string | undefined; + const runner = mock(async (command: string, options): Promise => { + capturedCwd = options?.cwd; + const match = command.match(/agentv-batch-\d+-\w+\.jsonl/); + if (match) { + const outputFilePath = path.join(os.tmpdir(), match[0]); + const jsonl = `${JSON.stringify({ id: 'case-1', text: 'ok' })}\n`; + await writeFile(outputFilePath, jsonl, 'utf-8'); + createdFiles.push(outputFilePath); + } + return { stdout: '', stderr: '', exitCode: 0, failed: false }; + }); + + const configWithCwd: CliResolvedConfig = { ...baseConfig, cwd: '/config/cwd' }; + const provider = new CliProvider('cli-target', configWithCwd, runner); + + // No cwd in request — should fall back to config.cwd + await provider.invokeBatch([baseRequest]); + expect(capturedCwd).toBe('/config/cwd'); + }); + it('supports batch mode by reading JSONL records keyed by id', async () => { const runner = mock(async (command: string): Promise => { const match = command.match(/agentv-batch-\d+-\w+\.jsonl/);