Skip to content

Commit 8759551

Browse files
committed
Evals: Add promptWithAgent param. Default is now false!
1 parent a552fbc commit 8759551

File tree

3 files changed

+17
-11
lines changed

3 files changed

+17
-11
lines changed

evals/git-evals/run-eval-set.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,11 @@ class RunEvalSetCommand extends Command {
8484
description: 'Codebuff agent id to use',
8585
default: 'base',
8686
}),
87+
'prompt-with-agent': Flags.boolean({
88+
description: 'Prompt with agent',
89+
default: false,
90+
allowNo: true,
91+
}),
8792
help: Flags.help({ char: 'h' }),
8893
}
8994

@@ -169,6 +174,7 @@ async function runEvalSet(options: {
169174
concurrency?: number
170175
'coding-agent': string
171176
agent: string
177+
'prompt-with-agent': boolean
172178
}): Promise<void> {
173179
const {
174180
sets,
@@ -180,6 +186,7 @@ async function runEvalSet(options: {
180186
title,
181187
'coding-agent': codingAgentstr,
182188
agent,
189+
'prompt-with-agent': promptWithAgent,
183190
} = options
184191

185192
if (!['codebuff', 'claude'].includes(codingAgentstr)) {
@@ -278,6 +285,7 @@ async function runEvalSet(options: {
278285
options.concurrency === 1,
279286
agent,
280287
worktreePath,
288+
promptWithAgent,
281289
)
282290
} catch (error) {
283291
const evalDuration = Date.now() - evalStartTime

evals/git-evals/run-git-evals.ts

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ export async function runSingleEval(
4242
fingerprintId: string,
4343
codingAgent: 'codebuff' | 'claude',
4444
agent?: string,
45-
promptWithSpec: boolean = false,
45+
promptWithAgent: boolean = false,
4646
): Promise<EvalRunJudged> {
4747
const startTime = new Date()
4848
const trace: CodebuffTrace[] = []
@@ -94,7 +94,7 @@ export async function runSingleEval(
9494

9595
let currentDecision: AgentDecision = 'continue'
9696
let attempts = 0
97-
const MAX_ATTEMPTS = promptWithSpec ? 1 : 5
97+
const MAX_ATTEMPTS = promptWithAgent ? 1 : 5
9898

9999
while (currentDecision === 'continue' && attempts < MAX_ATTEMPTS) {
100100
// Check for process-level errors
@@ -120,7 +120,7 @@ export async function runSingleEval(
120120
// Get next prompt from prompting agent with timeout
121121
let agentResponse: z.infer<typeof AgentDecisionSchema>
122122
try {
123-
agentResponse = promptWithSpec
123+
agentResponse = !promptWithAgent
124124
? {
125125
decision: 'continue',
126126
reasoning: 'Using spec as sole prompt',
@@ -376,6 +376,7 @@ export async function runGitEvals(
376376
logToStdout: boolean = false,
377377
agent: string = 'base',
378378
worktreePath?: string,
379+
promptWithAgent: boolean = false,
379380
): Promise<FullEvalLog> {
380381
// Set up signal handlers if this is the main module
381382
if (require.main === module) {
@@ -417,12 +418,6 @@ export async function runGitEvals(
417418
const logsDir = path.join(outputDir, 'logs', `${testRepoName}-${traceId}`)
418419
fs.mkdirSync(logsDir, { recursive: true })
419420

420-
// Generate filenames with trace ID (single file that gets overwritten)
421-
const partialOutputPath = path.join(
422-
outputDir,
423-
`eval-partial-${testRepoName}-${traceId}.json`,
424-
)
425-
426421
const commitsToRun = limit
427422
? evalData.evalCommits.slice(0, limit)
428423
: evalData.evalCommits
@@ -496,6 +491,7 @@ export async function runGitEvals(
496491
fingerprintId,
497492
codingAgent,
498493
agent,
494+
promptWithAgent.toString(),
499495
],
500496
{
501497
stdio: ['pipe', 'pipe', 'pipe', 'ipc'],

evals/git-evals/run-single-eval-process.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ async function main() {
3131
fingerprintId,
3232
codingAgent,
3333
agent,
34+
promptWithAgent,
3435
] = process.argv.slice(2)
3536

3637
if (
@@ -39,7 +40,8 @@ async function main() {
3940
!clientSessionId ||
4041
!fingerprintId ||
4142
!codingAgent ||
42-
!agent
43+
!agent ||
44+
!promptWithAgent
4345
) {
4446
console.error('Missing required arguments for single eval process')
4547
process.exit(1)
@@ -74,7 +76,7 @@ async function main() {
7476
fingerprintId,
7577
codingAgent as any,
7678
agent,
77-
false,
79+
promptWithAgent === 'true',
7880
)
7981

8082
// Check again after long-running operation

0 commit comments

Comments
 (0)