Evals: Add promptWithAgent param. Default is now false!

jahooma · jahooma · commit 8759551041e2 · 2025-10-09T21:32:05.000-07:00
diff --git a/evals/git-evals/run-eval-set.ts b/evals/git-evals/run-eval-set.ts
@@ -84,6 +84,11 @@ class RunEvalSetCommand extends Command {
       description: 'Codebuff agent id to use',
       default: 'base',
     }),
+    'prompt-with-agent': Flags.boolean({
+      description: 'Prompt with agent',
+      default: false,
+      allowNo: true,
+    }),
     help: Flags.help({ char: 'h' }),
   }
 
@@ -169,6 +174,7 @@ async function runEvalSet(options: {
   concurrency?: number
   'coding-agent': string
   agent: string
+  'prompt-with-agent': boolean
 }): Promise<void> {
   const {
     sets,
@@ -180,6 +186,7 @@ async function runEvalSet(options: {
     title,
     'coding-agent': codingAgentstr,
     agent,
+    'prompt-with-agent': promptWithAgent,
   } = options
 
   if (!['codebuff', 'claude'].includes(codingAgentstr)) {
@@ -278,6 +285,7 @@ async function runEvalSet(options: {
             options.concurrency === 1,
             agent,
             worktreePath,
+            promptWithAgent,
           )
     } catch (error) {
       const evalDuration = Date.now() - evalStartTime
diff --git a/evals/git-evals/run-git-evals.ts b/evals/git-evals/run-git-evals.ts
@@ -42,7 +42,7 @@ export async function runSingleEval(
   fingerprintId: string,
   codingAgent: 'codebuff' | 'claude',
   agent?: string,
-  promptWithSpec: boolean = false,
+  promptWithAgent: boolean = false,
 ): Promise<EvalRunJudged> {
   const startTime = new Date()
   const trace: CodebuffTrace[] = []
@@ -94,7 +94,7 @@ export async function runSingleEval(
 
     let currentDecision: AgentDecision = 'continue'
     let attempts = 0
-    const MAX_ATTEMPTS = promptWithSpec ? 1 : 5
+    const MAX_ATTEMPTS = promptWithAgent ? 1 : 5
 
     while (currentDecision === 'continue' && attempts < MAX_ATTEMPTS) {
       // Check for process-level errors
@@ -120,7 +120,7 @@ export async function runSingleEval(
       // Get next prompt from prompting agent with timeout
       let agentResponse: z.infer<typeof AgentDecisionSchema>
       try {
-        agentResponse = promptWithSpec
+        agentResponse = !promptWithAgent
           ? {
               decision: 'continue',
               reasoning: 'Using spec as sole prompt',
@@ -376,6 +376,7 @@ export async function runGitEvals(
   logToStdout: boolean = false,
   agent: string = 'base',
   worktreePath?: string,
+  promptWithAgent: boolean = false,
 ): Promise<FullEvalLog> {
   // Set up signal handlers if this is the main module
   if (require.main === module) {
@@ -417,12 +418,6 @@ export async function runGitEvals(
   const logsDir = path.join(outputDir, 'logs', `${testRepoName}-${traceId}`)
   fs.mkdirSync(logsDir, { recursive: true })
 
-  // Generate filenames with trace ID (single file that gets overwritten)
-  const partialOutputPath = path.join(
-    outputDir,
-    `eval-partial-${testRepoName}-${traceId}.json`,
-  )
-
   const commitsToRun = limit
     ? evalData.evalCommits.slice(0, limit)
     : evalData.evalCommits
@@ -496,6 +491,7 @@ export async function runGitEvals(
                 fingerprintId,
                 codingAgent,
                 agent,
+                promptWithAgent.toString(),
               ],
               {
                 stdio: ['pipe', 'pipe', 'pipe', 'ipc'],
diff --git a/evals/git-evals/run-single-eval-process.ts b/evals/git-evals/run-single-eval-process.ts
@@ -31,6 +31,7 @@ async function main() {
     fingerprintId,
     codingAgent,
     agent,
+    promptWithAgent,
   ] = process.argv.slice(2)
 
   if (
@@ -39,7 +40,8 @@ async function main() {
     !clientSessionId ||
     !fingerprintId ||
     !codingAgent ||
-    !agent
+    !agent ||
+    !promptWithAgent
   ) {
     console.error('Missing required arguments for single eval process')
     process.exit(1)
@@ -74,7 +76,7 @@ async function main() {
       fingerprintId,
       codingAgent as any,
       agent,
-      false,
+      promptWithAgent === 'true',
     )
 
     // Check again after long-running operation