CodebuffAI
diff --git a/‎evals/buffbench/agent-runner.ts‎
Lines changed: 156 additions & 0 deletions b/‎evals/buffbench/agent-runner.ts‎
Lines changed: 156 additions & 0 deletions
diff --git a/‎evals/buffbench/eval-codebuff.json‎
Lines changed: 3162 additions & 0 deletions b/‎evals/buffbench/eval-codebuff.json‎
Lines changed: 3162 additions & 0 deletions
diff --git a/‎evals/buffbench/eval-manifold.json‎
Lines changed: 1667 additions & 0 deletions b/‎evals/buffbench/eval-manifold.json‎
Lines changed: 1667 additions & 0 deletions
diff --git a/‎evals/buffbench/eval-plane.json‎
Lines changed: 2028 additions & 0 deletions b/‎evals/buffbench/eval-plane.json‎
Lines changed: 2028 additions & 0 deletions
diff --git a/‎evals/buffbench/eval-saleor.json‎
Lines changed: 1831 additions & 0 deletions b/‎evals/buffbench/eval-saleor.json‎
Lines changed: 1831 additions & 0 deletions
diff --git a/‎evals/buffbench/eval-task-generator.ts‎
Lines changed: 162 additions & 0 deletions b/‎evals/buffbench/eval-task-generator.ts‎
Lines changed: 162 additions & 0 deletions
@@ -0,0 +1,156 @@
+import { execSync } from 'child_process'
+import path from 'path'
+
+import { withTimeout } from '@codebuff/common/util/promise'
+import { loadLocalAgents } from '@codebuff/npm-app/agents/load-agents'
+import { CodebuffClient } from '../../sdk/src/client'
+import { withTestRepo } from '../subagents/test-repo-utils'
+
+import type { EvalCommitV2 } from './types'
+
+export interface AgentStep {
+  response: string
+  toolCalls: any[]
+  toolResults: any[]
+}
+
+export async function runAgentOnCommit({
+  client,
+  agentId,
+  commit,
+  repoUrl,
+  initCommand,
+}: {
+  client: CodebuffClient
+  agentId: string
+  commit: EvalCommitV2
+  repoUrl: string
+  initCommand?: string
+}): Promise<{
+  diff: string
+  contextFiles: Record<string, string>
+  durationMs: number
+  cost: number
+  error?: string
+  trace: AgentStep[]
+}> {
+  console.log(`[${commit.id}] Running agent ${agentId}...`)
+  const startTime = Date.now()
+  let diff = ''
+  let contextFiles: Record<string, string> = {}
+  let error: string | undefined
+  let cost = 0
+  const trace: AgentStep[] = []
+
+  try {
+    await withTestRepo(
+      {
+        repoUrl,
+        parentSha: commit.parentSha,
+        initCommand,
+      },
+      async (repoDir) => {
+        const agentsPath = path.join(__dirname, '../../.agents')
+        const localAgentDefinitions = Object.values(
+          await loadLocalAgents({ agentsPath }),
+        )
+
+        let responseText = ''
+        let toolCalls: any[] = []
+        let toolResults: any[] = []
+
+        function flushStep() {
+          if (
+            responseText.length > 0 ||
+            toolCalls.length > 0 ||
+            toolResults.length > 0
+          ) {
+            trace.push({ response: responseText, toolCalls, toolResults })
+            responseText = ''
+            toolCalls = []
+            toolResults = []
+          }
+        }
+
+        const timeoutMs = 10 * 60 * 1000 // 10 minutes
+        const result = await withTimeout(
+          client.run({
+            agent: agentId,
+            prompt: commit.prompt,
+            agentDefinitions: localAgentDefinitions,
+            cwd: repoDir,
+            handleEvent: (event) => {
+              if (event.type === 'text') {
+                if (toolResults.length > 0) {
+                  flushStep()
+                }
+                responseText += event.text
+              } else if (event.type === 'tool_call') {
+                if (event.toolName === 'set_messages') {
+                  return
+                }
+                toolCalls.push(event)
+              } else if (event.type === 'tool_result') {
+                toolResults.push(event)
+              } else if (event.type === 'finish') {
+                flushStep()
+              } else if (event.type === 'error') {
+                console.error(`[${agentId}] Error event:`, event.message)
+              }
+            },
+          }),
+          timeoutMs,
+          `Agent ${agentId} timed out after ${timeoutMs / 1000} seconds`,
+        )
+
+        flushStep()
+        cost = result.sessionState.mainAgentState.creditsUsed / 100
+
+        execSync('git add .', { cwd: repoDir, stdio: 'ignore' })
+        diff = execSync(`git diff ${commit.parentSha}`, {
+          cwd: repoDir,
+          encoding: 'utf-8',
+        })
+
+        const contextFilePaths = new Set<string>([
+          ...commit.supplementalFiles,
+          ...commit.fileDiffs.map((fd) => fd.path),
+        ])
+        for (const { status, path } of commit.fileDiffs) {
+          if (status === 'added') {
+            contextFilePaths.delete(path)
+          }
+        }
+
+        for (const filePath of contextFilePaths) {
+          try {
+            const content = execSync(
+              `git show ${commit.parentSha}:${JSON.stringify(filePath)}`,
+              {
+                cwd: repoDir,
+                encoding: 'utf-8',
+                maxBuffer: 10 * 1024 * 1024,
+              },
+            )
+            contextFiles[filePath] = content
+          } catch (error) {
+            contextFiles[filePath] = ''
+          }
+        }
+      },
+    )
+  } catch (e) {
+    error = e instanceof Error ? `${e.message}\n${e.stack}` : String(e)
+  }
+
+  const durationMs = Date.now() - startTime
+
+  return {
+    diff,
+    contextFiles,
+    durationMs,
+    cost,
+    error,
+    trace,
+  }
+}
@@ -0,0 +1,162 @@
+import { CodebuffClient } from '../../sdk/src/client'
+import type { AgentDefinition } from '../../sdk/src'
+import fileExplorerDef from '../../.agents/file-explorer/file-explorer'
+import findAllReferencerDef from '../../.agents/file-explorer/find-all-referencer'
+import { PLACEHOLDER } from '../../.agents/types/secret-agent-definition'
+
+const evalTaskGeneratorAgentDef: AgentDefinition = {
+  id: 'git-evals2-eval-task-generator',
+  displayName: 'Git Evals2 Eval Task Generator',
+  model: 'openai/gpt-5',
+  toolNames: ['spawn_agents', 'read_files', 'set_output'],
+  spawnableAgents: ['file-explorer', 'find-all-referencer'],
+  inputSchema: {
+    prompt: {
+      type: 'string',
+      description: 'Instructions to generate the task spec and prompt',
+    },
+  },
+  outputMode: 'structured_output',
+  outputSchema: {
+    type: 'object',
+    properties: {
+      id: {
+        type: 'string',
+        description:
+          'Short 2-3 word hyphenated task identifier (e.g., "fix-auth-bug", "add-user-profile", "refactor-login-flow")',
+      },
+      reasoning: {
+        type: 'string',
+        description: 'Your thoughts about the task, spec, and prompt',
+      },
+      spec: {
+        type: 'string',
+        description:
+          'Clear specification describing WHAT needs to be implemented (observable behavior/structure, not HOW)',
+      },
+      prompt: {
+        type: 'string',
+        description: 'High-level user prompt describing what needs to be done',
+      },
+      supplementalFiles: {
+        type: 'array',
+        items: { type: 'string' },
+        description: 'List of supplemental file paths',
+      },
+    },
+    required: ['id', 'reasoning', 'spec', 'prompt', 'supplementalFiles'],
+  },
+  systemPrompt: `You are an expert at analyzing git commits and generating evaluation tasks for AI coding assistants.
+
+You will receive:
+- A git diff showing the changes made
+- The list of files that were edited
+- An optional commit message
+- The repository directory where you can explore the codebase
+
+You must generate both a specification (spec) and a user prompt for the task.
+
+${PLACEHOLDER.FILE_TREE_PROMPT}
+${PLACEHOLDER.KNOWLEDGE_FILES_CONTENTS}`,
+
+  instructionsPrompt: `Your task:
+1. Analyze the git diff to understand what changed
+2. Spawn the file-explorer and find-all-referencer to explore the codebase and understand context.
+3. Read as many files relevant to the changes as possible.
+4. Generate the output, including:
+- a short, descriptive task ID (2-3 hyphenated words like "fix-auth-bug" or "refactor-login-flow")
+- a clear specification describing exactly what needs to be implemented
+- a high-level user prompt that describes what needs to be done leaving out details that should be reconstructed by the agent
+- supplemental files that would help a judge understand the change (exclude directly edited files)
+
+Key principles for the task ID:
+- 2-3 words maximum, hyphenated (e.g., "fix-memory-leak", "add-user-profile", "refactor-auth-flow")
+- Descriptive but concise
+- Use action verbs when appropriate (fix, add, remove, refactor, update, implement)
+- Lowercase with hyphens
+
+Key principles for the spec:
+- Prescribe exactly how to make the change with references to the files that need to be changed
+- Not include code
+- Focus on the observable behavior or structure that needs to be implemented
+- Be clear enough that a skilled developer or AI could implement it from scratch
+- Be phrased as what needs to be done, not what was already done
+- Cover all the changes shown across multiple files
+
+Key principles for the prompt:
+- Focus on the high-level functional requirements, not implementation details
+- Use natural language: "add user authentication" not "implement authenticateUser function"
+- Omit details that should be reconstructed by the agent
+- Be clear enough that a skilled developer could implement from scratch
+- Consider the commit message as a hint but don't just copy it
+`,
+}
+
+export async function generateEvalTask({
+  client,
+  input,
+  agentDefinitions,
+}: {
+  client: CodebuffClient
+  input: {
+    commitSha: string
+    parentSha: string
+    diff: string
+    editedFilePaths: string[]
+    commitMessage?: string
+    repoPath: string
+  }
+  agentDefinitions?: any[]
+}): Promise<{
+  id: string
+  reasoning: string
+  spec: string
+  prompt: string
+  supplementalFiles: string[]
+}> {
+  const { diff, editedFilePaths, commitMessage, repoPath } = input
+
+  const allAgentDefinitions = [
+    evalTaskGeneratorAgentDef,
+    fileExplorerDef,
+    findAllReferencerDef,
+    ...(agentDefinitions || []),
+  ]
+
+  const generatorResult = await client.run({
+    agent: 'git-evals2-eval-task-generator',
+    prompt:
+      'Generate a task specification and user prompt based on the git diff and codebase exploration',
+    params: {
+      diff,
+      editedFilePaths,
+      commitMessage,
+    },
+    cwd: repoPath,
+    agentDefinitions: allAgentDefinitions,
+    handleEvent: (event) => {
+      if (event.type === 'subagent_start') {
+        console.log(`[Agent] Starting: ${event.displayName}`)
+      } else if (event.type === 'tool_call') {
+        console.log(`[Tool] ${event.toolName}`)
+      } else if (event.type === 'text') {
+        console.log(`[Text] ${event.text}...`)
+      }
+    },
+  })
+
+  if (
+    generatorResult.output.type !== 'structuredOutput' ||
+    !generatorResult.output.value
+  ) {
+    throw new Error('Failed to generate structured task output')
+  }
+
+  return generatorResult.output.value as {
+    id: string
+    reasoning: string
+    spec: string
+    prompt: string
+    supplementalFiles: string[]
+  }
+}