Skip to content

Commit 369a869

Browse files
authored
BuffBench2 (#342)
1 parent c217174 commit 369a869

18 files changed

+10506
-43
lines changed

evals/buffbench/agent-runner.ts

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
import { execSync } from 'child_process'
2+
import path from 'path'
3+
4+
import { withTimeout } from '@codebuff/common/util/promise'
5+
import { loadLocalAgents } from '@codebuff/npm-app/agents/load-agents'
6+
import { CodebuffClient } from '../../sdk/src/client'
7+
import { withTestRepo } from '../subagents/test-repo-utils'
8+
9+
import type { EvalCommitV2 } from './types'
10+
11+
export interface AgentStep {
12+
response: string
13+
toolCalls: any[]
14+
toolResults: any[]
15+
}
16+
17+
export async function runAgentOnCommit({
18+
client,
19+
agentId,
20+
commit,
21+
repoUrl,
22+
initCommand,
23+
}: {
24+
client: CodebuffClient
25+
agentId: string
26+
commit: EvalCommitV2
27+
repoUrl: string
28+
initCommand?: string
29+
}): Promise<{
30+
diff: string
31+
contextFiles: Record<string, string>
32+
durationMs: number
33+
cost: number
34+
error?: string
35+
trace: AgentStep[]
36+
}> {
37+
console.log(`[${commit.id}] Running agent ${agentId}...`)
38+
const startTime = Date.now()
39+
let diff = ''
40+
let contextFiles: Record<string, string> = {}
41+
let error: string | undefined
42+
let cost = 0
43+
const trace: AgentStep[] = []
44+
45+
try {
46+
await withTestRepo(
47+
{
48+
repoUrl,
49+
parentSha: commit.parentSha,
50+
initCommand,
51+
},
52+
async (repoDir) => {
53+
const agentsPath = path.join(__dirname, '../../.agents')
54+
const localAgentDefinitions = Object.values(
55+
await loadLocalAgents({ agentsPath }),
56+
)
57+
58+
let responseText = ''
59+
let toolCalls: any[] = []
60+
let toolResults: any[] = []
61+
62+
function flushStep() {
63+
if (
64+
responseText.length > 0 ||
65+
toolCalls.length > 0 ||
66+
toolResults.length > 0
67+
) {
68+
trace.push({ response: responseText, toolCalls, toolResults })
69+
responseText = ''
70+
toolCalls = []
71+
toolResults = []
72+
}
73+
}
74+
75+
const timeoutMs = 10 * 60 * 1000 // 10 minutes
76+
const result = await withTimeout(
77+
client.run({
78+
agent: agentId,
79+
prompt: commit.prompt,
80+
agentDefinitions: localAgentDefinitions,
81+
cwd: repoDir,
82+
handleEvent: (event) => {
83+
if (event.type === 'text') {
84+
if (toolResults.length > 0) {
85+
flushStep()
86+
}
87+
responseText += event.text
88+
} else if (event.type === 'tool_call') {
89+
if (event.toolName === 'set_messages') {
90+
return
91+
}
92+
toolCalls.push(event)
93+
} else if (event.type === 'tool_result') {
94+
toolResults.push(event)
95+
} else if (event.type === 'finish') {
96+
flushStep()
97+
} else if (event.type === 'error') {
98+
console.error(`[${agentId}] Error event:`, event.message)
99+
}
100+
},
101+
}),
102+
timeoutMs,
103+
`Agent ${agentId} timed out after ${timeoutMs / 1000} seconds`,
104+
)
105+
106+
flushStep()
107+
cost = result.sessionState.mainAgentState.creditsUsed / 100
108+
109+
execSync('git add .', { cwd: repoDir, stdio: 'ignore' })
110+
diff = execSync(`git diff ${commit.parentSha}`, {
111+
cwd: repoDir,
112+
encoding: 'utf-8',
113+
})
114+
115+
const contextFilePaths = new Set<string>([
116+
...commit.supplementalFiles,
117+
...commit.fileDiffs.map((fd) => fd.path),
118+
])
119+
for (const { status, path } of commit.fileDiffs) {
120+
if (status === 'added') {
121+
contextFilePaths.delete(path)
122+
}
123+
}
124+
125+
for (const filePath of contextFilePaths) {
126+
try {
127+
const content = execSync(
128+
`git show ${commit.parentSha}:${JSON.stringify(filePath)}`,
129+
{
130+
cwd: repoDir,
131+
encoding: 'utf-8',
132+
maxBuffer: 10 * 1024 * 1024,
133+
},
134+
)
135+
contextFiles[filePath] = content
136+
} catch (error) {
137+
contextFiles[filePath] = ''
138+
}
139+
}
140+
},
141+
)
142+
} catch (e) {
143+
error = e instanceof Error ? `${e.message}\n${e.stack}` : String(e)
144+
}
145+
146+
const durationMs = Date.now() - startTime
147+
148+
return {
149+
diff,
150+
contextFiles,
151+
durationMs,
152+
cost,
153+
error,
154+
trace,
155+
}
156+
}

evals/buffbench/eval-codebuff.json

Lines changed: 3162 additions & 0 deletions
Large diffs are not rendered by default.

evals/buffbench/eval-manifold.json

Lines changed: 1667 additions & 0 deletions
Large diffs are not rendered by default.

evals/buffbench/eval-plane.json

Lines changed: 2028 additions & 0 deletions
Large diffs are not rendered by default.

evals/buffbench/eval-saleor.json

Lines changed: 1831 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
import { CodebuffClient } from '../../sdk/src/client'
2+
import type { AgentDefinition } from '../../sdk/src'
3+
import fileExplorerDef from '../../.agents/file-explorer/file-explorer'
4+
import findAllReferencerDef from '../../.agents/file-explorer/find-all-referencer'
5+
import { PLACEHOLDER } from '../../.agents/types/secret-agent-definition'
6+
7+
const evalTaskGeneratorAgentDef: AgentDefinition = {
8+
id: 'git-evals2-eval-task-generator',
9+
displayName: 'Git Evals2 Eval Task Generator',
10+
model: 'openai/gpt-5',
11+
toolNames: ['spawn_agents', 'read_files', 'set_output'],
12+
spawnableAgents: ['file-explorer', 'find-all-referencer'],
13+
inputSchema: {
14+
prompt: {
15+
type: 'string',
16+
description: 'Instructions to generate the task spec and prompt',
17+
},
18+
},
19+
outputMode: 'structured_output',
20+
outputSchema: {
21+
type: 'object',
22+
properties: {
23+
id: {
24+
type: 'string',
25+
description:
26+
'Short 2-3 word hyphenated task identifier (e.g., "fix-auth-bug", "add-user-profile", "refactor-login-flow")',
27+
},
28+
reasoning: {
29+
type: 'string',
30+
description: 'Your thoughts about the task, spec, and prompt',
31+
},
32+
spec: {
33+
type: 'string',
34+
description:
35+
'Clear specification describing WHAT needs to be implemented (observable behavior/structure, not HOW)',
36+
},
37+
prompt: {
38+
type: 'string',
39+
description: 'High-level user prompt describing what needs to be done',
40+
},
41+
supplementalFiles: {
42+
type: 'array',
43+
items: { type: 'string' },
44+
description: 'List of supplemental file paths',
45+
},
46+
},
47+
required: ['id', 'reasoning', 'spec', 'prompt', 'supplementalFiles'],
48+
},
49+
systemPrompt: `You are an expert at analyzing git commits and generating evaluation tasks for AI coding assistants.
50+
51+
You will receive:
52+
- A git diff showing the changes made
53+
- The list of files that were edited
54+
- An optional commit message
55+
- The repository directory where you can explore the codebase
56+
57+
You must generate both a specification (spec) and a user prompt for the task.
58+
59+
${PLACEHOLDER.FILE_TREE_PROMPT}
60+
${PLACEHOLDER.KNOWLEDGE_FILES_CONTENTS}`,
61+
62+
instructionsPrompt: `Your task:
63+
1. Analyze the git diff to understand what changed
64+
2. Spawn the file-explorer and find-all-referencer to explore the codebase and understand context.
65+
3. Read as many files relevant to the changes as possible.
66+
4. Generate the output, including:
67+
- a short, descriptive task ID (2-3 hyphenated words like "fix-auth-bug" or "refactor-login-flow")
68+
- a clear specification describing exactly what needs to be implemented
69+
- a high-level user prompt that describes what needs to be done leaving out details that should be reconstructed by the agent
70+
- supplemental files that would help a judge understand the change (exclude directly edited files)
71+
72+
Key principles for the task ID:
73+
- 2-3 words maximum, hyphenated (e.g., "fix-memory-leak", "add-user-profile", "refactor-auth-flow")
74+
- Descriptive but concise
75+
- Use action verbs when appropriate (fix, add, remove, refactor, update, implement)
76+
- Lowercase with hyphens
77+
78+
Key principles for the spec:
79+
- Prescribe exactly how to make the change with references to the files that need to be changed
80+
- Not include code
81+
- Focus on the observable behavior or structure that needs to be implemented
82+
- Be clear enough that a skilled developer or AI could implement it from scratch
83+
- Be phrased as what needs to be done, not what was already done
84+
- Cover all the changes shown across multiple files
85+
86+
Key principles for the prompt:
87+
- Focus on the high-level functional requirements, not implementation details
88+
- Use natural language: "add user authentication" not "implement authenticateUser function"
89+
- Omit details that should be reconstructed by the agent
90+
- Be clear enough that a skilled developer could implement from scratch
91+
- Consider the commit message as a hint but don't just copy it
92+
`,
93+
}
94+
95+
export async function generateEvalTask({
96+
client,
97+
input,
98+
agentDefinitions,
99+
}: {
100+
client: CodebuffClient
101+
input: {
102+
commitSha: string
103+
parentSha: string
104+
diff: string
105+
editedFilePaths: string[]
106+
commitMessage?: string
107+
repoPath: string
108+
}
109+
agentDefinitions?: any[]
110+
}): Promise<{
111+
id: string
112+
reasoning: string
113+
spec: string
114+
prompt: string
115+
supplementalFiles: string[]
116+
}> {
117+
const { diff, editedFilePaths, commitMessage, repoPath } = input
118+
119+
const allAgentDefinitions = [
120+
evalTaskGeneratorAgentDef,
121+
fileExplorerDef,
122+
findAllReferencerDef,
123+
...(agentDefinitions || []),
124+
]
125+
126+
const generatorResult = await client.run({
127+
agent: 'git-evals2-eval-task-generator',
128+
prompt:
129+
'Generate a task specification and user prompt based on the git diff and codebase exploration',
130+
params: {
131+
diff,
132+
editedFilePaths,
133+
commitMessage,
134+
},
135+
cwd: repoPath,
136+
agentDefinitions: allAgentDefinitions,
137+
handleEvent: (event) => {
138+
if (event.type === 'subagent_start') {
139+
console.log(`[Agent] Starting: ${event.displayName}`)
140+
} else if (event.type === 'tool_call') {
141+
console.log(`[Tool] ${event.toolName}`)
142+
} else if (event.type === 'text') {
143+
console.log(`[Text] ${event.text}...`)
144+
}
145+
},
146+
})
147+
148+
if (
149+
generatorResult.output.type !== 'structuredOutput' ||
150+
!generatorResult.output.value
151+
) {
152+
throw new Error('Failed to generate structured task output')
153+
}
154+
155+
return generatorResult.output.value as {
156+
id: string
157+
reasoning: string
158+
spec: string
159+
prompt: string
160+
supplementalFiles: string[]
161+
}
162+
}

0 commit comments

Comments
 (0)