Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
f963c13
First version of git-evals2
jahooma Oct 11, 2025
265df6d
Use shallow clone
jahooma Oct 11, 2025
1233cd1
Store agent trace
jahooma Oct 11, 2025
a2e06b3
simplify some logs
jahooma Oct 11, 2025
3974b74
trace analyzer v1
jahooma Oct 11, 2025
815129f
Misc refactoring
jahooma Oct 11, 2025
337156e
Update trace analyzer to focus on agent process
jahooma Oct 11, 2025
45de9de
Misc improvments: write a file result log. Use gpt5 for trace analysis
jahooma Oct 11, 2025
a5f3dda
Generate prompt with agent. Migration script
jahooma Oct 12, 2025
59665e4
Generate id per task
jahooma Oct 12, 2025
974edf3
Also generate a spec
jahooma Oct 12, 2025
320fd73
Save partial results to file for generating evals
jahooma Oct 12, 2025
4918fe5
Judge based on prompt not spec, pass in context files
jahooma Oct 12, 2025
9f7877d
Add resume capability for migrate-evals-to-v2
jahooma Oct 12, 2025
84bc373
Move eval file
jahooma Oct 12, 2025
0de25d7
Don't include newly created files as context
jahooma Oct 12, 2025
e5b7fd6
misc improvements
jahooma Oct 12, 2025
5b1b72a
finish migration of eval files to git-evals2
jahooma Oct 12, 2025
5701981
Delete migration script
jahooma Oct 12, 2025
3359013
output dir is this dir
jahooma Oct 12, 2025
f22879e
fix bug
jahooma Oct 12, 2025
5d174dc
Log trace when agent errors
jahooma Oct 12, 2025
1bd32fd
Rename to BuffBench
jahooma Oct 12, 2025
25e990c
Merge branch 'main' into git-evals2
jahooma Oct 12, 2025
8ad2611
Print errors in handleEvent
jahooma Oct 12, 2025
d27e643
Diff against parentSha in case of commit
jahooma Oct 12, 2025
9477b31
tweak
jahooma Oct 12, 2025
9a836d7
Fix fileDiffs status
jahooma Oct 12, 2025
b15c36a
Add concurrency option
jahooma Oct 12, 2025
c93f96e
Agent-specific recommendations
jahooma Oct 12, 2025
74db77b
Give base2 editing tools for small changes
jahooma Oct 12, 2025
b83b0df
Remove trace from agent error
jahooma Oct 12, 2025
315a5b5
prettier output
jahooma Oct 12, 2025
1771a05
Add commander agent
jahooma Oct 12, 2025
35a3a38
Update base2 prompts (and include commander)
jahooma Oct 12, 2025
3dd6318
More output tweaks
jahooma Oct 13, 2025
a463172
Fix to include prompt in log file
jahooma Oct 13, 2025
d76340c
Merge branch 'main' into git-evals2
jahooma Oct 13, 2025
4d95f80
rename example to main. kill output path, just return all results
jahooma Oct 13, 2025
b3cf6d9
Add timeout for running agent
jahooma Oct 13, 2025
22fa6cf
delete readme
jahooma Oct 13, 2025
821f34b
Remove non-existant file from supplemental
jahooma Oct 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 156 additions & 0 deletions evals/buffbench/agent-runner.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import { execSync } from 'child_process'
import path from 'path'

import { withTimeout } from '@codebuff/common/util/promise'
import { loadLocalAgents } from '@codebuff/npm-app/agents/load-agents'
import { CodebuffClient } from '../../sdk/src/client'
import { withTestRepo } from '../subagents/test-repo-utils'

import type { EvalCommitV2 } from './types'

export interface AgentStep {
response: string
toolCalls: any[]
toolResults: any[]
}

export async function runAgentOnCommit({
client,
agentId,
commit,
repoUrl,
initCommand,
}: {
client: CodebuffClient
agentId: string
commit: EvalCommitV2
repoUrl: string
initCommand?: string
}): Promise<{
diff: string
contextFiles: Record<string, string>
durationMs: number
cost: number
error?: string
trace: AgentStep[]
}> {
console.log(`[${commit.id}] Running agent ${agentId}...`)
const startTime = Date.now()
let diff = ''
let contextFiles: Record<string, string> = {}
let error: string | undefined
let cost = 0
const trace: AgentStep[] = []

try {
await withTestRepo(
{
repoUrl,
parentSha: commit.parentSha,
initCommand,
},
async (repoDir) => {
const agentsPath = path.join(__dirname, '../../.agents')
const localAgentDefinitions = Object.values(
await loadLocalAgents({ agentsPath }),
)

let responseText = ''
let toolCalls: any[] = []
let toolResults: any[] = []

function flushStep() {
if (
responseText.length > 0 ||
toolCalls.length > 0 ||
toolResults.length > 0
) {
trace.push({ response: responseText, toolCalls, toolResults })
responseText = ''
toolCalls = []
toolResults = []
}
}

const timeoutMs = 10 * 60 * 1000 // 10 minutes
const result = await withTimeout(
client.run({
agent: agentId,
prompt: commit.prompt,
agentDefinitions: localAgentDefinitions,
cwd: repoDir,
handleEvent: (event) => {
if (event.type === 'text') {
if (toolResults.length > 0) {
flushStep()
}
responseText += event.text
} else if (event.type === 'tool_call') {
if (event.toolName === 'set_messages') {
return
}
toolCalls.push(event)
} else if (event.type === 'tool_result') {
toolResults.push(event)
} else if (event.type === 'finish') {
flushStep()
} else if (event.type === 'error') {
console.error(`[${agentId}] Error event:`, event.message)
}
},
}),
timeoutMs,
`Agent ${agentId} timed out after ${timeoutMs / 1000} seconds`,
)

flushStep()
cost = result.sessionState.mainAgentState.creditsUsed / 100

execSync('git add .', { cwd: repoDir, stdio: 'ignore' })
diff = execSync(`git diff ${commit.parentSha}`, {
cwd: repoDir,
encoding: 'utf-8',
})

const contextFilePaths = new Set<string>([
...commit.supplementalFiles,
...commit.fileDiffs.map((fd) => fd.path),
])
for (const { status, path } of commit.fileDiffs) {
if (status === 'added') {
contextFilePaths.delete(path)
}
}

for (const filePath of contextFilePaths) {
try {
const content = execSync(
`git show ${commit.parentSha}:${JSON.stringify(filePath)}`,
{
cwd: repoDir,
encoding: 'utf-8',
maxBuffer: 10 * 1024 * 1024,
},
)
contextFiles[filePath] = content
} catch (error) {
contextFiles[filePath] = ''
}
}
},
)
} catch (e) {
error = e instanceof Error ? `${e.message}\n${e.stack}` : String(e)
}

const durationMs = Date.now() - startTime

return {
diff,
contextFiles,
durationMs,
cost,
error,
trace,
}
}
3,162 changes: 3,162 additions & 0 deletions evals/buffbench/eval-codebuff.json

Large diffs are not rendered by default.

1,667 changes: 1,667 additions & 0 deletions evals/buffbench/eval-manifold.json

Large diffs are not rendered by default.

2,028 changes: 2,028 additions & 0 deletions evals/buffbench/eval-plane.json

Large diffs are not rendered by default.

1,831 changes: 1,831 additions & 0 deletions evals/buffbench/eval-saleor.json

Large diffs are not rendered by default.

162 changes: 162 additions & 0 deletions evals/buffbench/eval-task-generator.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
import { CodebuffClient } from '../../sdk/src/client'
import type { AgentDefinition } from '../../sdk/src'
import fileExplorerDef from '../../.agents/file-explorer/file-explorer'
import findAllReferencerDef from '../../.agents/file-explorer/find-all-referencer'
import { PLACEHOLDER } from '../../.agents/types/secret-agent-definition'

const evalTaskGeneratorAgentDef: AgentDefinition = {
id: 'git-evals2-eval-task-generator',
displayName: 'Git Evals2 Eval Task Generator',
model: 'openai/gpt-5',
toolNames: ['spawn_agents', 'read_files', 'set_output'],
spawnableAgents: ['file-explorer', 'find-all-referencer'],
inputSchema: {
prompt: {
type: 'string',
description: 'Instructions to generate the task spec and prompt',
},
},
outputMode: 'structured_output',
outputSchema: {
type: 'object',
properties: {
id: {
type: 'string',
description:
'Short 2-3 word hyphenated task identifier (e.g., "fix-auth-bug", "add-user-profile", "refactor-login-flow")',
},
reasoning: {
type: 'string',
description: 'Your thoughts about the task, spec, and prompt',
},
spec: {
type: 'string',
description:
'Clear specification describing WHAT needs to be implemented (observable behavior/structure, not HOW)',
},
prompt: {
type: 'string',
description: 'High-level user prompt describing what needs to be done',
},
supplementalFiles: {
type: 'array',
items: { type: 'string' },
description: 'List of supplemental file paths',
},
},
required: ['id', 'reasoning', 'spec', 'prompt', 'supplementalFiles'],
},
systemPrompt: `You are an expert at analyzing git commits and generating evaluation tasks for AI coding assistants.

You will receive:
- A git diff showing the changes made
- The list of files that were edited
- An optional commit message
- The repository directory where you can explore the codebase

You must generate both a specification (spec) and a user prompt for the task.

${PLACEHOLDER.FILE_TREE_PROMPT}
${PLACEHOLDER.KNOWLEDGE_FILES_CONTENTS}`,

instructionsPrompt: `Your task:
1. Analyze the git diff to understand what changed
2. Spawn the file-explorer and find-all-referencer to explore the codebase and understand context.
3. Read as many files relevant to the changes as possible.
4. Generate the output, including:
- a short, descriptive task ID (2-3 hyphenated words like "fix-auth-bug" or "refactor-login-flow")
- a clear specification describing exactly what needs to be implemented
- a high-level user prompt that describes what needs to be done leaving out details that should be reconstructed by the agent
- supplemental files that would help a judge understand the change (exclude directly edited files)

Key principles for the task ID:
- 2-3 words maximum, hyphenated (e.g., "fix-memory-leak", "add-user-profile", "refactor-auth-flow")
- Descriptive but concise
- Use action verbs when appropriate (fix, add, remove, refactor, update, implement)
- Lowercase with hyphens

Key principles for the spec:
- Prescribe exactly how to make the change with references to the files that need to be changed
- Not include code
- Focus on the observable behavior or structure that needs to be implemented
- Be clear enough that a skilled developer or AI could implement it from scratch
- Be phrased as what needs to be done, not what was already done
- Cover all the changes shown across multiple files

Key principles for the prompt:
- Focus on the high-level functional requirements, not implementation details
- Use natural language: "add user authentication" not "implement authenticateUser function"
- Omit details that should be reconstructed by the agent
- Be clear enough that a skilled developer could implement from scratch
- Consider the commit message as a hint but don't just copy it
`,
}

export async function generateEvalTask({
client,
input,
agentDefinitions,
}: {
client: CodebuffClient
input: {
commitSha: string
parentSha: string
diff: string
editedFilePaths: string[]
commitMessage?: string
repoPath: string
}
agentDefinitions?: any[]
}): Promise<{
id: string
reasoning: string
spec: string
prompt: string
supplementalFiles: string[]
}> {
const { diff, editedFilePaths, commitMessage, repoPath } = input

const allAgentDefinitions = [
evalTaskGeneratorAgentDef,
fileExplorerDef,
findAllReferencerDef,
...(agentDefinitions || []),
]

const generatorResult = await client.run({
agent: 'git-evals2-eval-task-generator',
prompt:
'Generate a task specification and user prompt based on the git diff and codebase exploration',
params: {
diff,
editedFilePaths,
commitMessage,
},
cwd: repoPath,
agentDefinitions: allAgentDefinitions,
handleEvent: (event) => {
if (event.type === 'subagent_start') {
console.log(`[Agent] Starting: ${event.displayName}`)
} else if (event.type === 'tool_call') {
console.log(`[Tool] ${event.toolName}`)
} else if (event.type === 'text') {
console.log(`[Text] ${event.text}...`)
}
},
})

if (
generatorResult.output.type !== 'structuredOutput' ||
!generatorResult.output.value
) {
throw new Error('Failed to generate structured task output')
}

return generatorResult.output.value as {
id: string
reasoning: string
spec: string
prompt: string
supplementalFiles: string[]
}
}
Loading