Skip to content

Commit a492cdc

Browse files
committed
buffbench: refactor runTask to a separate function. delete onProgress
1 parent 0aba606 commit a492cdc

File tree

3 files changed

+152
-194
lines changed

3 files changed

+152
-194
lines changed

evals/buffbench/main.ts

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,6 @@ async function main() {
88
evalDataPath: path.join(__dirname, 'eval-codebuff.json'),
99
agents: ['base', 'base2'],
1010
commitConcurrency: 10,
11-
onProgress: (event) => {
12-
if (event.type === 'agent_error') {
13-
console.log(`[${event.agent}] ✗ ${event.evalId} error: ${event.error}`)
14-
}
15-
},
1611
})
1712

1813
const outputPath = path.join(__dirname, 'results.json')

evals/buffbench/run-buffbench.ts

Lines changed: 152 additions & 167 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,154 @@ import { judgeCommitResult } from './judge'
1111
import { analyzeAgentTraces, type AgentTraceData } from './trace-analyzer'
1212
import { CodebuffClient } from '../../sdk/src/client'
1313

14-
import type { AgentEvalResults, EvalDataV2, ProgressEvent } from './types'
14+
import type { AgentEvalResults, EvalDataV2 } from './types'
15+
16+
async function runTask(options: {
17+
client: CodebuffClient
18+
commit: EvalDataV2['evalCommits'][0]
19+
agents: string[]
20+
repoUrl: string
21+
initCommand?: string
22+
logsDir: string
23+
index: number
24+
totalTasks: number
25+
}) {
26+
const {
27+
client,
28+
commit,
29+
agents,
30+
repoUrl,
31+
initCommand,
32+
logsDir,
33+
index,
34+
totalTasks,
35+
} = options
36+
37+
console.log(
38+
`\n=== Task ${index + 1}/${totalTasks}: ${commit.id} (${commit.sha.slice(0, 7)}) ===`,
39+
)
40+
41+
// Store trace data for this commit to analyze later
42+
const commitTraces: AgentTraceData[] = []
43+
44+
const agentPromises = agents.map(async (agentId) => {
45+
const agentResult = await runAgentOnCommit({
46+
client,
47+
agentId,
48+
commit,
49+
repoUrl,
50+
initCommand,
51+
})
52+
53+
const judgeResult = await judgeCommitResult({
54+
client,
55+
prompt: commit.prompt,
56+
groundTruthFileDiffs: commit.fileDiffs,
57+
contextFiles: agentResult.contextFiles,
58+
agentDiff: agentResult.diff,
59+
error: agentResult.error,
60+
})
61+
62+
const evalRun = {
63+
commitSha: commit.sha,
64+
prompt: commit.prompt,
65+
diff: agentResult.diff,
66+
judging: judgeResult,
67+
cost: agentResult.cost,
68+
durationMs: agentResult.durationMs,
69+
error: agentResult.error,
70+
}
71+
72+
// Save trace to logs directory
73+
const safeTaskId = commit.id.replace(/[^a-zA-Z0-9-]/g, '_')
74+
const safeAgentId = agentId.replace(/[^a-zA-Z0-9-]/g, '_')
75+
const safeCommitShort = commit.sha.slice(0, 7)
76+
const traceFilename = `${safeTaskId}-${safeAgentId}-${safeCommitShort}.json`
77+
const tracePath = path.join(logsDir, traceFilename)
78+
79+
// Store judging result and trace for combined output later
80+
commitTraces.push({
81+
agentId,
82+
commitSha: commit.sha,
83+
prompt: commit.prompt,
84+
trace: agentResult.trace,
85+
diff: agentResult.diff,
86+
judgeResult,
87+
cost: agentResult.cost,
88+
durationMs: agentResult.durationMs,
89+
error: agentResult.error,
90+
timestamp: new Date().toISOString(),
91+
})
92+
93+
fs.writeFileSync(
94+
tracePath,
95+
JSON.stringify(commitTraces[commitTraces.length - 1], null, 2),
96+
)
97+
98+
return { agentId, evalRun }
99+
})
100+
101+
const agentResults = await Promise.all(agentPromises)
102+
103+
// After all agents complete for this commit, run trace analysis
104+
const traceAnalysis = await analyzeAgentTraces({
105+
client,
106+
traces: commitTraces,
107+
codingAgentPrompt: commit.prompt,
108+
})
109+
110+
const analysisData = {
111+
commitSha: commit.sha,
112+
timestamp: new Date().toISOString(),
113+
...traceAnalysis,
114+
results: commitTraces.map((t) => ({
115+
agentId: t.agentId,
116+
...t.judgeResult,
117+
cost: t.cost,
118+
durationMs: t.durationMs,
119+
error: t.error,
120+
})),
121+
prompt: commit.prompt,
122+
}
123+
124+
// Save analysis to logs directory
125+
const safeTaskId = commit.id.replace(/[^a-zA-Z0-9-]/g, '_')
126+
const analysisCommitShort = commit.sha.slice(0, 7)
127+
const analysisFilename = `${safeTaskId}-ANALYSIS-${analysisCommitShort}.json`
128+
const analysisPath = path.join(logsDir, analysisFilename)
129+
fs.writeFileSync(analysisPath, JSON.stringify(analysisData, null, 2))
130+
131+
// Print all agent results with their judging, then trace analysis together
132+
console.log(
133+
formatTaskResults({
134+
commit,
135+
taskNumber: index + 1,
136+
totalTasks,
137+
agentResults: commitTraces.map((trace) => ({
138+
agentId: trace.agentId,
139+
judging: trace.judgeResult,
140+
cost: trace.cost,
141+
durationMs: trace.durationMs,
142+
error: trace.error,
143+
traceFilePath: path.join(
144+
logsDir,
145+
`${commit.id.replace(/[^a-zA-Z0-9-]/g, '_')}-${trace.agentId.replace(/[^a-zA-Z0-9-]/g, '_')}-${commit.sha.slice(0, 7)}.json`,
146+
),
147+
})),
148+
traceAnalysis,
149+
}),
150+
)
151+
152+
return { commit, agentResults, commitTraces }
153+
}
15154

16155
export async function runBuffBench(options: {
17156
evalDataPath: string
18157
agents: string[]
19158
commitConcurrency?: number
20-
onProgress?: (event: ProgressEvent) => void
21159
client?: CodebuffClient
22160
}) {
23-
const { evalDataPath, agents, commitConcurrency = 1, onProgress } = options
161+
const { evalDataPath, agents, commitConcurrency = 1 } = options
24162

25163
const evalData: EvalDataV2 = JSON.parse(
26164
fs.readFileSync(evalDataPath, 'utf-8'),
@@ -56,171 +194,18 @@ export async function runBuffBench(options: {
56194
const commitLimit = pLimit(commitConcurrency)
57195

58196
const commitPromises = commitsToRun.map((commit, index) =>
59-
commitLimit(async () => {
60-
console.log(
61-
`\n=== Task ${index + 1}/${commitsToRun.length}: ${commit.id} (${commit.sha.slice(0, 7)}) ===`,
62-
)
63-
64-
// Store trace data for this commit to analyze later
65-
const commitTraces: AgentTraceData[] = []
66-
67-
const agentPromises = agents.map(async (agentId) => {
68-
onProgress?.({
69-
type: 'agent_start',
70-
agent: agentId,
71-
commit: commit.sha,
72-
evalId: commit.id,
73-
})
74-
75-
try {
76-
const agentResult = await runAgentOnCommit({
77-
client,
78-
agentId,
79-
commit,
80-
repoUrl: evalData.repoUrl,
81-
initCommand: evalData.initCommand,
82-
})
83-
84-
const judgeResult = await judgeCommitResult({
85-
client,
86-
prompt: commit.prompt,
87-
groundTruthFileDiffs: commit.fileDiffs,
88-
contextFiles: agentResult.contextFiles,
89-
agentDiff: agentResult.diff,
90-
error: agentResult.error,
91-
})
92-
93-
const evalRun = {
94-
commitSha: commit.sha,
95-
prompt: commit.prompt,
96-
diff: agentResult.diff,
97-
judging: judgeResult,
98-
cost: agentResult.cost,
99-
durationMs: agentResult.durationMs,
100-
error: agentResult.error,
101-
}
102-
103-
// Save trace to logs directory
104-
const safeTaskId = commit.id.replace(/[^a-zA-Z0-9-]/g, '_')
105-
const safeAgentId = agentId.replace(/[^a-zA-Z0-9-]/g, '_')
106-
const safeCommitShort = commit.sha.slice(0, 7)
107-
const traceFilename = `${safeTaskId}-${safeAgentId}-${safeCommitShort}.json`
108-
const tracePath = path.join(logsDir, traceFilename)
109-
110-
// Store judging result and trace for combined output later
111-
commitTraces.push({
112-
agentId,
113-
commitSha: commit.sha,
114-
prompt: commit.prompt,
115-
trace: agentResult.trace,
116-
diff: agentResult.diff,
117-
judgeResult,
118-
cost: agentResult.cost,
119-
durationMs: agentResult.durationMs,
120-
error: agentResult.error,
121-
timestamp: new Date().toISOString(),
122-
})
123-
124-
fs.writeFileSync(
125-
tracePath,
126-
JSON.stringify(commitTraces[commitTraces.length - 1], null, 2),
127-
)
128-
129-
onProgress?.({
130-
type: 'agent_complete',
131-
agent: agentId,
132-
commit: commit.sha,
133-
evalId: commit.id,
134-
score: judgeResult.overallScore,
135-
})
136-
137-
return { agentId, evalRun }
138-
} catch (error) {
139-
const errorMessage =
140-
error instanceof Error ? error.message : String(error)
141-
142-
onProgress?.({
143-
type: 'agent_error',
144-
agent: agentId,
145-
commit: commit.sha,
146-
evalId: commit.id,
147-
error: errorMessage,
148-
})
149-
150-
return {
151-
agentId,
152-
evalRun: {
153-
commitSha: commit.sha,
154-
prompt: commit.prompt,
155-
diff: '',
156-
judging: {
157-
analysis: '',
158-
strengths: [],
159-
weaknesses: [],
160-
completionScore: 0,
161-
codeQualityScore: 0,
162-
overallScore: 0,
163-
},
164-
cost: 0,
165-
durationMs: 0,
166-
error: errorMessage,
167-
},
168-
}
169-
}
170-
})
171-
172-
const agentResults = await Promise.all(agentPromises) // After all agents complete for this commit, run trace analysis
173-
174-
const traceAnalysis = await analyzeAgentTraces({
197+
commitLimit(() =>
198+
runTask({
175199
client,
176-
traces: commitTraces,
177-
codingAgentPrompt: commit.prompt,
178-
})
179-
180-
const analysisData = {
181-
commitSha: commit.sha,
182-
timestamp: new Date().toISOString(),
183-
...traceAnalysis,
184-
results: commitTraces.map((t) => ({
185-
agentId: t.agentId,
186-
...t.judgeResult,
187-
cost: t.cost,
188-
durationMs: t.durationMs,
189-
error: t.error,
190-
})),
191-
prompt: commit.prompt,
192-
}
193-
194-
// Save analysis to logs directory
195-
const safeTaskId = commit.id.replace(/[^a-zA-Z0-9-]/g, '_')
196-
const analysisCommitShort = commit.sha.slice(0, 7)
197-
const analysisFilename = `${safeTaskId}-ANALYSIS-${analysisCommitShort}.json`
198-
const analysisPath = path.join(logsDir, analysisFilename)
199-
fs.writeFileSync(analysisPath, JSON.stringify(analysisData, null, 2))
200-
201-
// Print all agent results with their judging, then trace analysis together
202-
console.log(
203-
formatTaskResults({
204-
commit,
205-
taskNumber: index + 1,
206-
totalTasks: commitsToRun.length,
207-
agentResults: commitTraces.map((trace) => ({
208-
agentId: trace.agentId,
209-
judging: trace.judgeResult,
210-
cost: trace.cost,
211-
durationMs: trace.durationMs,
212-
error: trace.error,
213-
traceFilePath: path.join(
214-
logsDir,
215-
`${commit.id.replace(/[^a-zA-Z0-9-]/g, '_')}-${trace.agentId.replace(/[^a-zA-Z0-9-]/g, '_')}-${commit.sha.slice(0, 7)}.json`,
216-
),
217-
})),
218-
traceAnalysis,
219-
}),
220-
)
221-
222-
return { commit, agentResults }
223-
}),
200+
commit,
201+
agents,
202+
repoUrl: evalData.repoUrl,
203+
initCommand: evalData.initCommand,
204+
logsDir,
205+
index,
206+
totalTasks: commitsToRun.length,
207+
}),
208+
),
224209
)
225210

226211
const commitResults = await Promise.allSettled(commitPromises)

evals/buffbench/types.ts

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -63,25 +63,3 @@ export interface AgentEvalResults {
6363
averageCost: number
6464
averageDuration: number
6565
}
66-
67-
export type ProgressEvent =
68-
| {
69-
type: 'agent_start'
70-
agent: string
71-
commit: string
72-
evalId: string
73-
}
74-
| {
75-
type: 'agent_complete'
76-
agent: string
77-
commit: string
78-
evalId: string
79-
score: number
80-
}
81-
| {
82-
type: 'agent_error'
83-
agent: string
84-
commit: string
85-
evalId: string
86-
error: string
87-
}

0 commit comments

Comments
 (0)