@@ -11,16 +11,154 @@ import { judgeCommitResult } from './judge'
1111import { analyzeAgentTraces , type AgentTraceData } from './trace-analyzer'
1212import { CodebuffClient } from '../../sdk/src/client'
1313
14- import type { AgentEvalResults , EvalDataV2 , ProgressEvent } from './types'
14+ import type { AgentEvalResults , EvalDataV2 } from './types'
15+
16+ async function runTask ( options : {
17+ client : CodebuffClient
18+ commit : EvalDataV2 [ 'evalCommits' ] [ 0 ]
19+ agents : string [ ]
20+ repoUrl : string
21+ initCommand ?: string
22+ logsDir : string
23+ index : number
24+ totalTasks : number
25+ } ) {
26+ const {
27+ client,
28+ commit,
29+ agents,
30+ repoUrl,
31+ initCommand,
32+ logsDir,
33+ index,
34+ totalTasks,
35+ } = options
36+
37+ console . log (
38+ `\n=== Task ${ index + 1 } /${ totalTasks } : ${ commit . id } (${ commit . sha . slice ( 0 , 7 ) } ) ===` ,
39+ )
40+
41+ // Store trace data for this commit to analyze later
42+ const commitTraces : AgentTraceData [ ] = [ ]
43+
44+ const agentPromises = agents . map ( async ( agentId ) => {
45+ const agentResult = await runAgentOnCommit ( {
46+ client,
47+ agentId,
48+ commit,
49+ repoUrl,
50+ initCommand,
51+ } )
52+
53+ const judgeResult = await judgeCommitResult ( {
54+ client,
55+ prompt : commit . prompt ,
56+ groundTruthFileDiffs : commit . fileDiffs ,
57+ contextFiles : agentResult . contextFiles ,
58+ agentDiff : agentResult . diff ,
59+ error : agentResult . error ,
60+ } )
61+
62+ const evalRun = {
63+ commitSha : commit . sha ,
64+ prompt : commit . prompt ,
65+ diff : agentResult . diff ,
66+ judging : judgeResult ,
67+ cost : agentResult . cost ,
68+ durationMs : agentResult . durationMs ,
69+ error : agentResult . error ,
70+ }
71+
72+ // Save trace to logs directory
73+ const safeTaskId = commit . id . replace ( / [ ^ a - z A - Z 0 - 9 - ] / g, '_' )
74+ const safeAgentId = agentId . replace ( / [ ^ a - z A - Z 0 - 9 - ] / g, '_' )
75+ const safeCommitShort = commit . sha . slice ( 0 , 7 )
76+ const traceFilename = `${ safeTaskId } -${ safeAgentId } -${ safeCommitShort } .json`
77+ const tracePath = path . join ( logsDir , traceFilename )
78+
79+ // Store judging result and trace for combined output later
80+ commitTraces . push ( {
81+ agentId,
82+ commitSha : commit . sha ,
83+ prompt : commit . prompt ,
84+ trace : agentResult . trace ,
85+ diff : agentResult . diff ,
86+ judgeResult,
87+ cost : agentResult . cost ,
88+ durationMs : agentResult . durationMs ,
89+ error : agentResult . error ,
90+ timestamp : new Date ( ) . toISOString ( ) ,
91+ } )
92+
93+ fs . writeFileSync (
94+ tracePath ,
95+ JSON . stringify ( commitTraces [ commitTraces . length - 1 ] , null , 2 ) ,
96+ )
97+
98+ return { agentId, evalRun }
99+ } )
100+
101+ const agentResults = await Promise . all ( agentPromises )
102+
103+ // After all agents complete for this commit, run trace analysis
104+ const traceAnalysis = await analyzeAgentTraces ( {
105+ client,
106+ traces : commitTraces ,
107+ codingAgentPrompt : commit . prompt ,
108+ } )
109+
110+ const analysisData = {
111+ commitSha : commit . sha ,
112+ timestamp : new Date ( ) . toISOString ( ) ,
113+ ...traceAnalysis ,
114+ results : commitTraces . map ( ( t ) => ( {
115+ agentId : t . agentId ,
116+ ...t . judgeResult ,
117+ cost : t . cost ,
118+ durationMs : t . durationMs ,
119+ error : t . error ,
120+ } ) ) ,
121+ prompt : commit . prompt ,
122+ }
123+
124+ // Save analysis to logs directory
125+ const safeTaskId = commit . id . replace ( / [ ^ a - z A - Z 0 - 9 - ] / g, '_' )
126+ const analysisCommitShort = commit . sha . slice ( 0 , 7 )
127+ const analysisFilename = `${ safeTaskId } -ANALYSIS-${ analysisCommitShort } .json`
128+ const analysisPath = path . join ( logsDir , analysisFilename )
129+ fs . writeFileSync ( analysisPath , JSON . stringify ( analysisData , null , 2 ) )
130+
131+ // Print all agent results with their judging, then trace analysis together
132+ console . log (
133+ formatTaskResults ( {
134+ commit,
135+ taskNumber : index + 1 ,
136+ totalTasks,
137+ agentResults : commitTraces . map ( ( trace ) => ( {
138+ agentId : trace . agentId ,
139+ judging : trace . judgeResult ,
140+ cost : trace . cost ,
141+ durationMs : trace . durationMs ,
142+ error : trace . error ,
143+ traceFilePath : path . join (
144+ logsDir ,
145+ `${ commit . id . replace ( / [ ^ a - z A - Z 0 - 9 - ] / g, '_' ) } -${ trace . agentId . replace ( / [ ^ a - z A - Z 0 - 9 - ] / g, '_' ) } -${ commit . sha . slice ( 0 , 7 ) } .json` ,
146+ ) ,
147+ } ) ) ,
148+ traceAnalysis,
149+ } ) ,
150+ )
151+
152+ return { commit, agentResults, commitTraces }
153+ }
15154
16155export async function runBuffBench ( options : {
17156 evalDataPath : string
18157 agents : string [ ]
19158 commitConcurrency ?: number
20- onProgress ?: ( event : ProgressEvent ) => void
21159 client ?: CodebuffClient
22160} ) {
23- const { evalDataPath, agents, commitConcurrency = 1 , onProgress } = options
161+ const { evalDataPath, agents, commitConcurrency = 1 } = options
24162
25163 const evalData : EvalDataV2 = JSON . parse (
26164 fs . readFileSync ( evalDataPath , 'utf-8' ) ,
@@ -56,171 +194,18 @@ export async function runBuffBench(options: {
56194 const commitLimit = pLimit ( commitConcurrency )
57195
58196 const commitPromises = commitsToRun . map ( ( commit , index ) =>
59- commitLimit ( async ( ) => {
60- console . log (
61- `\n=== Task ${ index + 1 } /${ commitsToRun . length } : ${ commit . id } (${ commit . sha . slice ( 0 , 7 ) } ) ===` ,
62- )
63-
64- // Store trace data for this commit to analyze later
65- const commitTraces : AgentTraceData [ ] = [ ]
66-
67- const agentPromises = agents . map ( async ( agentId ) => {
68- onProgress ?.( {
69- type : 'agent_start' ,
70- agent : agentId ,
71- commit : commit . sha ,
72- evalId : commit . id ,
73- } )
74-
75- try {
76- const agentResult = await runAgentOnCommit ( {
77- client,
78- agentId,
79- commit,
80- repoUrl : evalData . repoUrl ,
81- initCommand : evalData . initCommand ,
82- } )
83-
84- const judgeResult = await judgeCommitResult ( {
85- client,
86- prompt : commit . prompt ,
87- groundTruthFileDiffs : commit . fileDiffs ,
88- contextFiles : agentResult . contextFiles ,
89- agentDiff : agentResult . diff ,
90- error : agentResult . error ,
91- } )
92-
93- const evalRun = {
94- commitSha : commit . sha ,
95- prompt : commit . prompt ,
96- diff : agentResult . diff ,
97- judging : judgeResult ,
98- cost : agentResult . cost ,
99- durationMs : agentResult . durationMs ,
100- error : agentResult . error ,
101- }
102-
103- // Save trace to logs directory
104- const safeTaskId = commit . id . replace ( / [ ^ a - z A - Z 0 - 9 - ] / g, '_' )
105- const safeAgentId = agentId . replace ( / [ ^ a - z A - Z 0 - 9 - ] / g, '_' )
106- const safeCommitShort = commit . sha . slice ( 0 , 7 )
107- const traceFilename = `${ safeTaskId } -${ safeAgentId } -${ safeCommitShort } .json`
108- const tracePath = path . join ( logsDir , traceFilename )
109-
110- // Store judging result and trace for combined output later
111- commitTraces . push ( {
112- agentId,
113- commitSha : commit . sha ,
114- prompt : commit . prompt ,
115- trace : agentResult . trace ,
116- diff : agentResult . diff ,
117- judgeResult,
118- cost : agentResult . cost ,
119- durationMs : agentResult . durationMs ,
120- error : agentResult . error ,
121- timestamp : new Date ( ) . toISOString ( ) ,
122- } )
123-
124- fs . writeFileSync (
125- tracePath ,
126- JSON . stringify ( commitTraces [ commitTraces . length - 1 ] , null , 2 ) ,
127- )
128-
129- onProgress ?.( {
130- type : 'agent_complete' ,
131- agent : agentId ,
132- commit : commit . sha ,
133- evalId : commit . id ,
134- score : judgeResult . overallScore ,
135- } )
136-
137- return { agentId, evalRun }
138- } catch ( error ) {
139- const errorMessage =
140- error instanceof Error ? error . message : String ( error )
141-
142- onProgress ?.( {
143- type : 'agent_error' ,
144- agent : agentId ,
145- commit : commit . sha ,
146- evalId : commit . id ,
147- error : errorMessage ,
148- } )
149-
150- return {
151- agentId,
152- evalRun : {
153- commitSha : commit . sha ,
154- prompt : commit . prompt ,
155- diff : '' ,
156- judging : {
157- analysis : '' ,
158- strengths : [ ] ,
159- weaknesses : [ ] ,
160- completionScore : 0 ,
161- codeQualityScore : 0 ,
162- overallScore : 0 ,
163- } ,
164- cost : 0 ,
165- durationMs : 0 ,
166- error : errorMessage ,
167- } ,
168- }
169- }
170- } )
171-
172- const agentResults = await Promise . all ( agentPromises ) // After all agents complete for this commit, run trace analysis
173-
174- const traceAnalysis = await analyzeAgentTraces ( {
197+ commitLimit ( ( ) =>
198+ runTask ( {
175199 client,
176- traces : commitTraces ,
177- codingAgentPrompt : commit . prompt ,
178- } )
179-
180- const analysisData = {
181- commitSha : commit . sha ,
182- timestamp : new Date ( ) . toISOString ( ) ,
183- ...traceAnalysis ,
184- results : commitTraces . map ( ( t ) => ( {
185- agentId : t . agentId ,
186- ...t . judgeResult ,
187- cost : t . cost ,
188- durationMs : t . durationMs ,
189- error : t . error ,
190- } ) ) ,
191- prompt : commit . prompt ,
192- }
193-
194- // Save analysis to logs directory
195- const safeTaskId = commit . id . replace ( / [ ^ a - z A - Z 0 - 9 - ] / g, '_' )
196- const analysisCommitShort = commit . sha . slice ( 0 , 7 )
197- const analysisFilename = `${ safeTaskId } -ANALYSIS-${ analysisCommitShort } .json`
198- const analysisPath = path . join ( logsDir , analysisFilename )
199- fs . writeFileSync ( analysisPath , JSON . stringify ( analysisData , null , 2 ) )
200-
201- // Print all agent results with their judging, then trace analysis together
202- console . log (
203- formatTaskResults ( {
204- commit,
205- taskNumber : index + 1 ,
206- totalTasks : commitsToRun . length ,
207- agentResults : commitTraces . map ( ( trace ) => ( {
208- agentId : trace . agentId ,
209- judging : trace . judgeResult ,
210- cost : trace . cost ,
211- durationMs : trace . durationMs ,
212- error : trace . error ,
213- traceFilePath : path . join (
214- logsDir ,
215- `${ commit . id . replace ( / [ ^ a - z A - Z 0 - 9 - ] / g, '_' ) } -${ trace . agentId . replace ( / [ ^ a - z A - Z 0 - 9 - ] / g, '_' ) } -${ commit . sha . slice ( 0 , 7 ) } .json` ,
216- ) ,
217- } ) ) ,
218- traceAnalysis,
219- } ) ,
220- )
221-
222- return { commit, agentResults }
223- } ) ,
200+ commit,
201+ agents,
202+ repoUrl : evalData . repoUrl ,
203+ initCommand : evalData . initCommand ,
204+ logsDir,
205+ index,
206+ totalTasks : commitsToRun . length ,
207+ } ) ,
208+ ) ,
224209 )
225210
226211 const commitResults = await Promise . allSettled ( commitPromises )
0 commit comments