Skip to content

Commit 85a591e

Browse files
committed
Meta analyzer agent for evals
1 parent a512ec8 commit 85a591e

File tree

4 files changed

+377
-5
lines changed

4 files changed

+377
-5
lines changed

evals/buffbench/main-nightly.ts

Lines changed: 82 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import path from 'path'
33
import { sendBasicEmail } from '@codebuff/internal/loops'
44

55
import { runBuffBench } from './run-buffbench'
6+
import type { MetaAnalysisResult } from './meta-analyzer'
67
import type { AgentEvalResults } from './types'
78

89
async function main() {
@@ -23,8 +24,8 @@ async function main() {
2324
const recipientEmail = process.env.EVAL_RESULTS_EMAIL || 'team@codebuff.com'
2425
console.log(`\n📧 Sending buffbench results email to ${recipientEmail}...`)
2526

26-
const { metadata, ...agentResults } = results
27-
const emailContent = formatBuffBenchEmailContent(agentResults, metadata)
27+
const { metadata, metaAnalysis, ...agentResults } = results
28+
const emailContent = formatBuffBenchEmailContent(agentResults, metadata, metaAnalysis)
2829

2930
try {
3031
const emailResult = await sendBasicEmail({
@@ -48,6 +49,7 @@ async function main() {
4849
function formatBuffBenchEmailContent(
4950
results: Record<string, AgentEvalResults>,
5051
metadata: any,
52+
metaAnalysis: MetaAnalysisResult,
5153
) {
5254
const agents = Object.keys(results)
5355
const date = new Date().toLocaleDateString()
@@ -69,7 +71,7 @@ function formatBuffBenchEmailContent(
6971
)
7072
.join('\n\n')
7173

72-
const message = `📊 NIGHTLY BUFFBENCH RESULTS
74+
let message = `📊 NIGHTLY BUFFBENCH RESULTS
7375
7476
📈 AGENT RESULTS:
7577
${agentComparison}
@@ -82,9 +84,86 @@ ${agentComparison}
8284
Generated on: ${metadata.timestamp}
8385
Repository: ${metadata.repoUrl}`
8486

87+
// Add meta-analysis if available
88+
if (metaAnalysis) {
89+
message += '\n\n' + formatMetaAnalysis(metaAnalysis)
90+
}
91+
8592
return { subject, message }
8693
}
8794

95+
function formatMetaAnalysis(metaAnalysis: MetaAnalysisResult): string {
96+
const lines: string[] = []
97+
const separator = '═'.repeat(60)
98+
const minorSeparator = '─'.repeat(60)
99+
100+
lines.push(separator)
101+
lines.push('🔍 META-ANALYSIS: PATTERNS ACROSS ALL TASKS')
102+
lines.push(separator)
103+
lines.push('')
104+
105+
// Overall comparison
106+
lines.push('📊 OVERALL COMPARISON:')
107+
lines.push(minorSeparator)
108+
lines.push(metaAnalysis.overallComparison)
109+
lines.push('')
110+
111+
// Agent-specific insights
112+
if (metaAnalysis.agentInsights && metaAnalysis.agentInsights.length > 0) {
113+
lines.push('🤖 AGENT-SPECIFIC INSIGHTS:')
114+
lines.push(minorSeparator)
115+
116+
for (const insight of metaAnalysis.agentInsights) {
117+
lines.push('')
118+
lines.push(`[${insight.agentId}]`)
119+
lines.push('')
120+
121+
if (insight.performanceSummary) {
122+
lines.push(`Performance: ${insight.performanceSummary}`)
123+
lines.push('')
124+
}
125+
126+
if (insight.consistentStrengths?.length > 0) {
127+
lines.push('✅ Consistent Strengths:')
128+
insight.consistentStrengths.forEach((s: string) => {
129+
lines.push(` • ${s}`)
130+
})
131+
lines.push('')
132+
}
133+
134+
if (insight.consistentWeaknesses?.length > 0) {
135+
lines.push('⚠️ Consistent Weaknesses:')
136+
insight.consistentWeaknesses.forEach((w: string) => {
137+
lines.push(` • ${w}`)
138+
})
139+
lines.push('')
140+
}
141+
142+
if (insight.recommendations?.length > 0) {
143+
lines.push('💡 Recommendations:')
144+
insight.recommendations.forEach((r: string) => {
145+
lines.push(` • ${r}`)
146+
})
147+
lines.push('')
148+
}
149+
}
150+
}
151+
152+
// Key findings
153+
if (metaAnalysis.keyFindings && metaAnalysis.keyFindings.length > 0) {
154+
lines.push('🎯 KEY FINDINGS & PRIORITIES:')
155+
lines.push(minorSeparator)
156+
metaAnalysis.keyFindings.forEach((finding: string, index: number) => {
157+
lines.push(`${index + 1}. ${finding}`)
158+
})
159+
lines.push('')
160+
}
161+
162+
lines.push(separator)
163+
164+
return lines.join('\n')
165+
}
166+
88167
if (import.meta.main) {
89168
main().catch((error) => {
90169
console.error('Error running nightly buffbench:', error)

evals/buffbench/meta-analyzer.ts

Lines changed: 258 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,258 @@
1+
import type { CodebuffClient } from '../../sdk/src/client'
2+
import type { AgentDefinition } from '../../sdk/src'
3+
import { withTimeout } from '@codebuff/common/util/promise'
4+
import { getErrorObject } from '@codebuff/common/util/error'
5+
import fs from 'fs'
6+
import path from 'path'
7+
8+
export interface TaskAnalysisData {
9+
commitSha: string
10+
prompt: string
11+
timestamp: string
12+
overallAnalysis: string
13+
agentFeedback: Array<{
14+
agentId: string
15+
strengths: string[]
16+
weaknesses: string[]
17+
recommendations: string[]
18+
}>
19+
results: Array<{
20+
agentId: string
21+
analysis: string
22+
strengths: string[]
23+
weaknesses: string[]
24+
completionScore: number
25+
codeQualityScore: number
26+
overallScore: number
27+
cost: number
28+
durationMs: number
29+
error?: string
30+
}>
31+
}
32+
33+
export interface MetaAnalysisResult {
34+
overallComparison: string
35+
agentInsights: Array<{
36+
agentId: string
37+
consistentStrengths: string[]
38+
consistentWeaknesses: string[]
39+
performanceSummary: string
40+
recommendations: string[]
41+
}>
42+
keyFindings: string[]
43+
}
44+
45+
const metaAnalyzerAgent: AgentDefinition = {
46+
id: 'buffbench-meta-analyzer',
47+
displayName: 'Buffbench Meta Analyzer',
48+
model: 'openai/gpt-5',
49+
toolNames: ['set_output'],
50+
inputSchema: {
51+
prompt: { type: 'string', description: 'The meta-analysis prompt' },
52+
},
53+
outputMode: 'structured_output',
54+
outputSchema: {
55+
type: 'object',
56+
properties: {
57+
overallComparison: {
58+
type: 'string',
59+
description: 'High-level comparison of all agents across all tasks',
60+
},
61+
agentInsights: {
62+
type: 'array',
63+
items: {
64+
type: 'object',
65+
properties: {
66+
agentId: { type: 'string' },
67+
consistentStrengths: {
68+
type: 'array',
69+
items: { type: 'string' },
70+
description: 'Patterns of strengths across multiple tasks',
71+
},
72+
consistentWeaknesses: {
73+
type: 'array',
74+
items: { type: 'string' },
75+
description: 'Patterns of weaknesses across multiple tasks',
76+
},
77+
performanceSummary: {
78+
type: 'string',
79+
description:
80+
'Summary of overall performance including scores, cost, and time',
81+
},
82+
recommendations: {
83+
type: 'array',
84+
items: { type: 'string' },
85+
description:
86+
'High-level recommendations for improving this agent based on patterns observed',
87+
},
88+
},
89+
required: [
90+
'agentId',
91+
'consistentStrengths',
92+
'consistentWeaknesses',
93+
'performanceSummary',
94+
'recommendations',
95+
],
96+
},
97+
},
98+
keyFindings: {
99+
type: 'array',
100+
items: { type: 'string' },
101+
description:
102+
'Most important insights from the evaluation that should guide development priorities',
103+
},
104+
},
105+
required: ['overallComparison', 'agentInsights', 'keyFindings'],
106+
},
107+
systemPrompt: `You are an expert AI system evaluator analyzing patterns across multiple coding tasks and agents.
108+
109+
## Your Role
110+
111+
You will receive:
112+
1. Trace analyses from multiple tasks showing how agents approached different problems
113+
2. Judge analyses showing the quality of their implementations
114+
3. Performance metrics (scores, costs, times) across all tasks
115+
116+
## Focus on Patterns and Trends
117+
118+
Your analysis should identify consistent patterns across multiple tasks:
119+
120+
Key Analysis Areas:
121+
- **Consistent Behaviors**: What patterns emerge in how each agent approaches problems?
122+
- **Performance Trends**: Which agents consistently score higher/lower? Why?
123+
- **Cost vs Quality Trade-offs**: How do agents balance thoroughness with efficiency?
124+
- **Reliability**: Which agents are more consistent vs variable in their performance?
125+
- **Comparative Analysis**: What are the key differentiators between agents?
126+
127+
## Output Format
128+
129+
Provide:
130+
- **Overall Comparison**: High-level assessment comparing all agents' general approaches and performance
131+
- **Agent Insights**: For each agent:
132+
- Consistent Strengths: Patterns that work well across multiple tasks
133+
- Consistent Weaknesses: Recurring issues or limitations
134+
- Performance Summary: Overall scores, costs, times, and reliability
135+
- Recommendations: What changes would most improve this agent?
136+
- **Key Findings**: 3-5 most actionable insights that should guide development priorities
137+
138+
Focus on actionable patterns that can inform agent improvements, not individual task details.`,
139+
}
140+
141+
export async function analyzeAllTasks(params: {
142+
client: CodebuffClient
143+
logsDir: string
144+
agents: string[]
145+
}): Promise<MetaAnalysisResult> {
146+
const { client, logsDir, agents } = params
147+
148+
try {
149+
// Read all ANALYSIS files from logs directory
150+
const files = fs.readdirSync(logsDir)
151+
const analysisFiles = files.filter((f) => f.includes('ANALYSIS'))
152+
153+
const allTaskAnalyses: TaskAnalysisData[] = []
154+
for (const file of analysisFiles) {
155+
const filePath = path.join(logsDir, file)
156+
const content = fs.readFileSync(filePath, 'utf-8')
157+
const data: TaskAnalysisData = JSON.parse(content)
158+
allTaskAnalyses.push(data)
159+
}
160+
161+
if (allTaskAnalyses.length === 0) {
162+
console.warn('No analysis files found in logs directory')
163+
return {
164+
overallComparison: 'No analysis data available',
165+
agentInsights: [],
166+
keyFindings: [],
167+
}
168+
}
169+
170+
// Create a concise summary for each task (without full agent traces)
171+
const taskSummaries = allTaskAnalyses.map((task) => ({
172+
prompt: task.prompt,
173+
traceAnalysis: {
174+
overallAnalysis: task.overallAnalysis,
175+
agentFeedback: task.agentFeedback,
176+
},
177+
judgeResults: task.results.map((r) => ({
178+
agentId: r.agentId,
179+
overallScore: r.overallScore,
180+
completionScore: r.completionScore,
181+
codeQualityScore: r.codeQualityScore,
182+
cost: r.cost,
183+
durationMs: r.durationMs,
184+
strengths: r.strengths,
185+
weaknesses: r.weaknesses,
186+
error: r.error,
187+
})),
188+
}))
189+
190+
const prompt = `## All Task Analyses
191+
192+
You are analyzing ${allTaskAnalyses.length} tasks evaluated across ${agents.length} agent(s): ${agents.join(', ')}
193+
194+
${JSON.stringify(taskSummaries, null, 2)}
195+
196+
Analyze these results to identify:
197+
198+
1. **Overall Comparison**: How do the agents compare in general? What are the key differentiators?
199+
200+
2. **Per-Agent Patterns**: For each agent, identify:
201+
- What strengths appear consistently across tasks?
202+
- What weaknesses or issues recur?
203+
- How does their performance (scores, cost, time) compare?
204+
- What patterns emerge in how they approach problems?
205+
206+
3. **Actionable Insights**: What are the 3-5 most important findings that should guide development?
207+
- Which improvements would have the biggest impact?
208+
- What trade-offs are agents making?
209+
- Are there reliability concerns?
210+
211+
Focus on patterns across multiple tasks, not individual task details.`
212+
213+
const agentOutput: string[] = []
214+
const analyzerResult = await withTimeout(
215+
client.run({
216+
agent: 'buffbench-meta-analyzer',
217+
prompt,
218+
agentDefinitions: [metaAnalyzerAgent],
219+
handleEvent: (event) => {
220+
if (event.type === 'text') {
221+
agentOutput.push(event.text)
222+
} else if (event.type === 'tool_call') {
223+
agentOutput.push(JSON.stringify(event, null, 2))
224+
} else if (event.type === 'error') {
225+
console.warn('[Meta Analyzer] Error event:', event.message)
226+
}
227+
},
228+
}),
229+
30 * 60 * 1000,
230+
'Meta analyzer agent timed out after 30 minutes',
231+
)
232+
233+
const { output } = analyzerResult
234+
235+
if (output.type !== 'structuredOutput' || output.value === null) {
236+
console.error(
237+
'Error running meta analyzer - not structured output',
238+
JSON.stringify(output, null, 2),
239+
)
240+
console.error('Meta analyzer output trace:', agentOutput.join(''))
241+
return {
242+
overallComparison:
243+
'Error running meta analyzer - not structured output',
244+
agentInsights: [],
245+
keyFindings: [],
246+
}
247+
}
248+
249+
return output.value as MetaAnalysisResult
250+
} catch (error) {
251+
console.error(`Failed to analyze all tasks:`, getErrorObject(error))
252+
return {
253+
overallComparison: `Error running meta analyzer: ${getErrorObject(error).message}`,
254+
agentInsights: [],
255+
keyFindings: [],
256+
}
257+
}
258+
}

0 commit comments

Comments
 (0)