Skip to content

Commit ce97ebe

Browse files
committed
run-eval-set: default to only running codebuff. pass --sets all to run all
1 parent 548e223 commit ce97ebe

File tree

1 file changed

+30
-3
lines changed

1 file changed

+30
-3
lines changed

evals/git-evals/run-eval-set.ts

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,21 @@ class RunEvalSetCommand extends Command {
2525

2626
static examples = [
2727
'$ bun run run-eval-set',
28-
'$ bun run run-eval-set --output-dir custom-output',
28+
'$ bun run run-eval-set --sets codebuff,manifold',
29+
'$ bun run run-eval-set --sets all',
30+
'$ bun run run-eval-set --sets plane --output-dir custom-output',
2931
'$ bun run run-eval-set --email --no-analysis',
3032
'$ bun run run-eval-set --mock --no-insert',
3133
'$ bun run run-eval-set --title "Weekly Performance Test"',
3234
]
3335

3436
static flags = {
37+
sets: Flags.string({
38+
char: 's',
39+
description:
40+
'Comma-separated list of eval sets to run (codebuff, manifold, plane, saleor) or "all" for all sets',
41+
default: 'codebuff',
42+
}),
3543
'output-dir': Flags.string({
3644
char: 'o',
3745
description: 'Output directory for evaluation results',
@@ -85,6 +93,7 @@ class RunEvalSetCommand extends Command {
8593
}
8694

8795
async function runEvalSet(options: {
96+
sets: string
8897
'output-dir': string
8998
email: boolean
9099
analysis: boolean
@@ -96,6 +105,7 @@ async function runEvalSet(options: {
96105
agent: string
97106
}): Promise<void> {
98107
const {
108+
sets,
99109
'output-dir': outputDir,
100110
email: sendEmail,
101111
analysis: postEvalAnalysis,
@@ -127,8 +137,21 @@ async function runEvalSet(options: {
127137

128138
setGlobalConcurrencyLimit(options.concurrency ?? 5)
129139

130-
// Define the eval configurations
131-
const evalConfigs: EvalConfig[] = [
140+
const validSets = ['codebuff', 'manifold', 'plane', 'saleor']
141+
const requestedSets =
142+
sets.trim().toLowerCase() === 'all'
143+
? validSets
144+
: sets.split(',').map((s) => s.trim())
145+
146+
const invalidSets = requestedSets.filter((s) => !validSets.includes(s))
147+
148+
if (invalidSets.length > 0) {
149+
throw new Error(
150+
`Invalid eval sets: ${invalidSets.join(', ')}. Valid sets are: ${validSets.join(', ')} or "all"`,
151+
)
152+
}
153+
154+
const allEvalConfigs: EvalConfig[] = [
132155
{
133156
name: 'codebuff',
134157
evalDataPath: path.join(__dirname, 'eval-codebuff2.json'),
@@ -151,6 +174,10 @@ async function runEvalSet(options: {
151174
},
152175
]
153176

177+
const evalConfigs = allEvalConfigs.filter((config) =>
178+
requestedSets.includes(config.name),
179+
)
180+
154181
console.log(`Running ${evalConfigs.length} evaluations:`)
155182
evalConfigs.forEach((config) => {
156183
console.log(

0 commit comments

Comments
 (0)