@@ -25,13 +25,21 @@ class RunEvalSetCommand extends Command {
2525
2626 static examples = [
2727 '$ bun run run-eval-set' ,
28- '$ bun run run-eval-set --output-dir custom-output' ,
28+ '$ bun run run-eval-set --sets codebuff,manifold' ,
29+ '$ bun run run-eval-set --sets all' ,
30+ '$ bun run run-eval-set --sets plane --output-dir custom-output' ,
2931 '$ bun run run-eval-set --email --no-analysis' ,
3032 '$ bun run run-eval-set --mock --no-insert' ,
3133 '$ bun run run-eval-set --title "Weekly Performance Test"' ,
3234 ]
3335
3436 static flags = {
37+ sets : Flags . string ( {
38+ char : 's' ,
39+ description :
40+ 'Comma-separated list of eval sets to run (codebuff, manifold, plane, saleor) or "all" for all sets' ,
41+ default : 'codebuff' ,
42+ } ) ,
3543 'output-dir' : Flags . string ( {
3644 char : 'o' ,
3745 description : 'Output directory for evaluation results' ,
@@ -85,6 +93,7 @@ class RunEvalSetCommand extends Command {
8593}
8694
8795async function runEvalSet ( options : {
96+ sets : string
8897 'output-dir' : string
8998 email : boolean
9099 analysis : boolean
@@ -96,6 +105,7 @@ async function runEvalSet(options: {
96105 agent : string
97106} ) : Promise < void > {
98107 const {
108+ sets,
99109 'output-dir' : outputDir ,
100110 email : sendEmail ,
101111 analysis : postEvalAnalysis ,
@@ -127,8 +137,21 @@ async function runEvalSet(options: {
127137
128138 setGlobalConcurrencyLimit ( options . concurrency ?? 5 )
129139
130- // Define the eval configurations
131- const evalConfigs : EvalConfig [ ] = [
140+ const validSets = [ 'codebuff' , 'manifold' , 'plane' , 'saleor' ]
141+ const requestedSets =
142+ sets . trim ( ) . toLowerCase ( ) === 'all'
143+ ? validSets
144+ : sets . split ( ',' ) . map ( ( s ) => s . trim ( ) )
145+
146+ const invalidSets = requestedSets . filter ( ( s ) => ! validSets . includes ( s ) )
147+
148+ if ( invalidSets . length > 0 ) {
149+ throw new Error (
150+ `Invalid eval sets: ${ invalidSets . join ( ', ' ) } . Valid sets are: ${ validSets . join ( ', ' ) } or "all"` ,
151+ )
152+ }
153+
154+ const allEvalConfigs : EvalConfig [ ] = [
132155 {
133156 name : 'codebuff' ,
134157 evalDataPath : path . join ( __dirname , 'eval-codebuff2.json' ) ,
@@ -151,6 +174,10 @@ async function runEvalSet(options: {
151174 } ,
152175 ]
153176
177+ const evalConfigs = allEvalConfigs . filter ( ( config ) =>
178+ requestedSets . includes ( config . name ) ,
179+ )
180+
154181 console . log ( `Running ${ evalConfigs . length } evaluations:` )
155182 evalConfigs . forEach ( ( config ) => {
156183 console . log (
0 commit comments