diff --git a/.github/workflows/webhook-dispatch.yml b/.github/workflows/webhook-dispatch.yml new file mode 100644 index 0000000..ea2624b --- /dev/null +++ b/.github/workflows/webhook-dispatch.yml @@ -0,0 +1,31 @@ +name: trigger-webhook + +on: + push: + branches: [main] + tags: ['v*'] + +jobs: + dispatch: + runs-on: ubuntu-latest + steps: + - name: Trigger webhook + env: + WEBHOOK_URL: ${{ secrets.WEBHOOK_URL }} + WEBHOOK_USER: ${{ secrets.WEBHOOK_USER }} + WEBHOOK_PASSWORD: ${{ secrets.WEBHOOK_PASSWORD }} + run: | + if [ -z "$WEBHOOK_URL" ]; then + echo "WEBHOOK_URL secret unset; skip WEBHOOK dispatch" + exit 0 + fi + basic=$(printf '%s:%s' "$WEBHOOK_USER" "$WEBHOOK_PASSWORD" | base64 -w0) + body=$(jq -nc \ + --arg sha "${{ github.sha }}" \ + --arg ref "${{ github.ref_name }}" \ + --arg url "https://github.com/${{ github.repository }}.git" \ + '{conf: {commit_sha: $sha, git_ref: $ref, git_repo_url: $url}}') + curl -sfS -X POST "$WEBHOOK_URL" \ + -H "Authorization: Basic $basic" \ + -H "Content-Type: application/json" \ + -d "$body" diff --git a/paper/benchmark-methodology-whitepaper.tex b/paper/benchmark-methodology-whitepaper.tex index f5db197..3a77fe1 100644 --- a/paper/benchmark-methodology-whitepaper.tex +++ b/paper/benchmark-methodology-whitepaper.tex @@ -314,6 +314,8 @@ \subsection{OpenCode integration in the judgment stage.} The judge CLI also supports targeted re-judging over existing judge outputs using \\\texttt{----rerun-requirements-file} with \texttt{----output}; optionally pass \texttt{----rerun-requirement-id} to refresh only one requirement. Without \texttt{----rerun-requirement-id}, all requirements for the targeted eval are re-judged and replaced in the per-eval JSON. It also supports \texttt{----rerun-missing-judgements} with \texttt{----output} to scan for and judge all evals missing per-eval judge JSON outputs (the same missing rows counted by \texttt{evalsErrored} in the rebuilt summary). In rerun modes, the previous \texttt{summary.json} is backed up as \texttt{summary.backup..json}, and a new aggregate summary is generated from the current per-eval result set. +Both CLIs accept repeatable \texttt{----skip-eval-id}. Generation omits those eval IDs from discovery processing. Judging skips LLM calls for those IDs and instead requires an existing per-eval judge JSON already present under the configured judge output directory (used when orchestration merges prior verdict artifacts). Skipped evals still contribute to the aggregate summary when their per-eval JSON is present. + \section{Solver Stage Methodology} The solver methodology treats each benchmark item as a constrained code-generation task: the model receives a task prompt plus project context, produces a complete implementation artifact, and is evaluated only against task-defined requirements. The same solver protocol is applied across all tasks and runs, with outputs stored in a structured, reproducible format, ensuring fair comparison across models and direct traceability between generated code and downstream requirement-level scores. @@ -338,7 +340,7 @@ \subsection{Output Contract} Returned paths are sanitized before writing to disk (e.g., stripping leading slashes and removing traversal segments such as \texttt{..}). \subsection{Model Requirement} -The generation CLI requires \texttt{----model}. When set to \texttt{noop}, the generation stage copies files from each eval's \texttt{reference/} directory into the configured output and still writes a validated \texttt{manifest.json}. Any other model value runs normal solver generation. +The generation CLI requires \texttt{----model}. When set to \texttt{noop}, the generation stage copies files from each eval's \texttt{reference/} directory into the configured output and still writes a validated \texttt{manifest.json}. Any other model value runs normal solver generation. Repeatable \texttt{----skip-eval-id} removes matching eval IDs from the generation pass after glob discovery (orchestration may merge outputs from a prior run for those IDs before judging). \section{LLM Judge Methodology} @@ -405,7 +407,7 @@ \subsection{Requirement Mapping and Failure Policy} \item Requirement weights are normalized before scoring (\cref{sec:scoring}). \end{itemize} -The judge CLI requires \texttt{----model}; there is no judge skip/noop path. +The judge CLI requires \texttt{----model}. Aside from repeatable \texttt{----skip-eval-id} (which reuses existing per-eval judge JSON on disk rather than calling the judge model), there is no judge noop path. \section{Scoring Methodology} \label{sec:scoring} @@ -545,7 +547,7 @@ \section{Recommended Reporting Protocol} For comparative studies, report at minimum: \begin{enumerate} \item repository commit hash (dataset and runner version) - \item CLI options (\texttt{run: ----pattern, ----model, ----timeout, ----concurrency, ----output}; \\\texttt{judge: ----model, ----timeout, ----concurrency, ----input} \\and optional rerun flags \texttt{----rerun-requirements-file}, \texttt{----output}, \\scope optional \texttt{----rerun-requirement-id}, scope optional \texttt{----rerun-missing-judgements}) + \item CLI options (\texttt{run: ----pattern, ----model, ----timeout, ----concurrency, ----output}, \\optional repeatable \texttt{----skip-eval-id}; \\\texttt{judge: ----model, ----timeout, ----concurrency, ----input}, \\optional repeatable \texttt{----skip-eval-id}, \\and optional rerun flags \texttt{----rerun-requirements-file}, \texttt{----output}, \\scope optional \texttt{----rerun-requirement-id}, scope optional \texttt{----rerun-missing-judgements}) \item execution date and time \item counts of discovered, processed, and errored evals \item \texttt{weightedAverageScore} and \texttt{requirementsPassed/Total} diff --git a/runner/config.ts b/runner/config.ts index ae87103..cb8d47f 100644 --- a/runner/config.ts +++ b/runner/config.ts @@ -1,5 +1,15 @@ import { parseArgs as parseArgv } from 'node:util' +function normalizeSkipEvalIds(value: string | string[] | undefined) { + if (!value) { + return [] + } + + return (Array.isArray(value) ? value : [value]).filter( + (item) => item.length > 0 + ) +} + function parsePositiveInteger(rawValue: string, flagName: string) { const parsedValue = Number.parseInt(rawValue, 10) if (!Number.isInteger(parsedValue) || parsedValue <= 0) { @@ -25,6 +35,7 @@ export function parseRunCliArgs(argv: string[] = Bun.argv.slice(2)) { 'max-retries': { type: 'string', default: '1' }, 'model': { type: 'string' }, 'pattern': { type: 'string', default: 'evals/**/*' }, + 'skip-eval-id': { type: 'string', multiple: true }, 'timeout': { type: 'string', default: '120000' }, 'port': { type: 'string' }, 'output': { type: 'string' }, @@ -43,6 +54,7 @@ export function parseRunCliArgs(argv: string[] = Bun.argv.slice(2)) { maxRetries: parsePositiveInteger(values['max-retries'], '--max-retries'), model: values.model, pattern: values.pattern, + skipEvalIds: normalizeSkipEvalIds(values['skip-eval-id']), timeout: parsePositiveInteger(values.timeout, '--timeout'), port: parsePort(values.port), output: values.output, @@ -64,6 +76,7 @@ export function parseJudgeCliArgs(argv: string[] = Bun.argv.slice(2)) { 'rerun-missing-judgements': { type: 'boolean', default: false }, 'rerun-requirement-id': { type: 'string' }, 'rerun-requirements-file': { type: 'string' }, + 'skip-eval-id': { type: 'string', multiple: true }, 'timeout': { type: 'string', default: '120000' }, 'port': { type: 'string' }, 'input': { type: 'string' }, @@ -106,6 +119,7 @@ export function parseJudgeCliArgs(argv: string[] = Bun.argv.slice(2)) { failFast: values['fail-fast'] ?? false, maxRetries: parsePositiveInteger(values['max-retries'], '--max-retries'), model: values.model, + skipEvalIds: normalizeSkipEvalIds(values['skip-eval-id']), rerunMissingJudgements: values['rerun-missing-judgements'] ?? false, rerunRequirementId: values['rerun-requirement-id'], rerunRequirementsFile: values['rerun-requirements-file'], diff --git a/runner/judge.ts b/runner/judge.ts index 31e8523..e305f12 100644 --- a/runner/judge.ts +++ b/runner/judge.ts @@ -326,6 +326,7 @@ async function runWithRetries( */ export async function runJudgeEntry(argv: string[] = Bun.argv.slice(2)) { const cliOptions = parseJudgeCliArgs(argv) + const skipEvalIdSet = new Set(cliOptions.skipEvalIds) const inputDirectory = path.resolve(process.cwd(), cliOptions.input) const outputDirectory = cliOptions.output ?? path.dirname(inputDirectory) const outputDirectories = await createRunOutputDirectories(outputDirectory) @@ -610,6 +611,35 @@ export async function runJudgeEntry(argv: string[] = Bun.argv.slice(2)) { cliOptions.concurrency, async (manifestEval, index) => { try { + if (skipEvalIdSet.has(manifestEval.evalId)) { + const resultFilePath = getResultFilePath( + outputDirectories.runDirectory, + manifestEval.generatedPath, + manifestEval.evalId + ) + + try { + const raw = await readFile(resultFilePath, 'utf8') + const parsed = parsePersistedEvalResult(raw, resultFilePath) + const position = index + 1 + console.log( + `[${position}/${manifestEvals.length}] ${manifestEval.evalId} ` + + `-> llm:${parsed.score.ratio} (reused prior judge output)` + ) + + return { kind: 'success' as const, index, result: parsed } + } catch (error) { + if (isNotFoundError(error)) { + throw new Error( + `missing judge output for skipped eval ${manifestEval.evalId} at ` + + `${toRelativePath(resultFilePath)}` + ) + } + + throw error + } + } + const stageResult = await runJudgeForManifestEval({ manifestEval, index, diff --git a/runner/run.ts b/runner/run.ts index 26e4860..54ddbdf 100644 --- a/runner/run.ts +++ b/runner/run.ts @@ -57,7 +57,10 @@ async function runWithRetries( */ export async function runGenerationEntry(argv: string[] = Bun.argv.slice(2)) { const cliOptions = parseRunCliArgs(argv) - const discoveredEvals = await discoverEvals(cliOptions.pattern) + const skipEvalIdSet = new Set(cliOptions.skipEvalIds) + const discoveredEvals = (await discoverEvals(cliOptions.pattern)).filter( + (item) => !skipEvalIdSet.has(item.evalId) + ) const runId = new Date().toISOString().replace(/[:.]/g, '-') const startedAt = new Date().toISOString() const outputDirectory = path.resolve(