From b69d2e968d6ba0909e8f855c9ff79d7a25d79f1f Mon Sep 17 00:00:00 2001 From: James Grugett Date: Thu, 5 Feb 2026 13:50:31 -0800 Subject: [PATCH 1/4] Trigger buffbench remotely --- .github/workflows/buffbench.yml | 50 +++++++++++++++++++ evals/buffbench/main.ts | 4 +- evals/package.json | 1 + evals/scripts/trigger-buffbench.ts | 78 ++++++++++++++++++++++++++++++ 4 files changed, 131 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/buffbench.yml create mode 100644 evals/scripts/trigger-buffbench.ts diff --git a/.github/workflows/buffbench.yml b/.github/workflows/buffbench.yml new file mode 100644 index 0000000000..c5340994ae --- /dev/null +++ b/.github/workflows/buffbench.yml @@ -0,0 +1,50 @@ +name: BuffBench + +on: + workflow_dispatch: # Manual triggering only + +jobs: + run-buffbench: + runs-on: ubuntu-latest + timeout-minutes: 360 + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Bun + uses: oven-sh/setup-bun@v2 + with: + bun-version: '1.3.5' + + - name: Cache dependencies + uses: actions/cache@v4 + with: + path: | + node_modules + */node_modules + packages/*/node_modules + key: ${{ runner.os }}-deps-${{ hashFiles('**/bun.lock*') }} + restore-keys: | + ${{ runner.os }}-deps- + + - name: Install dependencies + run: bun install --frozen-lockfile + + - name: Set environment variables + env: + SECRETS_CONTEXT: ${{ toJSON(secrets) }} + run: | + VAR_NAMES=$(bun scripts/generate-ci-env.ts) + echo "$SECRETS_CONTEXT" | jq -r --argjson vars "$VAR_NAMES" ' + to_entries | .[] | select(.key as $k | $vars | index($k)) | .key + "=" + .value + ' >> $GITHUB_ENV + echo "CODEBUFF_GITHUB_ACTIONS=true" >> $GITHUB_ENV + echo "NEXT_PUBLIC_CB_ENVIRONMENT=test" >> $GITHUB_ENV + echo "NEXT_PUBLIC_INFISICAL_UP=true" >> $GITHUB_ENV + echo "CODEBUFF_GITHUB_TOKEN=${{ secrets.CODEBUFF_GITHUB_TOKEN }}" >> $GITHUB_ENV + + - name: Run buffbench + run: cd evals && bun run-buffbench + + - name: Workflow completed + run: echo "BuffBench workflow completed successfully" diff --git a/evals/buffbench/main.ts b/evals/buffbench/main.ts index a1739f50b1..78e28fdfba 100644 --- a/evals/buffbench/main.ts +++ b/evals/buffbench/main.ts @@ -8,8 +8,8 @@ async function main() { // Use 'external:codex' for OpenAI Codex CLI await runBuffBench({ evalDataPaths: [path.join(__dirname, 'eval-codebuff.json')], - agents: ['base2', 'external:claude', 'external:codex'], - taskConcurrency: 1, + agents: ['base2'], + taskConcurrency: 5, }) process.exit(0) diff --git a/evals/package.json b/evals/package.json index 9f14702943..d29673f4c5 100644 --- a/evals/package.json +++ b/evals/package.json @@ -23,6 +23,7 @@ "run-eval-set": "bun run git-evals/run-eval-set.ts", "run-buffbench": "bun run buffbench/main.ts", "run-buffbench-nightly": "bun run buffbench/main-nightly.ts", + "trigger-buffbench": "bun run scripts/trigger-buffbench.ts", "setup-codebuff-repo": "bun run setup-codebuff-repo.ts" }, "sideEffects": false, diff --git a/evals/scripts/trigger-buffbench.ts b/evals/scripts/trigger-buffbench.ts new file mode 100644 index 0000000000..65f7176084 --- /dev/null +++ b/evals/scripts/trigger-buffbench.ts @@ -0,0 +1,78 @@ +#!/usr/bin/env node + +const { execSync } = require('child_process') + +function log(message: string) { + console.log(`${message}`) +} + +function error(message: string) { + console.error(`โŒ ${message}`) + process.exit(1) +} + +function checkGitHubToken() { + const token = process.env.CODEBUFF_GITHUB_TOKEN + if (!token) { + error( + 'CODEBUFF_GITHUB_TOKEN environment variable is required but not set.\n' + + 'Please set it with your GitHub personal access token or use the infisical setup.' + ) + } + return token +} + +function getCurrentBranch(): string { + try { + return execSync('git rev-parse --abbrev-ref HEAD', { encoding: 'utf8' }).trim() + } catch { + return 'main' + } +} + +async function triggerWorkflow(token: string, branch: string) { + try { + const triggerCmd = `curl -s -w "HTTP Status: %{http_code}" -X POST \ + -H "Accept: application/vnd.github.v3+json" \ + -H "Authorization: token ${token}" \ + -H "Content-Type: application/json" \ + https://api.github.com/repos/CodebuffAI/codebuff/actions/workflows/buffbench.yml/dispatches \ + -d '{"ref":"${branch}"}'` + + const response = execSync(triggerCmd, { encoding: 'utf8' }) + + if (response.includes('workflow_dispatch')) { + log(`โš ๏ธ Workflow dispatch failed: ${response}`) + log( + 'Please manually trigger the workflow at: https://github.com/CodebuffAI/codebuff/actions/workflows/buffbench.yml', + ) + } else { + log('๐ŸŽ‰ BuffBench workflow triggered!') + } + } catch (err: any) { + log(`โš ๏ธ Failed to trigger workflow automatically: ${err.message}`) + log( + 'You may need to trigger it manually at: https://github.com/CodebuffAI/codebuff/actions/workflows/buffbench.yml', + ) + } +} + +async function main() { + const branch = process.argv[2] || getCurrentBranch() + + log('๐Ÿงช Triggering BuffBench workflow...') + log(`Branch: ${branch}`) + + const token = checkGitHubToken() + if (!token) return + log('โœ… Using CODEBUFF_GITHUB_TOKEN') + + await triggerWorkflow(token, branch) + + log('') + log('Monitor progress at: https://github.com/CodebuffAI/codebuff/actions/workflows/buffbench.yml') +} + +main().catch((err) => { + error(`Failed to trigger BuffBench: ${err.message}`) +}) From 9d433a6e1378be207aaac8bb18ccdcab8d0e255e Mon Sep 17 00:00:00 2001 From: James Grugett Date: Thu, 5 Feb 2026 13:52:39 -0800 Subject: [PATCH 2/4] Update agents to opus 4.6 --- agents/__tests__/editor.test.ts | 44 +++++----- agents/__tests__/thinker.test.ts | 82 +++++++++---------- agents/base2/base2.ts | 2 +- .../editor/best-of-n/best-of-n-selector2.ts | 11 ++- agents/editor/best-of-n/editor-implementor.ts | 11 ++- .../editor/best-of-n/editor-multi-prompt.ts | 2 +- agents/editor/editor.ts | 2 +- agents/general-agent/general-agent.ts | 2 +- agents/reviewer/code-reviewer.ts | 2 +- .../code-reviewer-multi-prompt.ts | 2 +- agents/thinker/best-of-n/thinker-best-of-n.ts | 14 ++-- agents/thinker/best-of-n/thinker-selector.ts | 60 +++++++------- agents/thinker/thinker.ts | 2 +- 13 files changed, 117 insertions(+), 119 deletions(-) diff --git a/agents/__tests__/editor.test.ts b/agents/__tests__/editor.test.ts index e87f4f18c1..3e516c3976 100644 --- a/agents/__tests__/editor.test.ts +++ b/agents/__tests__/editor.test.ts @@ -28,7 +28,7 @@ describe('editor agent', () => { }) test('uses opus model by default', () => { - expect(editor.model).toBe('anthropic/claude-opus-4.5') + expect(editor.model).toBe('anthropic/claude-opus-4.6') }) test('has output mode set to structured_output', () => { @@ -54,7 +54,7 @@ describe('editor agent', () => { describe('createCodeEditor', () => { test('creates opus editor by default', () => { const opusEditor = createCodeEditor({ model: 'opus' }) - expect(opusEditor.model).toBe('anthropic/claude-opus-4.5') + expect(opusEditor.model).toBe('anthropic/claude-opus-4.6') }) test('creates gpt-5 editor', () => { @@ -160,10 +160,10 @@ describe('editor agent', () => { ] const mockAgentState = createMockAgentState(initialMessages) const mockLogger = { - debug: () => {}, - info: () => {}, - warn: () => {}, - error: () => {}, + debug: () => { }, + info: () => { }, + warn: () => { }, + error: () => { }, } const generator = editor.handleSteps!({ @@ -183,10 +183,10 @@ describe('editor agent', () => { ] const mockAgentState = createMockAgentState(initialMessages) const mockLogger = { - debug: () => {}, - info: () => {}, - warn: () => {}, - error: () => {}, + debug: () => { }, + info: () => { }, + warn: () => { }, + error: () => { }, } const generator = editor.handleSteps!({ @@ -227,10 +227,10 @@ describe('editor agent', () => { ] const mockAgentState = createMockAgentState(initialMessages) const mockLogger = { - debug: () => {}, - info: () => {}, - warn: () => {}, - error: () => {}, + debug: () => { }, + info: () => { }, + warn: () => { }, + error: () => { }, } const generator = editor.handleSteps!({ @@ -278,10 +278,10 @@ describe('editor agent', () => { const initialMessages: any[] = [] const mockAgentState = createMockAgentState(initialMessages) const mockLogger = { - debug: () => {}, - info: () => {}, - warn: () => {}, - error: () => {}, + debug: () => { }, + info: () => { }, + warn: () => { }, + error: () => { }, } const generator = editor.handleSteps!({ @@ -315,10 +315,10 @@ describe('editor agent', () => { test('works with empty initial message history', () => { const mockAgentState = createMockAgentState([]) const mockLogger = { - debug: () => {}, - info: () => {}, - warn: () => {}, - error: () => {}, + debug: () => { }, + info: () => { }, + warn: () => { }, + error: () => { }, } const generator = editor.handleSteps!({ diff --git a/agents/__tests__/thinker.test.ts b/agents/__tests__/thinker.test.ts index 3edd281f78..ac36c12dbe 100644 --- a/agents/__tests__/thinker.test.ts +++ b/agents/__tests__/thinker.test.ts @@ -29,7 +29,7 @@ describe('thinker agent', () => { }) test('uses opus model', () => { - expect(thinker.model).toBe('anthropic/claude-opus-4.5') + expect(thinker.model).toBe('anthropic/claude-opus-4.6') }) test('has output mode set to structured_output', () => { @@ -94,10 +94,10 @@ describe('thinker agent', () => { test('yields STEP to get agent state', () => { const mockAgentState = createMockAgentState() const mockLogger = { - debug: () => {}, - info: () => {}, - warn: () => {}, - error: () => {}, + debug: () => { }, + info: () => { }, + warn: () => { }, + error: () => { }, } const generator = thinker.handleSteps!({ @@ -125,10 +125,10 @@ describe('thinker agent', () => { const mockAgentState = createMockAgentState(messages) const mockLogger = { - debug: () => {}, - info: () => {}, - warn: () => {}, - error: () => {}, + debug: () => { }, + info: () => { }, + warn: () => { }, + error: () => { }, } const generator = thinker.handleSteps!({ @@ -170,10 +170,10 @@ describe('thinker agent', () => { const mockAgentState = createMockAgentState(messages) const mockLogger = { - debug: () => {}, - info: () => {}, - warn: () => {}, - error: () => {}, + debug: () => { }, + info: () => { }, + warn: () => { }, + error: () => { }, } const generator = thinker.handleSteps!({ @@ -220,10 +220,10 @@ Actual response here`, const mockAgentState = createMockAgentState(messages) const mockLogger = { - debug: () => {}, - info: () => {}, - warn: () => {}, - error: () => {}, + debug: () => { }, + info: () => { }, + warn: () => { }, + error: () => { }, } const generator = thinker.handleSteps!({ @@ -255,10 +255,10 @@ Actual response here`, const mockAgentState = createMockAgentState(messages) const mockLogger = { - debug: () => {}, - info: () => {}, - warn: () => {}, - error: () => {}, + debug: () => { }, + info: () => { }, + warn: () => { }, + error: () => { }, } const generator = thinker.handleSteps!({ @@ -298,10 +298,10 @@ Actual response here`, const mockAgentState = createMockAgentState(messages) const mockLogger = { - debug: () => {}, - info: () => {}, - warn: () => {}, - error: () => {}, + debug: () => { }, + info: () => { }, + warn: () => { }, + error: () => { }, } const generator = thinker.handleSteps!({ @@ -337,10 +337,10 @@ Actual response here`, const mockAgentState = createMockAgentState(messages) const mockLogger = { - debug: () => {}, - info: () => {}, - warn: () => {}, - error: () => {}, + debug: () => { }, + info: () => { }, + warn: () => { }, + error: () => { }, } const generator = thinker.handleSteps!({ @@ -389,10 +389,10 @@ Actual response here`, const mockAgentState = createMockAgentState(messages) const mockLogger = { - debug: () => {}, - info: () => {}, - warn: () => {}, - error: () => {}, + debug: () => { }, + info: () => { }, + warn: () => { }, + error: () => { }, } const generator = thinker.handleSteps!({ @@ -442,10 +442,10 @@ Actual response here`, const mockAgentState = createMockAgentState(messages) const mockLogger = { - debug: () => {}, - info: () => {}, - warn: () => {}, - error: () => {}, + debug: () => { }, + info: () => { }, + warn: () => { }, + error: () => { }, } const generator = thinker.handleSteps!({ @@ -477,10 +477,10 @@ Actual response here`, const mockAgentState = createMockAgentState(messages) const mockLogger = { - debug: () => {}, - info: () => {}, - warn: () => {}, - error: () => {}, + debug: () => { }, + info: () => { }, + warn: () => { }, + error: () => { }, } const generator = thinker.handleSteps!({ diff --git a/agents/base2/base2.ts b/agents/base2/base2.ts index a654652032..2de7e716d8 100644 --- a/agents/base2/base2.ts +++ b/agents/base2/base2.ts @@ -28,7 +28,7 @@ export function createBase2( return { publisher, - model: isFree ? 'x-ai/grok-4.1-fast' : 'anthropic/claude-opus-4.5', + model: isFree ? 'x-ai/grok-4.1-fast' : 'anthropic/claude-opus-4.6', displayName: 'Buffy the Orchestrator', spawnerPrompt: 'Advanced base agent that orchestrates planning, editing, and reviewing for complex coding tasks', diff --git a/agents/editor/best-of-n/best-of-n-selector2.ts b/agents/editor/best-of-n/best-of-n-selector2.ts index 651296629e..852c268783 100644 --- a/agents/editor/best-of-n/best-of-n-selector2.ts +++ b/agents/editor/best-of-n/best-of-n-selector2.ts @@ -16,7 +16,7 @@ export const createBestOfNSelector2 = (options: { model: isSonnet ? 'anthropic/claude-sonnet-4.5' : isOpus - ? 'anthropic/claude-opus-4.5' + ? 'anthropic/claude-opus-4.6' : 'openai/gpt-5.2', ...(isGpt5 && { reasoningOptions: { @@ -126,13 +126,12 @@ Try to select an implementation that fulfills all the requirements in the user's ## Response Format -${ - isSonnet || isOpus - ? `Use tags to write out your thoughts about the implementations as needed to pick the best implementation. IMPORTANT: You should think really really hard to make sure you pick the absolute best implementation! Also analyze the non-chosen implementations for any valuable techniques or approaches that could improve the selected one. +${isSonnet || isOpus + ? `Use tags to write out your thoughts about the implementations as needed to pick the best implementation. IMPORTANT: You should think really really hard to make sure you pick the absolute best implementation! Also analyze the non-chosen implementations for any valuable techniques or approaches that could improve the selected one. Then, do not write any other explanations AT ALL. You should directly output a single tool call to set_output with the selected implementationId, short reason, and suggestedImprovements array.` - : `Output a single tool call to set_output with the selected implementationId, reason, and suggestedImprovements. Do not write anything else.` -}`, + : `Output a single tool call to set_output with the selected implementationId, reason, and suggestedImprovements. Do not write anything else.` + }`, } } diff --git a/agents/editor/best-of-n/editor-implementor.ts b/agents/editor/best-of-n/editor-implementor.ts index d04eff4617..f4852fd06d 100644 --- a/agents/editor/best-of-n/editor-implementor.ts +++ b/agents/editor/best-of-n/editor-implementor.ts @@ -16,7 +16,7 @@ export const createBestOfNImplementor = (options: { model: isSonnet ? 'anthropic/claude-sonnet-4.5' : isOpus - ? 'anthropic/claude-opus-4.5' + ? 'anthropic/claude-opus-4.6' : isGemini ? 'google/gemini-3-pro-preview' : 'openai/gpt-5.1', @@ -67,10 +67,9 @@ OR for new files or major rewrites: "content": "Complete file content or edit snippet" } -${ - isGpt5 || isGemini - ? `` - : ` +${isGpt5 || isGemini + ? `` + : ` IMPORTANT: Before you start writing your implementation, you should use tags to think about the best way to implement the changes. You should think really really hard to make sure you implement the changes in the best way possible. Take as much time as you to think through all the cases to produce the best changes. You can also use tags interspersed between tool calls to think about the best way to implement the changes. @@ -98,7 +97,7 @@ You can also use tags interspersed between tool calls to think about the ` -} + } After the edit tool calls, you can optionally mention any follow-up steps to take, like deleting a file, or a specific way to validate the changes. There's no need to use the set_output tool as your entire response will be included in the output. diff --git a/agents/editor/best-of-n/editor-multi-prompt.ts b/agents/editor/best-of-n/editor-multi-prompt.ts index 61de31eda0..2d101ea8a6 100644 --- a/agents/editor/best-of-n/editor-multi-prompt.ts +++ b/agents/editor/best-of-n/editor-multi-prompt.ts @@ -11,7 +11,7 @@ import type { SecretAgentDefinition } from '../../types/secret-agent-definition' export function createMultiPromptEditor(): Omit { return { publisher, - model: 'anthropic/claude-opus-4.5', + model: 'anthropic/claude-opus-4.6', displayName: 'Multi-Prompt Editor', spawnerPrompt: 'Edits code by spawning multiple implementor agents with different strategy prompts, selects the best implementation, and applies the changes. It also returns further suggested improvements which you should take seriously and act on. Pass as input an array of short prompts specifying different implementation approaches or strategies. Make sure to read any files intended to be edited before spawning this agent.', diff --git a/agents/editor/editor.ts b/agents/editor/editor.ts index 483724c70b..a22c18f0e3 100644 --- a/agents/editor/editor.ts +++ b/agents/editor/editor.ts @@ -14,7 +14,7 @@ export const createCodeEditor = (options: { ? 'openai/gpt-5.1' : options.model === 'glm' ? 'z-ai/glm-4.7' - : 'anthropic/claude-opus-4.5', + : 'anthropic/claude-opus-4.6', ...(model === 'glm' && { reasoningOptions: { effort: 'high', diff --git a/agents/general-agent/general-agent.ts b/agents/general-agent/general-agent.ts index 8c542e0e5f..37d92beacd 100644 --- a/agents/general-agent/general-agent.ts +++ b/agents/general-agent/general-agent.ts @@ -12,7 +12,7 @@ export const createGeneralAgent = (options: { return { publisher, - model: isGpt5 ? 'openai/gpt-5.2' : 'anthropic/claude-opus-4.5', + model: isGpt5 ? 'openai/gpt-5.2' : 'anthropic/claude-opus-4.6', ...(isGpt5 && { reasoningOptions: { effort: 'high' as const, diff --git a/agents/reviewer/code-reviewer.ts b/agents/reviewer/code-reviewer.ts index 0d4b6d4375..c22d2d6c40 100644 --- a/agents/reviewer/code-reviewer.ts +++ b/agents/reviewer/code-reviewer.ts @@ -64,7 +64,7 @@ Be extremely concise.`, const definition: SecretAgentDefinition = { id: 'code-reviewer', publisher, - ...createReviewer('anthropic/claude-opus-4.5'), + ...createReviewer('anthropic/claude-opus-4.6'), } export default definition diff --git a/agents/reviewer/multi-prompt/code-reviewer-multi-prompt.ts b/agents/reviewer/multi-prompt/code-reviewer-multi-prompt.ts index 126c2c6215..134862a57b 100644 --- a/agents/reviewer/multi-prompt/code-reviewer-multi-prompt.ts +++ b/agents/reviewer/multi-prompt/code-reviewer-multi-prompt.ts @@ -14,7 +14,7 @@ export function createCodeReviewerMultiPrompt(): Omit< > { return { publisher, - model: 'anthropic/claude-opus-4.5', + model: 'anthropic/claude-opus-4.6', displayName: 'Multi-Prompt Code Reviewer', spawnerPrompt: 'Reviews code by spawning multiple code-reviewer agents with different focus prompts, then combines all review outputs into a comprehensive review. Make sure to read relevant files before spawning this agent. Pass an input array of short prompts specifying several different review focuses or perspectives.', diff --git a/agents/thinker/best-of-n/thinker-best-of-n.ts b/agents/thinker/best-of-n/thinker-best-of-n.ts index cd11877899..66530a9269 100644 --- a/agents/thinker/best-of-n/thinker-best-of-n.ts +++ b/agents/thinker/best-of-n/thinker-best-of-n.ts @@ -18,7 +18,7 @@ export function createThinkerBestOfN( model: isGpt5 ? 'openai/gpt-5.1' : isOpus - ? 'anthropic/claude-opus-4.5' + ? 'anthropic/claude-opus-4.6' : 'anthropic/claude-sonnet-4.5', displayName: isGpt5 ? 'Best-of-N GPT-5 Thinker' @@ -133,9 +133,9 @@ function* handleStepsDefault({ .filter((result) => result.type === 'json') .map((result) => result.value) .flat() as { - agentType: string - value: { value?: T; errorMessage?: string } - }[] + agentType: string + value: { value?: T; errorMessage?: string } + }[] return spawnedResults.map( (result) => result.value.value ?? @@ -218,9 +218,9 @@ function* handleStepsOpus({ .filter((result) => result.type === 'json') .map((result) => result.value) .flat() as { - agentType: string - value: { value?: T; errorMessage?: string } - }[] + agentType: string + value: { value?: T; errorMessage?: string } + }[] return spawnedResults.map( (result) => result.value.value ?? diff --git a/agents/thinker/best-of-n/thinker-selector.ts b/agents/thinker/best-of-n/thinker-selector.ts index c961831620..a5c302bb96 100644 --- a/agents/thinker/best-of-n/thinker-selector.ts +++ b/agents/thinker/best-of-n/thinker-selector.ts @@ -9,49 +9,49 @@ export function createThinkerSelector( return { publisher, model: isOpus - ? 'anthropic/claude-opus-4.5' + ? 'anthropic/claude-opus-4.6' : 'anthropic/claude-sonnet-4.5', displayName: isOpus ? 'Opus Thinker Output Selector' : 'Thinker Output Selector', spawnerPrompt: 'Analyzes multiple thinking outputs and selects the best one', - includeMessageHistory: true, - inheritParentSystemPrompt: true, - - toolNames: ['set_output'], - spawnableAgents: [], - - inputSchema: { - params: { - type: 'object', - properties: { - thoughts: { - type: 'array', - items: { - type: 'object', - properties: { - id: { type: 'string' }, - content: { type: 'string' }, + includeMessageHistory: true, + inheritParentSystemPrompt: true, + + toolNames: ['set_output'], + spawnableAgents: [], + + inputSchema: { + params: { + type: 'object', + properties: { + thoughts: { + type: 'array', + items: { + type: 'object', + properties: { + id: { type: 'string' }, + content: { type: 'string' }, + }, + required: ['id', 'content'], }, - required: ['id', 'content'], }, }, + required: ['thoughts'], }, - required: ['thoughts'], }, - }, - outputMode: 'structured_output', - outputSchema: { - type: 'object', - properties: { - thoughtId: { - type: 'string', - description: 'The id of the chosen thinking output', + outputMode: 'structured_output', + outputSchema: { + type: 'object', + properties: { + thoughtId: { + type: 'string', + description: 'The id of the chosen thinking output', + }, }, + required: ['thoughtId'], }, - required: ['thoughtId'], - }, instructionsPrompt: `As part of the best-of-n workflow for thinking agents, you are the thinking selector agent. diff --git a/agents/thinker/thinker.ts b/agents/thinker/thinker.ts index 036d493753..83d45a0f8a 100644 --- a/agents/thinker/thinker.ts +++ b/agents/thinker/thinker.ts @@ -5,7 +5,7 @@ import type { SecretAgentDefinition } from '../types/secret-agent-definition' const definition: SecretAgentDefinition = { id: 'thinker', publisher, - model: 'anthropic/claude-opus-4.5', + model: 'anthropic/claude-opus-4.6', displayName: 'Theo the Theorizer', spawnerPrompt: 'Does deep thinking given the current conversation history and a specific prompt to focus on. Use this to help you solve a specific problem. It is better to gather any relevant context before spawning this agent.', From 4b76bc0c6a07cbb89a69eebc5535c0966dfb7efd Mon Sep 17 00:00:00 2001 From: James Grugett Date: Thu, 5 Feb 2026 16:59:08 -0800 Subject: [PATCH 3/4] Update .agents and agents-graveyard with new opus --- .agents/claude-code-cli.ts | 2 +- .agents/codebuff-local-cli.ts | 2 +- .agents/codex-cli.ts | 2 +- .agents/gemini-cli.ts | 2 +- .../editor/best-of-n/best-of-n-selector.ts | 11 +++++------ .../editor/best-of-n/editor-implementor2.ts | 2 +- .../editor/best-of-n/editor-multi-prompt2.ts | 2 +- agents-graveyard/editor/reviewer-editor.ts | 11 +++++------ 8 files changed, 16 insertions(+), 18 deletions(-) diff --git a/.agents/claude-code-cli.ts b/.agents/claude-code-cli.ts index a1bce0a223..b306968d98 100644 --- a/.agents/claude-code-cli.ts +++ b/.agents/claude-code-cli.ts @@ -10,7 +10,7 @@ const baseDefinition = createCliAgent({ startCommand: 'claude --dangerously-skip-permissions', permissionNote: 'Always use `--dangerously-skip-permissions` when testing to avoid permission prompts that would block automated tests.', - model: 'anthropic/claude-opus-4.5', + model: 'anthropic/claude-opus-4.6', }) // Constants must be inside handleSteps since it gets serialized via .toString() diff --git a/.agents/codebuff-local-cli.ts b/.agents/codebuff-local-cli.ts index 338cb35aef..2fee1813a4 100644 --- a/.agents/codebuff-local-cli.ts +++ b/.agents/codebuff-local-cli.ts @@ -10,7 +10,7 @@ const baseDefinition = createCliAgent({ startCommand: 'bun --cwd=cli run dev', permissionNote: 'No permission flags needed for Codebuff local dev server.', - model: 'anthropic/claude-opus-4.5', + model: 'anthropic/claude-opus-4.6', skipPrepPhase: true, spawnerPromptExtras: `**Purpose:** E2E visual testing of the Codebuff CLI itself. This agent starts a local dev Codebuff CLI instance and interacts with it to verify UI behavior. diff --git a/.agents/codex-cli.ts b/.agents/codex-cli.ts index 48570ff4c8..a3a7aec3a7 100644 --- a/.agents/codex-cli.ts +++ b/.agents/codex-cli.ts @@ -81,7 +81,7 @@ const baseDefinition = createCliAgent({ startCommand: 'codex -a never -s danger-full-access', permissionNote: 'Always use `-a never -s danger-full-access` when testing to avoid approval prompts that would block automated tests.', - model: 'anthropic/claude-opus-4.5', + model: 'anthropic/claude-opus-4.6', extraInputParams: { reviewType: { type: 'string', diff --git a/.agents/gemini-cli.ts b/.agents/gemini-cli.ts index 9117f87e53..6f8f5f2956 100644 --- a/.agents/gemini-cli.ts +++ b/.agents/gemini-cli.ts @@ -10,7 +10,7 @@ const baseDefinition = createCliAgent({ startCommand: 'gemini --yolo', permissionNote: 'Always use `--yolo` (or `--approval-mode yolo`) when testing to auto-approve all tool actions and avoid prompts that would block automated tests.', - model: 'anthropic/claude-opus-4.5', + model: 'anthropic/claude-opus-4.6', cliSpecificDocs: `## Gemini CLI Commands Gemini CLI uses slash commands for navigation: diff --git a/agents-graveyard/editor/best-of-n/best-of-n-selector.ts b/agents-graveyard/editor/best-of-n/best-of-n-selector.ts index 67945dc442..27d9dd8993 100644 --- a/agents-graveyard/editor/best-of-n/best-of-n-selector.ts +++ b/agents-graveyard/editor/best-of-n/best-of-n-selector.ts @@ -17,7 +17,7 @@ export const createBestOfNSelector = (options: { model: isSonnet ? 'anthropic/claude-sonnet-4.5' : isOpus - ? 'anthropic/claude-opus-4.5' + ? 'anthropic/claude-opus-4.6' : isGemini ? 'google/gemini-3-pro-preview' : 'openai/gpt-5.1', @@ -108,13 +108,12 @@ Try to select an implementation that fulfills all the requirements in the user's ## Response Format -${ - isSonnet || isOpus - ? `Use tags to write out your thoughts about the implementations as needed to pick the best implementation. IMPORTANT: You should think really really hard to make sure you pick the absolute best implementation! As soon as you know for sure which implementation is the best, you should output your choice. +${isSonnet || isOpus + ? `Use tags to write out your thoughts about the implementations as needed to pick the best implementation. IMPORTANT: You should think really really hard to make sure you pick the absolute best implementation! As soon as you know for sure which implementation is the best, you should output your choice. Then, do not write any other explanations AT ALL. You should directly output a single tool call to set_output with the selected implementationId and short reason.` - : `Output a single tool call to set_output with the selected implementationId. Do not write anything else.` -}`, + : `Output a single tool call to set_output with the selected implementationId. Do not write anything else.` + }`, } } diff --git a/agents-graveyard/editor/best-of-n/editor-implementor2.ts b/agents-graveyard/editor/best-of-n/editor-implementor2.ts index b0a4942c00..9447693177 100644 --- a/agents-graveyard/editor/best-of-n/editor-implementor2.ts +++ b/agents-graveyard/editor/best-of-n/editor-implementor2.ts @@ -13,7 +13,7 @@ export const createBestOfNImplementor2 = (options: { model: isGpt5 ? 'openai/gpt-5.2' : isOpus - ? 'anthropic/claude-opus-4.5' + ? 'anthropic/claude-opus-4.6' : 'anthropic/claude-sonnet-4.5', displayName: isGpt5 ? 'GPT-5 Implementation Generator v2' diff --git a/agents-graveyard/editor/best-of-n/editor-multi-prompt2.ts b/agents-graveyard/editor/best-of-n/editor-multi-prompt2.ts index 4af163cddd..be9722b5ef 100644 --- a/agents-graveyard/editor/best-of-n/editor-multi-prompt2.ts +++ b/agents-graveyard/editor/best-of-n/editor-multi-prompt2.ts @@ -10,7 +10,7 @@ import type { SecretAgentDefinition } from '../../types/secret-agent-definition' export function createMultiPromptEditor(): Omit { return { publisher, - model: 'anthropic/claude-opus-4.5', + model: 'anthropic/claude-opus-4.6', displayName: 'Multi-Prompt Editor', spawnerPrompt: 'Edits code by spawning multiple implementor agents with different strategy prompts, selects the best implementation, and applies the changes. It also returns further suggested improvements which you should take seriously and act on. Pass as input an array of short prompts specifying different implementation approaches or strategies. Make sure to read any files intended to be edited before spawning this agent.', diff --git a/agents-graveyard/editor/reviewer-editor.ts b/agents-graveyard/editor/reviewer-editor.ts index 80b43b228c..4049cb0c68 100644 --- a/agents-graveyard/editor/reviewer-editor.ts +++ b/agents-graveyard/editor/reviewer-editor.ts @@ -12,7 +12,7 @@ export const createCodeEditor = (options: { model: options.model === 'gpt-5' ? 'openai/gpt-5.1' - : 'anthropic/claude-opus-4.5', + : 'anthropic/claude-opus-4.6', displayName: 'Code Editor', spawnerPrompt: 'Expert code reviewer that reviews recent code changes and makes improvements.', @@ -58,10 +58,9 @@ OR for new files or major rewrites: } -${ - model === 'gpt-5' - ? '' - : `Before you start writing your implementation, you should use tags to think about the best way to implement the changes. +${model === 'gpt-5' + ? '' + : `Before you start writing your implementation, you should use tags to think about the best way to implement the changes. You can also use tags interspersed between tool calls to think about the best way to implement the changes. @@ -88,7 +87,7 @@ You can also use tags interspersed between tool calls to think about the ` -} + } ### Simplify the code. From 576bd20cc11f473617bc8d7c2dc451ef3176c498 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Thu, 5 Feb 2026 17:04:27 -0800 Subject: [PATCH 4/4] Update default model for token counter --- web/src/app/api/v1/token-count/_post.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/web/src/app/api/v1/token-count/_post.ts b/web/src/app/api/v1/token-count/_post.ts index 4a90a41955..9e2ce09cb1 100644 --- a/web/src/app/api/v1/token-count/_post.ts +++ b/web/src/app/api/v1/token-count/_post.ts @@ -26,6 +26,8 @@ const tokenCountRequestSchema = z.object({ type TokenCountRequest = z.infer +const DEFAULT_ANTHROPIC_MODEL = 'claude-opus-4-6' + export async function postTokenCount(params: { req: NextRequest getUserInfoFromApiKey: GetUserInfoFromApiKeyFn @@ -87,7 +89,7 @@ export async function postTokenCount(params: { userId, messageCount: messages.length, hasSystem: !!system, - model: model ?? 'claude-opus-4-5-20251101', + model: model ?? DEFAULT_ANTHROPIC_MODEL, tokenCount: inputTokens, }, `Token count: ${inputTokens}` @@ -124,7 +126,6 @@ async function countTokensViaAnthropic(params: { // Convert model from OpenRouter format (e.g. "anthropic/claude-opus-4.5") to Anthropic format (e.g. "claude-opus-4-5-20251101") // For non-Anthropic models, use the default Anthropic model for token counting - const DEFAULT_ANTHROPIC_MODEL = 'claude-opus-4-5-20251101' const isNonAnthropicModel = !model || !isClaudeModel(model) const anthropicModelId = isNonAnthropicModel ? DEFAULT_ANTHROPIC_MODEL