diff --git a/.agents/base.ts b/.agents/base.ts index eddc9d3162..394d66a79f 100644 --- a/.agents/base.ts +++ b/.agents/base.ts @@ -6,7 +6,7 @@ import type { SecretAgentDefinition } from './types/secret-agent-definition' const definition: SecretAgentDefinition = { id: 'base', publisher, - ...base('anthropic/claude-4-sonnet-20250522', 'normal'), + ...base('anthropic/claude-4.5-sonnet', 'normal'), } export default definition diff --git a/backend/src/llm-apis/openrouter.ts b/backend/src/llm-apis/openrouter.ts index b2c72c6100..4e0d296aa0 100644 --- a/backend/src/llm-apis/openrouter.ts +++ b/backend/src/llm-apis/openrouter.ts @@ -12,6 +12,11 @@ const providerOrder = { 'Anthropic', 'Amazon Bedrock', ], + [models.openrouter_claude_sonnet_4_5]: [ + 'Google', + 'Anthropic', + 'Amazon Bedrock', + ], [models.openrouter_claude_opus_4]: ['Google', 'Anthropic'], } as const diff --git a/backend/src/tools/definitions/tool/end-turn.ts b/backend/src/tools/definitions/tool/end-turn.ts index 7b4c3a2656..86b18938b2 100644 --- a/backend/src/tools/definitions/tool/end-turn.ts +++ b/backend/src/tools/definitions/tool/end-turn.ts @@ -13,7 +13,15 @@ Only use this tool to hand control back to the user. - Before calling: finish all pending steps, resolve tool results, and include any outputs the user needs to review. - Effect: Signals the UI to wait for the user's reply; any pending tool results will be ignored. -Correct usage: +*INCORRECT USAGE*: +${getToolCallString('some_tool_that_produces_results', { query: 'some example search term' }, false)} + ${getToolCallString(toolName, {})} + +*CORRECT USAGE*: +All done! Would you like some more help with xyz? + +${getToolCallString(toolName, {})} + `.trim(), } satisfies ToolDescription diff --git a/common/src/old-constants.ts b/common/src/old-constants.ts index 6bd48bf16b..41d49af0d1 100644 --- a/common/src/old-constants.ts +++ b/common/src/old-constants.ts @@ -194,6 +194,7 @@ export const geminiModels = { export type GeminiModel = (typeof geminiModels)[keyof typeof geminiModels] export const openrouterModels = { + openrouter_claude_sonnet_4_5: 'anthropic/claude-4.5-sonnet', openrouter_claude_sonnet_4: 'anthropic/claude-4-sonnet-20250522', openrouter_claude_opus_4: 'anthropic/claude-opus-4.1', openrouter_claude_3_5_haiku: 'anthropic/claude-3.5-haiku-20241022', @@ -259,6 +260,7 @@ export const shortModelNames = { 'gemini-2.5-pro': models.openrouter_gemini2_5_pro_preview, 'flash-2.5': models.openrouter_gemini2_5_flash, 'opus-4': models.openrouter_claude_opus_4, + 'sonnet-4.5': models.openrouter_claude_sonnet_4_5, 'sonnet-4': models.openrouter_claude_sonnet_4, 'sonnet-3.7': models.openrouter_claude_sonnet_4, 'sonnet-3.6': models.openrouter_claude_3_5_sonnet, diff --git a/evals/git-evals/run-git-evals.ts b/evals/git-evals/run-git-evals.ts index 696deb4181..06e401b423 100644 --- a/evals/git-evals/run-git-evals.ts +++ b/evals/git-evals/run-git-evals.ts @@ -4,6 +4,7 @@ import path from 'path' import { disableLiveUserInputCheck } from '@codebuff/backend/live-user-inputs' import { promptAiSdkStructured } from '@codebuff/backend/llm-apis/vercel-ai-sdk/ai-sdk' +import { errorToObject } from '@codebuff/common/util/object' import { withTimeout } from '@codebuff/common/util/promise' import { generateCompactId } from '@codebuff/common/util/string' import { cloneDeep } from 'lodash' @@ -247,7 +248,11 @@ Explain your reasoning in detail.`, return { ...evalRun, judging_results: { - analysis: 'Judging failed due to error', + analysis: `Judging failed due to error:\n${JSON.stringify( + judgingError instanceof Error + ? errorToObject(judgingError) + : judgingError, + )}`, strengths: [], weaknesses: ['Judging process encountered an error'], metrics: {