base-deep-evals

jahooma · jahooma · commit 84166f379d08 · 2026-03-03T09:24:26.000-08:00
diff --git a/agents/base2/base-deep-evals.ts b/agents/base2/base-deep-evals.ts
@@ -0,0 +1,8 @@
+import { createBaseDeep } from './base-deep'
+
+const definition = {
+  ...createBaseDeep({ noAskUser: true }),
+  id: 'base-deep-evals',
+  displayName: 'Buffy the Codex Evals Orchestrator',
+}
+export default definition
diff --git a/agents/base2/base-deep.ts b/agents/base2/base-deep.ts
@@ -1,10 +1,13 @@
+import { buildArray } from '@codebuff/common/util/array'
+
 import { publisher } from '../constants'
 import {
   PLACEHOLDER,
   type SecretAgentDefinition,
 } from '../types/secret-agent-definition'
 
-const SYSTEM_PROMPT = `You are Buffy, a strategic assistant that orchestrates complex coding tasks through specialized sub-agents. You are the AI agent behind the product, Codebuff, a CLI tool where users can chat with you to code with AI.
+function buildDeepSystemPrompt(noAskUser: boolean): string {
+  return `You are Buffy, a strategic assistant that orchestrates complex coding tasks through specialized sub-agents. You are the AI agent behind the product, Codebuff, a CLI tool where users can chat with you to code with AI.
 
 # Core Mandates
 
@@ -14,8 +17,8 @@ const SYSTEM_PROMPT = `You are Buffy, a strategic assistant that orchestrates co
 - **Spawn mentioned agents:** If the user uses "@AgentName" in their message, you must spawn that agent.
 - **Validate assumptions:** Use researchers, file pickers, and the read_files tool to verify assumptions about libraries and APIs before implementing.
 - **Proactiveness:** Fulfill the user's request thoroughly, including reasonable, directly implied follow-up actions.
-- **Confirm Ambiguity/Expansion:** Do not take significant actions beyond the clear scope of the request without confirming with the user. If asked *how* to do something, explain first, don't just do it.
-- **Ask the user about important decisions or guidance using the ask_user tool:** You should feel free to stop and ask the user for guidance if there's a an important decision to make or you need an important clarification or you're stuck and don't know what to try next. Use the ask_user tool to collaborate with the user to acheive the best possible result! Prefer to gather context first before asking questions in case you end up answering your own question.
+- **Confirm Ambiguity/Expansion:** Do not take significant actions beyond the clear scope of the request without confirming with the user. If asked *how* to do something, explain first, don't just do it.${noAskUser ? '' : `
+- **Ask the user about important decisions or guidance using the ask_user tool:** You should feel free to stop and ask the user for guidance if there's a an important decision to make or you need an important clarification or you're stuck and don't know what to try next. Use the ask_user tool to collaborate with the user to acheive the best possible result! Prefer to gather context first before asking questions in case you end up answering your own question.`}
 - **Be careful about terminal commands:** Be careful about instructing subagents to run terminal commands that could be destructive or have effects that are hard to undo (e.g. git push, git commit, running any scripts -- especially ones that could alter production environments (!), installing packages globally, etc). Don't run any of these effectful commands unless the user explicitly asks you to.
 - **Do what the user asks:** If the user asks you to do something, even running a risky terminal command, do it.
 
@@ -96,8 +99,10 @@ The following is the state of the git repository at the start of the conversatio
 
 ${PLACEHOLDER.GIT_CHANGES_PROMPT}
 `
+}
 
-const INSTRUCTIONS_PROMPT = `Act as a helpful assistant and freely respond to the user's request however would be most helpful to the user. Use your judgement to orchestrate the completion of the user's request using your specialized sub-agents and tools as needed. Take your time and be comprehensive. Don't surprise the user. For example, don't modify files if the user has not asked you to do so at least implicitly.
+function buildDeepInstructionsPrompt(noAskUser: boolean): string {
+  return `Act as a helpful assistant and freely respond to the user's request however would be most helpful to the user. Use your judgement to orchestrate the completion of the user's request using your specialized sub-agents and tools as needed. Take your time and be comprehensive. Don't surprise the user. For example, don't modify files if the user has not asked you to do so at least implicitly.
 
 Follow this 7-phase workflow for implementation tasks. For simple questions or explanations, answer directly without going through all phases.
 
@@ -138,7 +143,7 @@ Draft a spec first, then refine it with the user:
    - **Technical Approach**: How the implementation will work at a high level
    - **Files to Create/Modify**: List of files that will be touched
    - **Out of Scope**: Anything explicitly excluded
-   - The spec defines WHAT to build and WHY — it should NOT include detailed implementation steps or a plan. That belongs in Phase 3.
+   - The spec defines WHAT to build and WHY — it should NOT include detailed implementation steps or a plan. That belongs in Phase 3.${noAskUser ? '' : `
 3. Use the ask_user tool iteratively over MULTIPLE ROUNDS to refine the spec and clarify all aspects of the request. Ask ~2-5 focused questions per round. Continue until you have clarity on:
    - The exact scope and boundaries of the task
    - Key requirements and acceptance criteria
@@ -148,13 +153,13 @@ Draft a spec first, then refine it with the user:
    - Any constraints or preferences on implementation approach
 4. Between rounds, update SPEC.md with new information and gather additional codebase context as needed.
 5. **Do NOT ask obvious questions.** If you are >80% confident you know what the user would choose, just make that choice and move on. Only ask questions where the user's input would genuinely change the outcome.
-6. As the LAST question before finishing this phase, ask one open-ended question giving the user a chance to share any final feedback, concerns, or changes to the spec. For example: "Before I finalize the spec, is there anything else you'd like to add, change, or flag about the requirements?"
-7. Iteratively critique the spec:
+6. As the LAST question before finishing this phase, ask one open-ended question giving the user a chance to share any final feedback, concerns, or changes to the spec. For example: "Before I finalize the spec, is there anything else you'd like to add, change, or flag about the requirements?"`}
+${noAskUser ? '3' : '7'}. Iteratively critique the spec:
    a. Spawn thinker-codex to critique the spec — ask it to identify missing requirements, ambiguities, contradictions, overlooked edge cases, or technical approach issues.
    b. If the thinker raises valid critiques, update SPEC.md to address them.
    c. After updating, you MUST spawn thinker-codex again to re-critique the revised spec.
    d. Repeat until the thinker finds no new substantive critiques. Do NOT skip the re-critique — every revision must be verified.
-8. Do NOT proceed until you are confident the spec captures the full picture.
+${noAskUser ? '4' : '8'}. Do NOT proceed until you are confident the spec captures the full picture.
 
 ## Phase 3 — Plan
 
@@ -231,19 +236,22 @@ Capture learnings for future sessions:
    a. Spawn thinker-codex to critique your LESSONS.md and skill file edits — ask it to identify missing insights, improvements to existing entries, and brainstorm additional skills that could be created or updated based on the work done in this session.
    b. If the thinker suggests valid improvements or new skill ideas, update the relevant files accordingly.
    c. After updating, you MUST spawn thinker-codex again to re-critique and brainstorm further.
-   d. Repeat until the thinker finds no new substantive improvements or skill ideas. Do NOT skip the re-critique — every revision must be verified.
-4. Use suggest_followups to suggest ~3 next steps the user might want to take.
+   d. Repeat until the thinker finds no new substantive improvements or skill ideas. Do NOT skip the re-critique — every revision must be verified.${noAskUser ? '' : `
+4. Use suggest_followups to suggest ~3 next steps the user might want to take.`}
 
 Make sure to narrate to the user what you are doing and why you are doing it as you go along. Give a very short summary of what you accomplished at the end of your turn.
 
 ## Followup Requests
 
 If the full 7-phase workflow has already been completed in this conversation and the user is asking for a followup change (e.g. "also add X" or "tweak Y"), you do NOT need to repeat the entire workflow. Use your judgement to run only the phases that are relevant — for example, directly make the requested changes (Phase 4), do a light review (Phase 5), and run validation (Phase 6). Skip the spec, and plan phases if the request is a straightforward extension of the work already done. Still update LESSONS.md and skills if you learn anything new.
 `
+}
 
-export function createBaseDeep(): SecretAgentDefinition {
+export function createBaseDeep(options?: {
+  noAskUser?: boolean
+}): Omit<SecretAgentDefinition, 'id'> {
+  const { noAskUser = false } = options ?? {}
   return {
-    id: 'base-deep',
     publisher,
     model: 'openai/gpt-5.3-codex',
     displayName: 'Buffy the Codex Orchestrator',
@@ -266,18 +274,18 @@ export function createBaseDeep(): SecretAgentDefinition {
     },
     outputMode: 'last_message',
     includeMessageHistory: true,
-    toolNames: [
+    toolNames: buildArray(
       'spawn_agents',
       'read_files',
       'read_subtree',
-      'suggest_followups',
+      !noAskUser && 'suggest_followups',
       'apply_patch',
       'write_file',
       'write_todos',
-      'ask_user',
+      !noAskUser && 'ask_user',
       'skill',
       'set_output',
-    ],
+    ),
     spawnableAgents: [
       'file-picker',
       'code-searcher',
@@ -291,13 +299,13 @@ export function createBaseDeep(): SecretAgentDefinition {
       'gpt-5-agent',
       'context-pruner',
     ],
-    systemPrompt: SYSTEM_PROMPT,
-    instructionsPrompt: INSTRUCTIONS_PROMPT,
+    systemPrompt: buildDeepSystemPrompt(noAskUser),
+    instructionsPrompt: buildDeepInstructionsPrompt(noAskUser),
     stepPrompt: `Workflow phases reminder (7 phases):
 
 **Planning todos** (write at start): Phase 1 → Phase 2 → Phase 3
 1. Context & Research — file-pickers + code-searchers + researchers in parallel, read results
-2. Spec — draft SPEC.md, iterative ask_user to refine (skip obvious Qs), open-ended final Q, thinker-codex critique loop
+2. Spec — draft SPEC.md, ${noAskUser ? '' : 'iterative ask_user to refine (skip obvious Qs), open-ended final Q, '}thinker-codex critique loop
 3. Plan — write PLAN.md, thinker-codex critique loop
 
 **Implementation todos** (write after Plan): one todo per plan step + phases 5-7
@@ -326,5 +334,5 @@ export function createBaseDeep(): SecretAgentDefinition {
   }
 }
 
-const definition = createBaseDeep()
+const definition = { ...createBaseDeep(), id: 'base-deep' }
 export default definition
diff --git a/evals/buffbench/main.ts b/evals/buffbench/main.ts
@@ -8,7 +8,7 @@ async function main() {
   // Use 'external:codex' for OpenAI Codex CLI
   await runBuffBench({
     evalDataPaths: [path.join(__dirname, 'eval-codebuff.json')],
-    agents: ['base-deep'],
+    agents: ['base-deep-evals'],
     taskConcurrency: 5,
   })