fix(crab-pf): replace broken verify with smoke+session test, add session handling (#32)

MrFlounder · claude · web-flow · commit 3f5ee994e5b3 · 2026-02-21T17:31:43.000-08:00
## Summary - **Fix false-positive verification**: The old verify tool ran `promptfoo eval` against a config with `redteam` section but no `tests` array — zero tests ran, zero failures, reported success. Now replaced with a 3-step process: direct provider smoke test → session test → eval with 2 real test cases. - **Add session handling context**: System prompt now teaches the agent about the `callApi(prompt, context, options)` signature and `sessionId` contract needed for multi-turn redteam attacks (Crescendo, GOAT). - **Fix GPT-5/o1/o3 compatibility**: Use `max_completion_tokens` instead of `max_tokens`, omit `temperature` for reasoning models. - **Fix Node.js module caching**: `await import(url)` returns cached module when provider.js is rewritten. Added `?t=timestamp` cache buster. ## Changed files | File | What changed | |------|-------------| | `generator/config.ts` | Replace `redteam` section with `prompts` + `tests` + `defaultTest.assert` | | `agent/loop.ts` | Rewrite `verify` tool: smoke test + session test + eval with proper parsing | | `agent/tools.ts` | Update verify description, remove unused `numTests` param | | `agent/system-prompt.ts` | Add session handling section, update `callApi` signature in example | | `agent/providers.ts` | GPT-5/o1/o3 compat (`max_completion_tokens`, no `temperature`) | ## Test plan - [ ] Run `crab pf` against a simple HTTP target — verify eval shows `2 passed, 0 failed` - [ ] Run against a session-based target — verify provider gets correct `callApi` signature and returns `sessionId` - [ ] Run with `--provider openai:gpt-5` — verify no API errors - [ ] Break provider.js intentionally — verify smoke test catches it (not false-positive) 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/plugins/promptfoo/src/agent/loop.ts b/plugins/promptfoo/src/agent/loop.ts
@@ -13,7 +13,9 @@ import { toOpenAITools, toAnthropicTools } from './tools.js';
 import type { LLMProvider, Message, ToolCall, ChatResponse } from './providers.js';
 import type { DiscoveryResult } from '../types.js';
 import * as fs from 'node:fs';
+import * as path from 'node:path';
 import { execSync } from 'node:child_process';
+import { pathToFileURL } from 'node:url';
 
 export interface AgentOptions {
   context: string; // Raw artifact or description
@@ -76,7 +78,7 @@ Steps:
 2. Send a probe to verify connectivity
 3. Identify the prompt field and response field
 4. Generate the config (and provider file if needed)
-5. Verify it works with a mini redteam test
+5. Verify it works
 6. Call done() when complete`,
     },
   ];
@@ -152,11 +154,14 @@ Steps:
       toolCalls: response.toolCalls,
     });
 
-    // 5. Add tool results
+    // 5. Add tool results — include error in content so LLM can reason about failures
     for (const result of toolResults) {
+      const content = result.error
+        ? JSON.stringify({ error: result.error, result: result.result })
+        : JSON.stringify(result.result);
       messages.push({
         role: 'tool',
-        content: JSON.stringify(result.result),
+        content,
         toolCallId: result.toolCallId,
       });
     }
@@ -268,60 +273,113 @@ async function executeTool(
       }
 
       case 'verify': {
-        const { configFile, numTests } = args as {
+        const { configFile } = args as {
           configFile?: string;
-          numTests?: number;
         };
 
         const configPath = configFile || state.configFile || 'promptfooconfig.yaml';
+        const steps: string[] = [];
+
+        // Step 1: Direct provider smoke + session test
+        const providerPath = path.join(outputDir, 'provider.js');
+        if (fs.existsSync(providerPath)) {
+          // Install dependencies first if package.json exists
+          const packageJsonPath = path.join(outputDir, 'package.json');
+          if (fs.existsSync(packageJsonPath)) {
+            try {
+              execSync(`cd "${outputDir}" && npm install --silent 2>&1`, {
+                timeout: 60000,
+                encoding: 'utf-8',
+              });
+            } catch {
+              // Ignore install errors, will surface in import
+            }
+          }
 
-        // Install dependencies if package.json exists
-        const packageJsonPath = `${outputDir}/package.json`;
-        if (fs.existsSync(packageJsonPath)) {
-          try {
-            execSync(`cd "${outputDir}" && npm install --silent 2>&1`, {
-              timeout: 60000,
-              encoding: 'utf-8',
-            });
-          } catch {
-            // Ignore install errors, will fail in eval if deps missing
+          const providerUrl = pathToFileURL(path.resolve(providerPath)).href + `?t=${Date.now()}`;
+          const mod = await import(providerUrl);
+          const ProviderClass = mod.default;
+          const instance = new ProviderClass({ config: {} });
+
+          // Smoke test
+          const r1 = await instance.callApi('Hello, this is a test message', { vars: {} }, {});
+          if (!r1 || !r1.output || r1.error) {
+            const diag = JSON.stringify(r1, null, 2)?.slice(0, 500) || 'null response';
+            steps.push(`Smoke test FAILED. Provider returned: ${diag}`);
+            state.verified = false;
+            result = { success: false, error: `Provider smoke test failed`, providerResponse: r1, steps };
+            break;
+          }
+          steps.push(`Smoke test PASSED: got ${r1.output.length} chars`);
+
+          // Session test — second call, passing sessionId from first response (mimics promptfoo strategy flow)
+          const sessionContext = r1.sessionId
+            ? { vars: { sessionId: r1.sessionId } }
+            : { vars: {} };
+          const r2 = await instance.callApi('Follow up question', sessionContext, {});
+          if (!r2 || !r2.output || r2.error) {
+            const diag = JSON.stringify(r2, null, 2)?.slice(0, 500) || 'null response';
+            steps.push(`Session test FAILED. Provider returned: ${diag}`);
+            state.verified = false;
+            result = { success: false, error: `Provider session test failed`, providerResponse: r2, steps };
+            break;
           }
+          steps.push(`Session test PASSED: got ${r2.output.length} chars${r1.sessionId ? `, sessionId: ${r1.sessionId}` : ''}`);
         }
 
-        // Try to run promptfoo eval
+        // Step 2: Run promptfoo eval
         try {
           const output = execSync(
             `cd "${outputDir}" && npx promptfoo eval -c "${configPath}" --no-progress-bar 2>&1`,
             { timeout: 120000, encoding: 'utf-8' }
           );
 
-          // Check for actual failures, ignoring version warnings
-          const hasTestFailure = output.includes('[FAIL]') || output.includes('Test failed');
+          const passMatch = output.match(/(\d+) passed/);
+          const failMatch = output.match(/(\d+) failed/);
+          const errorMatch = output.match(/(\d+) error/);
+          const passed = passMatch ? parseInt(passMatch[1]) : 0;
+          const failed = failMatch ? parseInt(failMatch[1]) : 0;
+          const errors = errorMatch ? parseInt(errorMatch[1]) : 0;
+
           const hasConfigError = output.includes('Error loading config') || output.includes('Invalid config');
-          const hasProviderError = output.includes('Provider error') || output.includes('Connection refused');
 
-          state.verified = !hasTestFailure && !hasConfigError && !hasProviderError;
+          if (passed === 0 && failed === 0) {
+            steps.push('Eval FAILED: zero tests ran');
+            state.verified = false;
+          } else if (failed > 0 || errors > 0 || hasConfigError) {
+            steps.push(`Eval FAILED: ${passed} passed, ${failed} failed, ${errors} errors`);
+            state.verified = false;
+          } else {
+            steps.push(`Eval PASSED: ${passed} passed, ${failed} failed`);
+            state.verified = true;
+          }
 
           result = {
             success: state.verified,
             output: output.slice(0, 1000),
+            steps,
           };
         } catch (error) {
           const err = error as { message: string; stdout?: string; stderr?: string };
-          // If promptfoo ran but returned non-zero, check if tests actually passed
           const stdout = err.stdout || '';
-          const hasPassingOutput = stdout.includes('[PASS]') || stdout.includes('Evaluation complete');
 
-          result = {
-            success: hasPassingOutput,
-            error: hasPassingOutput ? undefined : err.message,
-            stdout: stdout.slice(0, 1000),
-            stderr: err.stderr?.slice(0, 500),
-          };
+          const passMatch = stdout.match(/(\d+) passed/);
+          const passed = passMatch ? parseInt(passMatch[1]) : 0;
 
-          if (hasPassingOutput) {
+          if (passed > 0 && !stdout.includes('failed')) {
+            steps.push(`Eval PASSED (non-zero exit): ${passed} passed`);
             state.verified = true;
+          } else {
+            steps.push(`Eval FAILED: ${err.message.slice(0, 200)}`);
+            state.verified = false;
           }
+
+          result = {
+            success: state.verified,
+            error: state.verified ? undefined : err.message,
+            stdout: stdout.slice(0, 1000),
+            steps,
+          };
         }
         break;
       }
diff --git a/plugins/promptfoo/src/agent/providers.ts b/plugins/promptfoo/src/agent/providers.ts
@@ -63,8 +63,12 @@ export class OpenAIProvider implements LLMProvider {
         model: this.model,
         messages: options.messages.map((m) => this.toOpenAIMessage(m)),
         tools: options.tools,
-        max_tokens: options.maxTokens || 4096,
-        temperature: options.temperature ?? 0.7,
+        ...(this.model.startsWith('gpt-5') || this.model.startsWith('o1') || this.model.startsWith('o3')
+          ? { max_completion_tokens: options.maxTokens || 4096 }
+          : { max_tokens: options.maxTokens || 4096 }),
+        ...(this.model.startsWith('gpt-5') || this.model.startsWith('o1') || this.model.startsWith('o3')
+          ? {}
+          : { temperature: options.temperature ?? 0.7 }),
       }),
     });
 
diff --git a/plugins/promptfoo/src/agent/system-prompt.ts b/plugins/promptfoo/src/agent/system-prompt.ts
@@ -10,15 +10,15 @@ export const DISCOVERY_SYSTEM_PROMPT = `You are a target discovery agent for pro
 
 1. Probe the target to understand how it communicates
 2. Generate a working promptfoo config (YAML + custom provider if needed)
-3. Verify it works with a mini redteam test
+3. Verify it works
 
 ## Tools
 
 - **probe(url, method?, body?, headers?)** - Send HTTP request, see response
 - **probe_ws(url, message, headers?, timeout?)** - Test WebSocket endpoint
 - **write_config(description, providerType, providerConfig)** - Write promptfooconfig.yaml
 - **write_provider(code, filename, language)** - Write custom provider.js/py
-- **verify()** - Run promptfoo eval to test the config
+- **verify()** - Test provider directly (smoke + session), then run promptfoo eval
 - **done(summary, configFile, verified)** - Signal completion
 
 ## Promptfoo Config Format
@@ -56,26 +56,46 @@ export default class Provider {
     return 'my-provider';
   }
 
-  async callApi(prompt) {
+  async callApi(prompt, context, options) {
+    // context.vars.sessionId is set on subsequent turns if you returned sessionId previously
     // Your logic here...
-    return { output: "the response string" };  // MUST return { output: string }
+    return {
+      output: "the response string",
+      sessionId: "optional-session-id",  // Return if target uses sessions
+    };
   }
 }
 \`\`\`
 
 **Key requirements:**
 - Must be a class with \`export default\`
-- Must have \`callApi(prompt)\` method
-- \`callApi\` must return \`{ output: string }\`, not just a string
+- Must have \`callApi(prompt, context, options)\` method — all 3 params
+- \`callApi\` must return \`{ output: string, sessionId?: string }\`
 - Use native fetch (Node 18+), import 'ws' for WebSocket
 
+## Session Handling
+
+Promptfoo uses sessions for multi-turn conversations (e.g. redteam attack strategies like Crescendo and GOAT). The flow works like this:
+
+1. Strategy calls \`callApi(prompt, context)\` on turn 1
+2. Provider talks to the target, gets a response and a session/conversation ID
+3. Provider returns \`{ output: "...", sessionId: "abc123" }\`
+4. Promptfoo stores the sessionId and passes it back on turn 2+ via \`context.vars.sessionId\`
+5. Provider reads \`context.vars.sessionId\` and reuses the existing conversation
+
+**If the target is stateful (uses sessions, conversation IDs, etc.), the provider MUST support this flow.** Otherwise multi-turn attacks will start a new conversation on every turn and fail.
+
+For **custom providers**: Accept the \`context\` parameter, check \`context.vars.sessionId\` to reuse an existing session, and return \`sessionId\` in the response.
+
+For **HTTP providers**: Use \`sessionParser\` in the config to extract the session ID from the response (e.g. \`sessionParser: json.session_id\`). Promptfoo handles the rest automatically.
+
 ## Workflow
 
 1. Read the target spec to understand the API
 2. Probe to verify connectivity and response format
 3. Decide: HTTP provider (simple) or custom provider (complex)
 4. Write config (and provider.js if needed)
-5. Verify with promptfoo eval
+5. Verify — runs provider smoke test + session test, then promptfoo eval with 2 test cases
 6. Call done() with results
 
 Be intelligent. Figure out the target's protocol, auth, request/response format from probing. Generate configs that work.`;
diff --git a/plugins/promptfoo/src/agent/tools.ts b/plugins/promptfoo/src/agent/tools.ts
@@ -133,7 +133,7 @@ export const toolDefinitions: ToolDefinition[] = [
   },
   {
     name: 'verify',
-    description: 'Run a mini redteam test to verify the configuration works. This sends a few test prompts through the target.',
+    description: 'Verify the configuration works. Tests the provider directly (smoke + session test), then runs promptfoo eval with 2 simple test cases.',
     parameters: {
       type: 'object',
       properties: {
@@ -142,11 +142,6 @@ export const toolDefinitions: ToolDefinition[] = [
           description: 'Path to the config file to verify (default: promptfooconfig.yaml)',
           default: 'promptfooconfig.yaml',
         },
-        numTests: {
-          type: 'number',
-          description: 'Number of test prompts to send (default: 3)',
-          default: 3,
-        },
       },
     },
   },
diff --git a/plugins/promptfoo/src/generator/config.ts b/plugins/promptfoo/src/generator/config.ts
@@ -62,26 +62,16 @@ export function generateConfig(options: GenerateConfigOptions): GeneratedConfig
   const config = {
     description,
     providers: [provider],
-    // Default test with the prompt variable
+    prompts: ['{{prompt}}'],
     defaultTest: {
-      vars: {
-        prompt: '{{prompt}}',
-      },
-    },
-    // Mini redteam for verification
-    redteam: {
-      plugins: ['harmful:hate'],
-      strategies: [
-        { id: 'jailbreak' },
-        {
-          id: 'jailbreak:composite',
-          config: {
-            maxTurns: 3,
-          },
-        },
+      assert: [
+        { type: 'javascript', value: 'output && output.length > 0' },
       ],
-      numTests: 1,
     },
+    tests: [
+      { vars: { prompt: 'Hello, how are you?' } },
+      { vars: { prompt: 'What can you help me with?' } },
+    ],
   };
 
   // Generate YAML