Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 37 additions & 15 deletions .agentv/targets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,21 @@ targets:
- name: default
use_target: ${{ AGENT_TARGET }}

- name: agent
use_target: ${{ AGENT_TARGET }}

# ── LLM target (text generation, no agent binary needed) ────────────
# Delegates to GRADER_TARGET — same provider used for grading and LLM evals.
# Delegates to LLM_TARGET — same provider used for grading and LLM evals.
- name: llm
use_target: ${{ GRADER_TARGET }}
use_target: ${{ LLM_TARGET }}

# ── Grader (LLM-as-judge) ──────────────────────────────────────────
# Used by agent targets via grader_target. Switch provider via GRADER_TARGET.
- name: grader
use_target: ${{ GRADER_TARGET }}

# ── Named agent targets ───────────────────────────────────────────
- name: copilot-cli
- name: copilot
provider: copilot-cli
model: ${{ COPILOT_MODEL }}
grader_target: grader
Expand All @@ -38,7 +41,7 @@ targets:
log_format: json

- name: claude
provider: claude
provider: claude-cli
grader_target: grader
log_format: json

Expand All @@ -48,6 +51,13 @@ targets:
log_format: json

- name: pi
provider: pi-cli
subprovider: openrouter
model: ${{ OPENROUTER_MODEL }}
api_key: ${{ OPENROUTER_API_KEY }}
grader_target: grader

- name: pi-sdk
provider: pi-coding-agent
subprovider: openrouter
model: ${{ OPENROUTER_MODEL }}
Expand All @@ -56,13 +66,24 @@ targets:
tools: read,bash,edit,write
log_format: json

- name: pi-cli
- name: pi-azure
provider: pi-cli
subprovider: openrouter
model: ${{ OPENROUTER_MODEL }}
api_key: ${{ OPENROUTER_API_KEY }}
subprovider: azure
base_url: ${{ AZURE_OPENAI_ENDPOINT }}
model: ${{ AZURE_DEPLOYMENT_NAME }}
api_key: ${{ AZURE_OPENAI_API_KEY }}
grader_target: grader

- name: pi-sdk-azure
provider: pi-coding-agent
subprovider: azure
base_url: ${{ AZURE_OPENAI_ENDPOINT }}
model: ${{ AZURE_DEPLOYMENT_NAME }}
api_key: ${{ AZURE_OPENAI_API_KEY }}
grader_target: grader
tools: read,bash,edit,write
log_format: json

- name: codex
provider: codex
grader_target: grader
Expand All @@ -71,23 +92,24 @@ targets:
log_format: json

# ── LLM targets (direct model access) ─────────────────────────────
- name: azure-llm
- name: gh-models
provider: openai
base_url: https://models.github.ai/inference
api_key: ${{ GH_MODELS_TOKEN }}
model: ${{ GH_MODELS_MODEL }}

- name: azure
provider: azure
endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
api_key: ${{ AZURE_OPENAI_API_KEY }}
model: ${{ AZURE_DEPLOYMENT_NAME }}
version: ${{ AZURE_OPENAI_API_VERSION }}

- name: gemini-llm
- name: gemini
provider: gemini
api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }}
model: ${{ GEMINI_MODEL_NAME }}

- name: gemini-flash
provider: gemini
model: gemini-3-flash-preview
api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }}

- name: openai
provider: openai
endpoint: ${{ OPENAI_ENDPOINT }}
Expand Down
12 changes: 12 additions & 0 deletions evals/self/azure-smoke.eval.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
description: Smoke test for Azure OpenAI connectivity via pi agent

tests:
- id: capital-of-france
criteria: The answer correctly states that Paris is the capital of France.
input: What is the capital of France? Answer in one word.
expected_output: Paris

- id: simple-math
criteria: The answer correctly states that 2 + 2 = 4.
input: What is 2 + 2? Answer with just the number.
expected_output: "4"
129 changes: 104 additions & 25 deletions packages/core/src/evaluation/providers/pi-cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,20 @@
* For the SDK-based approach (no subprocess), use the `pi-coding-agent` provider instead.
*/

import { spawn } from 'node:child_process';
import { execSync, spawn } from 'node:child_process';
import { randomUUID } from 'node:crypto';
import { createWriteStream } from 'node:fs';
import { accessSync, createWriteStream, readFileSync } from 'node:fs';
import type { WriteStream } from 'node:fs';
import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import path from 'node:path';

import { recordPiLogEntry } from './pi-log-tracker.js';
import {
extractAzureResourceName,
resolveCliProvider,
resolveEnvKeyName,
} from './pi-provider-aliases.js';
import { extractPiTextContent, toFiniteNumber } from './pi-utils.js';
import { normalizeInputFiles } from './preread.js';
import type { PiCliResolvedConfig } from './targets.js';
Expand Down Expand Up @@ -174,12 +179,14 @@ export class PiCliProvider implements Provider {
const args: string[] = [];

if (this.config.subprovider) {
args.push('--provider', this.config.subprovider);
args.push('--provider', resolveCliProvider(this.config.subprovider));
}
if (this.config.model) {
args.push('--model', this.config.model);
}
if (this.config.apiKey) {
// For azure, the API key is passed via AZURE_OPENAI_API_KEY env var in
// buildEnv(). The --api-key flag would set the wrong provider's key.
if (this.config.apiKey && this.config.subprovider?.toLowerCase() !== 'azure') {
args.push('--api-key', this.config.apiKey);
}

Expand Down Expand Up @@ -242,20 +249,23 @@ export class PiCliProvider implements Provider {
private buildEnv(): NodeJS.ProcessEnv {
const env = { ...process.env };

if (this.config.apiKey) {
const provider = this.config.subprovider?.toLowerCase() ?? 'google';
const ENV_KEY_MAP: Record<string, string> = {
google: 'GEMINI_API_KEY',
gemini: 'GEMINI_API_KEY',
anthropic: 'ANTHROPIC_API_KEY',
openai: 'OPENAI_API_KEY',
groq: 'GROQ_API_KEY',
xai: 'XAI_API_KEY',
openrouter: 'OPENROUTER_API_KEY',
};
const envKey = ENV_KEY_MAP[provider];
if (envKey) {
env[envKey] = this.config.apiKey;
const provider = this.config.subprovider?.toLowerCase() ?? 'google';

if (provider === 'azure') {
// Pi CLI uses azure-openai-responses with AZURE_OPENAI_RESOURCE_NAME.
// Extract the resource name from base_url (or use it as-is if already a name).
if (this.config.apiKey) {
env.AZURE_OPENAI_API_KEY = this.config.apiKey;
}
if (this.config.baseUrl) {
env.AZURE_OPENAI_RESOURCE_NAME = extractAzureResourceName(this.config.baseUrl);
}
} else {
if (this.config.apiKey) {
const envKey = resolveEnvKeyName(provider);
if (envKey) {
env[envKey] = this.config.apiKey;
}
}
}

Expand All @@ -267,18 +277,18 @@ export class PiCliProvider implements Provider {
// var prefixes that provider uses. All other providers' vars are stripped
// automatically when that provider is selected.
if (this.config.subprovider) {
const provider = this.config.subprovider.toLowerCase();
const resolvedProvider = resolveCliProvider(this.config.subprovider);
const PROVIDER_OWN_PREFIXES: Record<string, readonly string[]> = {
openrouter: ['OPENROUTER_'],
anthropic: ['ANTHROPIC_'],
openai: ['OPENAI_'],
azure: ['AZURE_OPENAI_'],
'azure-openai-responses': ['AZURE_OPENAI_'],
google: ['GEMINI_', 'GOOGLE_GENERATIVE_AI_'],
gemini: ['GEMINI_', 'GOOGLE_GENERATIVE_AI_'],
groq: ['GROQ_'],
xai: ['XAI_'],
};
const ownPrefixes = PROVIDER_OWN_PREFIXES[provider] ?? [];
const ownPrefixes = PROVIDER_OWN_PREFIXES[resolvedProvider] ?? [];
const allOtherPrefixes = Object.entries(PROVIDER_OWN_PREFIXES)
.filter(([key]) => key !== provider)
.flatMap(([, prefixes]) => prefixes);
Expand Down Expand Up @@ -622,6 +632,29 @@ function extractMessages(events: unknown[]): readonly Message[] {
}
}

// Some providers (e.g. azure-openai-responses) emit text content only in
// message_update events, leaving the agent_end assistant message with empty
// content. Fall back to the last message_end with non-empty content.
if (messages) {
for (let i = messages.length - 1; i >= 0; i--) {
if (messages[i].role === 'assistant' && !messages[i].content) {
// Try to find content from the last message_end event
for (let j = events.length - 1; j >= 0; j--) {
const evt = events[j] as Record<string, unknown> | null;
if (!evt || evt.type !== 'message_end') continue;
const msg = evt.message as Record<string, unknown> | undefined;
if (msg?.role !== 'assistant') continue;
const text = extractPiTextContent(msg.content);
if (text) {
messages[i] = { ...messages[i], content: text };
break;
}
}
break;
}
}
}

// Pi CLI may emit tool_execution_start/tool_execution_end events whose tool
// calls are absent from the final agent_end messages. Reconstruct them and
// inject into the last assistant message so evaluators (e.g. skill-trigger)
Expand Down Expand Up @@ -859,18 +892,64 @@ function formatTimeoutSuffix(timeoutMs: number | undefined): string {
return ` after ${Math.ceil(timeoutMs / 1000)}s`;
}

/**
* On Windows, npm/bun global installs create `.cmd` and `.sh` wrappers.
* Bun's spawn can't capture stdout from sh-script wrappers (the forked
* node process writes to a different stdout). Resolve to the underlying
* node script so we can spawn `node script.js` directly.
*/
function resolveWindowsCmd(executable: string): [string, string[]] {
if (process.platform !== 'win32') return [executable, []];

// If already pointing at node/bun or a .js file, no resolution needed
const lower = executable.toLowerCase();
if (lower.endsWith('.js') || lower.endsWith('.exe')) return [executable, []];

// Find the executable's full path using `where`
let fullPath: string;
try {
fullPath = execSync(`where ${executable}`, { encoding: 'utf-8' })
.trim()
.split(/\r?\n/)[0]
.trim();
} catch {
return [executable, []];
}

// Try .cmd wrapper first (has the script path embedded)
const cmdPath = fullPath.endsWith('.cmd') ? fullPath : `${fullPath}.cmd`;
try {
const content = readFileSync(cmdPath, 'utf-8');
// npm .cmd wrappers end with: "%_prog%" "%dp0%\path\to\script.js" %*
const match = content.match(/"?%_prog%"?\s+"([^"]+\.js)"/);
if (match) {
const dp0 = path.dirname(path.resolve(cmdPath));
const scriptPath = match[1].replace(/%dp0%[/\\]?/gi, `${dp0}${path.sep}`);
try {
accessSync(scriptPath);
return ['node', [scriptPath]];
} catch {
// Script not found at resolved path, fall through
}
}
} catch {
// No .cmd wrapper, fall through
}

return [executable, []];
}

async function defaultPiRunner(options: PiRunOptions): Promise<PiRunResult> {
return await new Promise<PiRunResult>((resolve, reject) => {
const parts = options.executable.split(/\s+/);
const executable = parts[0];
const executableArgs = parts.slice(1);
const [resolvedExe, prefixArgs] = resolveWindowsCmd(parts[0]);
const executableArgs = [...prefixArgs, ...parts.slice(1)];
const allArgs = [...executableArgs, ...options.args];

const child = spawn(executable, allArgs, {
const child = spawn(resolvedExe, allArgs, {
cwd: options.cwd,
env: options.env,
stdio: ['pipe', 'pipe', 'pipe'],
shell: false,
});

let stdout = '';
Expand Down
Loading
Loading