From f59d9cde4beb32bc7796688c1e2059bd2f352b2f Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 31 Dec 2025 11:58:03 +0000 Subject: [PATCH 01/16] feat(core): add pi-coding-agent provider Add dedicated provider for pi-coding-agent CLI from pi-mono repository. This enables AgentV to evaluate the Pi autonomous coding agent with full JSONL output parsing, tool call extraction, and stream logging. - Add PiCodingAgentProvider class with JSONL event parsing - Add 'pi-coding-agent' to ProviderKind with 'pi' alias - Add PiCodingAgentResolvedConfig type and resolver - Support provider, model, tools, thinking level configuration - Add example target configuration and test eval --- examples/features/.agentv/targets.yaml | 11 + examples/features/evals/pi-agent/hello.yaml | 14 + .../core/src/evaluation/providers/index.ts | 4 + .../evaluation/providers/pi-coding-agent.ts | 773 ++++++++++++++++++ .../core/src/evaluation/providers/targets.ts | 106 +++ .../core/src/evaluation/providers/types.ts | 4 + 6 files changed, 912 insertions(+) create mode 100644 examples/features/evals/pi-agent/hello.yaml create mode 100644 packages/core/src/evaluation/providers/pi-coding-agent.ts diff --git a/examples/features/.agentv/targets.yaml b/examples/features/.agentv/targets.yaml index bab6ea4c..fe8d1e2b 100644 --- a/examples/features/.agentv/targets.yaml +++ b/examples/features/.agentv/targets.yaml @@ -55,6 +55,17 @@ targets: api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }} model: ${{ GEMINI_MODEL_NAME }} + # Pi Coding Agent - autonomous coding CLI from pi-mono + - name: pi_coding_agent + provider: pi-coding-agent + executable: node /root/projects/pi-mono/packages/coding-agent/dist/cli.js + pi_provider: google + model: ${{ GEMINI_MODEL_NAME }} + api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }} + judge_target: gemini_base + timeout_seconds: 180 + tools: read,bash,edit,write # Default tools for coding tasks + - name: local_cli provider: cli judge_target: azure_base diff --git a/examples/features/evals/pi-agent/hello.yaml b/examples/features/evals/pi-agent/hello.yaml new file mode 100644 index 00000000..8182b439 --- /dev/null +++ b/examples/features/evals/pi-agent/hello.yaml @@ -0,0 +1,14 @@ +$schema: agentv-eval-v2 +description: Test Pi Coding Agent provider +target: pi_coding_agent + +evalcases: + - id: hello-world + expected_outcome: | + The agent should respond with a greeting like "Hello" or "Hi". + + input_messages: + - role: user + content: + - type: text + value: Say hello in one word. Only output the word, nothing else. diff --git a/packages/core/src/evaluation/providers/index.ts b/packages/core/src/evaluation/providers/index.ts index c95120ae..85c29dad 100644 --- a/packages/core/src/evaluation/providers/index.ts +++ b/packages/core/src/evaluation/providers/index.ts @@ -2,6 +2,7 @@ import { AnthropicProvider, AzureProvider, GeminiProvider } from './ai-sdk.js'; import { CliProvider } from './cli.js'; import { CodexProvider } from './codex.js'; import { MockProvider } from './mock.js'; +import { PiCodingAgentProvider } from './pi-coding-agent.js'; import type { ResolvedTarget } from './targets.js'; import { resolveTargetDefinition } from './targets.js'; import type { EnvLookup, Provider, TargetDefinition } from './types.js'; @@ -22,6 +23,7 @@ export type { CliResolvedConfig, GeminiResolvedConfig, MockResolvedConfig, + PiCodingAgentResolvedConfig, ResolvedTarget, VSCodeResolvedConfig, } from './targets.js'; @@ -47,6 +49,8 @@ export function createProvider(target: ResolvedTarget): Provider { return new CliProvider(target.name, target.config); case 'codex': return new CodexProvider(target.name, target.config); + case 'pi-coding-agent': + return new PiCodingAgentProvider(target.name, target.config); case 'mock': return new MockProvider(target.name, target.config); case 'vscode': diff --git a/packages/core/src/evaluation/providers/pi-coding-agent.ts b/packages/core/src/evaluation/providers/pi-coding-agent.ts new file mode 100644 index 00000000..a6d68f61 --- /dev/null +++ b/packages/core/src/evaluation/providers/pi-coding-agent.ts @@ -0,0 +1,773 @@ +import { spawn } from 'node:child_process'; +import { randomUUID } from 'node:crypto'; +import { createWriteStream } from 'node:fs'; +import type { WriteStream } from 'node:fs'; +import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; + +import { buildPromptDocument, normalizeInputFiles } from './preread.js'; +import type { PiCodingAgentResolvedConfig } from './targets.js'; +import type { + OutputMessage, + Provider, + ProviderRequest, + ProviderResponse, + ToolCall, +} from './types.js'; + +const WORKSPACE_PREFIX = 'agentv-pi-'; +const PROMPT_FILENAME = 'prompt.md'; + +interface PiRunOptions { + readonly executable: string; + readonly args: readonly string[]; + readonly cwd: string; + readonly timeoutMs?: number; + readonly env: NodeJS.ProcessEnv; + readonly signal?: AbortSignal; + readonly onStdoutChunk?: (chunk: string) => void; + readonly onStderrChunk?: (chunk: string) => void; +} + +interface PiRunResult { + readonly stdout: string; + readonly stderr: string; + readonly exitCode: number; + readonly timedOut?: boolean; +} + +type PiRunner = (options: PiRunOptions) => Promise; + +export class PiCodingAgentProvider implements Provider { + readonly id: string; + readonly kind = 'pi-coding-agent' as const; + readonly targetName: string; + readonly supportsBatch = false; + + private readonly config: PiCodingAgentResolvedConfig; + private readonly runPi: PiRunner; + + constructor( + targetName: string, + config: PiCodingAgentResolvedConfig, + runner: PiRunner = defaultPiRunner, + ) { + this.id = `pi-coding-agent:${targetName}`; + this.targetName = targetName; + this.config = config; + this.runPi = runner; + } + + async invoke(request: ProviderRequest): Promise { + if (request.signal?.aborted) { + throw new Error('Pi coding agent request was aborted before execution'); + } + + const inputFiles = normalizeInputFiles(request.inputFiles); + + const workspaceRoot = await this.createWorkspace(); + const logger = await this.createStreamLogger(request).catch(() => undefined); + try { + const promptContent = buildPromptDocument(request, inputFiles); + const promptFile = path.join(workspaceRoot, PROMPT_FILENAME); + await writeFile(promptFile, promptContent, 'utf8'); + + const args = this.buildPiArgs(promptContent); + const cwd = this.resolveCwd(workspaceRoot); + + const result = await this.executePi(args, cwd, request.signal, logger); + + if (result.timedOut) { + throw new Error( + `Pi coding agent timed out${formatTimeoutSuffix(this.config.timeoutMs ?? undefined)}`, + ); + } + + if (result.exitCode !== 0) { + const detail = pickDetail(result.stderr, result.stdout); + const prefix = `Pi coding agent exited with code ${result.exitCode}`; + throw new Error(detail ? `${prefix}: ${detail}` : prefix); + } + + const parsed = parsePiJsonl(result.stdout); + const outputMessages = extractOutputMessages(parsed); + const assistantText = extractAssistantText(outputMessages); + + return { + raw: { + response: parsed, + stdout: result.stdout, + stderr: result.stderr, + exitCode: result.exitCode, + args, + executable: this.config.executable, + promptFile, + workspace: workspaceRoot, + inputFiles, + logFile: logger?.filePath, + }, + outputMessages, + }; + } finally { + await logger?.close(); + await this.cleanupWorkspace(workspaceRoot); + } + } + + private resolveCwd(workspaceRoot: string): string { + if (!this.config.cwd) { + return workspaceRoot; + } + return path.resolve(this.config.cwd); + } + + private buildPiArgs(prompt: string): string[] { + const args: string[] = []; + + // Provider and model configuration + if (this.config.provider) { + args.push('--provider', this.config.provider); + } + if (this.config.model) { + args.push('--model', this.config.model); + } + if (this.config.apiKey) { + args.push('--api-key', this.config.apiKey); + } + + // Output mode - always use JSON for structured output + args.push('--mode', 'json'); + + // Non-interactive mode + args.push('--print'); + + // No session storage for eval runs + args.push('--no-session'); + + // Tools configuration + if (this.config.tools) { + args.push('--tools', this.config.tools); + } + + // Thinking level + if (this.config.thinking) { + args.push('--thinking', this.config.thinking); + } + + // Custom args + if (this.config.args && this.config.args.length > 0) { + args.push(...this.config.args); + } + + // Prompt is passed as the final argument + args.push(prompt); + + return args; + } + + private async executePi( + args: readonly string[], + cwd: string, + signal: AbortSignal | undefined, + logger: PiStreamLogger | undefined, + ): Promise { + try { + return await this.runPi({ + executable: this.config.executable, + args, + cwd, + timeoutMs: this.config.timeoutMs, + env: this.buildEnv(), + signal, + onStdoutChunk: logger ? (chunk) => logger.handleStdoutChunk(chunk) : undefined, + onStderrChunk: logger ? (chunk) => logger.handleStderrChunk(chunk) : undefined, + }); + } catch (error) { + const err = error as NodeJS.ErrnoException; + if (err.code === 'ENOENT') { + throw new Error( + `Pi coding agent executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`, + ); + } + throw error; + } + } + + private buildEnv(): NodeJS.ProcessEnv { + const env = { ...process.env }; + + // Map provider-specific API key to the correct env var + if (this.config.apiKey) { + const provider = this.config.provider?.toLowerCase() ?? 'google'; + switch (provider) { + case 'google': + case 'gemini': + env.GEMINI_API_KEY = this.config.apiKey; + break; + case 'anthropic': + env.ANTHROPIC_API_KEY = this.config.apiKey; + break; + case 'openai': + env.OPENAI_API_KEY = this.config.apiKey; + break; + case 'groq': + env.GROQ_API_KEY = this.config.apiKey; + break; + case 'xai': + env.XAI_API_KEY = this.config.apiKey; + break; + case 'openrouter': + env.OPENROUTER_API_KEY = this.config.apiKey; + break; + } + } + + return env; + } + + private async createWorkspace(): Promise { + return await mkdtemp(path.join(tmpdir(), WORKSPACE_PREFIX)); + } + + private async cleanupWorkspace(workspaceRoot: string): Promise { + try { + await rm(workspaceRoot, { recursive: true, force: true }); + } catch { + // Best-effort cleanup + } + } + + private resolveLogDirectory(): string | undefined { + if (this.config.logDir) { + return path.resolve(this.config.logDir); + } + return path.join(process.cwd(), '.agentv', 'logs', 'pi-coding-agent'); + } + + private async createStreamLogger(request: ProviderRequest): Promise { + const logDir = this.resolveLogDirectory(); + if (!logDir) { + return undefined; + } + try { + await mkdir(logDir, { recursive: true }); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`); + return undefined; + } + + const filePath = path.join(logDir, buildLogFilename(request, this.targetName)); + + try { + const logger = await PiStreamLogger.create({ + filePath, + targetName: this.targetName, + evalCaseId: request.evalCaseId, + attempt: request.attempt, + }); + return logger; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.warn(`Skipping Pi stream logging for ${filePath}: ${message}`); + return undefined; + } + } +} + +class PiStreamLogger { + readonly filePath: string; + private readonly stream: WriteStream; + private readonly startedAt = Date.now(); + private stdoutBuffer = ''; + private stderrBuffer = ''; + + private constructor(filePath: string) { + this.filePath = filePath; + this.stream = createWriteStream(filePath, { flags: 'a' }); + } + + static async create(options: { + readonly filePath: string; + readonly targetName: string; + readonly evalCaseId?: string; + readonly attempt?: number; + }): Promise { + const logger = new PiStreamLogger(options.filePath); + const header = [ + '# Pi Coding Agent stream log', + `# target: ${options.targetName}`, + options.evalCaseId ? `# eval: ${options.evalCaseId}` : undefined, + options.attempt !== undefined ? `# attempt: ${options.attempt + 1}` : undefined, + `# started: ${new Date().toISOString()}`, + '', + ].filter((line): line is string => Boolean(line)); + logger.writeLines(header); + return logger; + } + + handleStdoutChunk(chunk: string): void { + this.stdoutBuffer += chunk; + this.flushBuffer('stdout'); + } + + handleStderrChunk(chunk: string): void { + this.stderrBuffer += chunk; + this.flushBuffer('stderr'); + } + + async close(): Promise { + this.flushBuffer('stdout'); + this.flushBuffer('stderr'); + this.flushRemainder(); + await new Promise((resolve, reject) => { + this.stream.once('error', reject); + this.stream.end(() => resolve()); + }); + } + + private writeLines(lines: readonly string[]): void { + for (const line of lines) { + this.stream.write(`${line}\n`); + } + } + + private flushBuffer(source: 'stdout' | 'stderr'): void { + const buffer = source === 'stdout' ? this.stdoutBuffer : this.stderrBuffer; + const lines = buffer.split(/\r?\n/); + const remainder = lines.pop() ?? ''; + if (source === 'stdout') { + this.stdoutBuffer = remainder; + } else { + this.stderrBuffer = remainder; + } + for (const line of lines) { + const formatted = this.formatLine(line, source); + if (formatted) { + this.stream.write(formatted); + this.stream.write('\n'); + } + } + } + + private formatLine(rawLine: string, source: 'stdout' | 'stderr'): string | undefined { + const trimmed = rawLine.trim(); + if (trimmed.length === 0) { + return undefined; + } + const message = formatPiLogMessage(trimmed, source); + return `[+${formatElapsed(this.startedAt)}] [${source}] ${message}`; + } + + private flushRemainder(): void { + const stdoutRemainder = this.stdoutBuffer.trim(); + if (stdoutRemainder.length > 0) { + const formatted = this.formatLine(stdoutRemainder, 'stdout'); + if (formatted) { + this.stream.write(formatted); + this.stream.write('\n'); + } + } + const stderrRemainder = this.stderrBuffer.trim(); + if (stderrRemainder.length > 0) { + const formatted = this.formatLine(stderrRemainder, 'stderr'); + if (formatted) { + this.stream.write(formatted); + this.stream.write('\n'); + } + } + this.stdoutBuffer = ''; + this.stderrBuffer = ''; + } +} + +function buildLogFilename(request: ProviderRequest, targetName: string): string { + const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); + const evalId = sanitizeForFilename(request.evalCaseId ?? 'pi'); + const attemptSuffix = request.attempt !== undefined ? `_attempt-${request.attempt + 1}` : ''; + const target = sanitizeForFilename(targetName); + return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID().slice(0, 8)}.log`; +} + +function sanitizeForFilename(value: string): string { + const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, '_'); + return sanitized.length > 0 ? sanitized : 'pi'; +} + +function formatElapsed(startedAt: number): string { + const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1000); + const hours = Math.floor(elapsedSeconds / 3600); + const minutes = Math.floor((elapsedSeconds % 3600) / 60); + const seconds = elapsedSeconds % 60; + if (hours > 0) { + return `${hours.toString().padStart(2, '0')}:${minutes.toString().padStart(2, '0')}:${seconds.toString().padStart(2, '0')}`; + } + return `${minutes.toString().padStart(2, '0')}:${seconds.toString().padStart(2, '0')}`; +} + +function formatPiLogMessage(rawLine: string, source: 'stdout' | 'stderr'): string { + const parsed = tryParseJsonValue(rawLine); + if (parsed) { + const summary = summarizePiEvent(parsed); + if (summary) { + return summary; + } + } + if (source === 'stderr') { + return `stderr: ${rawLine}`; + } + return rawLine; +} + +function summarizePiEvent(event: unknown): string | undefined { + if (!event || typeof event !== 'object') { + return undefined; + } + const record = event as Record; + const type = typeof record.type === 'string' ? record.type : undefined; + + if (!type) { + return undefined; + } + + // Handle specific event types + switch (type) { + case 'agent_start': + return 'agent_start'; + case 'agent_end': + return 'agent_end'; + case 'turn_start': + return 'turn_start'; + case 'turn_end': + return 'turn_end'; + case 'message_start': + case 'message_end': { + const message = record.message as Record | undefined; + const role = message?.role; + return `${type}: ${role}`; + } + case 'message_update': { + const event = record.assistantMessageEvent as Record | undefined; + const eventType = event?.type; + if (eventType === 'text_delta') { + const delta = event?.delta; + if (typeof delta === 'string') { + const preview = delta.length > 50 ? `${delta.slice(0, 50)}...` : delta; + return `text_delta: ${preview}`; + } + } + return `message_update: ${eventType}`; + } + default: + return type; + } +} + +function tryParseJsonValue(rawLine: string): unknown | undefined { + try { + return JSON.parse(rawLine); + } catch { + return undefined; + } +} + +/** + * Parse Pi coding agent JSONL output. + * Returns an array of parsed JSON objects from each line. + */ +function parsePiJsonl(output: string): unknown[] { + const trimmed = output.trim(); + if (trimmed.length === 0) { + throw new Error('Pi coding agent produced no output'); + } + + const lines = trimmed + .split(/\r?\n/) + .map((line) => line.trim()) + .filter((line) => line.length > 0); + + const parsed: unknown[] = []; + for (const line of lines) { + try { + parsed.push(JSON.parse(line)); + } catch { + // Skip non-JSON lines (e.g., stderr mixed in) + } + } + + if (parsed.length === 0) { + throw new Error('Pi coding agent produced no valid JSON output'); + } + + return parsed; +} + +/** + * Extract OutputMessage array from Pi JSONL events. + * Looks for the agent_end event which contains the full message history. + */ +function extractOutputMessages(events: unknown[]): readonly OutputMessage[] { + // Find the agent_end event which contains all messages + for (let i = events.length - 1; i >= 0; i--) { + const event = events[i]; + if (!event || typeof event !== 'object') { + continue; + } + const record = event as Record; + if (record.type !== 'agent_end') { + continue; + } + + const messages = record.messages; + if (!Array.isArray(messages)) { + continue; + } + + return messages.map(convertPiMessage).filter((m): m is OutputMessage => m !== undefined); + } + + // Fallback: collect messages from turn_end events + const outputMessages: OutputMessage[] = []; + for (const event of events) { + if (!event || typeof event !== 'object') { + continue; + } + const record = event as Record; + if (record.type === 'turn_end') { + const message = record.message; + const converted = convertPiMessage(message); + if (converted) { + outputMessages.push(converted); + } + } + } + + return outputMessages; +} + +/** + * Convert a Pi message to AgentV OutputMessage format. + */ +function convertPiMessage(message: unknown): OutputMessage | undefined { + if (!message || typeof message !== 'object') { + return undefined; + } + + const msg = message as Record; + const role = msg.role; + if (typeof role !== 'string') { + return undefined; + } + + // Extract text content from Pi's content array format + const content = extractTextContent(msg.content); + + // Extract tool calls if present + const toolCalls = extractToolCalls(msg.content); + + // Extract timestamp + const timestamp = + typeof msg.timestamp === 'number' + ? new Date(msg.timestamp).toISOString() + : typeof msg.timestamp === 'string' + ? msg.timestamp + : undefined; + + // Extract metadata (usage, model info, etc.) + const metadata: Record = {}; + if (msg.api) metadata.api = msg.api; + if (msg.provider) metadata.provider = msg.provider; + if (msg.model) metadata.model = msg.model; + if (msg.usage) metadata.usage = msg.usage; + if (msg.stopReason) metadata.stopReason = msg.stopReason; + + return { + role, + content, + toolCalls: toolCalls.length > 0 ? toolCalls : undefined, + timestamp, + metadata: Object.keys(metadata).length > 0 ? metadata : undefined, + }; +} + +/** + * Extract text content from Pi's content array format. + * Pi uses: content: [{ type: "text", text: "..." }, ...] + */ +function extractTextContent(content: unknown): string | undefined { + if (typeof content === 'string') { + return content; + } + + if (!Array.isArray(content)) { + return undefined; + } + + const textParts: string[] = []; + for (const part of content) { + if (!part || typeof part !== 'object') { + continue; + } + const p = part as Record; + if (p.type === 'text' && typeof p.text === 'string') { + textParts.push(p.text); + } + } + + return textParts.length > 0 ? textParts.join('\n') : undefined; +} + +/** + * Extract tool calls from Pi's content array format. + * Pi uses: content: [{ type: "tool_use", name: "...", input: {...} }, ...] + */ +function extractToolCalls(content: unknown): readonly ToolCall[] { + if (!Array.isArray(content)) { + return []; + } + + const toolCalls: ToolCall[] = []; + for (const part of content) { + if (!part || typeof part !== 'object') { + continue; + } + const p = part as Record; + if (p.type === 'tool_use' && typeof p.name === 'string') { + toolCalls.push({ + tool: p.name, + input: p.input, + id: typeof p.id === 'string' ? p.id : undefined, + }); + } + // Also handle tool_result for output + if (p.type === 'tool_result' && typeof p.tool_use_id === 'string') { + // Find matching tool call and add output + const existing = toolCalls.find((tc) => tc.id === p.tool_use_id); + if (existing) { + // Create new object with output added + const idx = toolCalls.indexOf(existing); + toolCalls[idx] = { + ...existing, + output: p.content, + }; + } + } + } + + return toolCalls; +} + +/** + * Extract the final assistant text from output messages. + */ +function extractAssistantText(messages: readonly OutputMessage[]): string { + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg.role === 'assistant' && msg.content) { + if (typeof msg.content === 'string') { + return msg.content; + } + return JSON.stringify(msg.content); + } + } + return ''; +} + +function pickDetail(stderr: string, stdout: string): string | undefined { + const errorText = stderr.trim(); + if (errorText.length > 0) { + return errorText; + } + const stdoutText = stdout.trim(); + return stdoutText.length > 0 ? stdoutText : undefined; +} + +function formatTimeoutSuffix(timeoutMs: number | undefined): string { + if (!timeoutMs || timeoutMs <= 0) { + return ''; + } + const seconds = Math.ceil(timeoutMs / 1000); + return ` after ${seconds}s`; +} + +async function defaultPiRunner(options: PiRunOptions): Promise { + return await new Promise((resolve, reject) => { + // Parse executable - may be "node /path/to/script.js" or just "pi" + const parts = options.executable.split(/\s+/); + const executable = parts[0]; + const executableArgs = parts.slice(1); + const allArgs = [...executableArgs, ...options.args]; + + const child = spawn(executable, allArgs, { + cwd: options.cwd, + env: options.env, + stdio: ['pipe', 'pipe', 'pipe'], + shell: false, + }); + + let stdout = ''; + let stderr = ''; + let timedOut = false; + + const onAbort = (): void => { + child.kill('SIGTERM'); + }; + + if (options.signal) { + if (options.signal.aborted) { + onAbort(); + } else { + options.signal.addEventListener('abort', onAbort, { once: true }); + } + } + + let timeoutHandle: NodeJS.Timeout | undefined; + if (options.timeoutMs && options.timeoutMs > 0) { + timeoutHandle = setTimeout(() => { + timedOut = true; + child.kill('SIGTERM'); + }, options.timeoutMs); + timeoutHandle.unref?.(); + } + + child.stdout.setEncoding('utf8'); + child.stdout.on('data', (chunk) => { + stdout += chunk; + options.onStdoutChunk?.(chunk); + }); + + child.stderr.setEncoding('utf8'); + child.stderr.on('data', (chunk) => { + stderr += chunk; + options.onStderrChunk?.(chunk); + }); + + // Close stdin immediately since prompt is passed as argument + child.stdin.end(); + + const cleanup = (): void => { + if (timeoutHandle) { + clearTimeout(timeoutHandle); + } + if (options.signal) { + options.signal.removeEventListener('abort', onAbort); + } + }; + + child.on('error', (error) => { + cleanup(); + reject(error); + }); + + child.on('close', (code) => { + cleanup(); + resolve({ + stdout, + stderr, + exitCode: typeof code === 'number' ? code : -1, + timedOut, + }); + }); + }); +} diff --git a/packages/core/src/evaluation/providers/targets.ts b/packages/core/src/evaluation/providers/targets.ts index 61f6b3aa..ace60299 100644 --- a/packages/core/src/evaluation/providers/targets.ts +++ b/packages/core/src/evaluation/providers/targets.ts @@ -429,6 +429,19 @@ export interface CodexResolvedConfig { readonly logFormat?: 'summary' | 'json'; } +export interface PiCodingAgentResolvedConfig { + readonly executable: string; + readonly provider?: string; + readonly model?: string; + readonly apiKey?: string; + readonly tools?: string; + readonly thinking?: string; + readonly args?: readonly string[]; + readonly cwd?: string; + readonly timeoutMs?: number; + readonly logDir?: string; +} + export interface MockResolvedConfig { readonly response?: string; readonly delayMs?: number; @@ -486,6 +499,14 @@ export type ResolvedTarget = readonly providerBatching?: boolean; readonly config: CodexResolvedConfig; } + | { + readonly kind: 'pi-coding-agent'; + readonly name: string; + readonly judgeTarget?: string; + readonly workers?: number; + readonly providerBatching?: boolean; + readonly config: PiCodingAgentResolvedConfig; + } | { readonly kind: 'mock'; readonly name: string; @@ -630,6 +651,16 @@ export function resolveTargetDefinition( providerBatching, config: resolveCodexConfig(parsed, env), }; + case 'pi': + case 'pi-coding-agent': + return { + kind: 'pi-coding-agent', + name: parsed.name, + judgeTarget: parsed.judge_target, + workers: parsed.workers, + providerBatching, + config: resolvePiCodingAgentConfig(parsed, env), + }; case 'mock': return { kind: 'mock', @@ -811,6 +842,81 @@ function normalizeCodexLogFormat(value: unknown): 'summary' | 'json' | undefined throw new Error("codex log format must be 'summary' or 'json'"); } +function resolvePiCodingAgentConfig( + target: z.infer, + env: EnvLookup, +): PiCodingAgentResolvedConfig { + const executableSource = target.executable ?? target.command ?? target.binary; + const providerSource = target.pi_provider ?? target.piProvider ?? target.llm_provider; + const modelSource = target.model ?? target.pi_model ?? target.piModel; + const apiKeySource = target.api_key ?? target.apiKey; + const toolsSource = target.tools ?? target.pi_tools ?? target.piTools; + const thinkingSource = target.thinking ?? target.pi_thinking ?? target.piThinking; + const argsSource = target.args ?? target.arguments; + const cwdSource = target.cwd; + const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds; + const logDirSource = + target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory; + + const executable = + resolveOptionalString(executableSource, env, `${target.name} pi executable`, { + allowLiteral: true, + optionalEnv: true, + }) ?? 'node /root/projects/pi-mono/packages/coding-agent/dist/cli.js'; + + const provider = resolveOptionalString(providerSource, env, `${target.name} pi provider`, { + allowLiteral: true, + optionalEnv: true, + }); + + const model = resolveOptionalString(modelSource, env, `${target.name} pi model`, { + allowLiteral: true, + optionalEnv: true, + }); + + const apiKey = resolveOptionalString(apiKeySource, env, `${target.name} pi api key`, { + allowLiteral: false, + optionalEnv: true, + }); + + const tools = resolveOptionalString(toolsSource, env, `${target.name} pi tools`, { + allowLiteral: true, + optionalEnv: true, + }); + + const thinking = resolveOptionalString(thinkingSource, env, `${target.name} pi thinking`, { + allowLiteral: true, + optionalEnv: true, + }); + + const args = resolveOptionalStringArray(argsSource, env, `${target.name} pi args`); + + const cwd = resolveOptionalString(cwdSource, env, `${target.name} pi cwd`, { + allowLiteral: true, + optionalEnv: true, + }); + + const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} pi timeout`); + + const logDir = resolveOptionalString(logDirSource, env, `${target.name} pi log directory`, { + allowLiteral: true, + optionalEnv: true, + }); + + return { + executable, + provider, + model, + apiKey, + tools, + thinking, + args, + cwd, + timeoutMs, + logDir, + }; +} + function resolveMockConfig(target: z.infer): MockResolvedConfig { const response = typeof target.response === 'string' ? target.response : undefined; return { response }; diff --git a/packages/core/src/evaluation/providers/types.ts b/packages/core/src/evaluation/providers/types.ts index 4634e043..5515d8ba 100644 --- a/packages/core/src/evaluation/providers/types.ts +++ b/packages/core/src/evaluation/providers/types.ts @@ -15,6 +15,7 @@ export type ProviderKind = | 'anthropic' | 'gemini' | 'codex' + | 'pi-coding-agent' | 'cli' | 'mock' | 'vscode' @@ -26,6 +27,7 @@ export type ProviderKind = */ export const AGENT_PROVIDER_KINDS: readonly ProviderKind[] = [ 'codex', + 'pi-coding-agent', 'vscode', 'vscode-insiders', ] as const; @@ -39,6 +41,7 @@ export const KNOWN_PROVIDERS: readonly ProviderKind[] = [ 'anthropic', 'gemini', 'codex', + 'pi-coding-agent', 'cli', 'mock', 'vscode', @@ -54,6 +57,7 @@ export const PROVIDER_ALIASES: readonly string[] = [ 'google', // alias for "gemini" 'google-gemini', // alias for "gemini" 'codex-cli', // alias for "codex" + 'pi', // alias for "pi-coding-agent" 'openai', // legacy/future support 'bedrock', // legacy/future support 'vertex', // legacy/future support From 24f087e4b14321d235336bfa080830573a84646f Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 31 Dec 2025 12:37:15 +0000 Subject: [PATCH 02/16] refactor(pi-coding-agent): use native @path file syntax Use pi CLI's native @path syntax for file attachments instead of embedding file:// URIs in the prompt text. --- .../src/evaluation/providers/pi-coding-agent.ts | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/packages/core/src/evaluation/providers/pi-coding-agent.ts b/packages/core/src/evaluation/providers/pi-coding-agent.ts index a6d68f61..a040fe22 100644 --- a/packages/core/src/evaluation/providers/pi-coding-agent.ts +++ b/packages/core/src/evaluation/providers/pi-coding-agent.ts @@ -6,7 +6,7 @@ import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import path from 'node:path'; -import { buildPromptDocument, normalizeInputFiles } from './preread.js'; +import { normalizeInputFiles } from './preread.js'; import type { PiCodingAgentResolvedConfig } from './targets.js'; import type { OutputMessage, @@ -69,11 +69,11 @@ export class PiCodingAgentProvider implements Provider { const workspaceRoot = await this.createWorkspace(); const logger = await this.createStreamLogger(request).catch(() => undefined); try { - const promptContent = buildPromptDocument(request, inputFiles); + // Save prompt to file for debugging/logging const promptFile = path.join(workspaceRoot, PROMPT_FILENAME); - await writeFile(promptFile, promptContent, 'utf8'); + await writeFile(promptFile, request.question, 'utf8'); - const args = this.buildPiArgs(promptContent); + const args = this.buildPiArgs(request.question, inputFiles); const cwd = this.resolveCwd(workspaceRoot); const result = await this.executePi(args, cwd, request.signal, logger); @@ -122,7 +122,7 @@ export class PiCodingAgentProvider implements Provider { return path.resolve(this.config.cwd); } - private buildPiArgs(prompt: string): string[] { + private buildPiArgs(prompt: string, inputFiles: readonly string[] | undefined): string[] { const args: string[] = []; // Provider and model configuration @@ -160,6 +160,13 @@ export class PiCodingAgentProvider implements Provider { args.push(...this.config.args); } + // Input files passed with @path syntax (pi-native file inclusion) + if (inputFiles && inputFiles.length > 0) { + for (const file of inputFiles) { + args.push(`@${file}`); + } + } + // Prompt is passed as the final argument args.push(prompt); From a6f2531499616066b73d34afa4118f0b8360b4bf Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 31 Dec 2025 12:45:14 +0000 Subject: [PATCH 03/16] fix(pi-coding-agent): escape @ symbols in prompts to avoid file conflicts Pi CLI interprets @ as file prefix, but AgentV prompts use @[Role]: markers for multi-turn conversations. Escape these patterns to [[Role]]: format. Also removes pi-agent specific evals - use basic evals which are agent-agnostic. --- examples/features/evals/pi-agent/hello.yaml | 14 -------------- .../src/evaluation/providers/pi-coding-agent.ts | 17 ++++++++++++++++- 2 files changed, 16 insertions(+), 15 deletions(-) delete mode 100644 examples/features/evals/pi-agent/hello.yaml diff --git a/examples/features/evals/pi-agent/hello.yaml b/examples/features/evals/pi-agent/hello.yaml deleted file mode 100644 index 8182b439..00000000 --- a/examples/features/evals/pi-agent/hello.yaml +++ /dev/null @@ -1,14 +0,0 @@ -$schema: agentv-eval-v2 -description: Test Pi Coding Agent provider -target: pi_coding_agent - -evalcases: - - id: hello-world - expected_outcome: | - The agent should respond with a greeting like "Hello" or "Hi". - - input_messages: - - role: user - content: - - type: text - value: Say hello in one word. Only output the word, nothing else. diff --git a/packages/core/src/evaluation/providers/pi-coding-agent.ts b/packages/core/src/evaluation/providers/pi-coding-agent.ts index a040fe22..b70d5860 100644 --- a/packages/core/src/evaluation/providers/pi-coding-agent.ts +++ b/packages/core/src/evaluation/providers/pi-coding-agent.ts @@ -167,8 +167,12 @@ export class PiCodingAgentProvider implements Provider { } } + // Escape @ symbols in prompt that aren't file references + // Pi CLI interprets @ as file prefix, but AgentV uses @[Role]: for multi-turn + const escapedPrompt = escapeAtSymbols(prompt); + // Prompt is passed as the final argument - args.push(prompt); + args.push(escapedPrompt); return args; } @@ -681,6 +685,17 @@ function extractAssistantText(messages: readonly OutputMessage[]): string { return ''; } +/** + * Escape @ symbols in prompt text that pi CLI would interpret as file references. + * Pi CLI uses @path syntax for file inclusion, but AgentV prompts use @[Role]: markers. + * We replace @[ with [[ to avoid pi trying to read these as files. + */ +function escapeAtSymbols(prompt: string): string { + // Replace @[Role]: patterns with [[Role]]: to avoid pi file interpretation + // This handles @[System]:, @[User]:, @[Assistant]:, @[Tool]: etc. + return prompt.replace(/@\[([^\]]+)\]:/g, '[[$1]]:'); +} + function pickDetail(stderr: string, stdout: string): string | undefined { const errorText = stderr.trim(); if (errorText.length > 0) { From d9af94fa8c96fc2562d80d7fcdcd55b380b7d899 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 31 Dec 2025 13:19:16 +0000 Subject: [PATCH 04/16] feat(pi-coding-agent): display log paths in console output Add log path tracking for Pi Coding Agent similar to Codex CLI. Log file paths are now displayed in the console during eval runs with a "Pi Coding Agent logs:" header. --- .../cli/src/commands/eval/progress-display.ts | 5 +- apps/cli/src/commands/eval/run-eval.ts | 17 ++++- .../core/src/evaluation/providers/index.ts | 1 + .../evaluation/providers/pi-coding-agent.ts | 7 ++ .../evaluation/providers/pi-log-tracker.ts | 72 +++++++++++++++++++ 5 files changed, 97 insertions(+), 5 deletions(-) create mode 100644 packages/core/src/evaluation/providers/pi-log-tracker.ts diff --git a/apps/cli/src/commands/eval/progress-display.ts b/apps/cli/src/commands/eval/progress-display.ts index 6df5cc61..0b18013c 100644 --- a/apps/cli/src/commands/eval/progress-display.ts +++ b/apps/cli/src/commands/eval/progress-display.ts @@ -78,7 +78,7 @@ export class ProgressDisplay { } } - addLogPaths(paths: readonly string[]): void { + addLogPaths(paths: readonly string[], provider?: 'codex' | 'pi'): void { const newPaths: string[] = []; for (const path of paths) { if (this.logPathSet.has(path)) { @@ -96,7 +96,8 @@ export class ProgressDisplay { if (!this.hasPrintedLogHeader) { console.log(''); - console.log('Codex CLI logs:'); + const label = provider === 'pi' ? 'Pi Coding Agent' : 'Codex CLI'; + console.log(`${label} logs:`); this.hasPrintedLogHeader = true; } diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 21e04a55..79e05438 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -11,6 +11,7 @@ import { ensureVSCodeSubagents, loadEvalCases, subscribeToCodexLogEntries, + subscribeToPiLogEntries, } from '@agentv/core'; import { loadEnvFromHierarchy } from './env.js'; @@ -170,7 +171,7 @@ type ProgressReporter = { setTotal(total: number): void; update(workerId: number, progress: WorkerProgress): void; finish(): void; - addLogPaths(paths: readonly string[]): void; + addLogPaths(paths: readonly string[], provider?: 'codex' | 'pi'): void; }; function createProgressReporter( @@ -185,7 +186,8 @@ function createProgressReporter( update: (workerId: number, progress: WorkerProgress) => display.updateWorker({ ...progress, workerId }), finish: () => display.finish(), - addLogPaths: (paths: readonly string[]) => display.addLogPaths(paths), + addLogPaths: (paths: readonly string[], provider?: 'codex' | 'pi') => + display.addLogPaths(paths, provider), }; } @@ -494,7 +496,15 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise return; } seenCodexLogPaths.add(entry.filePath); - progressReporter.addLogPaths([entry.filePath]); + progressReporter.addLogPaths([entry.filePath], 'codex'); + }); + const seenPiLogPaths = new Set(); + const unsubscribePiLogs = subscribeToPiLogEntries((entry) => { + if (!entry.filePath || seenPiLogPaths.has(entry.filePath)) { + return; + } + seenPiLogPaths.add(entry.filePath); + progressReporter.addLogPaths([entry.filePath], 'pi'); }); for (const [testFilePath, meta] of fileMetadata.entries()) { for (const evalId of meta.evalIds) { @@ -553,6 +563,7 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise } } finally { unsubscribeCodexLogs(); + unsubscribePiLogs(); await outputWriter.close().catch(() => undefined); } } diff --git a/packages/core/src/evaluation/providers/index.ts b/packages/core/src/evaluation/providers/index.ts index 85c29dad..2651ddd3 100644 --- a/packages/core/src/evaluation/providers/index.ts +++ b/packages/core/src/evaluation/providers/index.ts @@ -36,6 +36,7 @@ export { type EnsureSubagentsResult, } from './vscode.js'; export { consumeCodexLogEntries, subscribeToCodexLogEntries } from './codex-log-tracker.js'; +export { consumePiLogEntries, subscribeToPiLogEntries } from './pi-log-tracker.js'; export function createProvider(target: ResolvedTarget): Provider { switch (target.kind) { diff --git a/packages/core/src/evaluation/providers/pi-coding-agent.ts b/packages/core/src/evaluation/providers/pi-coding-agent.ts index b70d5860..d9fd3b55 100644 --- a/packages/core/src/evaluation/providers/pi-coding-agent.ts +++ b/packages/core/src/evaluation/providers/pi-coding-agent.ts @@ -6,6 +6,7 @@ import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import path from 'node:path'; +import { recordPiLogEntry } from './pi-log-tracker.js'; import { normalizeInputFiles } from './preread.js'; import type { PiCodingAgentResolvedConfig } from './targets.js'; import type { @@ -278,6 +279,12 @@ export class PiCodingAgentProvider implements Provider { evalCaseId: request.evalCaseId, attempt: request.attempt, }); + recordPiLogEntry({ + filePath, + targetName: this.targetName, + evalCaseId: request.evalCaseId, + attempt: request.attempt, + }); return logger; } catch (error) { const message = error instanceof Error ? error.message : String(error); diff --git a/packages/core/src/evaluation/providers/pi-log-tracker.ts b/packages/core/src/evaluation/providers/pi-log-tracker.ts new file mode 100644 index 00000000..dad8b51b --- /dev/null +++ b/packages/core/src/evaluation/providers/pi-log-tracker.ts @@ -0,0 +1,72 @@ +export type PiLogEntry = { + readonly filePath: string; + readonly evalCaseId?: string; + readonly targetName: string; + readonly attempt?: number; +}; + +const GLOBAL_LOGS_KEY = Symbol.for('agentv.piLogs'); +const GLOBAL_SUBSCRIBERS_KEY = Symbol.for('agentv.piLogSubscribers'); + +type PiLogListener = (entry: PiLogEntry) => void; + +type GlobalWithPiLogs = typeof globalThis & { + [GLOBAL_LOGS_KEY]?: PiLogEntry[]; + [GLOBAL_SUBSCRIBERS_KEY]?: Set; +}; + +function getPiLogStore(): PiLogEntry[] { + const globalObject = globalThis as GlobalWithPiLogs; + const existing = globalObject[GLOBAL_LOGS_KEY]; + if (existing) { + return existing; + } + const created: PiLogEntry[] = []; + globalObject[GLOBAL_LOGS_KEY] = created; + return created; +} + +function getSubscriberStore(): Set { + const globalObject = globalThis as GlobalWithPiLogs; + const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY]; + if (existing) { + return existing; + } + const created = new Set(); + globalObject[GLOBAL_SUBSCRIBERS_KEY] = created; + return created; +} + +function notifySubscribers(entry: PiLogEntry): void { + const subscribers = Array.from(getSubscriberStore()); + for (const listener of subscribers) { + try { + listener(entry); + } catch (error) { + // Avoid surfacing subscriber errors to providers; log for visibility. + const message = error instanceof Error ? error.message : String(error); + console.warn(`Pi log subscriber failed: ${message}`); + } + } +} + +export function recordPiLogEntry(entry: PiLogEntry): void { + getPiLogStore().push(entry); + notifySubscribers(entry); +} + +export function consumePiLogEntries(): PiLogEntry[] { + const store = getPiLogStore(); + if (store.length === 0) { + return []; + } + return store.splice(0, store.length); +} + +export function subscribeToPiLogEntries(listener: PiLogListener): () => void { + const store = getSubscriberStore(); + store.add(listener); + return () => { + store.delete(listener); + }; +} From 5681f3e14697e8f956b2f554312730d98a302b23 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 31 Dec 2025 13:35:51 +0000 Subject: [PATCH 05/16] feat(pi-coding-agent): add log_format option for full JSON logs Add log_format config option (default: 'summary', or 'json' for raw events). This matches Codex CLI behavior and allows full output preservation. --- examples/features/.agentv/targets.yaml | 1 + .../core/src/evaluation/providers/pi-coding-agent.ts | 11 ++++++++--- packages/core/src/evaluation/providers/targets.ts | 6 ++++++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/examples/features/.agentv/targets.yaml b/examples/features/.agentv/targets.yaml index fe8d1e2b..1a627a9c 100644 --- a/examples/features/.agentv/targets.yaml +++ b/examples/features/.agentv/targets.yaml @@ -65,6 +65,7 @@ targets: judge_target: gemini_base timeout_seconds: 180 tools: read,bash,edit,write # Default tools for coding tasks + log_format: json # 'summary' (default) or 'json' for raw event logs - name: local_cli provider: cli diff --git a/packages/core/src/evaluation/providers/pi-coding-agent.ts b/packages/core/src/evaluation/providers/pi-coding-agent.ts index d9fd3b55..e71f9d27 100644 --- a/packages/core/src/evaluation/providers/pi-coding-agent.ts +++ b/packages/core/src/evaluation/providers/pi-coding-agent.ts @@ -278,6 +278,7 @@ export class PiCodingAgentProvider implements Provider { targetName: this.targetName, evalCaseId: request.evalCaseId, attempt: request.attempt, + format: this.config.logFormat ?? 'summary', }); recordPiLogEntry({ filePath, @@ -300,9 +301,11 @@ class PiStreamLogger { private readonly startedAt = Date.now(); private stdoutBuffer = ''; private stderrBuffer = ''; + private readonly format: 'summary' | 'json'; - private constructor(filePath: string) { + private constructor(filePath: string, format: 'summary' | 'json') { this.filePath = filePath; + this.format = format; this.stream = createWriteStream(filePath, { flags: 'a' }); } @@ -311,8 +314,9 @@ class PiStreamLogger { readonly targetName: string; readonly evalCaseId?: string; readonly attempt?: number; + readonly format: 'summary' | 'json'; }): Promise { - const logger = new PiStreamLogger(options.filePath); + const logger = new PiStreamLogger(options.filePath, options.format); const header = [ '# Pi Coding Agent stream log', `# target: ${options.targetName}`, @@ -374,7 +378,8 @@ class PiStreamLogger { if (trimmed.length === 0) { return undefined; } - const message = formatPiLogMessage(trimmed, source); + const message = + this.format === 'json' ? trimmed : formatPiLogMessage(trimmed, source); return `[+${formatElapsed(this.startedAt)}] [${source}] ${message}`; } diff --git a/packages/core/src/evaluation/providers/targets.ts b/packages/core/src/evaluation/providers/targets.ts index ace60299..289e5f37 100644 --- a/packages/core/src/evaluation/providers/targets.ts +++ b/packages/core/src/evaluation/providers/targets.ts @@ -440,6 +440,7 @@ export interface PiCodingAgentResolvedConfig { readonly cwd?: string; readonly timeoutMs?: number; readonly logDir?: string; + readonly logFormat?: 'summary' | 'json'; } export interface MockResolvedConfig { @@ -857,6 +858,7 @@ function resolvePiCodingAgentConfig( const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds; const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory; + const logFormatSource = target.log_format ?? target.logFormat; const executable = resolveOptionalString(executableSource, env, `${target.name} pi executable`, { @@ -903,6 +905,9 @@ function resolvePiCodingAgentConfig( optionalEnv: true, }); + const logFormat = + logFormatSource === 'json' || logFormatSource === 'summary' ? logFormatSource : undefined; + return { executable, provider, @@ -914,6 +919,7 @@ function resolvePiCodingAgentConfig( cwd, timeoutMs, logDir, + logFormat, }; } From c5c2b59b4455dc5d36f43643b97d38585ddb03ba Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 31 Dec 2025 13:42:51 +0000 Subject: [PATCH 06/16] fix(pi-coding-agent): pretty-print JSON logs like Codex --- .../src/evaluation/providers/pi-coding-agent.ts | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/packages/core/src/evaluation/providers/pi-coding-agent.ts b/packages/core/src/evaluation/providers/pi-coding-agent.ts index e71f9d27..1ece2033 100644 --- a/packages/core/src/evaluation/providers/pi-coding-agent.ts +++ b/packages/core/src/evaluation/providers/pi-coding-agent.ts @@ -379,7 +379,7 @@ class PiStreamLogger { return undefined; } const message = - this.format === 'json' ? trimmed : formatPiLogMessage(trimmed, source); + this.format === 'json' ? formatPiJsonLog(trimmed) : formatPiLogMessage(trimmed, source); return `[+${formatElapsed(this.startedAt)}] [${source}] ${message}`; } @@ -443,6 +443,18 @@ function formatPiLogMessage(rawLine: string, source: 'stdout' | 'stderr'): strin return rawLine; } +function formatPiJsonLog(rawLine: string): string { + const parsed = tryParseJsonValue(rawLine); + if (!parsed) { + return rawLine; + } + try { + return JSON.stringify(parsed, null, 2); + } catch { + return rawLine; + } +} + function summarizePiEvent(event: unknown): string | undefined { if (!event || typeof event !== 'object') { return undefined; From 28313aa6573dc5f0be334b80e878f5a3a2a9652d Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 31 Dec 2025 13:46:58 +0000 Subject: [PATCH 07/16] chore: add changeset for pi-coding-agent provider --- .changeset/add-pi-coding-agent-provider.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 .changeset/add-pi-coding-agent-provider.md diff --git a/.changeset/add-pi-coding-agent-provider.md b/.changeset/add-pi-coding-agent-provider.md new file mode 100644 index 00000000..c9791769 --- /dev/null +++ b/.changeset/add-pi-coding-agent-provider.md @@ -0,0 +1,12 @@ +--- +"@agentv/core": minor +"agentv": minor +--- + +Add Pi Coding Agent provider for autonomous coding evaluations + +- New `pi-coding-agent` provider for the Pi Coding Agent CLI from pi-mono +- Support file attachments using Pi's native `@path` syntax +- Extract tool trajectory/traces from Pi's JSONL output +- Display log file paths in console during eval runs +- Add `log_format` option ('summary' or 'json') for log verbosity From 7f20fa76256ab5e64d21e9b424cfe9b5398159ab Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 31 Dec 2025 13:51:07 +0000 Subject: [PATCH 08/16] docs: add Pi Coding Agent to README --- README.md | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 93b427f6..63c6460e 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # AgentV -A TypeScript-based AI agent evaluation and optimization framework using YAML specifications to score task completion. Built for modern development workflows with first-class support for VS Code Copilot, OpenAI Codex CLI and Azure OpenAI. +A TypeScript-based AI agent evaluation and optimization framework using YAML specifications to score task completion. Built for modern development workflows with first-class support for VS Code Copilot, OpenAI Codex CLI, Pi Coding Agent, and Azure OpenAI. ## Installation and Setup @@ -162,7 +162,7 @@ Execution targets in `.agentv/targets.yaml` decouple evals from providers/settin Each target specifies: - `name`: Unique identifier for the target -- `provider`: The model provider (`azure`, `anthropic`, `gemini`, `codex`, `vscode`, `vscode-insiders`, `cli`, or `mock`) +- `provider`: The model provider (`azure`, `anthropic`, `gemini`, `codex`, `pi-coding-agent`, `vscode`, `vscode-insiders`, `cli`, or `mock`) - Provider-specific configuration fields at the top level (no `settings` wrapper needed) - Optional fields: `judge_target`, `workers`, `provider_batching` @@ -240,6 +240,24 @@ Note: Environment variables are referenced using `${{ VARIABLE_NAME }}` syntax. Codex targets require the standalone `codex` CLI and a configured profile (via `codex configure`) so credentials are stored in `~/.codex/config` (or whatever path the CLI already uses). AgentV mirrors all guideline and attachment files into a fresh scratch workspace, so the `file://` preread links remain valid even when the CLI runs outside your repo tree. Confirm the CLI works by running `codex exec --json --profile "ping"` (or any supported dry run) before starting an eval. This prints JSONL events; seeing `item.completed` messages indicates the CLI is healthy. +**Pi Coding Agent targets:** + +```yaml +- name: pi_agent + provider: pi-coding-agent + judge_target: gemini_base + executable: node /path/to/pi-mono/packages/coding-agent/dist/cli.js + pi_provider: google # google, anthropic, openai, groq, xai, openrouter + model: ${{ GEMINI_MODEL_NAME }} + api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }} + tools: read,bash,edit,write # Available tools for the agent + timeout_seconds: 180 + cwd: ${{ PI_WORKSPACE_DIR }} # Optional: run in specific directory + log_format: json # 'summary' (default) or 'json' for full logs +``` + +Pi Coding Agent is an autonomous coding CLI from [pi-mono](https://github.com/badlogic/pi-mono). It supports multiple LLM providers and outputs JSONL events. AgentV extracts tool trajectories from the output for trace-based evaluation. File attachments are passed using Pi's native `@path` syntax. + ## Writing Custom Evaluators ### Code Evaluator I/O Contract From f35b2e45253a439de551676959c2d0c27afc3cbd Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 31 Dec 2025 13:58:59 +0000 Subject: [PATCH 09/16] fix: default pi executable to 'pi' command --- README.md | 4 ++-- apps/cli/README.md | 22 +++++++++++++++++-- examples/features/.agentv/targets.yaml | 2 +- .../core/src/evaluation/providers/targets.ts | 2 +- 4 files changed, 24 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 63c6460e..68d62d3e 100644 --- a/README.md +++ b/README.md @@ -246,7 +246,7 @@ Confirm the CLI works by running `codex exec --json --profile "ping"` (or - name: pi_agent provider: pi-coding-agent judge_target: gemini_base - executable: node /path/to/pi-mono/packages/coding-agent/dist/cli.js + executable: ${{ PI_CLI_PATH }} # Optional: defaults to `pi` if omitted pi_provider: google # google, anthropic, openai, groq, xai, openrouter model: ${{ GEMINI_MODEL_NAME }} api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }} @@ -256,7 +256,7 @@ Confirm the CLI works by running `codex exec --json --profile "ping"` (or log_format: json # 'summary' (default) or 'json' for full logs ``` -Pi Coding Agent is an autonomous coding CLI from [pi-mono](https://github.com/badlogic/pi-mono). It supports multiple LLM providers and outputs JSONL events. AgentV extracts tool trajectories from the output for trace-based evaluation. File attachments are passed using Pi's native `@path` syntax. +Pi Coding Agent is an autonomous coding CLI from [pi-mono](https://github.com/badlogic/pi-mono). Install it globally with `npm install -g @anthropic/pi-coding-agent` (or use a local path via `executable`). It supports multiple LLM providers and outputs JSONL events. AgentV extracts tool trajectories from the output for trace-based evaluation. File attachments are passed using Pi's native `@path` syntax. ## Writing Custom Evaluators diff --git a/apps/cli/README.md b/apps/cli/README.md index 93b427f6..68d62d3e 100644 --- a/apps/cli/README.md +++ b/apps/cli/README.md @@ -1,6 +1,6 @@ # AgentV -A TypeScript-based AI agent evaluation and optimization framework using YAML specifications to score task completion. Built for modern development workflows with first-class support for VS Code Copilot, OpenAI Codex CLI and Azure OpenAI. +A TypeScript-based AI agent evaluation and optimization framework using YAML specifications to score task completion. Built for modern development workflows with first-class support for VS Code Copilot, OpenAI Codex CLI, Pi Coding Agent, and Azure OpenAI. ## Installation and Setup @@ -162,7 +162,7 @@ Execution targets in `.agentv/targets.yaml` decouple evals from providers/settin Each target specifies: - `name`: Unique identifier for the target -- `provider`: The model provider (`azure`, `anthropic`, `gemini`, `codex`, `vscode`, `vscode-insiders`, `cli`, or `mock`) +- `provider`: The model provider (`azure`, `anthropic`, `gemini`, `codex`, `pi-coding-agent`, `vscode`, `vscode-insiders`, `cli`, or `mock`) - Provider-specific configuration fields at the top level (no `settings` wrapper needed) - Optional fields: `judge_target`, `workers`, `provider_batching` @@ -240,6 +240,24 @@ Note: Environment variables are referenced using `${{ VARIABLE_NAME }}` syntax. Codex targets require the standalone `codex` CLI and a configured profile (via `codex configure`) so credentials are stored in `~/.codex/config` (or whatever path the CLI already uses). AgentV mirrors all guideline and attachment files into a fresh scratch workspace, so the `file://` preread links remain valid even when the CLI runs outside your repo tree. Confirm the CLI works by running `codex exec --json --profile "ping"` (or any supported dry run) before starting an eval. This prints JSONL events; seeing `item.completed` messages indicates the CLI is healthy. +**Pi Coding Agent targets:** + +```yaml +- name: pi_agent + provider: pi-coding-agent + judge_target: gemini_base + executable: ${{ PI_CLI_PATH }} # Optional: defaults to `pi` if omitted + pi_provider: google # google, anthropic, openai, groq, xai, openrouter + model: ${{ GEMINI_MODEL_NAME }} + api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }} + tools: read,bash,edit,write # Available tools for the agent + timeout_seconds: 180 + cwd: ${{ PI_WORKSPACE_DIR }} # Optional: run in specific directory + log_format: json # 'summary' (default) or 'json' for full logs +``` + +Pi Coding Agent is an autonomous coding CLI from [pi-mono](https://github.com/badlogic/pi-mono). Install it globally with `npm install -g @anthropic/pi-coding-agent` (or use a local path via `executable`). It supports multiple LLM providers and outputs JSONL events. AgentV extracts tool trajectories from the output for trace-based evaluation. File attachments are passed using Pi's native `@path` syntax. + ## Writing Custom Evaluators ### Code Evaluator I/O Contract diff --git a/examples/features/.agentv/targets.yaml b/examples/features/.agentv/targets.yaml index 1a627a9c..b64937e5 100644 --- a/examples/features/.agentv/targets.yaml +++ b/examples/features/.agentv/targets.yaml @@ -58,7 +58,7 @@ targets: # Pi Coding Agent - autonomous coding CLI from pi-mono - name: pi_coding_agent provider: pi-coding-agent - executable: node /root/projects/pi-mono/packages/coding-agent/dist/cli.js + executable: ${{ PI_CLI_PATH }} # Optional: defaults to `pi` if omitted pi_provider: google model: ${{ GEMINI_MODEL_NAME }} api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }} diff --git a/packages/core/src/evaluation/providers/targets.ts b/packages/core/src/evaluation/providers/targets.ts index 289e5f37..4ef18a62 100644 --- a/packages/core/src/evaluation/providers/targets.ts +++ b/packages/core/src/evaluation/providers/targets.ts @@ -864,7 +864,7 @@ function resolvePiCodingAgentConfig( resolveOptionalString(executableSource, env, `${target.name} pi executable`, { allowLiteral: true, optionalEnv: true, - }) ?? 'node /root/projects/pi-mono/packages/coding-agent/dist/cli.js'; + }) ?? 'pi'; const provider = resolveOptionalString(providerSource, env, `${target.name} pi provider`, { allowLiteral: true, From 0e4a76964d275bf6baecbd4657eab27715f20ce1 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 31 Dec 2025 13:59:51 +0000 Subject: [PATCH 10/16] chore: rename pi target from pi_coding_agent to pi --- README.md | 2 +- examples/features/.agentv/targets.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 68d62d3e..28e78e42 100644 --- a/README.md +++ b/README.md @@ -243,7 +243,7 @@ Confirm the CLI works by running `codex exec --json --profile "ping"` (or **Pi Coding Agent targets:** ```yaml -- name: pi_agent +- name: pi provider: pi-coding-agent judge_target: gemini_base executable: ${{ PI_CLI_PATH }} # Optional: defaults to `pi` if omitted diff --git a/examples/features/.agentv/targets.yaml b/examples/features/.agentv/targets.yaml index b64937e5..4d7494b5 100644 --- a/examples/features/.agentv/targets.yaml +++ b/examples/features/.agentv/targets.yaml @@ -56,7 +56,7 @@ targets: model: ${{ GEMINI_MODEL_NAME }} # Pi Coding Agent - autonomous coding CLI from pi-mono - - name: pi_coding_agent + - name: pi provider: pi-coding-agent executable: ${{ PI_CLI_PATH }} # Optional: defaults to `pi` if omitted pi_provider: google From d3254b2d47eac185150a5beefa4c3250077c7dea Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 31 Dec 2025 20:12:28 +0000 Subject: [PATCH 11/16] feat(pi-coding-agent): add system_prompt config option --- README.md | 3 +++ apps/cli/README.md | 5 ++++- examples/features/.agentv/targets.yaml | 3 +++ packages/core/src/evaluation/providers/pi-coding-agent.ts | 8 +++++++- packages/core/src/evaluation/providers/targets.ts | 8 ++++++++ 5 files changed, 25 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 28e78e42..7f093b6a 100644 --- a/README.md +++ b/README.md @@ -254,6 +254,9 @@ Confirm the CLI works by running `codex exec --json --profile "ping"` (or timeout_seconds: 180 cwd: ${{ PI_WORKSPACE_DIR }} # Optional: run in specific directory log_format: json # 'summary' (default) or 'json' for full logs + system_prompt: | # Optional: prepended to all prompts + Always include your complete code in your response. + Do not just write files - show the code in your response text. ``` Pi Coding Agent is an autonomous coding CLI from [pi-mono](https://github.com/badlogic/pi-mono). Install it globally with `npm install -g @anthropic/pi-coding-agent` (or use a local path via `executable`). It supports multiple LLM providers and outputs JSONL events. AgentV extracts tool trajectories from the output for trace-based evaluation. File attachments are passed using Pi's native `@path` syntax. diff --git a/apps/cli/README.md b/apps/cli/README.md index 68d62d3e..7f093b6a 100644 --- a/apps/cli/README.md +++ b/apps/cli/README.md @@ -243,7 +243,7 @@ Confirm the CLI works by running `codex exec --json --profile "ping"` (or **Pi Coding Agent targets:** ```yaml -- name: pi_agent +- name: pi provider: pi-coding-agent judge_target: gemini_base executable: ${{ PI_CLI_PATH }} # Optional: defaults to `pi` if omitted @@ -254,6 +254,9 @@ Confirm the CLI works by running `codex exec --json --profile "ping"` (or timeout_seconds: 180 cwd: ${{ PI_WORKSPACE_DIR }} # Optional: run in specific directory log_format: json # 'summary' (default) or 'json' for full logs + system_prompt: | # Optional: prepended to all prompts + Always include your complete code in your response. + Do not just write files - show the code in your response text. ``` Pi Coding Agent is an autonomous coding CLI from [pi-mono](https://github.com/badlogic/pi-mono). Install it globally with `npm install -g @anthropic/pi-coding-agent` (or use a local path via `executable`). It supports multiple LLM providers and outputs JSONL events. AgentV extracts tool trajectories from the output for trace-based evaluation. File attachments are passed using Pi's native `@path` syntax. diff --git a/examples/features/.agentv/targets.yaml b/examples/features/.agentv/targets.yaml index 4d7494b5..f1785722 100644 --- a/examples/features/.agentv/targets.yaml +++ b/examples/features/.agentv/targets.yaml @@ -66,6 +66,9 @@ targets: timeout_seconds: 180 tools: read,bash,edit,write # Default tools for coding tasks log_format: json # 'summary' (default) or 'json' for raw event logs + system_prompt: | + Always include your complete code in your response. + Do not just write files - show the code in your response text. - name: local_cli provider: cli diff --git a/packages/core/src/evaluation/providers/pi-coding-agent.ts b/packages/core/src/evaluation/providers/pi-coding-agent.ts index 1ece2033..b1eb6f99 100644 --- a/packages/core/src/evaluation/providers/pi-coding-agent.ts +++ b/packages/core/src/evaluation/providers/pi-coding-agent.ts @@ -168,9 +168,15 @@ export class PiCodingAgentProvider implements Provider { } } + // Prepend system prompt from target config if provided + let fullPrompt = prompt; + if (this.config.systemPrompt) { + fullPrompt = `${this.config.systemPrompt}\n\n${prompt}`; + } + // Escape @ symbols in prompt that aren't file references // Pi CLI interprets @ as file prefix, but AgentV uses @[Role]: for multi-turn - const escapedPrompt = escapeAtSymbols(prompt); + const escapedPrompt = escapeAtSymbols(fullPrompt); // Prompt is passed as the final argument args.push(escapedPrompt); diff --git a/packages/core/src/evaluation/providers/targets.ts b/packages/core/src/evaluation/providers/targets.ts index 4ef18a62..042e4de7 100644 --- a/packages/core/src/evaluation/providers/targets.ts +++ b/packages/core/src/evaluation/providers/targets.ts @@ -441,6 +441,7 @@ export interface PiCodingAgentResolvedConfig { readonly timeoutMs?: number; readonly logDir?: string; readonly logFormat?: 'summary' | 'json'; + readonly systemPrompt?: string; } export interface MockResolvedConfig { @@ -859,6 +860,7 @@ function resolvePiCodingAgentConfig( const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory; const logFormatSource = target.log_format ?? target.logFormat; + const systemPromptSource = target.system_prompt ?? target.systemPrompt; const executable = resolveOptionalString(executableSource, env, `${target.name} pi executable`, { @@ -908,6 +910,11 @@ function resolvePiCodingAgentConfig( const logFormat = logFormatSource === 'json' || logFormatSource === 'summary' ? logFormatSource : undefined; + const systemPrompt = + typeof systemPromptSource === 'string' && systemPromptSource.trim().length > 0 + ? systemPromptSource.trim() + : undefined; + return { executable, provider, @@ -920,6 +927,7 @@ function resolvePiCodingAgentConfig( timeoutMs, logDir, logFormat, + systemPrompt, }; } From d8ed2a6d5b2788c23154dcc770dc81cc63cfe6f2 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 31 Dec 2025 22:10:23 +0000 Subject: [PATCH 12/16] feat(pi-coding-agent): add default system prompt for eval scoring --- README.md | 6 +++--- apps/cli/README.md | 6 +++--- examples/features/.agentv/targets.yaml | 4 +--- .../src/evaluation/providers/pi-coding-agent.ts | 16 +++++++++++----- 4 files changed, 18 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 7f093b6a..78c13df6 100644 --- a/README.md +++ b/README.md @@ -254,13 +254,13 @@ Confirm the CLI works by running `codex exec --json --profile "ping"` (or timeout_seconds: 180 cwd: ${{ PI_WORKSPACE_DIR }} # Optional: run in specific directory log_format: json # 'summary' (default) or 'json' for full logs - system_prompt: | # Optional: prepended to all prompts - Always include your complete code in your response. - Do not just write files - show the code in your response text. + # system_prompt: optional override for the default system prompt ``` Pi Coding Agent is an autonomous coding CLI from [pi-mono](https://github.com/badlogic/pi-mono). Install it globally with `npm install -g @anthropic/pi-coding-agent` (or use a local path via `executable`). It supports multiple LLM providers and outputs JSONL events. AgentV extracts tool trajectories from the output for trace-based evaluation. File attachments are passed using Pi's native `@path` syntax. +By default, a system prompt instructs the agent to include code in its response (required for evaluation scoring). Use `system_prompt` to override this behavior. + ## Writing Custom Evaluators ### Code Evaluator I/O Contract diff --git a/apps/cli/README.md b/apps/cli/README.md index 7f093b6a..78c13df6 100644 --- a/apps/cli/README.md +++ b/apps/cli/README.md @@ -254,13 +254,13 @@ Confirm the CLI works by running `codex exec --json --profile "ping"` (or timeout_seconds: 180 cwd: ${{ PI_WORKSPACE_DIR }} # Optional: run in specific directory log_format: json # 'summary' (default) or 'json' for full logs - system_prompt: | # Optional: prepended to all prompts - Always include your complete code in your response. - Do not just write files - show the code in your response text. + # system_prompt: optional override for the default system prompt ``` Pi Coding Agent is an autonomous coding CLI from [pi-mono](https://github.com/badlogic/pi-mono). Install it globally with `npm install -g @anthropic/pi-coding-agent` (or use a local path via `executable`). It supports multiple LLM providers and outputs JSONL events. AgentV extracts tool trajectories from the output for trace-based evaluation. File attachments are passed using Pi's native `@path` syntax. +By default, a system prompt instructs the agent to include code in its response (required for evaluation scoring). Use `system_prompt` to override this behavior. + ## Writing Custom Evaluators ### Code Evaluator I/O Contract diff --git a/examples/features/.agentv/targets.yaml b/examples/features/.agentv/targets.yaml index f1785722..e47e65f7 100644 --- a/examples/features/.agentv/targets.yaml +++ b/examples/features/.agentv/targets.yaml @@ -66,9 +66,7 @@ targets: timeout_seconds: 180 tools: read,bash,edit,write # Default tools for coding tasks log_format: json # 'summary' (default) or 'json' for raw event logs - system_prompt: | - Always include your complete code in your response. - Do not just write files - show the code in your response text. + # system_prompt: optional override (default instructs agent to include code in response) - name: local_cli provider: cli diff --git a/packages/core/src/evaluation/providers/pi-coding-agent.ts b/packages/core/src/evaluation/providers/pi-coding-agent.ts index b1eb6f99..bf9b884e 100644 --- a/packages/core/src/evaluation/providers/pi-coding-agent.ts +++ b/packages/core/src/evaluation/providers/pi-coding-agent.ts @@ -20,6 +20,14 @@ import type { const WORKSPACE_PREFIX = 'agentv-pi-'; const PROMPT_FILENAME = 'prompt.md'; +/** + * Default system prompt for Pi Coding Agent evaluations. + * Ensures the agent returns code in its response rather than just writing files. + */ +const DEFAULT_SYSTEM_PROMPT = `IMPORTANT: Always include your complete code and solutions in your response text. +Do not just write files - show all code, diffs, and outputs directly in your response. +This is required for evaluation scoring.`; + interface PiRunOptions { readonly executable: string; readonly args: readonly string[]; @@ -168,11 +176,9 @@ export class PiCodingAgentProvider implements Provider { } } - // Prepend system prompt from target config if provided - let fullPrompt = prompt; - if (this.config.systemPrompt) { - fullPrompt = `${this.config.systemPrompt}\n\n${prompt}`; - } + // Prepend system prompt (use default if not configured) + const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT; + const fullPrompt = `${systemPrompt}\n\n${prompt}`; // Escape @ symbols in prompt that aren't file references // Pi CLI interprets @ as file prefix, but AgentV uses @[Role]: for multi-turn From 002ba3860e4ec7bf2aac4a6b09e81ca753380958 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 31 Dec 2025 22:11:30 +0000 Subject: [PATCH 13/16] refactor(pi-coding-agent): align default system prompt with VS Code format --- packages/core/src/evaluation/providers/pi-coding-agent.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/packages/core/src/evaluation/providers/pi-coding-agent.ts b/packages/core/src/evaluation/providers/pi-coding-agent.ts index bf9b884e..b42d3612 100644 --- a/packages/core/src/evaluation/providers/pi-coding-agent.ts +++ b/packages/core/src/evaluation/providers/pi-coding-agent.ts @@ -24,8 +24,10 @@ const PROMPT_FILENAME = 'prompt.md'; * Default system prompt for Pi Coding Agent evaluations. * Ensures the agent returns code in its response rather than just writing files. */ -const DEFAULT_SYSTEM_PROMPT = `IMPORTANT: Always include your complete code and solutions in your response text. -Do not just write files - show all code, diffs, and outputs directly in your response. +const DEFAULT_SYSTEM_PROMPT = `**IMPORTANT**: Follow these instructions for your response: +- Do NOT create any additional output files in the workspace. +- All intended file outputs/changes MUST be written in your response. +- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`. This is required for evaluation scoring.`; interface PiRunOptions { From 8439c9020cf0578f10802c91c8edbb2fb76df930 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 31 Dec 2025 22:13:24 +0000 Subject: [PATCH 14/16] feat(codex): add default system prompt for eval scoring --- packages/core/src/evaluation/providers/codex.ts | 14 +++++++++++++- packages/core/src/evaluation/providers/targets.ts | 8 ++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/packages/core/src/evaluation/providers/codex.ts b/packages/core/src/evaluation/providers/codex.ts index c8cea955..8e3ce6a1 100644 --- a/packages/core/src/evaluation/providers/codex.ts +++ b/packages/core/src/evaluation/providers/codex.ts @@ -17,6 +17,16 @@ const WORKSPACE_PREFIX = 'agentv-codex-'; const PROMPT_FILENAME = 'prompt.md'; const JSONL_TYPE_ITEM_COMPLETED = 'item.completed'; +/** + * Default system prompt for Codex CLI evaluations. + * Ensures the agent returns code in its response rather than just writing files. + */ +const DEFAULT_SYSTEM_PROMPT = `**IMPORTANT**: Follow these instructions for your response: +- Do NOT create any additional output files in the workspace. +- All intended file outputs/changes MUST be written in your response. +- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`. +This is required for evaluation scoring.`; + interface CodexRunOptions { readonly executable: string; readonly args: readonly string[]; @@ -72,7 +82,9 @@ export class CodexProvider implements Provider { const workspaceRoot = await this.createWorkspace(); const logger = await this.createStreamLogger(request).catch(() => undefined); try { - const promptContent = buildPromptDocument(request, inputFiles); + const basePrompt = buildPromptDocument(request, inputFiles); + const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT; + const promptContent = `${systemPrompt}\n\n${basePrompt}`; const promptFile = path.join(workspaceRoot, PROMPT_FILENAME); await writeFile(promptFile, promptContent, 'utf8'); diff --git a/packages/core/src/evaluation/providers/targets.ts b/packages/core/src/evaluation/providers/targets.ts index 042e4de7..80dd509b 100644 --- a/packages/core/src/evaluation/providers/targets.ts +++ b/packages/core/src/evaluation/providers/targets.ts @@ -427,6 +427,7 @@ export interface CodexResolvedConfig { readonly timeoutMs?: number; readonly logDir?: string; readonly logFormat?: 'summary' | 'json'; + readonly systemPrompt?: string; } export interface PiCodingAgentResolvedConfig { @@ -800,6 +801,7 @@ function resolveCodexConfig( target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CODEX_LOG_FORMAT; + const systemPromptSource = target.system_prompt ?? target.systemPrompt; const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, { @@ -820,6 +822,11 @@ function resolveCodexConfig( }); const logFormat = normalizeCodexLogFormat(logFormatSource); + const systemPrompt = + typeof systemPromptSource === 'string' && systemPromptSource.trim().length > 0 + ? systemPromptSource.trim() + : undefined; + return { executable, args, @@ -827,6 +834,7 @@ function resolveCodexConfig( timeoutMs, logDir, logFormat, + systemPrompt, }; } From 5a23b547ba00ac11cefda8bbc6417977806aa00a Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 31 Dec 2025 22:54:40 +0000 Subject: [PATCH 15/16] chore: update changeset with system_prompt features --- .changeset/add-pi-coding-agent-provider.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.changeset/add-pi-coding-agent-provider.md b/.changeset/add-pi-coding-agent-provider.md index c9791769..db318476 100644 --- a/.changeset/add-pi-coding-agent-provider.md +++ b/.changeset/add-pi-coding-agent-provider.md @@ -3,10 +3,12 @@ "agentv": minor --- -Add Pi Coding Agent provider for autonomous coding evaluations +Add Pi Coding Agent provider and default system prompts for agent evaluations - New `pi-coding-agent` provider for the Pi Coding Agent CLI from pi-mono - Support file attachments using Pi's native `@path` syntax - Extract tool trajectory/traces from Pi's JSONL output - Display log file paths in console during eval runs - Add `log_format` option ('summary' or 'json') for log verbosity +- Add default system prompt for Pi and Codex providers instructing agents to include code in response using git diff format +- Add `system_prompt` config option to override default behavior via targets.yaml From 00cf038e0e6db0bdbde1ba21002d57b23cf34fc3 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 31 Dec 2025 22:56:27 +0000 Subject: [PATCH 16/16] chore: fix lint formatting --- apps/cli/package.json | 5 +---- packages/core/package.json | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/apps/cli/package.json b/apps/cli/package.json index 2d4fcf8e..59cb89a8 100644 --- a/apps/cli/package.json +++ b/apps/cli/package.json @@ -14,10 +14,7 @@ "bin": { "agentv": "./dist/cli.js" }, - "files": [ - "dist", - "README.md" - ], + "files": ["dist", "README.md"], "scripts": { "dev": "bun --watch src/index.ts", "build": "tsup && bun run copy-readme", diff --git a/packages/core/package.json b/packages/core/package.json index f8b30af5..5f1187ff 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -36,10 +36,7 @@ "test:watch": "bun test --watch", "diagnostics:azure": "bun src/diagnostics/azure-deployment-diag.ts" }, - "files": [ - "dist", - "README.md" - ], + "files": ["dist", "README.md"], "dependencies": { "@ai-sdk/anthropic": "^2.0.53", "@ai-sdk/azure": "^2.0.78",