diff --git a/.changeset/add-pi-coding-agent-provider.md b/.changeset/add-pi-coding-agent-provider.md new file mode 100644 index 0000000..db31847 --- /dev/null +++ b/.changeset/add-pi-coding-agent-provider.md @@ -0,0 +1,14 @@ +--- +"@agentv/core": minor +"agentv": minor +--- + +Add Pi Coding Agent provider and default system prompts for agent evaluations + +- New `pi-coding-agent` provider for the Pi Coding Agent CLI from pi-mono +- Support file attachments using Pi's native `@path` syntax +- Extract tool trajectory/traces from Pi's JSONL output +- Display log file paths in console during eval runs +- Add `log_format` option ('summary' or 'json') for log verbosity +- Add default system prompt for Pi and Codex providers instructing agents to include code in response using git diff format +- Add `system_prompt` config option to override default behavior via targets.yaml diff --git a/README.md b/README.md index 93b427f..78c13df 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # AgentV -A TypeScript-based AI agent evaluation and optimization framework using YAML specifications to score task completion. Built for modern development workflows with first-class support for VS Code Copilot, OpenAI Codex CLI and Azure OpenAI. +A TypeScript-based AI agent evaluation and optimization framework using YAML specifications to score task completion. Built for modern development workflows with first-class support for VS Code Copilot, OpenAI Codex CLI, Pi Coding Agent, and Azure OpenAI. ## Installation and Setup @@ -162,7 +162,7 @@ Execution targets in `.agentv/targets.yaml` decouple evals from providers/settin Each target specifies: - `name`: Unique identifier for the target -- `provider`: The model provider (`azure`, `anthropic`, `gemini`, `codex`, `vscode`, `vscode-insiders`, `cli`, or `mock`) +- `provider`: The model provider (`azure`, `anthropic`, `gemini`, `codex`, `pi-coding-agent`, `vscode`, `vscode-insiders`, `cli`, or `mock`) - Provider-specific configuration fields at the top level (no `settings` wrapper needed) - Optional fields: `judge_target`, `workers`, `provider_batching` @@ -240,6 +240,27 @@ Note: Environment variables are referenced using `${{ VARIABLE_NAME }}` syntax. Codex targets require the standalone `codex` CLI and a configured profile (via `codex configure`) so credentials are stored in `~/.codex/config` (or whatever path the CLI already uses). AgentV mirrors all guideline and attachment files into a fresh scratch workspace, so the `file://` preread links remain valid even when the CLI runs outside your repo tree. Confirm the CLI works by running `codex exec --json --profile "ping"` (or any supported dry run) before starting an eval. This prints JSONL events; seeing `item.completed` messages indicates the CLI is healthy. +**Pi Coding Agent targets:** + +```yaml +- name: pi + provider: pi-coding-agent + judge_target: gemini_base + executable: ${{ PI_CLI_PATH }} # Optional: defaults to `pi` if omitted + pi_provider: google # google, anthropic, openai, groq, xai, openrouter + model: ${{ GEMINI_MODEL_NAME }} + api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }} + tools: read,bash,edit,write # Available tools for the agent + timeout_seconds: 180 + cwd: ${{ PI_WORKSPACE_DIR }} # Optional: run in specific directory + log_format: json # 'summary' (default) or 'json' for full logs + # system_prompt: optional override for the default system prompt +``` + +Pi Coding Agent is an autonomous coding CLI from [pi-mono](https://github.com/badlogic/pi-mono). Install it globally with `npm install -g @anthropic/pi-coding-agent` (or use a local path via `executable`). It supports multiple LLM providers and outputs JSONL events. AgentV extracts tool trajectories from the output for trace-based evaluation. File attachments are passed using Pi's native `@path` syntax. + +By default, a system prompt instructs the agent to include code in its response (required for evaluation scoring). Use `system_prompt` to override this behavior. + ## Writing Custom Evaluators ### Code Evaluator I/O Contract diff --git a/apps/cli/README.md b/apps/cli/README.md index 93b427f..78c13df 100644 --- a/apps/cli/README.md +++ b/apps/cli/README.md @@ -1,6 +1,6 @@ # AgentV -A TypeScript-based AI agent evaluation and optimization framework using YAML specifications to score task completion. Built for modern development workflows with first-class support for VS Code Copilot, OpenAI Codex CLI and Azure OpenAI. +A TypeScript-based AI agent evaluation and optimization framework using YAML specifications to score task completion. Built for modern development workflows with first-class support for VS Code Copilot, OpenAI Codex CLI, Pi Coding Agent, and Azure OpenAI. ## Installation and Setup @@ -162,7 +162,7 @@ Execution targets in `.agentv/targets.yaml` decouple evals from providers/settin Each target specifies: - `name`: Unique identifier for the target -- `provider`: The model provider (`azure`, `anthropic`, `gemini`, `codex`, `vscode`, `vscode-insiders`, `cli`, or `mock`) +- `provider`: The model provider (`azure`, `anthropic`, `gemini`, `codex`, `pi-coding-agent`, `vscode`, `vscode-insiders`, `cli`, or `mock`) - Provider-specific configuration fields at the top level (no `settings` wrapper needed) - Optional fields: `judge_target`, `workers`, `provider_batching` @@ -240,6 +240,27 @@ Note: Environment variables are referenced using `${{ VARIABLE_NAME }}` syntax. Codex targets require the standalone `codex` CLI and a configured profile (via `codex configure`) so credentials are stored in `~/.codex/config` (or whatever path the CLI already uses). AgentV mirrors all guideline and attachment files into a fresh scratch workspace, so the `file://` preread links remain valid even when the CLI runs outside your repo tree. Confirm the CLI works by running `codex exec --json --profile "ping"` (or any supported dry run) before starting an eval. This prints JSONL events; seeing `item.completed` messages indicates the CLI is healthy. +**Pi Coding Agent targets:** + +```yaml +- name: pi + provider: pi-coding-agent + judge_target: gemini_base + executable: ${{ PI_CLI_PATH }} # Optional: defaults to `pi` if omitted + pi_provider: google # google, anthropic, openai, groq, xai, openrouter + model: ${{ GEMINI_MODEL_NAME }} + api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }} + tools: read,bash,edit,write # Available tools for the agent + timeout_seconds: 180 + cwd: ${{ PI_WORKSPACE_DIR }} # Optional: run in specific directory + log_format: json # 'summary' (default) or 'json' for full logs + # system_prompt: optional override for the default system prompt +``` + +Pi Coding Agent is an autonomous coding CLI from [pi-mono](https://github.com/badlogic/pi-mono). Install it globally with `npm install -g @anthropic/pi-coding-agent` (or use a local path via `executable`). It supports multiple LLM providers and outputs JSONL events. AgentV extracts tool trajectories from the output for trace-based evaluation. File attachments are passed using Pi's native `@path` syntax. + +By default, a system prompt instructs the agent to include code in its response (required for evaluation scoring). Use `system_prompt` to override this behavior. + ## Writing Custom Evaluators ### Code Evaluator I/O Contract diff --git a/apps/cli/package.json b/apps/cli/package.json index 2d4fcf8..59cb89a 100644 --- a/apps/cli/package.json +++ b/apps/cli/package.json @@ -14,10 +14,7 @@ "bin": { "agentv": "./dist/cli.js" }, - "files": [ - "dist", - "README.md" - ], + "files": ["dist", "README.md"], "scripts": { "dev": "bun --watch src/index.ts", "build": "tsup && bun run copy-readme", diff --git a/apps/cli/src/commands/eval/progress-display.ts b/apps/cli/src/commands/eval/progress-display.ts index 6df5cc6..0b18013 100644 --- a/apps/cli/src/commands/eval/progress-display.ts +++ b/apps/cli/src/commands/eval/progress-display.ts @@ -78,7 +78,7 @@ export class ProgressDisplay { } } - addLogPaths(paths: readonly string[]): void { + addLogPaths(paths: readonly string[], provider?: 'codex' | 'pi'): void { const newPaths: string[] = []; for (const path of paths) { if (this.logPathSet.has(path)) { @@ -96,7 +96,8 @@ export class ProgressDisplay { if (!this.hasPrintedLogHeader) { console.log(''); - console.log('Codex CLI logs:'); + const label = provider === 'pi' ? 'Pi Coding Agent' : 'Codex CLI'; + console.log(`${label} logs:`); this.hasPrintedLogHeader = true; } diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 21e04a5..79e0543 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -11,6 +11,7 @@ import { ensureVSCodeSubagents, loadEvalCases, subscribeToCodexLogEntries, + subscribeToPiLogEntries, } from '@agentv/core'; import { loadEnvFromHierarchy } from './env.js'; @@ -170,7 +171,7 @@ type ProgressReporter = { setTotal(total: number): void; update(workerId: number, progress: WorkerProgress): void; finish(): void; - addLogPaths(paths: readonly string[]): void; + addLogPaths(paths: readonly string[], provider?: 'codex' | 'pi'): void; }; function createProgressReporter( @@ -185,7 +186,8 @@ function createProgressReporter( update: (workerId: number, progress: WorkerProgress) => display.updateWorker({ ...progress, workerId }), finish: () => display.finish(), - addLogPaths: (paths: readonly string[]) => display.addLogPaths(paths), + addLogPaths: (paths: readonly string[], provider?: 'codex' | 'pi') => + display.addLogPaths(paths, provider), }; } @@ -494,7 +496,15 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise return; } seenCodexLogPaths.add(entry.filePath); - progressReporter.addLogPaths([entry.filePath]); + progressReporter.addLogPaths([entry.filePath], 'codex'); + }); + const seenPiLogPaths = new Set(); + const unsubscribePiLogs = subscribeToPiLogEntries((entry) => { + if (!entry.filePath || seenPiLogPaths.has(entry.filePath)) { + return; + } + seenPiLogPaths.add(entry.filePath); + progressReporter.addLogPaths([entry.filePath], 'pi'); }); for (const [testFilePath, meta] of fileMetadata.entries()) { for (const evalId of meta.evalIds) { @@ -553,6 +563,7 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise } } finally { unsubscribeCodexLogs(); + unsubscribePiLogs(); await outputWriter.close().catch(() => undefined); } } diff --git a/examples/features/.agentv/targets.yaml b/examples/features/.agentv/targets.yaml index bab6ea4..e47e65f 100644 --- a/examples/features/.agentv/targets.yaml +++ b/examples/features/.agentv/targets.yaml @@ -55,6 +55,19 @@ targets: api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }} model: ${{ GEMINI_MODEL_NAME }} + # Pi Coding Agent - autonomous coding CLI from pi-mono + - name: pi + provider: pi-coding-agent + executable: ${{ PI_CLI_PATH }} # Optional: defaults to `pi` if omitted + pi_provider: google + model: ${{ GEMINI_MODEL_NAME }} + api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }} + judge_target: gemini_base + timeout_seconds: 180 + tools: read,bash,edit,write # Default tools for coding tasks + log_format: json # 'summary' (default) or 'json' for raw event logs + # system_prompt: optional override (default instructs agent to include code in response) + - name: local_cli provider: cli judge_target: azure_base diff --git a/packages/core/package.json b/packages/core/package.json index f8b30af..5f1187f 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -36,10 +36,7 @@ "test:watch": "bun test --watch", "diagnostics:azure": "bun src/diagnostics/azure-deployment-diag.ts" }, - "files": [ - "dist", - "README.md" - ], + "files": ["dist", "README.md"], "dependencies": { "@ai-sdk/anthropic": "^2.0.53", "@ai-sdk/azure": "^2.0.78", diff --git a/packages/core/src/evaluation/providers/codex.ts b/packages/core/src/evaluation/providers/codex.ts index c8cea95..8e3ce6a 100644 --- a/packages/core/src/evaluation/providers/codex.ts +++ b/packages/core/src/evaluation/providers/codex.ts @@ -17,6 +17,16 @@ const WORKSPACE_PREFIX = 'agentv-codex-'; const PROMPT_FILENAME = 'prompt.md'; const JSONL_TYPE_ITEM_COMPLETED = 'item.completed'; +/** + * Default system prompt for Codex CLI evaluations. + * Ensures the agent returns code in its response rather than just writing files. + */ +const DEFAULT_SYSTEM_PROMPT = `**IMPORTANT**: Follow these instructions for your response: +- Do NOT create any additional output files in the workspace. +- All intended file outputs/changes MUST be written in your response. +- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`. +This is required for evaluation scoring.`; + interface CodexRunOptions { readonly executable: string; readonly args: readonly string[]; @@ -72,7 +82,9 @@ export class CodexProvider implements Provider { const workspaceRoot = await this.createWorkspace(); const logger = await this.createStreamLogger(request).catch(() => undefined); try { - const promptContent = buildPromptDocument(request, inputFiles); + const basePrompt = buildPromptDocument(request, inputFiles); + const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT; + const promptContent = `${systemPrompt}\n\n${basePrompt}`; const promptFile = path.join(workspaceRoot, PROMPT_FILENAME); await writeFile(promptFile, promptContent, 'utf8'); diff --git a/packages/core/src/evaluation/providers/index.ts b/packages/core/src/evaluation/providers/index.ts index c95120a..2651ddd 100644 --- a/packages/core/src/evaluation/providers/index.ts +++ b/packages/core/src/evaluation/providers/index.ts @@ -2,6 +2,7 @@ import { AnthropicProvider, AzureProvider, GeminiProvider } from './ai-sdk.js'; import { CliProvider } from './cli.js'; import { CodexProvider } from './codex.js'; import { MockProvider } from './mock.js'; +import { PiCodingAgentProvider } from './pi-coding-agent.js'; import type { ResolvedTarget } from './targets.js'; import { resolveTargetDefinition } from './targets.js'; import type { EnvLookup, Provider, TargetDefinition } from './types.js'; @@ -22,6 +23,7 @@ export type { CliResolvedConfig, GeminiResolvedConfig, MockResolvedConfig, + PiCodingAgentResolvedConfig, ResolvedTarget, VSCodeResolvedConfig, } from './targets.js'; @@ -34,6 +36,7 @@ export { type EnsureSubagentsResult, } from './vscode.js'; export { consumeCodexLogEntries, subscribeToCodexLogEntries } from './codex-log-tracker.js'; +export { consumePiLogEntries, subscribeToPiLogEntries } from './pi-log-tracker.js'; export function createProvider(target: ResolvedTarget): Provider { switch (target.kind) { @@ -47,6 +50,8 @@ export function createProvider(target: ResolvedTarget): Provider { return new CliProvider(target.name, target.config); case 'codex': return new CodexProvider(target.name, target.config); + case 'pi-coding-agent': + return new PiCodingAgentProvider(target.name, target.config); case 'mock': return new MockProvider(target.name, target.config); case 'vscode': diff --git a/packages/core/src/evaluation/providers/pi-coding-agent.ts b/packages/core/src/evaluation/providers/pi-coding-agent.ts new file mode 100644 index 0000000..b42d361 --- /dev/null +++ b/packages/core/src/evaluation/providers/pi-coding-agent.ts @@ -0,0 +1,833 @@ +import { spawn } from 'node:child_process'; +import { randomUUID } from 'node:crypto'; +import { createWriteStream } from 'node:fs'; +import type { WriteStream } from 'node:fs'; +import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; + +import { recordPiLogEntry } from './pi-log-tracker.js'; +import { normalizeInputFiles } from './preread.js'; +import type { PiCodingAgentResolvedConfig } from './targets.js'; +import type { + OutputMessage, + Provider, + ProviderRequest, + ProviderResponse, + ToolCall, +} from './types.js'; + +const WORKSPACE_PREFIX = 'agentv-pi-'; +const PROMPT_FILENAME = 'prompt.md'; + +/** + * Default system prompt for Pi Coding Agent evaluations. + * Ensures the agent returns code in its response rather than just writing files. + */ +const DEFAULT_SYSTEM_PROMPT = `**IMPORTANT**: Follow these instructions for your response: +- Do NOT create any additional output files in the workspace. +- All intended file outputs/changes MUST be written in your response. +- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`. +This is required for evaluation scoring.`; + +interface PiRunOptions { + readonly executable: string; + readonly args: readonly string[]; + readonly cwd: string; + readonly timeoutMs?: number; + readonly env: NodeJS.ProcessEnv; + readonly signal?: AbortSignal; + readonly onStdoutChunk?: (chunk: string) => void; + readonly onStderrChunk?: (chunk: string) => void; +} + +interface PiRunResult { + readonly stdout: string; + readonly stderr: string; + readonly exitCode: number; + readonly timedOut?: boolean; +} + +type PiRunner = (options: PiRunOptions) => Promise; + +export class PiCodingAgentProvider implements Provider { + readonly id: string; + readonly kind = 'pi-coding-agent' as const; + readonly targetName: string; + readonly supportsBatch = false; + + private readonly config: PiCodingAgentResolvedConfig; + private readonly runPi: PiRunner; + + constructor( + targetName: string, + config: PiCodingAgentResolvedConfig, + runner: PiRunner = defaultPiRunner, + ) { + this.id = `pi-coding-agent:${targetName}`; + this.targetName = targetName; + this.config = config; + this.runPi = runner; + } + + async invoke(request: ProviderRequest): Promise { + if (request.signal?.aborted) { + throw new Error('Pi coding agent request was aborted before execution'); + } + + const inputFiles = normalizeInputFiles(request.inputFiles); + + const workspaceRoot = await this.createWorkspace(); + const logger = await this.createStreamLogger(request).catch(() => undefined); + try { + // Save prompt to file for debugging/logging + const promptFile = path.join(workspaceRoot, PROMPT_FILENAME); + await writeFile(promptFile, request.question, 'utf8'); + + const args = this.buildPiArgs(request.question, inputFiles); + const cwd = this.resolveCwd(workspaceRoot); + + const result = await this.executePi(args, cwd, request.signal, logger); + + if (result.timedOut) { + throw new Error( + `Pi coding agent timed out${formatTimeoutSuffix(this.config.timeoutMs ?? undefined)}`, + ); + } + + if (result.exitCode !== 0) { + const detail = pickDetail(result.stderr, result.stdout); + const prefix = `Pi coding agent exited with code ${result.exitCode}`; + throw new Error(detail ? `${prefix}: ${detail}` : prefix); + } + + const parsed = parsePiJsonl(result.stdout); + const outputMessages = extractOutputMessages(parsed); + const assistantText = extractAssistantText(outputMessages); + + return { + raw: { + response: parsed, + stdout: result.stdout, + stderr: result.stderr, + exitCode: result.exitCode, + args, + executable: this.config.executable, + promptFile, + workspace: workspaceRoot, + inputFiles, + logFile: logger?.filePath, + }, + outputMessages, + }; + } finally { + await logger?.close(); + await this.cleanupWorkspace(workspaceRoot); + } + } + + private resolveCwd(workspaceRoot: string): string { + if (!this.config.cwd) { + return workspaceRoot; + } + return path.resolve(this.config.cwd); + } + + private buildPiArgs(prompt: string, inputFiles: readonly string[] | undefined): string[] { + const args: string[] = []; + + // Provider and model configuration + if (this.config.provider) { + args.push('--provider', this.config.provider); + } + if (this.config.model) { + args.push('--model', this.config.model); + } + if (this.config.apiKey) { + args.push('--api-key', this.config.apiKey); + } + + // Output mode - always use JSON for structured output + args.push('--mode', 'json'); + + // Non-interactive mode + args.push('--print'); + + // No session storage for eval runs + args.push('--no-session'); + + // Tools configuration + if (this.config.tools) { + args.push('--tools', this.config.tools); + } + + // Thinking level + if (this.config.thinking) { + args.push('--thinking', this.config.thinking); + } + + // Custom args + if (this.config.args && this.config.args.length > 0) { + args.push(...this.config.args); + } + + // Input files passed with @path syntax (pi-native file inclusion) + if (inputFiles && inputFiles.length > 0) { + for (const file of inputFiles) { + args.push(`@${file}`); + } + } + + // Prepend system prompt (use default if not configured) + const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT; + const fullPrompt = `${systemPrompt}\n\n${prompt}`; + + // Escape @ symbols in prompt that aren't file references + // Pi CLI interprets @ as file prefix, but AgentV uses @[Role]: for multi-turn + const escapedPrompt = escapeAtSymbols(fullPrompt); + + // Prompt is passed as the final argument + args.push(escapedPrompt); + + return args; + } + + private async executePi( + args: readonly string[], + cwd: string, + signal: AbortSignal | undefined, + logger: PiStreamLogger | undefined, + ): Promise { + try { + return await this.runPi({ + executable: this.config.executable, + args, + cwd, + timeoutMs: this.config.timeoutMs, + env: this.buildEnv(), + signal, + onStdoutChunk: logger ? (chunk) => logger.handleStdoutChunk(chunk) : undefined, + onStderrChunk: logger ? (chunk) => logger.handleStderrChunk(chunk) : undefined, + }); + } catch (error) { + const err = error as NodeJS.ErrnoException; + if (err.code === 'ENOENT') { + throw new Error( + `Pi coding agent executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`, + ); + } + throw error; + } + } + + private buildEnv(): NodeJS.ProcessEnv { + const env = { ...process.env }; + + // Map provider-specific API key to the correct env var + if (this.config.apiKey) { + const provider = this.config.provider?.toLowerCase() ?? 'google'; + switch (provider) { + case 'google': + case 'gemini': + env.GEMINI_API_KEY = this.config.apiKey; + break; + case 'anthropic': + env.ANTHROPIC_API_KEY = this.config.apiKey; + break; + case 'openai': + env.OPENAI_API_KEY = this.config.apiKey; + break; + case 'groq': + env.GROQ_API_KEY = this.config.apiKey; + break; + case 'xai': + env.XAI_API_KEY = this.config.apiKey; + break; + case 'openrouter': + env.OPENROUTER_API_KEY = this.config.apiKey; + break; + } + } + + return env; + } + + private async createWorkspace(): Promise { + return await mkdtemp(path.join(tmpdir(), WORKSPACE_PREFIX)); + } + + private async cleanupWorkspace(workspaceRoot: string): Promise { + try { + await rm(workspaceRoot, { recursive: true, force: true }); + } catch { + // Best-effort cleanup + } + } + + private resolveLogDirectory(): string | undefined { + if (this.config.logDir) { + return path.resolve(this.config.logDir); + } + return path.join(process.cwd(), '.agentv', 'logs', 'pi-coding-agent'); + } + + private async createStreamLogger(request: ProviderRequest): Promise { + const logDir = this.resolveLogDirectory(); + if (!logDir) { + return undefined; + } + try { + await mkdir(logDir, { recursive: true }); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`); + return undefined; + } + + const filePath = path.join(logDir, buildLogFilename(request, this.targetName)); + + try { + const logger = await PiStreamLogger.create({ + filePath, + targetName: this.targetName, + evalCaseId: request.evalCaseId, + attempt: request.attempt, + format: this.config.logFormat ?? 'summary', + }); + recordPiLogEntry({ + filePath, + targetName: this.targetName, + evalCaseId: request.evalCaseId, + attempt: request.attempt, + }); + return logger; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.warn(`Skipping Pi stream logging for ${filePath}: ${message}`); + return undefined; + } + } +} + +class PiStreamLogger { + readonly filePath: string; + private readonly stream: WriteStream; + private readonly startedAt = Date.now(); + private stdoutBuffer = ''; + private stderrBuffer = ''; + private readonly format: 'summary' | 'json'; + + private constructor(filePath: string, format: 'summary' | 'json') { + this.filePath = filePath; + this.format = format; + this.stream = createWriteStream(filePath, { flags: 'a' }); + } + + static async create(options: { + readonly filePath: string; + readonly targetName: string; + readonly evalCaseId?: string; + readonly attempt?: number; + readonly format: 'summary' | 'json'; + }): Promise { + const logger = new PiStreamLogger(options.filePath, options.format); + const header = [ + '# Pi Coding Agent stream log', + `# target: ${options.targetName}`, + options.evalCaseId ? `# eval: ${options.evalCaseId}` : undefined, + options.attempt !== undefined ? `# attempt: ${options.attempt + 1}` : undefined, + `# started: ${new Date().toISOString()}`, + '', + ].filter((line): line is string => Boolean(line)); + logger.writeLines(header); + return logger; + } + + handleStdoutChunk(chunk: string): void { + this.stdoutBuffer += chunk; + this.flushBuffer('stdout'); + } + + handleStderrChunk(chunk: string): void { + this.stderrBuffer += chunk; + this.flushBuffer('stderr'); + } + + async close(): Promise { + this.flushBuffer('stdout'); + this.flushBuffer('stderr'); + this.flushRemainder(); + await new Promise((resolve, reject) => { + this.stream.once('error', reject); + this.stream.end(() => resolve()); + }); + } + + private writeLines(lines: readonly string[]): void { + for (const line of lines) { + this.stream.write(`${line}\n`); + } + } + + private flushBuffer(source: 'stdout' | 'stderr'): void { + const buffer = source === 'stdout' ? this.stdoutBuffer : this.stderrBuffer; + const lines = buffer.split(/\r?\n/); + const remainder = lines.pop() ?? ''; + if (source === 'stdout') { + this.stdoutBuffer = remainder; + } else { + this.stderrBuffer = remainder; + } + for (const line of lines) { + const formatted = this.formatLine(line, source); + if (formatted) { + this.stream.write(formatted); + this.stream.write('\n'); + } + } + } + + private formatLine(rawLine: string, source: 'stdout' | 'stderr'): string | undefined { + const trimmed = rawLine.trim(); + if (trimmed.length === 0) { + return undefined; + } + const message = + this.format === 'json' ? formatPiJsonLog(trimmed) : formatPiLogMessage(trimmed, source); + return `[+${formatElapsed(this.startedAt)}] [${source}] ${message}`; + } + + private flushRemainder(): void { + const stdoutRemainder = this.stdoutBuffer.trim(); + if (stdoutRemainder.length > 0) { + const formatted = this.formatLine(stdoutRemainder, 'stdout'); + if (formatted) { + this.stream.write(formatted); + this.stream.write('\n'); + } + } + const stderrRemainder = this.stderrBuffer.trim(); + if (stderrRemainder.length > 0) { + const formatted = this.formatLine(stderrRemainder, 'stderr'); + if (formatted) { + this.stream.write(formatted); + this.stream.write('\n'); + } + } + this.stdoutBuffer = ''; + this.stderrBuffer = ''; + } +} + +function buildLogFilename(request: ProviderRequest, targetName: string): string { + const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); + const evalId = sanitizeForFilename(request.evalCaseId ?? 'pi'); + const attemptSuffix = request.attempt !== undefined ? `_attempt-${request.attempt + 1}` : ''; + const target = sanitizeForFilename(targetName); + return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID().slice(0, 8)}.log`; +} + +function sanitizeForFilename(value: string): string { + const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, '_'); + return sanitized.length > 0 ? sanitized : 'pi'; +} + +function formatElapsed(startedAt: number): string { + const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1000); + const hours = Math.floor(elapsedSeconds / 3600); + const minutes = Math.floor((elapsedSeconds % 3600) / 60); + const seconds = elapsedSeconds % 60; + if (hours > 0) { + return `${hours.toString().padStart(2, '0')}:${minutes.toString().padStart(2, '0')}:${seconds.toString().padStart(2, '0')}`; + } + return `${minutes.toString().padStart(2, '0')}:${seconds.toString().padStart(2, '0')}`; +} + +function formatPiLogMessage(rawLine: string, source: 'stdout' | 'stderr'): string { + const parsed = tryParseJsonValue(rawLine); + if (parsed) { + const summary = summarizePiEvent(parsed); + if (summary) { + return summary; + } + } + if (source === 'stderr') { + return `stderr: ${rawLine}`; + } + return rawLine; +} + +function formatPiJsonLog(rawLine: string): string { + const parsed = tryParseJsonValue(rawLine); + if (!parsed) { + return rawLine; + } + try { + return JSON.stringify(parsed, null, 2); + } catch { + return rawLine; + } +} + +function summarizePiEvent(event: unknown): string | undefined { + if (!event || typeof event !== 'object') { + return undefined; + } + const record = event as Record; + const type = typeof record.type === 'string' ? record.type : undefined; + + if (!type) { + return undefined; + } + + // Handle specific event types + switch (type) { + case 'agent_start': + return 'agent_start'; + case 'agent_end': + return 'agent_end'; + case 'turn_start': + return 'turn_start'; + case 'turn_end': + return 'turn_end'; + case 'message_start': + case 'message_end': { + const message = record.message as Record | undefined; + const role = message?.role; + return `${type}: ${role}`; + } + case 'message_update': { + const event = record.assistantMessageEvent as Record | undefined; + const eventType = event?.type; + if (eventType === 'text_delta') { + const delta = event?.delta; + if (typeof delta === 'string') { + const preview = delta.length > 50 ? `${delta.slice(0, 50)}...` : delta; + return `text_delta: ${preview}`; + } + } + return `message_update: ${eventType}`; + } + default: + return type; + } +} + +function tryParseJsonValue(rawLine: string): unknown | undefined { + try { + return JSON.parse(rawLine); + } catch { + return undefined; + } +} + +/** + * Parse Pi coding agent JSONL output. + * Returns an array of parsed JSON objects from each line. + */ +function parsePiJsonl(output: string): unknown[] { + const trimmed = output.trim(); + if (trimmed.length === 0) { + throw new Error('Pi coding agent produced no output'); + } + + const lines = trimmed + .split(/\r?\n/) + .map((line) => line.trim()) + .filter((line) => line.length > 0); + + const parsed: unknown[] = []; + for (const line of lines) { + try { + parsed.push(JSON.parse(line)); + } catch { + // Skip non-JSON lines (e.g., stderr mixed in) + } + } + + if (parsed.length === 0) { + throw new Error('Pi coding agent produced no valid JSON output'); + } + + return parsed; +} + +/** + * Extract OutputMessage array from Pi JSONL events. + * Looks for the agent_end event which contains the full message history. + */ +function extractOutputMessages(events: unknown[]): readonly OutputMessage[] { + // Find the agent_end event which contains all messages + for (let i = events.length - 1; i >= 0; i--) { + const event = events[i]; + if (!event || typeof event !== 'object') { + continue; + } + const record = event as Record; + if (record.type !== 'agent_end') { + continue; + } + + const messages = record.messages; + if (!Array.isArray(messages)) { + continue; + } + + return messages.map(convertPiMessage).filter((m): m is OutputMessage => m !== undefined); + } + + // Fallback: collect messages from turn_end events + const outputMessages: OutputMessage[] = []; + for (const event of events) { + if (!event || typeof event !== 'object') { + continue; + } + const record = event as Record; + if (record.type === 'turn_end') { + const message = record.message; + const converted = convertPiMessage(message); + if (converted) { + outputMessages.push(converted); + } + } + } + + return outputMessages; +} + +/** + * Convert a Pi message to AgentV OutputMessage format. + */ +function convertPiMessage(message: unknown): OutputMessage | undefined { + if (!message || typeof message !== 'object') { + return undefined; + } + + const msg = message as Record; + const role = msg.role; + if (typeof role !== 'string') { + return undefined; + } + + // Extract text content from Pi's content array format + const content = extractTextContent(msg.content); + + // Extract tool calls if present + const toolCalls = extractToolCalls(msg.content); + + // Extract timestamp + const timestamp = + typeof msg.timestamp === 'number' + ? new Date(msg.timestamp).toISOString() + : typeof msg.timestamp === 'string' + ? msg.timestamp + : undefined; + + // Extract metadata (usage, model info, etc.) + const metadata: Record = {}; + if (msg.api) metadata.api = msg.api; + if (msg.provider) metadata.provider = msg.provider; + if (msg.model) metadata.model = msg.model; + if (msg.usage) metadata.usage = msg.usage; + if (msg.stopReason) metadata.stopReason = msg.stopReason; + + return { + role, + content, + toolCalls: toolCalls.length > 0 ? toolCalls : undefined, + timestamp, + metadata: Object.keys(metadata).length > 0 ? metadata : undefined, + }; +} + +/** + * Extract text content from Pi's content array format. + * Pi uses: content: [{ type: "text", text: "..." }, ...] + */ +function extractTextContent(content: unknown): string | undefined { + if (typeof content === 'string') { + return content; + } + + if (!Array.isArray(content)) { + return undefined; + } + + const textParts: string[] = []; + for (const part of content) { + if (!part || typeof part !== 'object') { + continue; + } + const p = part as Record; + if (p.type === 'text' && typeof p.text === 'string') { + textParts.push(p.text); + } + } + + return textParts.length > 0 ? textParts.join('\n') : undefined; +} + +/** + * Extract tool calls from Pi's content array format. + * Pi uses: content: [{ type: "tool_use", name: "...", input: {...} }, ...] + */ +function extractToolCalls(content: unknown): readonly ToolCall[] { + if (!Array.isArray(content)) { + return []; + } + + const toolCalls: ToolCall[] = []; + for (const part of content) { + if (!part || typeof part !== 'object') { + continue; + } + const p = part as Record; + if (p.type === 'tool_use' && typeof p.name === 'string') { + toolCalls.push({ + tool: p.name, + input: p.input, + id: typeof p.id === 'string' ? p.id : undefined, + }); + } + // Also handle tool_result for output + if (p.type === 'tool_result' && typeof p.tool_use_id === 'string') { + // Find matching tool call and add output + const existing = toolCalls.find((tc) => tc.id === p.tool_use_id); + if (existing) { + // Create new object with output added + const idx = toolCalls.indexOf(existing); + toolCalls[idx] = { + ...existing, + output: p.content, + }; + } + } + } + + return toolCalls; +} + +/** + * Extract the final assistant text from output messages. + */ +function extractAssistantText(messages: readonly OutputMessage[]): string { + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (msg.role === 'assistant' && msg.content) { + if (typeof msg.content === 'string') { + return msg.content; + } + return JSON.stringify(msg.content); + } + } + return ''; +} + +/** + * Escape @ symbols in prompt text that pi CLI would interpret as file references. + * Pi CLI uses @path syntax for file inclusion, but AgentV prompts use @[Role]: markers. + * We replace @[ with [[ to avoid pi trying to read these as files. + */ +function escapeAtSymbols(prompt: string): string { + // Replace @[Role]: patterns with [[Role]]: to avoid pi file interpretation + // This handles @[System]:, @[User]:, @[Assistant]:, @[Tool]: etc. + return prompt.replace(/@\[([^\]]+)\]:/g, '[[$1]]:'); +} + +function pickDetail(stderr: string, stdout: string): string | undefined { + const errorText = stderr.trim(); + if (errorText.length > 0) { + return errorText; + } + const stdoutText = stdout.trim(); + return stdoutText.length > 0 ? stdoutText : undefined; +} + +function formatTimeoutSuffix(timeoutMs: number | undefined): string { + if (!timeoutMs || timeoutMs <= 0) { + return ''; + } + const seconds = Math.ceil(timeoutMs / 1000); + return ` after ${seconds}s`; +} + +async function defaultPiRunner(options: PiRunOptions): Promise { + return await new Promise((resolve, reject) => { + // Parse executable - may be "node /path/to/script.js" or just "pi" + const parts = options.executable.split(/\s+/); + const executable = parts[0]; + const executableArgs = parts.slice(1); + const allArgs = [...executableArgs, ...options.args]; + + const child = spawn(executable, allArgs, { + cwd: options.cwd, + env: options.env, + stdio: ['pipe', 'pipe', 'pipe'], + shell: false, + }); + + let stdout = ''; + let stderr = ''; + let timedOut = false; + + const onAbort = (): void => { + child.kill('SIGTERM'); + }; + + if (options.signal) { + if (options.signal.aborted) { + onAbort(); + } else { + options.signal.addEventListener('abort', onAbort, { once: true }); + } + } + + let timeoutHandle: NodeJS.Timeout | undefined; + if (options.timeoutMs && options.timeoutMs > 0) { + timeoutHandle = setTimeout(() => { + timedOut = true; + child.kill('SIGTERM'); + }, options.timeoutMs); + timeoutHandle.unref?.(); + } + + child.stdout.setEncoding('utf8'); + child.stdout.on('data', (chunk) => { + stdout += chunk; + options.onStdoutChunk?.(chunk); + }); + + child.stderr.setEncoding('utf8'); + child.stderr.on('data', (chunk) => { + stderr += chunk; + options.onStderrChunk?.(chunk); + }); + + // Close stdin immediately since prompt is passed as argument + child.stdin.end(); + + const cleanup = (): void => { + if (timeoutHandle) { + clearTimeout(timeoutHandle); + } + if (options.signal) { + options.signal.removeEventListener('abort', onAbort); + } + }; + + child.on('error', (error) => { + cleanup(); + reject(error); + }); + + child.on('close', (code) => { + cleanup(); + resolve({ + stdout, + stderr, + exitCode: typeof code === 'number' ? code : -1, + timedOut, + }); + }); + }); +} diff --git a/packages/core/src/evaluation/providers/pi-log-tracker.ts b/packages/core/src/evaluation/providers/pi-log-tracker.ts new file mode 100644 index 0000000..dad8b51 --- /dev/null +++ b/packages/core/src/evaluation/providers/pi-log-tracker.ts @@ -0,0 +1,72 @@ +export type PiLogEntry = { + readonly filePath: string; + readonly evalCaseId?: string; + readonly targetName: string; + readonly attempt?: number; +}; + +const GLOBAL_LOGS_KEY = Symbol.for('agentv.piLogs'); +const GLOBAL_SUBSCRIBERS_KEY = Symbol.for('agentv.piLogSubscribers'); + +type PiLogListener = (entry: PiLogEntry) => void; + +type GlobalWithPiLogs = typeof globalThis & { + [GLOBAL_LOGS_KEY]?: PiLogEntry[]; + [GLOBAL_SUBSCRIBERS_KEY]?: Set; +}; + +function getPiLogStore(): PiLogEntry[] { + const globalObject = globalThis as GlobalWithPiLogs; + const existing = globalObject[GLOBAL_LOGS_KEY]; + if (existing) { + return existing; + } + const created: PiLogEntry[] = []; + globalObject[GLOBAL_LOGS_KEY] = created; + return created; +} + +function getSubscriberStore(): Set { + const globalObject = globalThis as GlobalWithPiLogs; + const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY]; + if (existing) { + return existing; + } + const created = new Set(); + globalObject[GLOBAL_SUBSCRIBERS_KEY] = created; + return created; +} + +function notifySubscribers(entry: PiLogEntry): void { + const subscribers = Array.from(getSubscriberStore()); + for (const listener of subscribers) { + try { + listener(entry); + } catch (error) { + // Avoid surfacing subscriber errors to providers; log for visibility. + const message = error instanceof Error ? error.message : String(error); + console.warn(`Pi log subscriber failed: ${message}`); + } + } +} + +export function recordPiLogEntry(entry: PiLogEntry): void { + getPiLogStore().push(entry); + notifySubscribers(entry); +} + +export function consumePiLogEntries(): PiLogEntry[] { + const store = getPiLogStore(); + if (store.length === 0) { + return []; + } + return store.splice(0, store.length); +} + +export function subscribeToPiLogEntries(listener: PiLogListener): () => void { + const store = getSubscriberStore(); + store.add(listener); + return () => { + store.delete(listener); + }; +} diff --git a/packages/core/src/evaluation/providers/targets.ts b/packages/core/src/evaluation/providers/targets.ts index 61f6b3a..80dd509 100644 --- a/packages/core/src/evaluation/providers/targets.ts +++ b/packages/core/src/evaluation/providers/targets.ts @@ -427,6 +427,22 @@ export interface CodexResolvedConfig { readonly timeoutMs?: number; readonly logDir?: string; readonly logFormat?: 'summary' | 'json'; + readonly systemPrompt?: string; +} + +export interface PiCodingAgentResolvedConfig { + readonly executable: string; + readonly provider?: string; + readonly model?: string; + readonly apiKey?: string; + readonly tools?: string; + readonly thinking?: string; + readonly args?: readonly string[]; + readonly cwd?: string; + readonly timeoutMs?: number; + readonly logDir?: string; + readonly logFormat?: 'summary' | 'json'; + readonly systemPrompt?: string; } export interface MockResolvedConfig { @@ -486,6 +502,14 @@ export type ResolvedTarget = readonly providerBatching?: boolean; readonly config: CodexResolvedConfig; } + | { + readonly kind: 'pi-coding-agent'; + readonly name: string; + readonly judgeTarget?: string; + readonly workers?: number; + readonly providerBatching?: boolean; + readonly config: PiCodingAgentResolvedConfig; + } | { readonly kind: 'mock'; readonly name: string; @@ -630,6 +654,16 @@ export function resolveTargetDefinition( providerBatching, config: resolveCodexConfig(parsed, env), }; + case 'pi': + case 'pi-coding-agent': + return { + kind: 'pi-coding-agent', + name: parsed.name, + judgeTarget: parsed.judge_target, + workers: parsed.workers, + providerBatching, + config: resolvePiCodingAgentConfig(parsed, env), + }; case 'mock': return { kind: 'mock', @@ -767,6 +801,7 @@ function resolveCodexConfig( target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CODEX_LOG_FORMAT; + const systemPromptSource = target.system_prompt ?? target.systemPrompt; const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, { @@ -787,6 +822,11 @@ function resolveCodexConfig( }); const logFormat = normalizeCodexLogFormat(logFormatSource); + const systemPrompt = + typeof systemPromptSource === 'string' && systemPromptSource.trim().length > 0 + ? systemPromptSource.trim() + : undefined; + return { executable, args, @@ -794,6 +834,7 @@ function resolveCodexConfig( timeoutMs, logDir, logFormat, + systemPrompt, }; } @@ -811,6 +852,93 @@ function normalizeCodexLogFormat(value: unknown): 'summary' | 'json' | undefined throw new Error("codex log format must be 'summary' or 'json'"); } +function resolvePiCodingAgentConfig( + target: z.infer, + env: EnvLookup, +): PiCodingAgentResolvedConfig { + const executableSource = target.executable ?? target.command ?? target.binary; + const providerSource = target.pi_provider ?? target.piProvider ?? target.llm_provider; + const modelSource = target.model ?? target.pi_model ?? target.piModel; + const apiKeySource = target.api_key ?? target.apiKey; + const toolsSource = target.tools ?? target.pi_tools ?? target.piTools; + const thinkingSource = target.thinking ?? target.pi_thinking ?? target.piThinking; + const argsSource = target.args ?? target.arguments; + const cwdSource = target.cwd; + const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds; + const logDirSource = + target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory; + const logFormatSource = target.log_format ?? target.logFormat; + const systemPromptSource = target.system_prompt ?? target.systemPrompt; + + const executable = + resolveOptionalString(executableSource, env, `${target.name} pi executable`, { + allowLiteral: true, + optionalEnv: true, + }) ?? 'pi'; + + const provider = resolveOptionalString(providerSource, env, `${target.name} pi provider`, { + allowLiteral: true, + optionalEnv: true, + }); + + const model = resolveOptionalString(modelSource, env, `${target.name} pi model`, { + allowLiteral: true, + optionalEnv: true, + }); + + const apiKey = resolveOptionalString(apiKeySource, env, `${target.name} pi api key`, { + allowLiteral: false, + optionalEnv: true, + }); + + const tools = resolveOptionalString(toolsSource, env, `${target.name} pi tools`, { + allowLiteral: true, + optionalEnv: true, + }); + + const thinking = resolveOptionalString(thinkingSource, env, `${target.name} pi thinking`, { + allowLiteral: true, + optionalEnv: true, + }); + + const args = resolveOptionalStringArray(argsSource, env, `${target.name} pi args`); + + const cwd = resolveOptionalString(cwdSource, env, `${target.name} pi cwd`, { + allowLiteral: true, + optionalEnv: true, + }); + + const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} pi timeout`); + + const logDir = resolveOptionalString(logDirSource, env, `${target.name} pi log directory`, { + allowLiteral: true, + optionalEnv: true, + }); + + const logFormat = + logFormatSource === 'json' || logFormatSource === 'summary' ? logFormatSource : undefined; + + const systemPrompt = + typeof systemPromptSource === 'string' && systemPromptSource.trim().length > 0 + ? systemPromptSource.trim() + : undefined; + + return { + executable, + provider, + model, + apiKey, + tools, + thinking, + args, + cwd, + timeoutMs, + logDir, + logFormat, + systemPrompt, + }; +} + function resolveMockConfig(target: z.infer): MockResolvedConfig { const response = typeof target.response === 'string' ? target.response : undefined; return { response }; diff --git a/packages/core/src/evaluation/providers/types.ts b/packages/core/src/evaluation/providers/types.ts index 4634e04..5515d8b 100644 --- a/packages/core/src/evaluation/providers/types.ts +++ b/packages/core/src/evaluation/providers/types.ts @@ -15,6 +15,7 @@ export type ProviderKind = | 'anthropic' | 'gemini' | 'codex' + | 'pi-coding-agent' | 'cli' | 'mock' | 'vscode' @@ -26,6 +27,7 @@ export type ProviderKind = */ export const AGENT_PROVIDER_KINDS: readonly ProviderKind[] = [ 'codex', + 'pi-coding-agent', 'vscode', 'vscode-insiders', ] as const; @@ -39,6 +41,7 @@ export const KNOWN_PROVIDERS: readonly ProviderKind[] = [ 'anthropic', 'gemini', 'codex', + 'pi-coding-agent', 'cli', 'mock', 'vscode', @@ -54,6 +57,7 @@ export const PROVIDER_ALIASES: readonly string[] = [ 'google', // alias for "gemini" 'google-gemini', // alias for "gemini" 'codex-cli', // alias for "codex" + 'pi', // alias for "pi-coding-agent" 'openai', // legacy/future support 'bedrock', // legacy/future support 'vertex', // legacy/future support