Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,34 @@ AgentV aims to provide a robust, declarative framework for evaluating AI agents.
- **Multi-Objective Scoring**: Measure correctness, latency, cost, and safety in a single run.
- **Optimization Ready**: Designed to support future automated hyperparameter tuning and candidate generation.

## IMPORTANT: Design Principles

These principles guide all feature decisions. **Follow these when proposing or implementing changes.**

### 1. Lightweight Core, Plugin Extensibility
AgentV's core should remain minimal. Complex or domain-specific logic belongs in plugins, not built-in features.

**Extension points (prefer these over adding built-ins):**
- `code_judge` scripts for custom evaluation logic
- CLI wrappers that consume AgentV's JSON/JSONL output for post-processing (aggregation, comparison, reporting)

**Ask yourself:** "Can this be achieved with existing primitives + a plugin or wrapper?" If yes, it should not be a built-in.

### 2. Built-ins for Primitives Only
Built-in evaluators provide **universal primitives** that users compose. A primitive is:
- Stateless and deterministic
- Has a single, clear responsibility
- Cannot be trivially composed from other primitives
- Needed by the majority of users

If a feature serves a niche use case or adds conditional logic, it belongs in a plugin.

### 3. Align with Industry Standards
Before adding features, research how peer frameworks solve the problem. Prefer the **lowest common denominator** that covers most use cases. Novel features without industry precedent require strong justification and should default to plugin implementation.

### 4. Non-Breaking Extensions
New fields should be optional. Existing configurations must continue working unchanged.

## Tech Stack & Tools
- **Language:** TypeScript 5.x targeting ES2022
- **Runtime:** Bun (use `bun` for all package and script operations)
Expand Down
5 changes: 1 addition & 4 deletions apps/cli/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,7 @@
"bin": {
"agentv": "./dist/cli.js"
},
"files": [
"dist",
"README.md"
],
"files": ["dist", "README.md"],
"scripts": {
"dev": "bun --watch src/index.ts",
"build": "tsup && bun run copy-readme",
Expand Down
22 changes: 11 additions & 11 deletions apps/cli/src/commands/compare/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@ import { readFileSync } from 'node:fs';
import { command, number, option, optional, positional, string } from 'cmd-ts';

interface EvalResult {
eval_id: string;
evalId: string;
score: number;
}

interface MatchedResult {
eval_id: string;
evalId: string;
score1: number;
score2: number;
delta: number;
Expand Down Expand Up @@ -35,14 +35,14 @@ export function loadJsonlResults(filePath: string): EvalResult[] {
.filter((line) => line.trim());

return lines.map((line) => {
const record = JSON.parse(line) as { eval_id?: string; score?: number };
if (typeof record.eval_id !== 'string') {
throw new Error(`Missing eval_id in result: ${line}`);
const record = JSON.parse(line) as { evalId?: string; score?: number };
if (typeof record.evalId !== 'string') {
throw new Error(`Missing evalId in result: ${line}`);
}
if (typeof record.score !== 'number') {
throw new Error(`Missing or invalid score in result: ${line}`);
}
return { eval_id: record.eval_id, score: record.score };
return { evalId: record.evalId, score: record.score };
});
}

Expand All @@ -57,8 +57,8 @@ export function compareResults(
results2: EvalResult[],
threshold: number,
): ComparisonOutput {
const map1 = new Map(results1.map((r) => [r.eval_id, r.score]));
const map2 = new Map(results2.map((r) => [r.eval_id, r.score]));
const map1 = new Map(results1.map((r) => [r.evalId, r.score]));
const map2 = new Map(results2.map((r) => [r.evalId, r.score]));

const matched: MatchedResult[] = [];
const matchedIds = new Set<string>();
Expand All @@ -68,7 +68,7 @@ export function compareResults(
if (score2 !== undefined) {
const delta = score2 - score1;
matched.push({
eval_id: evalId,
evalId: evalId,
score1,
score2,
delta,
Expand All @@ -78,8 +78,8 @@ export function compareResults(
}
}

const unmatchedFile1 = results1.filter((r) => !matchedIds.has(r.eval_id)).length;
const unmatchedFile2 = results2.filter((r) => !map1.has(r.eval_id)).length;
const unmatchedFile1 = results1.filter((r) => !matchedIds.has(r.evalId)).length;
const unmatchedFile2 = results2.filter((r) => !map1.has(r.evalId)).length;

const wins = matched.filter((m) => m.outcome === 'win').length;
const losses = matched.filter((m) => m.outcome === 'loss').length;
Expand Down
6 changes: 3 additions & 3 deletions apps/cli/src/commands/eval/statistics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ export function calculateEvaluationSummary(
// Track errors
const errors = results
.filter((result) => result.error !== undefined)
.map((result) => ({ evalId: result.eval_id, error: result.error as string }));
.map((result) => ({ evalId: result.evalId, error: result.error as string }));
const errorCount = errors.length;

if (total === 0) {
Expand Down Expand Up @@ -180,12 +180,12 @@ export function formatEvaluationSummary(summary: EvaluationSummary): string {

lines.push('\nTop performing eval cases:');
summary.topResults.forEach((result, index) => {
lines.push(` ${index + 1}. ${result.eval_id}: ${formatScore(result.score)}`);
lines.push(` ${index + 1}. ${result.evalId}: ${formatScore(result.score)}`);
});

lines.push('\nLowest performing eval cases:');
summary.bottomResults.forEach((result, index) => {
lines.push(` ${index + 1}. ${result.eval_id}: ${formatScore(result.score)}`);
lines.push(` ${index + 1}. ${result.evalId}: ${formatScore(result.score)}`);
});

return lines.join('\n');
Expand Down
48 changes: 24 additions & 24 deletions apps/cli/test/commands/compare/compare.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,39 +26,39 @@ describe('compare command', () => {
const filePath = path.join(tempDir, 'results.jsonl');
writeFileSync(
filePath,
'{"eval_id": "case-1", "score": 0.8}\n{"eval_id": "case-2", "score": 0.9}\n',
'{"evalId": "case-1", "score": 0.8}\n{"evalId": "case-2", "score": 0.9}\n',
);

const results = loadJsonlResults(filePath);

expect(results).toEqual([
{ eval_id: 'case-1', score: 0.8 },
{ eval_id: 'case-2', score: 0.9 },
{ evalId: 'case-1', score: 0.8 },
{ evalId: 'case-2', score: 0.9 },
]);
});

it('should handle empty lines in JSONL', () => {
const filePath = path.join(tempDir, 'results.jsonl');
writeFileSync(
filePath,
'{"eval_id": "case-1", "score": 0.8}\n\n{"eval_id": "case-2", "score": 0.9}\n',
'{"evalId": "case-1", "score": 0.8}\n\n{"evalId": "case-2", "score": 0.9}\n',
);

const results = loadJsonlResults(filePath);

expect(results).toHaveLength(2);
});

it('should throw error for missing eval_id', () => {
it('should throw error for missing evalId', () => {
const filePath = path.join(tempDir, 'results.jsonl');
writeFileSync(filePath, '{"score": 0.8}\n');

expect(() => loadJsonlResults(filePath)).toThrow('Missing eval_id');
expect(() => loadJsonlResults(filePath)).toThrow('Missing evalId');
});

it('should throw error for missing score', () => {
const filePath = path.join(tempDir, 'results.jsonl');
writeFileSync(filePath, '{"eval_id": "case-1"}\n');
writeFileSync(filePath, '{"evalId": "case-1"}\n');

expect(() => loadJsonlResults(filePath)).toThrow('Missing or invalid score');
});
Expand Down Expand Up @@ -93,27 +93,27 @@ describe('compare command', () => {
});

describe('compareResults', () => {
it('should match results by eval_id and compute deltas', () => {
it('should match results by evalId and compute deltas', () => {
// Use values that avoid floating point precision issues
const results1 = [
{ eval_id: 'case-1', score: 0.5 },
{ eval_id: 'case-2', score: 0.75 },
{ evalId: 'case-1', score: 0.5 },
{ evalId: 'case-2', score: 0.75 },
];
const results2 = [
{ eval_id: 'case-1', score: 0.7 }, // +0.2 win
{ eval_id: 'case-2', score: 0.5 }, // -0.25 loss
{ evalId: 'case-1', score: 0.7 }, // +0.2 win
{ evalId: 'case-2', score: 0.5 }, // -0.25 loss
];

const comparison = compareResults(results1, results2, 0.1);

expect(comparison.matched).toHaveLength(2);
expect(comparison.matched[0].eval_id).toBe('case-1');
expect(comparison.matched[0].evalId).toBe('case-1');
expect(comparison.matched[0].score1).toBe(0.5);
expect(comparison.matched[0].score2).toBe(0.7);
expect(comparison.matched[0].delta).toBeCloseTo(0.2, 10);
expect(comparison.matched[0].outcome).toBe('win');

expect(comparison.matched[1].eval_id).toBe('case-2');
expect(comparison.matched[1].evalId).toBe('case-2');
expect(comparison.matched[1].score1).toBe(0.75);
expect(comparison.matched[1].score2).toBe(0.5);
expect(comparison.matched[1].delta).toBeCloseTo(-0.25, 10);
Expand All @@ -122,12 +122,12 @@ describe('compare command', () => {

it('should count unmatched results', () => {
const results1 = [
{ eval_id: 'case-1', score: 0.8 },
{ eval_id: 'only-in-1', score: 0.5 },
{ evalId: 'case-1', score: 0.8 },
{ evalId: 'only-in-1', score: 0.5 },
];
const results2 = [
{ eval_id: 'case-1', score: 0.9 },
{ eval_id: 'only-in-2', score: 0.6 },
{ evalId: 'case-1', score: 0.9 },
{ evalId: 'only-in-2', score: 0.6 },
];

const comparison = compareResults(results1, results2, 0.1);
Expand All @@ -138,14 +138,14 @@ describe('compare command', () => {
it('should compute summary statistics', () => {
// Use values that produce clear deltas above/below threshold
const results1 = [
{ eval_id: 'case-1', score: 0.5 },
{ eval_id: 'case-2', score: 0.75 },
{ eval_id: 'case-3', score: 0.6 },
{ evalId: 'case-1', score: 0.5 },
{ evalId: 'case-2', score: 0.75 },
{ evalId: 'case-3', score: 0.6 },
];
const results2 = [
{ eval_id: 'case-1', score: 0.7 }, // win (+0.2)
{ eval_id: 'case-2', score: 0.5 }, // loss (-0.25)
{ eval_id: 'case-3', score: 0.65 }, // tie (+0.05)
{ evalId: 'case-1', score: 0.7 }, // win (+0.2)
{ evalId: 'case-2', score: 0.5 }, // loss (-0.25)
{ evalId: 'case-3', score: 0.65 }, // tie (+0.05)
];

const comparison = compareResults(results1, results2, 0.1);
Expand Down
4 changes: 2 additions & 2 deletions apps/cli/test/eval.integration.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,8 @@ describe('agentv eval CLI', () => {
const results = await readJsonLines(outputPath);
expect(results).toHaveLength(2);
const [firstResult, secondResult] = results as Array<Record<string, unknown>>;
expect(firstResult.eval_id).toBe('case-alpha');
expect(secondResult.eval_id).toBe('case-beta');
expect(firstResult.evalId).toBe('case-alpha');
expect(secondResult.evalId).toBe('case-beta');

const diagnostics = await readDiagnostics(fixture);
expect(diagnostics).toMatchObject({
Expand Down
26 changes: 13 additions & 13 deletions apps/cli/test/fixtures/mock-run-evaluation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,44 +22,44 @@ interface RunEvaluationOptionsLike {
}

interface EvaluationResultLike {
readonly eval_id: string;
readonly evalId: string;
readonly score: number;
readonly hits: readonly string[];
readonly misses: readonly string[];
readonly candidate_answer: string;
readonly expected_aspect_count: number;
readonly candidateAnswer: string;
readonly expectedAspectCount: number;
readonly target: string;
readonly timestamp: string;
readonly reasoning?: string;
readonly raw_aspects?: readonly string[];
readonly rawAspects?: readonly string[];
}

function buildResults(targetName: string): EvaluationResultLike[] {
const baseTime = new Date('2024-01-01T00:00:00.000Z');
return [
{
eval_id: 'case-alpha',
evalId: 'case-alpha',
score: 0.6,
hits: ['alpha'],
misses: [],
candidate_answer: 'Alpha answer',
expected_aspect_count: 1,
candidateAnswer: 'Alpha answer',
expectedAspectCount: 1,
target: targetName,
timestamp: baseTime.toISOString(),
reasoning: 'Alpha reasoning',
raw_aspects: ['alpha'],
rawAspects: ['alpha'],
},
{
eval_id: 'case-beta',
evalId: 'case-beta',
score: 0.9,
hits: ['beta', 'gamma'],
misses: ['delta'],
candidate_answer: 'Beta answer',
expected_aspect_count: 3,
candidateAnswer: 'Beta answer',
expectedAspectCount: 3,
target: targetName,
timestamp: new Date(baseTime.getTime() + 60_000).toISOString(),
reasoning: 'Beta reasoning',
raw_aspects: ['beta', 'gamma', 'delta'],
rawAspects: ['beta', 'gamma', 'delta'],
},
];
}
Expand Down Expand Up @@ -109,7 +109,7 @@ export async function runEvaluation(
await maybeWriteDiagnostics(options, results);
await maybeWritePromptDump(
options.promptDumpDir,
results.map((result) => result.eval_id),
results.map((result) => result.evalId),
);

for (const result of results) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ function isObject(value: unknown): value is Record<string, unknown> {
}

type EvalInput = {
readonly input_messages?: unknown;
readonly expected_messages?: unknown;
readonly candidate_answer?: unknown;
readonly inputMessages?: unknown;
readonly expectedMessages?: unknown;
readonly candidateAnswer?: unknown;
};

function findExpectedDecisionFromExpectedMessages(expectedMessages: unknown): string | undefined {
Expand Down Expand Up @@ -53,9 +53,9 @@ function main(): void {
const input = JSON.parse(stdin) as EvalInput;

const expectedDecision =
findExpectedDecisionFromExpectedMessages(input.expected_messages) ??
findExpectedDecision(input.input_messages);
const candidate = typeof input.candidate_answer === 'string' ? input.candidate_answer : '';
findExpectedDecisionFromExpectedMessages(input.expectedMessages) ??
findExpectedDecision(input.inputMessages);
const candidate = typeof input.candidateAnswer === 'string' ? input.candidateAnswer : '';

let candidateObj: unknown;
try {
Expand All @@ -73,7 +73,7 @@ function main(): void {
const misses: string[] = [];

if (!expectedDecision) {
misses.push('Missing expected decision (expected_messages[].content.decision)');
misses.push('Missing expected decision (expectedMessages[].content.decision)');
} else {
hits.push(`expected.decision present: ${expectedDecision}`);
}
Expand Down
Loading