EntityProcess · christso · Dec 31, 2025 · Jan 1, 2026 · Jan 1, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -9,6 +9,34 @@ AgentV aims to provide a robust, declarative framework for evaluating AI agents.
 - **Multi-Objective Scoring**: Measure correctness, latency, cost, and safety in a single run.
 - **Optimization Ready**: Designed to support future automated hyperparameter tuning and candidate generation.
 
+## IMPORTANT: Design Principles
+
+These principles guide all feature decisions. **Follow these when proposing or implementing changes.**
+
+### 1. Lightweight Core, Plugin Extensibility
+AgentV's core should remain minimal. Complex or domain-specific logic belongs in plugins, not built-in features.
+
+**Extension points (prefer these over adding built-ins):**
+- `code_judge` scripts for custom evaluation logic
+- CLI wrappers that consume AgentV's JSON/JSONL output for post-processing (aggregation, comparison, reporting)
+
+**Ask yourself:** "Can this be achieved with existing primitives + a plugin or wrapper?" If yes, it should not be a built-in.
+
+### 2. Built-ins for Primitives Only
+Built-in evaluators provide **universal primitives** that users compose. A primitive is:
+- Stateless and deterministic
+- Has a single, clear responsibility
+- Cannot be trivially composed from other primitives
+- Needed by the majority of users
+
+If a feature serves a niche use case or adds conditional logic, it belongs in a plugin.
+
+### 3. Align with Industry Standards
+Before adding features, research how peer frameworks solve the problem. Prefer the **lowest common denominator** that covers most use cases. Novel features without industry precedent require strong justification and should default to plugin implementation.
+
+### 4. Non-Breaking Extensions
+New fields should be optional. Existing configurations must continue working unchanged.
+
 ## Tech Stack & Tools
 - **Language:** TypeScript 5.x targeting ES2022
 - **Runtime:** Bun (use `bun` for all package and script operations)

diff --git a/apps/cli/package.json b/apps/cli/package.json
@@ -14,10 +14,7 @@
   "bin": {
     "agentv": "./dist/cli.js"
   },
-  "files": [
-    "dist",
-    "README.md"
-  ],
+  "files": ["dist", "README.md"],
   "scripts": {
     "dev": "bun --watch src/index.ts",
     "build": "tsup && bun run copy-readme",

diff --git a/apps/cli/src/commands/compare/index.ts b/apps/cli/src/commands/compare/index.ts
@@ -2,12 +2,12 @@ import { readFileSync } from 'node:fs';
 import { command, number, option, optional, positional, string } from 'cmd-ts';
 
 interface EvalResult {
-  eval_id: string;
+  evalId: string;
   score: number;
 }
 
 interface MatchedResult {
-  eval_id: string;
+  evalId: string;
   score1: number;
   score2: number;
   delta: number;
@@ -35,14 +35,14 @@ export function loadJsonlResults(filePath: string): EvalResult[] {
     .filter((line) => line.trim());
 
   return lines.map((line) => {
-    const record = JSON.parse(line) as { eval_id?: string; score?: number };
-    if (typeof record.eval_id !== 'string') {
-      throw new Error(`Missing eval_id in result: ${line}`);
+    const record = JSON.parse(line) as { evalId?: string; score?: number };
+    if (typeof record.evalId !== 'string') {
+      throw new Error(`Missing evalId in result: ${line}`);
     }
     if (typeof record.score !== 'number') {
       throw new Error(`Missing or invalid score in result: ${line}`);
     }
-    return { eval_id: record.eval_id, score: record.score };
+    return { evalId: record.evalId, score: record.score };
   });
 }
 
@@ -57,8 +57,8 @@ export function compareResults(
   results2: EvalResult[],
   threshold: number,
 ): ComparisonOutput {
-  const map1 = new Map(results1.map((r) => [r.eval_id, r.score]));
-  const map2 = new Map(results2.map((r) => [r.eval_id, r.score]));
+  const map1 = new Map(results1.map((r) => [r.evalId, r.score]));
+  const map2 = new Map(results2.map((r) => [r.evalId, r.score]));
 
   const matched: MatchedResult[] = [];
   const matchedIds = new Set<string>();
@@ -68,7 +68,7 @@ export function compareResults(
     if (score2 !== undefined) {
       const delta = score2 - score1;
       matched.push({
-        eval_id: evalId,
+        evalId: evalId,
         score1,
         score2,
         delta,
@@ -78,8 +78,8 @@ export function compareResults(
     }
   }
 
-  const unmatchedFile1 = results1.filter((r) => !matchedIds.has(r.eval_id)).length;
-  const unmatchedFile2 = results2.filter((r) => !map1.has(r.eval_id)).length;
+  const unmatchedFile1 = results1.filter((r) => !matchedIds.has(r.evalId)).length;
+  const unmatchedFile2 = results2.filter((r) => !map1.has(r.evalId)).length;
 
   const wins = matched.filter((m) => m.outcome === 'win').length;
   const losses = matched.filter((m) => m.outcome === 'loss').length;

diff --git a/apps/cli/src/commands/eval/statistics.ts b/apps/cli/src/commands/eval/statistics.ts
@@ -86,7 +86,7 @@ export function calculateEvaluationSummary(
   // Track errors
   const errors = results
     .filter((result) => result.error !== undefined)
-    .map((result) => ({ evalId: result.eval_id, error: result.error as string }));
+    .map((result) => ({ evalId: result.evalId, error: result.error as string }));
   const errorCount = errors.length;
 
   if (total === 0) {
@@ -180,12 +180,12 @@ export function formatEvaluationSummary(summary: EvaluationSummary): string {
 
   lines.push('\nTop performing eval cases:');
   summary.topResults.forEach((result, index) => {
-    lines.push(`  ${index + 1}. ${result.eval_id}: ${formatScore(result.score)}`);
+    lines.push(`  ${index + 1}. ${result.evalId}: ${formatScore(result.score)}`);
   });
 
   lines.push('\nLowest performing eval cases:');
   summary.bottomResults.forEach((result, index) => {
-    lines.push(`  ${index + 1}. ${result.eval_id}: ${formatScore(result.score)}`);
+    lines.push(`  ${index + 1}. ${result.evalId}: ${formatScore(result.score)}`);
   });
 
   return lines.join('\n');

diff --git a/apps/cli/test/commands/compare/compare.test.ts b/apps/cli/test/commands/compare/compare.test.ts
@@ -26,39 +26,39 @@ describe('compare command', () => {
       const filePath = path.join(tempDir, 'results.jsonl');
       writeFileSync(
         filePath,
-        '{"eval_id": "case-1", "score": 0.8}\n{"eval_id": "case-2", "score": 0.9}\n',
+        '{"evalId": "case-1", "score": 0.8}\n{"evalId": "case-2", "score": 0.9}\n',
       );
 
       const results = loadJsonlResults(filePath);
 
       expect(results).toEqual([
-        { eval_id: 'case-1', score: 0.8 },
-        { eval_id: 'case-2', score: 0.9 },
+        { evalId: 'case-1', score: 0.8 },
+        { evalId: 'case-2', score: 0.9 },
       ]);
     });
 
     it('should handle empty lines in JSONL', () => {
       const filePath = path.join(tempDir, 'results.jsonl');
       writeFileSync(
         filePath,
-        '{"eval_id": "case-1", "score": 0.8}\n\n{"eval_id": "case-2", "score": 0.9}\n',
+        '{"evalId": "case-1", "score": 0.8}\n\n{"evalId": "case-2", "score": 0.9}\n',
       );
 
       const results = loadJsonlResults(filePath);
 
       expect(results).toHaveLength(2);
     });
 
-    it('should throw error for missing eval_id', () => {
+    it('should throw error for missing evalId', () => {
       const filePath = path.join(tempDir, 'results.jsonl');
       writeFileSync(filePath, '{"score": 0.8}\n');
 
-      expect(() => loadJsonlResults(filePath)).toThrow('Missing eval_id');
+      expect(() => loadJsonlResults(filePath)).toThrow('Missing evalId');
     });
 
     it('should throw error for missing score', () => {
       const filePath = path.join(tempDir, 'results.jsonl');
-      writeFileSync(filePath, '{"eval_id": "case-1"}\n');
+      writeFileSync(filePath, '{"evalId": "case-1"}\n');
 
       expect(() => loadJsonlResults(filePath)).toThrow('Missing or invalid score');
     });
@@ -93,27 +93,27 @@ describe('compare command', () => {
   });
 
   describe('compareResults', () => {
-    it('should match results by eval_id and compute deltas', () => {
+    it('should match results by evalId and compute deltas', () => {
       // Use values that avoid floating point precision issues
       const results1 = [
-        { eval_id: 'case-1', score: 0.5 },
-        { eval_id: 'case-2', score: 0.75 },
+        { evalId: 'case-1', score: 0.5 },
+        { evalId: 'case-2', score: 0.75 },
       ];
       const results2 = [
-        { eval_id: 'case-1', score: 0.7 }, // +0.2 win
-        { eval_id: 'case-2', score: 0.5 }, // -0.25 loss
+        { evalId: 'case-1', score: 0.7 }, // +0.2 win
+        { evalId: 'case-2', score: 0.5 }, // -0.25 loss
       ];
 
       const comparison = compareResults(results1, results2, 0.1);
 
       expect(comparison.matched).toHaveLength(2);
-      expect(comparison.matched[0].eval_id).toBe('case-1');
+      expect(comparison.matched[0].evalId).toBe('case-1');
       expect(comparison.matched[0].score1).toBe(0.5);
       expect(comparison.matched[0].score2).toBe(0.7);
       expect(comparison.matched[0].delta).toBeCloseTo(0.2, 10);
       expect(comparison.matched[0].outcome).toBe('win');
 
-      expect(comparison.matched[1].eval_id).toBe('case-2');
+      expect(comparison.matched[1].evalId).toBe('case-2');
       expect(comparison.matched[1].score1).toBe(0.75);
       expect(comparison.matched[1].score2).toBe(0.5);
       expect(comparison.matched[1].delta).toBeCloseTo(-0.25, 10);
@@ -122,12 +122,12 @@ describe('compare command', () => {
 
     it('should count unmatched results', () => {
       const results1 = [
-        { eval_id: 'case-1', score: 0.8 },
-        { eval_id: 'only-in-1', score: 0.5 },
+        { evalId: 'case-1', score: 0.8 },
+        { evalId: 'only-in-1', score: 0.5 },
       ];
       const results2 = [
-        { eval_id: 'case-1', score: 0.9 },
-        { eval_id: 'only-in-2', score: 0.6 },
+        { evalId: 'case-1', score: 0.9 },
+        { evalId: 'only-in-2', score: 0.6 },
       ];
 
       const comparison = compareResults(results1, results2, 0.1);
@@ -138,14 +138,14 @@ describe('compare command', () => {
     it('should compute summary statistics', () => {
       // Use values that produce clear deltas above/below threshold
       const results1 = [
-        { eval_id: 'case-1', score: 0.5 },
-        { eval_id: 'case-2', score: 0.75 },
-        { eval_id: 'case-3', score: 0.6 },
+        { evalId: 'case-1', score: 0.5 },
+        { evalId: 'case-2', score: 0.75 },
+        { evalId: 'case-3', score: 0.6 },
       ];
       const results2 = [
-        { eval_id: 'case-1', score: 0.7 }, // win (+0.2)
-        { eval_id: 'case-2', score: 0.5 }, // loss (-0.25)
-        { eval_id: 'case-3', score: 0.65 }, // tie (+0.05)
+        { evalId: 'case-1', score: 0.7 }, // win (+0.2)
+        { evalId: 'case-2', score: 0.5 }, // loss (-0.25)
+        { evalId: 'case-3', score: 0.65 }, // tie (+0.05)
       ];
 
       const comparison = compareResults(results1, results2, 0.1);

diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts
@@ -167,8 +167,8 @@ describe('agentv eval CLI', () => {
     const results = await readJsonLines(outputPath);
     expect(results).toHaveLength(2);
     const [firstResult, secondResult] = results as Array<Record<string, unknown>>;
-    expect(firstResult.eval_id).toBe('case-alpha');
-    expect(secondResult.eval_id).toBe('case-beta');
+    expect(firstResult.evalId).toBe('case-alpha');
+    expect(secondResult.evalId).toBe('case-beta');
 
     const diagnostics = await readDiagnostics(fixture);
     expect(diagnostics).toMatchObject({

diff --git a/apps/cli/test/fixtures/mock-run-evaluation.ts b/apps/cli/test/fixtures/mock-run-evaluation.ts
@@ -22,44 +22,44 @@ interface RunEvaluationOptionsLike {
 }
 
 interface EvaluationResultLike {
-  readonly eval_id: string;
+  readonly evalId: string;
   readonly score: number;
   readonly hits: readonly string[];
   readonly misses: readonly string[];
-  readonly candidate_answer: string;
-  readonly expected_aspect_count: number;
+  readonly candidateAnswer: string;
+  readonly expectedAspectCount: number;
   readonly target: string;
   readonly timestamp: string;
   readonly reasoning?: string;
-  readonly raw_aspects?: readonly string[];
+  readonly rawAspects?: readonly string[];
 }
 
 function buildResults(targetName: string): EvaluationResultLike[] {
   const baseTime = new Date('2024-01-01T00:00:00.000Z');
   return [
     {
-      eval_id: 'case-alpha',
+      evalId: 'case-alpha',
       score: 0.6,
       hits: ['alpha'],
       misses: [],
-      candidate_answer: 'Alpha answer',
-      expected_aspect_count: 1,
+      candidateAnswer: 'Alpha answer',
+      expectedAspectCount: 1,
       target: targetName,
       timestamp: baseTime.toISOString(),
       reasoning: 'Alpha reasoning',
-      raw_aspects: ['alpha'],
+      rawAspects: ['alpha'],
     },
     {
-      eval_id: 'case-beta',
+      evalId: 'case-beta',
       score: 0.9,
       hits: ['beta', 'gamma'],
       misses: ['delta'],
-      candidate_answer: 'Beta answer',
-      expected_aspect_count: 3,
+      candidateAnswer: 'Beta answer',
+      expectedAspectCount: 3,
       target: targetName,
       timestamp: new Date(baseTime.getTime() + 60_000).toISOString(),
       reasoning: 'Beta reasoning',
-      raw_aspects: ['beta', 'gamma', 'delta'],
+      rawAspects: ['beta', 'gamma', 'delta'],
     },
   ];
 }
@@ -109,7 +109,7 @@ export async function runEvaluation(
   await maybeWriteDiagnostics(options, results);
   await maybeWritePromptDump(
     options.promptDumpDir,
-    results.map((result) => result.eval_id),
+    results.map((result) => result.evalId),
   );
 
   for (const result of results) {

diff --git a/examples/features/evals/batch-cli/scripts/check-batch-cli-output.ts b/examples/features/evals/batch-cli/scripts/check-batch-cli-output.ts
@@ -5,9 +5,9 @@ function isObject(value: unknown): value is Record<string, unknown> {
 }
 
 type EvalInput = {
-  readonly input_messages?: unknown;
-  readonly expected_messages?: unknown;
-  readonly candidate_answer?: unknown;
+  readonly inputMessages?: unknown;
+  readonly expectedMessages?: unknown;
+  readonly candidateAnswer?: unknown;
 };
 
 function findExpectedDecisionFromExpectedMessages(expectedMessages: unknown): string | undefined {
@@ -53,9 +53,9 @@ function main(): void {
   const input = JSON.parse(stdin) as EvalInput;
 
   const expectedDecision =
-    findExpectedDecisionFromExpectedMessages(input.expected_messages) ??
-    findExpectedDecision(input.input_messages);
-  const candidate = typeof input.candidate_answer === 'string' ? input.candidate_answer : '';
+    findExpectedDecisionFromExpectedMessages(input.expectedMessages) ??
+    findExpectedDecision(input.inputMessages);
+  const candidate = typeof input.candidateAnswer === 'string' ? input.candidateAnswer : '';
 
   let candidateObj: unknown;
   try {
@@ -73,7 +73,7 @@ function main(): void {
   const misses: string[] = [];
 
   if (!expectedDecision) {
-    misses.push('Missing expected decision (expected_messages[].content.decision)');
+    misses.push('Missing expected decision (expectedMessages[].content.decision)');
   } else {
     hits.push(`expected.decision present: ${expectedDecision}`);
   }