From c6fe9a2084c73813e408c009798c09e078a91636 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Wed, 31 Dec 2025 13:48:36 +0000
Subject: [PATCH 1/5] docs: add tool evaluation proposals and plugin showcase

---
 AGENTS.md                                     |  28 +++
 .../tool-trajectory/tool-trajectory-demo.yaml |  69 ++++++
 .../tool-evaluation-plugins/README.md         | 119 ++++++++++
 .../scripts/efficiency_scorer.py              | 214 +++++++++++++++++
 .../scripts/pairwise_tool_compare.py          | 220 ++++++++++++++++++
 .../scripts/tool_selection_judge.py           | 166 +++++++++++++
 .../tool-eval-demo.yaml                       | 131 +++++++++++
 .../changes/add-execution-metrics/proposal.md |  81 +++++++
 .../specs/evaluation/spec.md                  |  80 +++++++
 .../changes/add-execution-metrics/tasks.md    |  34 +++
 .../proposal.md                               |  49 ++++
 .../specs/evaluation/spec.md                  | 192 +++++++++++++++
 .../add-trajectory-argument-matching/tasks.md |  22 ++
 13 files changed, 1405 insertions(+)
 create mode 100644 examples/showcase/tool-evaluation-plugins/README.md
 create mode 100644 examples/showcase/tool-evaluation-plugins/scripts/efficiency_scorer.py
 create mode 100644 examples/showcase/tool-evaluation-plugins/scripts/pairwise_tool_compare.py
 create mode 100644 examples/showcase/tool-evaluation-plugins/scripts/tool_selection_judge.py
 create mode 100644 examples/showcase/tool-evaluation-plugins/tool-eval-demo.yaml
 create mode 100644 openspec/changes/add-execution-metrics/proposal.md
 create mode 100644 openspec/changes/add-execution-metrics/specs/evaluation/spec.md
 create mode 100644 openspec/changes/add-execution-metrics/tasks.md
 create mode 100644 openspec/changes/add-trajectory-argument-matching/proposal.md
 create mode 100644 openspec/changes/add-trajectory-argument-matching/specs/evaluation/spec.md
 create mode 100644 openspec/changes/add-trajectory-argument-matching/tasks.md

diff --git a/AGENTS.md b/AGENTS.md
index 364f217f..57b09312 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -9,6 +9,34 @@ AgentV aims to provide a robust, declarative framework for evaluating AI agents.
 - **Multi-Objective Scoring**: Measure correctness, latency, cost, and safety in a single run.
 - **Optimization Ready**: Designed to support future automated hyperparameter tuning and candidate generation.
 
+## IMPORTANT: Design Principles
+
+These principles guide all feature decisions. **Follow these when proposing or implementing changes.**
+
+### 1. Lightweight Core, Plugin Extensibility
+AgentV's core should remain minimal. Complex or domain-specific logic belongs in plugins, not built-in features.
+
+**Extension points (prefer these over adding built-ins):**
+- `code_judge` scripts for custom evaluation logic
+- CLI wrappers that consume AgentV's JSON/JSONL output for post-processing (aggregation, comparison, reporting)
+
+**Ask yourself:** "Can this be achieved with existing primitives + a plugin or wrapper?" If yes, it should not be a built-in.
+
+### 2. Built-ins for Primitives Only
+Built-in evaluators provide **universal primitives** that users compose. A primitive is:
+- Stateless and deterministic
+- Has a single, clear responsibility
+- Cannot be trivially composed from other primitives
+- Needed by the majority of users
+
+If a feature serves a niche use case or adds conditional logic, it belongs in a plugin.
+
+### 3. Align with Industry Standards
+Before adding features, research how peer frameworks solve the problem. Prefer the **lowest common denominator** that covers most use cases. Novel features without industry precedent require strong justification and should default to plugin implementation.
+
+### 4. Non-Breaking Extensions
+New fields should be optional. Existing configurations must continue working unchanged.
+
 ## Tech Stack & Tools
 - **Language:** TypeScript 5.x targeting ES2022
 - **Runtime:** Bun (use `bun` for all package and script operations)
diff --git a/examples/features/evals/tool-trajectory/tool-trajectory-demo.yaml b/examples/features/evals/tool-trajectory/tool-trajectory-demo.yaml
index 8941426a..ab16557a 100644
--- a/examples/features/evals/tool-trajectory/tool-trajectory-demo.yaml
+++ b/examples/features/evals/tool-trajectory/tool-trajectory-demo.yaml
@@ -9,6 +9,11 @@
 # - in_order: Validates tools appear in expected sequence (allows gaps)
 # - exact: Validates exact tool sequence match (no gaps, no extra tools)
 #
+# Argument matching (PLANNED - see openspec/changes/add-trajectory-argument-matching/):
+# - Exact: args: { key: "value" } - must match exactly (deep equality)
+# - Skip: args: any - validate tool name only, ignore arguments
+# Note: For pattern/regex matching, use a code_judge evaluator instead.
+#
 # This demo uses a CLI provider (mock-agent.ts) that simulates an agent with tool usage.
 # The mock agent generates different traces based on the prompt content.
 #
@@ -148,3 +153,67 @@ evalcases:
             knowledgeSearch: 1    # Present in research trace (will pass)
             documentRetrieve: 1   # Present in research trace (will pass)
             generateReport: 1     # NOT present (will fail - demonstrates partial scoring)
+
+  # ==========================================
+  # PLANNED FEATURES - Argument Matching
+  # The examples below show the intended syntax for argument validation.
+  # See: openspec/changes/add-trajectory-argument-matching/
+  # ==========================================
+
+  # ==========================================
+  # Example 6: Exact argument matching
+  # Use case: Validate tool is called with specific argument values
+  # PLANNED - will not run until feature is implemented
+  # ==========================================
+  - id: exact-args-match
+
+    expected_outcome: |-
+      Agent searches for weather and retrieves forecast for the correct city.
+
+    input_messages:
+      - role: user
+        content: What's the weather like in Paris?
+
+    execution:
+      evaluators:
+        - name: arg-validation
+          type: tool_trajectory
+          mode: in_order
+          expected:
+            - tool: search
+              args:
+                query: "weather Paris"
+            - tool: get_weather
+              args:
+                location: "Paris"
+
+  # ==========================================
+  # Example 7: Skip argument validation with `any`
+  # Use case: Validate tool sequence but ignore specific arguments
+  # PLANNED - will not run until feature is implemented
+  # ==========================================
+  - id: skip-args-validation
+
+    expected_outcome: |-
+      Agent loads data, transforms it, and saves. Arguments don't matter.
+
+    input_messages:
+      - role: user
+        content: Load customer data, normalize it, and save
+
+    execution:
+      evaluators:
+        - name: workflow-sequence-only
+          type: tool_trajectory
+          mode: in_order
+          expected:
+            # Exact match: must use specific source
+            - tool: load_data
+              args:
+                source: "customers"
+            # Skip: any transformation is acceptable
+            - tool: transform
+              args: any
+            # Skip: we don't care about save arguments
+            - tool: save_data
+              args: any
diff --git a/examples/showcase/tool-evaluation-plugins/README.md b/examples/showcase/tool-evaluation-plugins/README.md
new file mode 100644
index 00000000..94350618
--- /dev/null
+++ b/examples/showcase/tool-evaluation-plugins/README.md
@@ -0,0 +1,119 @@
+# Tool Evaluation Plugin Patterns
+
+This showcase demonstrates **plugin-based tool evaluation patterns** that complement AgentV's built-in `tool_trajectory` evaluator. These patterns are intentionally implemented as plugins (code judges) rather than built-ins because they involve domain-specific logic or semantic evaluation.
+
+## When to Use Plugins vs Built-ins
+
+| Pattern | Implementation | Reason |
+|---------|----------------|--------|
+| Tool name/sequence matching | Built-in (`tool_trajectory`) | Deterministic, reusable primitive |
+| Argument matching | Built-in (planned) | Extension of existing primitive |
+| Tool selection correctness | **Plugin** | Requires semantic judgment |
+| Tool input appropriateness | **Plugin** | Domain-specific criteria |
+| Tool output utilization | **Plugin** | Requires understanding tool purposes |
+| Efficiency scoring | **Plugin** | Custom thresholds, domain-specific |
+| Pairwise comparison | **Plugin** | Specialized evaluation pattern |
+
+## Plugin Examples
+
+### 1. Tool Selection Evaluator (`tool_selection_judge.py`)
+
+Evaluates whether the agent selected the **right tools** for the task. Uses LLM-as-judge pattern to semantically assess tool choices.
+
+```yaml
+evaluators:
+  - name: tool-selection
+    type: code_judge
+    script: scripts/tool_selection_judge.py
+```
+
+### 2. Tool Input Validator (`tool_input_validator.ts`)
+
+Validates that tool **arguments are semantically appropriate** (not just syntactically correct). Checks if argument values make sense in context.
+
+```yaml
+evaluators:
+  - name: input-validation
+    type: code_judge
+    script: scripts/tool_input_validator.ts
+```
+
+### 3. Tool Efficiency Scorer (`efficiency_scorer.py`)
+
+Computes efficiency metrics and scores based on configurable thresholds. Demonstrates how to use execution metrics in evaluation.
+
+```yaml
+evaluators:
+  - name: efficiency
+    type: code_judge
+    script: scripts/efficiency_scorer.py
+```
+
+### 4. Pairwise Tool Comparison (`pairwise_tool_compare.py`)
+
+Compares two agent responses for tool usage quality with position bias mitigation (runs comparison twice with swapped order).
+
+```yaml
+evaluators:
+  - name: pairwise-compare
+    type: code_judge
+    script: scripts/pairwise_tool_compare.py
+```
+
+## Running the Examples
+
+```bash
+cd examples/showcase/tool-evaluation-plugins
+npx agentv eval tool-eval-demo.yaml --target mock_agent
+```
+
+## Input Contract
+
+All code judges receive a JSON object on stdin with:
+
+```json
+{
+  "question": "User's question/task",
+  "expected_outcome": "Expected behavior description",
+  "candidate_answer": "Agent's final response",
+  "output_messages": [
+    {
+      "role": "assistant",
+      "content": "...",
+      "toolCalls": [
+        { "id": "...", "tool": "search", "args": { "query": "..." } }
+      ]
+    },
+    {
+      "role": "tool",
+      "toolCallId": "...",
+      "toolName": "search",
+      "content": "Tool result..."
+    }
+  ],
+  "candidate_trace_summary": {
+    "eventCount": 5,
+    "toolNames": ["search", "fetch"],
+    "toolCallsByName": { "search": 2, "fetch": 1 },
+    "errorCount": 0
+  },
+  "execution_metrics": {
+    "tokenUsage": { "input": 1000, "output": 500 },
+    "durationMs": 3500,
+    "costUsd": 0.0015
+  }
+}
+```
+
+## Output Contract
+
+Code judges must output JSON with:
+
+```json
+{
+  "score": 0.85,
+  "hits": ["Used appropriate search tool", "Validated input before fetch"],
+  "misses": ["Redundant search call"],
+  "reasoning": "Agent demonstrated good tool selection with minor inefficiency"
+}
+```
diff --git a/examples/showcase/tool-evaluation-plugins/scripts/efficiency_scorer.py b/examples/showcase/tool-evaluation-plugins/scripts/efficiency_scorer.py
new file mode 100644
index 00000000..15c68ecc
--- /dev/null
+++ b/examples/showcase/tool-evaluation-plugins/scripts/efficiency_scorer.py
@@ -0,0 +1,214 @@
+#!/usr/bin/env python3
+"""
+Tool Efficiency Scorer - Code Judge Plugin
+
+Evaluates agent efficiency based on execution metrics:
+- Token usage relative to task complexity
+- Number of tool calls (redundancy detection)
+- Exploration ratio (read-only vs action tools)
+- Cost efficiency
+
+Why this is a plugin (not built-in):
+- Efficiency thresholds are domain-specific
+- What's "efficient" depends on the task type
+- Different projects have different cost/performance tradeoffs
+
+Usage in eval YAML:
+  evaluators:
+    - name: efficiency
+      type: code_judge
+      script: scripts/efficiency_scorer.py
+
+Input (stdin JSON):
+  - candidate_trace_summary: Tool call statistics
+  - execution_metrics: Token usage, cost, duration (if available)
+  - expected_outcome: Task description (for complexity estimation)
+
+Output (stdout JSON):
+  - score: 0.0-1.0 efficiency score
+  - hits: Efficiency wins
+  - misses: Efficiency issues
+  - reasoning: Explanation
+"""
+
+import json
+import sys
+from typing import Any
+
+
+# Configurable thresholds (customize for your domain)
+THRESHOLDS = {
+    # Maximum tool calls before penalty
+    "max_tool_calls": 10,
+    # Ideal exploration ratio (read-only tools / total)
+    "target_exploration_ratio": 0.6,
+    "exploration_tolerance": 0.2,
+    # Token budgets
+    "max_tokens_simple": 2000,
+    "max_tokens_complex": 10000,
+    # Cost thresholds (USD)
+    "max_cost_simple": 0.01,
+    "max_cost_complex": 0.10,
+}
+
+# Tools considered "exploration" (read-only)
+EXPLORATION_TOOLS = {
+    "read", "grep", "glob", "search", "list", "find",
+    "get", "fetch", "query", "inspect", "view",
+}
+
+
+def estimate_task_complexity(expected_outcome: str) -> str:
+    """Estimate task complexity from expected outcome description."""
+    text = expected_outcome.lower()
+    complex_indicators = [
+        "multiple", "several", "comprehensive", "thorough",
+        "analyze", "compare", "synthesize", "integrate",
+    ]
+    if any(indicator in text for indicator in complex_indicators):
+        return "complex"
+    return "simple"
+
+
+def calculate_exploration_ratio(trace_summary: dict) -> float:
+    """Calculate ratio of exploration tools to total tools."""
+    tool_calls = trace_summary.get("toolCallsByName", {})
+    total = sum(tool_calls.values())
+    if total == 0:
+        return 0.0
+
+    exploration_count = sum(
+        count for tool, count in tool_calls.items()
+        if any(exp in tool.lower() for exp in EXPLORATION_TOOLS)
+    )
+    return exploration_count / total
+
+
+def evaluate_efficiency(
+    trace_summary: dict | None,
+    execution_metrics: dict | None,
+    expected_outcome: str,
+) -> dict[str, Any]:
+    """Evaluate agent efficiency against configurable thresholds."""
+    hits = []
+    misses = []
+    scores = []
+
+    complexity = estimate_task_complexity(expected_outcome)
+
+    # 1. Tool call count evaluation
+    if trace_summary:
+        tool_count = trace_summary.get("eventCount", 0)
+        max_calls = THRESHOLDS["max_tool_calls"]
+
+        if tool_count <= max_calls:
+            hits.append(f"Tool calls ({tool_count}) within budget ({max_calls})")
+            scores.append(1.0)
+        else:
+            penalty = min((tool_count - max_calls) / max_calls, 1.0)
+            scores.append(1.0 - penalty)
+            misses.append(f"Excessive tool calls: {tool_count} (budget: {max_calls})")
+
+        # 2. Exploration ratio evaluation
+        exp_ratio = calculate_exploration_ratio(trace_summary)
+        target = THRESHOLDS["target_exploration_ratio"]
+        tolerance = THRESHOLDS["exploration_tolerance"]
+
+        if abs(exp_ratio - target) <= tolerance:
+            hits.append(f"Good exploration ratio: {exp_ratio:.2f}")
+            scores.append(1.0)
+        elif exp_ratio < target - tolerance:
+            scores.append(0.7)
+            misses.append(f"Low exploration ratio: {exp_ratio:.2f} (target: {target:.2f})")
+        else:
+            scores.append(0.7)
+            misses.append(f"High exploration ratio: {exp_ratio:.2f} (target: {target:.2f})")
+
+    # 3. Token usage evaluation
+    if execution_metrics and "tokenUsage" in execution_metrics:
+        tokens = execution_metrics["tokenUsage"]
+        total_tokens = tokens.get("input", 0) + tokens.get("output", 0)
+        max_tokens = (
+            THRESHOLDS["max_tokens_complex"]
+            if complexity == "complex"
+            else THRESHOLDS["max_tokens_simple"]
+        )
+
+        if total_tokens <= max_tokens:
+            hits.append(f"Token usage ({total_tokens}) within budget")
+            scores.append(1.0)
+        else:
+            penalty = min((total_tokens - max_tokens) / max_tokens, 1.0)
+            scores.append(1.0 - penalty * 0.5)  # Softer penalty
+            misses.append(f"High token usage: {total_tokens} (budget: {max_tokens})")
+
+    # 4. Cost evaluation
+    if execution_metrics and "costUsd" in execution_metrics:
+        cost = execution_metrics["costUsd"]
+        max_cost = (
+            THRESHOLDS["max_cost_complex"]
+            if complexity == "complex"
+            else THRESHOLDS["max_cost_simple"]
+        )
+
+        if cost <= max_cost:
+            hits.append(f"Cost (${cost:.4f}) within budget")
+            scores.append(1.0)
+        else:
+            scores.append(0.5)
+            misses.append(f"High cost: ${cost:.4f} (budget: ${max_cost:.4f})")
+
+    # Calculate final score
+    if not scores:
+        return {
+            "score": 0.5,
+            "hits": ["No efficiency metrics available"],
+            "misses": [],
+            "reasoning": "Could not evaluate efficiency - no metrics provided",
+        }
+
+    final_score = sum(scores) / len(scores)
+
+    reasoning = (
+        f"Task complexity: {complexity}. "
+        f"Evaluated {len(scores)} efficiency criteria. "
+        f"Score: {final_score:.2f}"
+    )
+
+    return {
+        "score": round(final_score, 2),
+        "hits": hits[:4],
+        "misses": misses[:4],
+        "reasoning": reasoning,
+    }
+
+
+def main():
+    try:
+        input_data = json.loads(sys.stdin.read())
+
+        trace_summary = input_data.get("candidate_trace_summary")
+        execution_metrics = input_data.get("execution_metrics")
+        expected_outcome = input_data.get("expected_outcome", "")
+
+        result = evaluate_efficiency(
+            trace_summary=trace_summary,
+            execution_metrics=execution_metrics,
+            expected_outcome=expected_outcome,
+        )
+
+        print(json.dumps(result, indent=2))
+
+    except Exception as e:
+        error_result = {
+            "score": 0.0,
+            "hits": [],
+            "misses": [f"Evaluator error: {str(e)}"],
+            "reasoning": f"Evaluation failed: {str(e)}",
+        }
+        print(json.dumps(error_result, indent=2))
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/showcase/tool-evaluation-plugins/scripts/pairwise_tool_compare.py b/examples/showcase/tool-evaluation-plugins/scripts/pairwise_tool_compare.py
new file mode 100644
index 00000000..e0bc842c
--- /dev/null
+++ b/examples/showcase/tool-evaluation-plugins/scripts/pairwise_tool_compare.py
@@ -0,0 +1,220 @@
+#!/usr/bin/env python3
+"""
+Pairwise Tool Comparison - Code Judge Plugin
+
+Compares tool usage quality between two agent responses with
+position bias mitigation (runs comparison twice with swapped order).
+
+Why this is a plugin (not built-in):
+- Pairwise comparison is a specialized evaluation pattern
+- Requires reference response (not always available)
+- Position bias mitigation adds complexity
+- Not all evaluations need comparative assessment
+
+Usage in eval YAML:
+  evaluators:
+    - name: pairwise-compare
+      type: code_judge
+      script: scripts/pairwise_tool_compare.py
+
+Input (stdin JSON):
+  - candidate_answer: Agent's response (Response A)
+  - reference_answer: Reference/baseline response (Response B)
+  - output_messages: Tool calls from candidate
+  - expected_outcome: Task description
+
+Output (stdout JSON):
+  - score: 0.0-1.0 (1.0 = candidate wins, 0.5 = tie, 0.0 = reference wins)
+  - hits: Candidate advantages
+  - misses: Reference advantages
+  - reasoning: Comparison explanation with bias check result
+"""
+
+import json
+import sys
+from typing import Any
+
+
+def extract_tool_summary(messages: list[dict] | None) -> dict:
+    """Extract tool usage summary from messages."""
+    if not messages:
+        return {"tools": [], "count": 0}
+
+    tools = []
+    for msg in messages:
+        if msg.get("role") == "assistant" and msg.get("toolCalls"):
+            for call in msg["toolCalls"]:
+                tools.append(call.get("tool", "unknown"))
+
+    return {
+        "tools": tools,
+        "count": len(tools),
+        "unique": list(set(tools)),
+    }
+
+
+def compare_responses(
+    response_a: str,
+    response_b: str,
+    tools_a: dict,
+    tools_b: dict,
+    task: str,
+) -> dict[str, Any]:
+    """
+    Compare two responses for tool usage quality.
+    Returns winner and reasoning.
+    """
+    a_advantages = []
+    b_advantages = []
+
+    # 1. Compare tool count efficiency
+    if tools_a["count"] < tools_b["count"] and tools_a["count"] > 0:
+        a_advantages.append(f"More efficient: {tools_a['count']} vs {tools_b['count']} tools")
+    elif tools_b["count"] < tools_a["count"] and tools_b["count"] > 0:
+        b_advantages.append(f"More efficient: {tools_b['count']} vs {tools_a['count']} tools")
+
+    # 2. Compare tool diversity
+    if len(tools_a["unique"]) > len(tools_b["unique"]):
+        a_advantages.append(f"More diverse tools: {len(tools_a['unique'])} types")
+    elif len(tools_b["unique"]) > len(tools_a["unique"]):
+        b_advantages.append(f"More diverse tools: {len(tools_b['unique'])} types")
+
+    # 3. Compare response length (proxy for completeness)
+    len_a, len_b = len(response_a), len(response_b)
+    if len_a > len_b * 1.2:
+        a_advantages.append("More comprehensive response")
+    elif len_b > len_a * 1.2:
+        b_advantages.append("More comprehensive response")
+
+    # 4. Check for no tools (penalty)
+    if tools_a["count"] == 0 and tools_b["count"] > 0:
+        b_advantages.append("Response B used tools; A did not")
+    elif tools_b["count"] == 0 and tools_a["count"] > 0:
+        a_advantages.append("Response A used tools; B did not")
+
+    # Determine winner
+    a_score = len(a_advantages)
+    b_score = len(b_advantages)
+
+    if a_score > b_score:
+        return {"winner": "A", "a_advantages": a_advantages, "b_advantages": b_advantages}
+    elif b_score > a_score:
+        return {"winner": "B", "a_advantages": a_advantages, "b_advantages": b_advantages}
+    else:
+        return {"winner": "TIE", "a_advantages": a_advantages, "b_advantages": b_advantages}
+
+
+def pairwise_with_bias_mitigation(
+    candidate: str,
+    reference: str,
+    candidate_tools: dict,
+    reference_tools: dict,
+    task: str,
+) -> dict[str, Any]:
+    """
+    Run pairwise comparison twice with position swap to mitigate bias.
+    """
+    # Pass 1: Candidate as A, Reference as B
+    pass1 = compare_responses(
+        candidate, reference, candidate_tools, reference_tools, task
+    )
+
+    # Pass 2: Reference as A, Candidate as B (swapped)
+    pass2 = compare_responses(
+        reference, candidate, reference_tools, candidate_tools, task
+    )
+
+    # Map pass2 result back (if A wins in pass2, that means Reference won)
+    pass2_mapped = {
+        "A": "B",  # A in pass2 = Reference = B in pass1 terms
+        "B": "A",  # B in pass2 = Candidate = A in pass1 terms
+        "TIE": "TIE",
+    }.get(pass2["winner"], "TIE")
+
+    # Check consistency
+    consistent = pass1["winner"] == pass2_mapped
+
+    if consistent:
+        final_winner = pass1["winner"]
+        confidence = "high"
+    else:
+        # Inconsistent results indicate position bias - return TIE
+        final_winner = "TIE"
+        confidence = "low (position bias detected)"
+
+    # Convert to score (candidate perspective)
+    if final_winner == "A":  # Candidate wins
+        score = 1.0
+    elif final_winner == "B":  # Reference wins
+        score = 0.0
+    else:  # TIE
+        score = 0.5
+
+    hits = pass1["a_advantages"][:4]  # Candidate advantages
+    misses = pass1["b_advantages"][:4]  # Reference advantages
+
+    reasoning = (
+        f"Pass 1: {pass1['winner']} wins. "
+        f"Pass 2 (swapped): {pass2['winner']} wins (maps to {pass2_mapped}). "
+        f"Consistency: {consistent}. "
+        f"Final: {final_winner} ({confidence} confidence)"
+    )
+
+    return {
+        "score": score,
+        "hits": hits,
+        "misses": misses,
+        "reasoning": reasoning,
+    }
+
+
+def main():
+    try:
+        input_data = json.loads(sys.stdin.read())
+
+        candidate = input_data.get("candidate_answer", "")
+        reference = input_data.get("reference_answer", "")
+        output_messages = input_data.get("output_messages", [])
+        task = input_data.get("expected_outcome", "")
+
+        # If no reference, we can't do pairwise comparison
+        if not reference:
+            print(json.dumps({
+                "score": 0.5,
+                "hits": ["Candidate response provided"],
+                "misses": ["No reference for comparison"],
+                "reasoning": "Pairwise comparison requires reference_answer field",
+            }, indent=2))
+            return
+
+        # Extract tool summaries
+        candidate_tools = extract_tool_summary(output_messages)
+
+        # For reference, we'd need reference_output_messages
+        # In practice, this would come from a baseline run
+        reference_messages = input_data.get("reference_output_messages", [])
+        reference_tools = extract_tool_summary(reference_messages)
+
+        result = pairwise_with_bias_mitigation(
+            candidate=candidate,
+            reference=reference,
+            candidate_tools=candidate_tools,
+            reference_tools=reference_tools,
+            task=task,
+        )
+
+        print(json.dumps(result, indent=2))
+
+    except Exception as e:
+        error_result = {
+            "score": 0.0,
+            "hits": [],
+            "misses": [f"Evaluator error: {str(e)}"],
+            "reasoning": f"Evaluation failed: {str(e)}",
+        }
+        print(json.dumps(error_result, indent=2))
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/showcase/tool-evaluation-plugins/scripts/tool_selection_judge.py b/examples/showcase/tool-evaluation-plugins/scripts/tool_selection_judge.py
new file mode 100644
index 00000000..18f8f560
--- /dev/null
+++ b/examples/showcase/tool-evaluation-plugins/scripts/tool_selection_judge.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+"""
+Tool Selection Evaluator - Code Judge Plugin
+
+Evaluates whether the agent selected the RIGHT tools for the task.
+This is a semantic evaluation that requires understanding task requirements
+and matching them against available tools.
+
+Why this is a plugin (not built-in):
+- Requires domain-specific knowledge of what tools are "appropriate"
+- Involves semantic judgment, not just pattern matching
+- Different projects have different tool selection criteria
+
+Usage in eval YAML:
+  evaluators:
+    - name: tool-selection
+      type: code_judge
+      script: scripts/tool_selection_judge.py
+
+Input (stdin JSON):
+  - question: The user's task/question
+  - expected_outcome: Description of expected behavior
+  - output_messages: Array of messages including tool calls
+  - candidate_trace_summary: Summary of tool usage
+
+Output (stdout JSON):
+  - score: 0.0-1.0 (1.0 = all tools appropriate, 0.0 = all inappropriate)
+  - hits: List of appropriate tool selections
+  - misses: List of missing or inappropriate tools
+  - reasoning: Explanation of the evaluation
+"""
+
+import json
+import sys
+from typing import Any
+
+
+def extract_tool_calls(messages: list[dict]) -> list[dict]:
+    """Extract all tool calls from output messages."""
+    tool_calls = []
+    for msg in messages:
+        if msg.get("role") == "assistant" and msg.get("toolCalls"):
+            for call in msg["toolCalls"]:
+                tool_calls.append({
+                    "tool": call.get("tool"),
+                    "args": call.get("args", {}),
+                })
+    return tool_calls
+
+
+def evaluate_tool_selection(
+    question: str,
+    expected_outcome: str,
+    tool_calls: list[dict],
+    trace_summary: dict | None,
+) -> dict[str, Any]:
+    """
+    Evaluate tool selection based on task requirements.
+
+    This is a simplified heuristic-based evaluation.
+    For production use, you might:
+    1. Use an LLM to judge appropriateness
+    2. Define explicit tool-to-task mappings
+    3. Use a decision tree based on task classification
+    """
+    hits = []
+    misses = []
+
+    # Extract keywords from question and expected outcome
+    task_text = f"{question} {expected_outcome}".lower()
+
+    # Define tool-to-task mappings (customize for your domain)
+    tool_task_mappings = {
+        "search": ["find", "search", "look", "query", "discover"],
+        "fetch": ["get", "retrieve", "fetch", "download", "load"],
+        "read": ["read", "open", "view", "examine", "inspect"],
+        "write": ["write", "save", "create", "output", "generate"],
+        "analyze": ["analyze", "process", "compute", "calculate"],
+        "validate": ["check", "validate", "verify", "confirm"],
+    }
+
+    # Determine expected tools based on task keywords
+    expected_tools = set()
+    for tool, keywords in tool_task_mappings.items():
+        if any(kw in task_text for kw in keywords):
+            expected_tools.add(tool)
+
+    # Get actual tools used
+    actual_tools = set(call["tool"] for call in tool_calls)
+
+    # Evaluate selection
+    if not tool_calls:
+        return {
+            "score": 0.0,
+            "hits": [],
+            "misses": ["No tools were called"],
+            "reasoning": "Agent did not use any tools. Expected at least some tool usage.",
+        }
+
+    # Check for appropriate selections
+    for tool in actual_tools:
+        tool_lower = tool.lower()
+        is_relevant = any(
+            tool_lower in expected or expected in tool_lower
+            for expected in expected_tools
+        )
+        if is_relevant or not expected_tools:
+            hits.append(f"Tool '{tool}' appears relevant to task")
+        else:
+            misses.append(f"Tool '{tool}' may not be needed for this task")
+
+    # Check for missing expected tools
+    for expected in expected_tools:
+        if not any(expected in t.lower() for t in actual_tools):
+            misses.append(f"Expected a '{expected}'-type tool but none used")
+
+    # Calculate score
+    total_checks = len(hits) + len(misses)
+    score = len(hits) / total_checks if total_checks > 0 else 0.5
+
+    reasoning = (
+        f"Evaluated {len(actual_tools)} tool(s) against task requirements. "
+        f"{len(hits)} appropriate, {len(misses)} issues found."
+    )
+
+    return {
+        "score": round(score, 2),
+        "hits": hits[:4],  # Cap at 4 per contract
+        "misses": misses[:4],
+        "reasoning": reasoning,
+    }
+
+
+def main():
+    try:
+        input_data = json.loads(sys.stdin.read())
+
+        question = input_data.get("question", "")
+        expected_outcome = input_data.get("expected_outcome", "")
+        output_messages = input_data.get("output_messages", [])
+        trace_summary = input_data.get("candidate_trace_summary")
+
+        tool_calls = extract_tool_calls(output_messages)
+
+        result = evaluate_tool_selection(
+            question=question,
+            expected_outcome=expected_outcome,
+            tool_calls=tool_calls,
+            trace_summary=trace_summary,
+        )
+
+        print(json.dumps(result, indent=2))
+
+    except Exception as e:
+        error_result = {
+            "score": 0.0,
+            "hits": [],
+            "misses": [f"Evaluator error: {str(e)}"],
+            "reasoning": f"Evaluation failed: {str(e)}",
+        }
+        print(json.dumps(error_result, indent=2))
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/showcase/tool-evaluation-plugins/tool-eval-demo.yaml b/examples/showcase/tool-evaluation-plugins/tool-eval-demo.yaml
new file mode 100644
index 00000000..0b830617
--- /dev/null
+++ b/examples/showcase/tool-evaluation-plugins/tool-eval-demo.yaml
@@ -0,0 +1,131 @@
+# Tool Evaluation Plugins Demo
+# Demonstrates plugin-based (code judge) tool evaluation patterns
+#
+# These patterns complement the built-in tool_trajectory evaluator with
+# semantic evaluation capabilities that require domain-specific logic.
+#
+# Run: cd examples/showcase/tool-evaluation-plugins
+#      npx agentv eval tool-eval-demo.yaml --target mock_agent
+
+$schema: agentv-eval-v2
+description: Showcase of tool evaluation plugin patterns
+
+# Use mock_agent target (configure in .agentv/targets.yaml)
+target: mock_agent
+
+evalcases:
+  # ==========================================
+  # Example 1: Tool Selection Evaluation
+  # Use case: Verify agent chose appropriate tools for the task
+  # ==========================================
+  - id: tool-selection-demo
+
+    expected_outcome: |-
+      Agent should search for relevant information and fetch data from APIs.
+      Uses search and fetch tools appropriately for the research task.
+
+    input_messages:
+      - role: user
+        content: Find information about the current weather in Tokyo and fetch the detailed forecast.
+
+    execution:
+      evaluators:
+        # Built-in: Check minimum tool calls
+        - name: trajectory-check
+          type: tool_trajectory
+          mode: any_order
+          minimums:
+            search: 1
+            fetch: 1
+
+        # Plugin: Semantic tool selection evaluation
+        - name: selection-quality
+          type: code_judge
+          script: scripts/tool_selection_judge.py
+
+  # ==========================================
+  # Example 2: Efficiency Scoring
+  # Use case: Evaluate resource efficiency of agent execution
+  # ==========================================
+  - id: efficiency-demo
+
+    expected_outcome: |-
+      Agent efficiently processes the request with minimal redundant operations.
+      Simple task requiring straightforward tool usage.
+
+    input_messages:
+      - role: user
+        content: Get the current time.
+
+    execution:
+      evaluators:
+        # Plugin: Efficiency metrics scoring
+        - name: efficiency-check
+          type: code_judge
+          script: scripts/efficiency_scorer.py
+
+  # ==========================================
+  # Example 3: Combined Built-in + Plugin Evaluation
+  # Use case: Comprehensive tool usage assessment
+  # ==========================================
+  - id: combined-evaluation
+
+    expected_outcome: |-
+      Agent performs comprehensive data analysis:
+      1. Search multiple sources
+      2. Validate data quality
+      3. Process and transform results
+      4. Output formatted report
+
+    input_messages:
+      - role: user
+        content: Analyze the quarterly sales data and generate a summary report.
+
+    execution:
+      evaluators:
+        # Built-in: Verify required workflow sequence
+        - name: workflow-trajectory
+          type: tool_trajectory
+          mode: in_order
+          expected:
+            - tool: search
+            - tool: validate
+            - tool: process
+
+        # Plugin: Check if tools were appropriate choices
+        - name: selection-check
+          type: code_judge
+          script: scripts/tool_selection_judge.py
+
+        # Plugin: Evaluate efficiency
+        - name: efficiency
+          type: code_judge
+          script: scripts/efficiency_scorer.py
+
+  # ==========================================
+  # Example 4: Pairwise Comparison
+  # Use case: Compare candidate against baseline response
+  # Requires reference_answer field
+  # ==========================================
+  - id: pairwise-demo
+
+    expected_outcome: |-
+      Agent should retrieve and summarize the document efficiently.
+
+    input_messages:
+      - role: user
+        content: Summarize the main points of the user manual.
+
+    # Reference answer for comparison (from a baseline agent)
+    reference_answer: |-
+      Here is a summary of the user manual:
+      1. Installation: Follow the setup wizard
+      2. Configuration: Edit settings.json
+      3. Usage: Run the main command
+
+    execution:
+      evaluators:
+        # Plugin: Pairwise comparison with position bias mitigation
+        - name: pairwise-quality
+          type: code_judge
+          script: scripts/pairwise_tool_compare.py
diff --git a/openspec/changes/add-execution-metrics/proposal.md b/openspec/changes/add-execution-metrics/proposal.md
new file mode 100644
index 00000000..669d0385
--- /dev/null
+++ b/openspec/changes/add-execution-metrics/proposal.md
@@ -0,0 +1,81 @@
+# Change: Add Extended Execution Metrics
+
+## Why
+
+Tracking **how** agents work is as important as **what** they produce. Currently, AgentV's `TraceSummary` only captures basic tool call counts. Extended metrics like token usage, cost, duration, and efficiency ratios provide valuable signals for:
+
+1. Cost optimization (tokens, API costs)
+2. Efficiency analysis (tokens per tool, exploration ratio)
+3. Performance tracking (duration, tool latency)
+
+This is infrastructure/data collection - not domain logic. Providers optionally report metrics; the framework aggregates them.
+
+## What Changes
+
+- Extend `TraceSummary` to `ExecutionMetrics` with optional fields for token usage, cost, duration
+- Add computed metrics: `explorationRatio`, `tokensPerTool`
+- Make metrics available to evaluators and output writers
+- Add example demonstrating metrics-based evaluation
+
+## Impact
+
+- Affected specs: `evaluation`
+- Affected code: `packages/core/src/evaluation/trace.ts`, provider types
+- Non-breaking: all new fields are optional; existing traces work unchanged
+
+## Implementation Notes
+
+### Data Model
+Extend `TraceSummary` (don't create a separate type):
+```typescript
+// In packages/core/src/evaluation/trace.ts
+export interface TraceSummary {
+  // Existing fields
+  eventCount: number;
+  toolNames: string[];
+  toolCallsByName: Record<string, number>;
+  errorCount: number;
+
+  // NEW optional fields
+  tokenUsage?: { input: number; output: number; cached?: number };
+  costUsd?: number;
+  durationMs?: number;
+  toolDurations?: Record<string, number[]>;
+}
+```
+
+### Provider Response
+Extend `ProviderResponse` in `packages/core/src/evaluation/providers/types.ts`:
+```typescript
+export interface ProviderResponse {
+  // Existing fields...
+
+  // NEW optional metrics (providers report what they can)
+  tokenUsage?: { input: number; output: number; cached?: number };
+  costUsd?: number;
+  durationMs?: number;
+}
+```
+
+### Computed Metrics
+Add computation functions in `trace.ts`:
+```typescript
+// Default exploration tools (can be overridden per-eval via config)
+const DEFAULT_EXPLORATION_TOOLS = ['read', 'grep', 'glob', 'search', 'list'];
+
+export function computeExplorationRatio(
+  summary: TraceSummary,
+  explorationTools: string[] = DEFAULT_EXPLORATION_TOOLS
+): number | undefined {
+  if (summary.eventCount === 0) return undefined;
+  const explorationCalls = explorationTools.reduce(
+    (sum, tool) => sum + (summary.toolCallsByName[tool] ?? 0), 0
+  );
+  return explorationCalls / summary.eventCount;
+}
+```
+
+### Integration Points
+1. **EvaluationContext**: Add `executionMetrics?: TraceSummary` (already has `traceSummary`)
+2. **Code judge stdin**: Include metrics in the JSON passed to scripts
+3. **JSONL output**: Add `execution_metrics` field to result objects
diff --git a/openspec/changes/add-execution-metrics/specs/evaluation/spec.md b/openspec/changes/add-execution-metrics/specs/evaluation/spec.md
new file mode 100644
index 00000000..5f5f00b7
--- /dev/null
+++ b/openspec/changes/add-execution-metrics/specs/evaluation/spec.md
@@ -0,0 +1,80 @@
+## ADDED Requirements
+
+### Requirement: Extended Execution Metrics
+
+The system SHALL capture extended execution metrics from providers and make them available to evaluators.
+
+#### Scenario: Provider reports token usage
+- **GIVEN** a provider invocation completes successfully
+- **AND** the provider response includes token usage data
+- **WHEN** the trace is processed
+- **THEN** `execution_metrics.tokenUsage` contains `{ input, output, cached? }`
+- **AND** the metrics are available to evaluators via `context.executionMetrics`
+
+#### Scenario: Provider reports cost
+- **GIVEN** a provider invocation completes successfully
+- **AND** the provider response includes cost data
+- **WHEN** the trace is processed
+- **THEN** `execution_metrics.costUsd` contains the reported cost
+- **AND** the cost is included in evaluation results
+
+#### Scenario: Provider reports duration
+- **GIVEN** a provider invocation completes successfully
+- **WHEN** the trace is processed
+- **THEN** `execution_metrics.durationMs` contains the total execution time
+- **AND** if individual tool durations are available, `execution_metrics.toolDurations` maps tool names to duration arrays
+
+#### Scenario: Metrics not available
+- **GIVEN** a provider invocation completes successfully
+- **AND** the provider does not report metrics
+- **WHEN** the trace is processed
+- **THEN** `execution_metrics` fields are `undefined` or omitted
+- **AND** evaluation proceeds normally without metrics
+
+#### Scenario: Computed exploration ratio
+- **GIVEN** execution metrics with tool call data
+- **AND** a configured list of exploration tools (e.g., `["read", "grep", "glob", "search"]`)
+- **WHEN** `explorationRatio` is computed
+- **THEN** the ratio equals `explorationToolCalls / totalToolCalls`
+- **AND** the ratio is between 0.0 and 1.0
+
+#### Scenario: Computed tokens per tool
+- **GIVEN** execution metrics with `tokenUsage.output` and `toolCallCount`
+- **WHEN** `tokensPerTool` is computed
+- **THEN** the value equals `tokenUsage.output / toolCallCount`
+- **AND** returns `undefined` if tool call count is zero
+
+#### Scenario: Code judge receives metrics
+- **GIVEN** an eval case with a `code_judge` evaluator
+- **AND** the provider reported execution metrics
+- **WHEN** the code judge script is invoked
+- **THEN** the stdin JSON includes `execution_metrics` with available fields
+- **AND** the script can use metrics for scoring decisions
+
+#### Scenario: Metrics in evaluation results
+- **GIVEN** an evaluation completes with execution metrics
+- **WHEN** results are written to JSONL output
+- **THEN** each result includes `execution_metrics` object with available fields
+- **AND** undefined fields are omitted from output
+
+### Requirement: Execution Metrics Data Model
+
+The system SHALL define a structured data model for execution metrics.
+
+#### Scenario: Token usage structure
+- **GIVEN** a provider reports token usage
+- **WHEN** the data is captured
+- **THEN** `tokenUsage` has required fields `input: number` and `output: number`
+- **AND** optional field `cached?: number` for cache-hit tokens
+
+#### Scenario: Tool durations structure
+- **GIVEN** a provider reports individual tool timing
+- **WHEN** the data is captured
+- **THEN** `toolDurations` is a map of `{ [toolName: string]: number[] }`
+- **AND** each array contains durations in milliseconds for each invocation of that tool
+
+#### Scenario: Metrics schema validation
+- **GIVEN** a provider returns metrics data
+- **WHEN** the data is validated
+- **THEN** numeric fields are non-negative
+- **AND** invalid data is logged and omitted rather than causing failure
diff --git a/openspec/changes/add-execution-metrics/tasks.md b/openspec/changes/add-execution-metrics/tasks.md
new file mode 100644
index 00000000..aa80072a
--- /dev/null
+++ b/openspec/changes/add-execution-metrics/tasks.md
@@ -0,0 +1,34 @@
+## 1. Data Model
+
+- [ ] 1.1 Extend `TraceSummary` type with optional `tokenUsage` field
+- [ ] 1.2 Add optional `costUsd` field to trace
+- [ ] 1.3 Add optional `durationMs` field to trace
+- [ ] 1.4 Add optional `toolDurations` map (tool name -> duration array)
+
+## 2. Computed Metrics
+
+- [ ] 2.1 Implement `explorationRatio` computation (configurable exploration tool list)
+- [ ] 2.2 Implement `tokensPerTool` computation
+- [ ] 2.3 Add `avgToolDurationMs` computation
+
+## 3. Provider Integration
+
+- [ ] 3.1 Define provider metric reporting interface
+- [ ] 3.2 Update CLI provider to report duration metrics
+- [ ] 3.3 Document metric reporting for custom providers
+
+## 4. Output & Evaluation
+
+- [ ] 4.1 Include metrics in evaluation results JSON
+- [ ] 4.2 Make metrics available to code judges via stdin
+- [ ] 4.3 Add metrics to JSONL output format
+
+## 5. Examples & Documentation
+
+- [ ] 5.1 Add metrics evaluation example to `examples/features/`
+- [ ] 5.2 Create code judge example that uses metrics
+
+## 6. Testing
+
+- [ ] 6.1 Unit tests for metric computation
+- [ ] 6.2 Integration test with metric-aware code judge
diff --git a/openspec/changes/add-trajectory-argument-matching/proposal.md b/openspec/changes/add-trajectory-argument-matching/proposal.md
new file mode 100644
index 00000000..c00d84a3
--- /dev/null
+++ b/openspec/changes/add-trajectory-argument-matching/proposal.md
@@ -0,0 +1,49 @@
+# Change: Add Argument Matching to Tool Trajectory Evaluator
+
+## Why
+
+The current `tool_trajectory` evaluator only validates tool **names**, not their **arguments**. Argument validation is a core primitive for tool use evaluation. Without it, users cannot verify that agents pass correct parameters to tools.
+
+This is a lightweight extension to an existing primitive - not domain logic. It aligns with Google ADK's trajectory evaluator which supports exact argument matching in EXACT mode.
+
+## What Changes
+
+- Extend `tool_trajectory` evaluator to support optional `args` matching in `expected` items
+- Support two argument matching modes: **exact** (deep equality) and **skip** (`any`)
+- Add examples demonstrating argument matching
+
+**Note:** Pattern/regex matching is intentionally excluded - use `code_judge` for complex validation logic. See AGENTS.md "Design Principles" for rationale.
+
+## Impact
+
+- Affected specs: `evaluation`
+- Affected code: `packages/core/src/evaluation/evaluators.ts` (ToolTrajectoryEvaluator)
+- Non-breaking: existing configs without `args` continue to work unchanged
+
+## Implementation Notes
+
+### Data Source
+Tool arguments are already available in `ToolCall.input` (see `packages/core/src/evaluation/providers/types.ts`).
+Currently, `extractToolCallsFromMessages()` discards this - change to preserve it:
+```typescript
+// Current (discards args):
+toolCalls.push({ name: call.tool });
+
+// New (preserves args):
+toolCalls.push({ name: call.tool, args: call.input });
+```
+
+### Type Definition
+Extend `ToolTrajectoryExpectedItem` in `trace.ts`:
+```typescript
+interface ToolTrajectoryExpectedItem {
+  tool: string;
+  args?: 'any' | Record<string, unknown>;  // NEW
+}
+```
+
+### Matching Semantics
+- `args: any` → skip argument validation entirely
+- `args: { key: value }` → partial match (only validate specified keys, use deep equality)
+- If tool name matches but args don't → **full miss** (score 0 for that expected item)
+- Use deep equality for nested objects
diff --git a/openspec/changes/add-trajectory-argument-matching/specs/evaluation/spec.md b/openspec/changes/add-trajectory-argument-matching/specs/evaluation/spec.md
new file mode 100644
index 00000000..d890474c
--- /dev/null
+++ b/openspec/changes/add-trajectory-argument-matching/specs/evaluation/spec.md
@@ -0,0 +1,192 @@
+## MODIFIED Requirements
+
+### Requirement: Tool Trajectory Evaluator
+
+The system SHALL provide a built-in `tool_trajectory` evaluator that asserts tool-call constraints, including optional argument validation.
+
+#### Scenario: Minimum calls met - PASS
+- **GIVEN** an eval case with evaluator:
+  ```yaml
+  type: tool_trajectory
+  mode: any_order
+  minimums:
+    semanticSearch: 3
+  ```
+- **AND** trace summary `toolCallsByName: { "semanticSearch": 3 }`
+- **WHEN** the evaluator runs
+- **THEN** it returns `score: 1.0`
+- **AND** `hits` includes a message like `"semanticSearch called 3 times (minimum: 3)"`
+
+#### Scenario: Minimum calls not met - FAIL
+- **GIVEN** an eval case with evaluator:
+  ```yaml
+  type: tool_trajectory
+  mode: any_order
+  minimums:
+    semanticSearch: 3
+  ```
+- **AND** trace summary `toolCallsByName: { "semanticSearch": 1 }`
+- **WHEN** the evaluator runs
+- **THEN** it returns `score: 0.0`
+- **AND** `misses` includes a message like `"semanticSearch called 1 time (minimum: 3)"`
+
+#### Scenario: Multiple minimums - partial pass
+- **GIVEN** an eval case with evaluator:
+  ```yaml
+  type: tool_trajectory
+  mode: any_order
+  minimums:
+    toolA: 2
+    toolB: 2
+  ```
+- **AND** trace summary `toolCallsByName: { "toolA": 2, "toolB": 1 }`
+- **WHEN** the evaluator runs
+- **THEN** it returns `score: 0.5` (1 of 2 constraints met)
+- **AND** `hits` includes message for toolA
+- **AND** `misses` includes message for toolB
+
+#### Scenario: In-order sequence - PASS
+- **GIVEN** an eval case with evaluator:
+  ```yaml
+  type: tool_trajectory
+  mode: in_order
+  expected:
+    - tool: A
+    - tool: B
+    - tool: C
+  ```
+- **AND** trace contains tool calls in order `[A, X, B, Y, C]` (extra tools allowed)
+- **WHEN** the evaluator runs
+- **THEN** it returns `score: 1.0`
+
+#### Scenario: In-order sequence - FAIL (wrong order)
+- **GIVEN** an eval case with evaluator:
+  ```yaml
+  type: tool_trajectory
+  mode: in_order
+  expected:
+    - tool: A
+    - tool: B
+  ```
+- **AND** trace contains tool calls in order `[B, A]`
+- **WHEN** the evaluator runs
+- **THEN** it returns `score: 0.0`
+- **AND** `misses` explains the order mismatch
+
+#### Scenario: Exact sequence - PASS
+- **GIVEN** an eval case with evaluator:
+  ```yaml
+  type: tool_trajectory
+  mode: exact
+  expected:
+    - tool: A
+    - tool: B
+  ```
+- **AND** trace contains exactly tool calls `[A, B]`
+- **WHEN** the evaluator runs
+- **THEN** it returns `score: 1.0`
+
+#### Scenario: Exact sequence - FAIL (extra tools)
+- **GIVEN** an eval case with evaluator:
+  ```yaml
+  type: tool_trajectory
+  mode: exact
+  expected:
+    - tool: A
+    - tool: B
+  ```
+- **AND** trace contains tool calls `[A, B, C]`
+- **WHEN** the evaluator runs
+- **THEN** it returns `score: 0.0`
+- **AND** `misses` explains the extra tool
+
+#### Scenario: No trace available
+- **GIVEN** an eval case with a `tool_trajectory` evaluator
+- **AND** the provider did not return a trace
+- **WHEN** the evaluator runs
+- **THEN** it returns `score: 0.0`
+- **AND** `misses` includes `"No trace available for evaluation"`
+
+#### Scenario: In-order with exact argument matching - PASS
+- **GIVEN** an eval case with evaluator:
+  ```yaml
+  type: tool_trajectory
+  mode: in_order
+  expected:
+    - tool: search
+      args:
+        query: "weather forecast"
+    - tool: get_weather
+      args:
+        location: "Paris"
+  ```
+- **AND** trace contains tool calls `[search(query="weather forecast"), get_weather(location="Paris")]`
+- **WHEN** the evaluator runs
+- **THEN** it returns `score: 1.0`
+
+#### Scenario: In-order with exact argument matching - FAIL (wrong args)
+- **GIVEN** an eval case with evaluator:
+  ```yaml
+  type: tool_trajectory
+  mode: in_order
+  expected:
+    - tool: search
+      args:
+        query: "weather forecast"
+  ```
+- **AND** trace contains tool calls `[search(query="stock prices")]`
+- **WHEN** the evaluator runs
+- **THEN** it returns `score: 0.0`
+- **AND** `misses` explains the argument mismatch
+
+#### Scenario: Argument matching with `any` skip mode
+- **GIVEN** an eval case with evaluator:
+  ```yaml
+  type: tool_trajectory
+  mode: in_order
+  expected:
+    - tool: search
+      args: any
+    - tool: process
+      args:
+        format: "json"
+  ```
+- **AND** trace contains tool calls `[search(query="anything"), process(format="json")]`
+- **WHEN** the evaluator runs
+- **THEN** it returns `score: 1.0`
+- **AND** the `search` tool's arguments are not validated
+- **AND** the `process` tool's `format` argument is validated
+
+#### Scenario: Exact mode with argument matching
+- **GIVEN** an eval case with evaluator:
+  ```yaml
+  type: tool_trajectory
+  mode: exact
+  expected:
+    - tool: auth
+      args:
+        method: "oauth"
+    - tool: fetch
+      args:
+        endpoint: "/api/users"
+  ```
+- **AND** trace contains exactly `[auth(method="oauth"), fetch(endpoint="/api/users")]`
+- **WHEN** the evaluator runs
+- **THEN** it returns `score: 1.0`
+
+#### Scenario: Partial argument matching (subset validation)
+- **GIVEN** an eval case with evaluator:
+  ```yaml
+  type: tool_trajectory
+  mode: in_order
+  expected:
+    - tool: api_call
+      args:
+        method: "POST"
+        # url not specified - not validated
+  ```
+- **AND** trace contains tool calls `[api_call(method="POST", url="https://example.com", headers={})]`
+- **WHEN** the evaluator runs
+- **THEN** it returns `score: 1.0`
+- **AND** only the specified `method` argument is validated
+- **AND** extra arguments `url` and `headers` are ignored
diff --git a/openspec/changes/add-trajectory-argument-matching/tasks.md b/openspec/changes/add-trajectory-argument-matching/tasks.md
new file mode 100644
index 00000000..5669ba16
--- /dev/null
+++ b/openspec/changes/add-trajectory-argument-matching/tasks.md
@@ -0,0 +1,22 @@
+## 1. Implementation
+
+- [ ] 1.1 Extend `ToolTrajectoryExpectedItem` type to include optional `args` field
+- [ ] 1.2 Implement exact argument matching (deep equality)
+- [ ] 1.3 Implement `any` mode (skip argument validation)
+- [ ] 1.4 Update `evaluateInOrder` to check arguments
+- [ ] 1.5 Update `evaluateExact` to check arguments
+- [ ] 1.6 Update `extractToolCallsFromMessages` to preserve `ToolCall.input`
+
+## 2. Schema & Validation
+
+- [ ] 2.1 Update YAML schema for `expected[].args` field
+
+## 3. Examples & Documentation
+
+- [x] 3.1 Add argument matching examples to `examples/features/evals/tool-trajectory/tool-trajectory-demo.yaml`
+
+## 4. Testing
+
+- [ ] 4.1 Unit tests for exact argument matching
+- [ ] 4.2 Unit tests for `any` mode
+- [ ] 4.3 Integration tests with mock agent

From 362c22a444ef6fd103632d509ab1892bc9808027 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Thu, 1 Jan 2026 09:45:08 +0000
Subject: [PATCH 2/5] feat(core): add execution metrics and trajectory argument
 matching

Execution metrics:
- Extend TraceSummary with tokenUsage, costUsd, durationMs, toolDurations
- Add helper functions: explorationRatio, tokensPerTool, avgToolDurationMs
- Extend ProviderResponse with metrics fields
- Orchestrator merges provider metrics into TraceSummary

Trajectory argument matching:
- Add args field to ToolTrajectoryExpectedItem ('any' or partial object)
- Implement deep equality matching for tool arguments
- Update evaluateInOrder and evaluateExact to check args
- Parse args field from YAML in evaluator-parser
---
 apps/cli/package.json                         |   5 +-
 .../changes/add-execution-metrics/proposal.md |   4 +-
 .../changes/add-execution-metrics/tasks.md    |  24 +-
 .../add-trajectory-argument-matching/tasks.md |  20 +-
 packages/core/package.json                    |   5 +-
 packages/core/src/evaluation/evaluators.ts    | 109 ++++++-
 .../evaluation/loaders/evaluator-parser.ts    |   9 +-
 packages/core/src/evaluation/orchestrator.ts  |  21 +-
 .../core/src/evaluation/providers/index.ts    |   1 +
 .../core/src/evaluation/providers/types.ts    |  18 ++
 packages/core/src/evaluation/trace.ts         | 130 ++++++++
 .../test/evaluation/execution-metrics.test.ts | 248 ++++++++++++++
 .../tool-trajectory-evaluator.test.ts         | 304 ++++++++++++++++++
 13 files changed, 848 insertions(+), 50 deletions(-)
 create mode 100644 packages/core/test/evaluation/execution-metrics.test.ts

diff --git a/apps/cli/package.json b/apps/cli/package.json
index adaf747c..402e7bc5 100644
--- a/apps/cli/package.json
+++ b/apps/cli/package.json
@@ -14,10 +14,7 @@
   "bin": {
     "agentv": "./dist/cli.js"
   },
-  "files": [
-    "dist",
-    "README.md"
-  ],
+  "files": ["dist", "README.md"],
   "scripts": {
     "dev": "bun --watch src/index.ts",
     "build": "tsup && bun run copy-readme",
diff --git a/openspec/changes/add-execution-metrics/proposal.md b/openspec/changes/add-execution-metrics/proposal.md
index 669d0385..1cb8b917 100644
--- a/openspec/changes/add-execution-metrics/proposal.md
+++ b/openspec/changes/add-execution-metrics/proposal.md
@@ -12,8 +12,8 @@ This is infrastructure/data collection - not domain logic. Providers optionally
 
 ## What Changes
 
-- Extend `TraceSummary` to `ExecutionMetrics` with optional fields for token usage, cost, duration
-- Add computed metrics: `explorationRatio`, `tokensPerTool`
+- Add optional execution metrics fields to `TraceSummary` (token usage, cost, duration)
+- Add helper functions to compute derived metrics (`explorationRatio`, `tokensPerTool`)
 - Make metrics available to evaluators and output writers
 - Add example demonstrating metrics-based evaluation
 
diff --git a/openspec/changes/add-execution-metrics/tasks.md b/openspec/changes/add-execution-metrics/tasks.md
index aa80072a..d3891971 100644
--- a/openspec/changes/add-execution-metrics/tasks.md
+++ b/openspec/changes/add-execution-metrics/tasks.md
@@ -1,27 +1,27 @@
 ## 1. Data Model
 
-- [ ] 1.1 Extend `TraceSummary` type with optional `tokenUsage` field
-- [ ] 1.2 Add optional `costUsd` field to trace
-- [ ] 1.3 Add optional `durationMs` field to trace
-- [ ] 1.4 Add optional `toolDurations` map (tool name -> duration array)
+- [x] 1.1 Extend `TraceSummary` type with optional `tokenUsage` field
+- [x] 1.2 Add optional `costUsd` field to trace
+- [x] 1.3 Add optional `durationMs` field to trace
+- [x] 1.4 Add optional `toolDurations` map (tool name -> duration array)
 
 ## 2. Computed Metrics
 
-- [ ] 2.1 Implement `explorationRatio` computation (configurable exploration tool list)
-- [ ] 2.2 Implement `tokensPerTool` computation
-- [ ] 2.3 Add `avgToolDurationMs` computation
+- [x] 2.1 Implement `explorationRatio` computation (configurable exploration tool list)
+- [x] 2.2 Implement `tokensPerTool` computation
+- [x] 2.3 Add `avgToolDurationMs` computation
 
 ## 3. Provider Integration
 
-- [ ] 3.1 Define provider metric reporting interface
+- [x] 3.1 Define provider metric reporting interface
 - [ ] 3.2 Update CLI provider to report duration metrics
 - [ ] 3.3 Document metric reporting for custom providers
 
 ## 4. Output & Evaluation
 
-- [ ] 4.1 Include metrics in evaluation results JSON
-- [ ] 4.2 Make metrics available to code judges via stdin
-- [ ] 4.3 Add metrics to JSONL output format
+- [x] 4.1 Include metrics in evaluation results JSON
+- [x] 4.2 Make metrics available to code judges via stdin
+- [x] 4.3 Add metrics to JSONL output format
 
 ## 5. Examples & Documentation
 
@@ -30,5 +30,5 @@
 
 ## 6. Testing
 
-- [ ] 6.1 Unit tests for metric computation
+- [x] 6.1 Unit tests for metric computation
 - [ ] 6.2 Integration test with metric-aware code judge
diff --git a/openspec/changes/add-trajectory-argument-matching/tasks.md b/openspec/changes/add-trajectory-argument-matching/tasks.md
index 5669ba16..12b7f20c 100644
--- a/openspec/changes/add-trajectory-argument-matching/tasks.md
+++ b/openspec/changes/add-trajectory-argument-matching/tasks.md
@@ -1,15 +1,15 @@
 ## 1. Implementation
 
-- [ ] 1.1 Extend `ToolTrajectoryExpectedItem` type to include optional `args` field
-- [ ] 1.2 Implement exact argument matching (deep equality)
-- [ ] 1.3 Implement `any` mode (skip argument validation)
-- [ ] 1.4 Update `evaluateInOrder` to check arguments
-- [ ] 1.5 Update `evaluateExact` to check arguments
-- [ ] 1.6 Update `extractToolCallsFromMessages` to preserve `ToolCall.input`
+- [x] 1.1 Extend `ToolTrajectoryExpectedItem` type to include optional `args` field
+- [x] 1.2 Implement exact argument matching (deep equality)
+- [x] 1.3 Implement `any` mode (skip argument validation)
+- [x] 1.4 Update `evaluateInOrder` to check arguments
+- [x] 1.5 Update `evaluateExact` to check arguments
+- [x] 1.6 Update `extractToolCallsFromMessages` to preserve `ToolCall.input`
 
 ## 2. Schema & Validation
 
-- [ ] 2.1 Update YAML schema for `expected[].args` field
+- [x] 2.1 Update YAML schema for `expected[].args` field (updated evaluator-parser.ts)
 
 ## 3. Examples & Documentation
 
@@ -17,6 +17,6 @@
 
 ## 4. Testing
 
-- [ ] 4.1 Unit tests for exact argument matching
-- [ ] 4.2 Unit tests for `any` mode
-- [ ] 4.3 Integration tests with mock agent
+- [x] 4.1 Unit tests for exact argument matching
+- [x] 4.2 Unit tests for `any` mode
+- [x] 4.3 Integration tests with mock agent (covered by unit tests with mock context)
diff --git a/packages/core/package.json b/packages/core/package.json
index 765f7e0a..5328d44b 100644
--- a/packages/core/package.json
+++ b/packages/core/package.json
@@ -36,10 +36,7 @@
     "test:watch": "bun test --watch",
     "diagnostics:azure": "bun src/diagnostics/azure-deployment-diag.ts"
   },
-  "files": [
-    "dist",
-    "README.md"
-  ],
+  "files": ["dist", "README.md"],
   "dependencies": {
     "@ai-sdk/anthropic": "^2.0.53",
     "@ai-sdk/azure": "^2.0.78",
diff --git a/packages/core/src/evaluation/evaluators.ts b/packages/core/src/evaluation/evaluators.ts
index ad6ccebd..1c233e91 100644
--- a/packages/core/src/evaluation/evaluators.ts
+++ b/packages/core/src/evaluation/evaluators.ts
@@ -11,7 +11,11 @@ import {
   extractLastAssistantContent,
 } from './providers/types.js';
 import { TEMPLATE_VARIABLES } from './template-variables.js';
-import type { ToolTrajectoryEvaluatorConfig, TraceSummary } from './trace.js';
+import type {
+  ToolTrajectoryEvaluatorConfig,
+  ToolTrajectoryExpectedItem,
+  TraceSummary,
+} from './trace.js';
 import type {
   EvalCase,
   EvaluationVerdict,
@@ -584,6 +588,58 @@ function substituteVariables(template: string, variables: Record<string, string>
 
 // Tool Trajectory Evaluator
 
+/** Extracted tool call with optional arguments */
+interface ExtractedToolCall {
+  readonly name: string;
+  readonly args?: Record<string, unknown>;
+}
+
+/**
+ * Deep equality check for two values.
+ * Handles primitives, arrays, and plain objects.
+ */
+function deepEqual(a: unknown, b: unknown): boolean {
+  if (a === b) return true;
+  if (a === null || b === null) return a === b;
+  if (typeof a !== typeof b) return false;
+  if (typeof a !== 'object') return a === b;
+
+  if (Array.isArray(a) !== Array.isArray(b)) return false;
+  if (Array.isArray(a) && Array.isArray(b)) {
+    if (a.length !== b.length) return false;
+    return a.every((val, i) => deepEqual(val, b[i]));
+  }
+
+  const aObj = a as Record<string, unknown>;
+  const bObj = b as Record<string, unknown>;
+  const aKeys = Object.keys(aObj);
+  const bKeys = Object.keys(bObj);
+  if (aKeys.length !== bKeys.length) return false;
+  return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
+}
+
+/**
+ * Check if actual args match expected args.
+ * - 'any' → always matches
+ * - object → partial match (only specified keys, deep equality)
+ */
+function argsMatch(
+  expected: ToolTrajectoryExpectedItem['args'],
+  actual: Record<string, unknown> | undefined,
+): boolean {
+  // No args constraint means match
+  if (expected === undefined) return true;
+  // 'any' means skip validation
+  if (expected === 'any') return true;
+  // Partial match: check only specified keys
+  if (actual === undefined) return false;
+  for (const key of Object.keys(expected)) {
+    if (!Object.hasOwn(actual, key)) return false;
+    if (!deepEqual(expected[key], actual[key])) return false;
+  }
+  return true;
+}
+
 export interface ToolTrajectoryEvaluatorOptions {
   readonly config: ToolTrajectoryEvaluatorConfig;
 }
@@ -650,16 +706,19 @@ export class ToolTrajectoryEvaluator implements Evaluator {
    */
   private extractToolCallsFromMessages(
     messages: readonly OutputMessage[] | undefined,
-  ): readonly { name: string }[] {
+  ): readonly ExtractedToolCall[] {
     if (!messages) {
       return [];
     }
 
-    const toolCalls: { name: string }[] = [];
+    const toolCalls: ExtractedToolCall[] = [];
     for (const message of messages) {
       if (message.toolCalls) {
         for (const call of message.toolCalls) {
-          toolCalls.push({ name: call.tool });
+          toolCalls.push({
+            name: call.tool,
+            args: call.input as Record<string, unknown> | undefined,
+          });
         }
       }
     }
@@ -669,7 +728,7 @@ export class ToolTrajectoryEvaluator implements Evaluator {
   /**
    * Build a summary from extracted tool calls.
    */
-  private buildSummary(toolCalls: readonly { name: string }[]): TraceSummary {
+  private buildSummary(toolCalls: readonly ExtractedToolCall[]): TraceSummary {
     const toolCallsByName: Record<string, number> = {};
     for (const call of toolCalls) {
       toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
@@ -721,7 +780,7 @@ export class ToolTrajectoryEvaluator implements Evaluator {
     };
   }
 
-  private evaluateInOrder(toolCalls: readonly { name: string }[]): EvaluationScore {
+  private evaluateInOrder(toolCalls: readonly ExtractedToolCall[]): EvaluationScore {
     const expected = this.config.expected ?? [];
 
     if (expected.length === 0) {
@@ -739,20 +798,33 @@ export class ToolTrajectoryEvaluator implements Evaluator {
     let actualIndex = 0;
 
     for (let i = 0; i < expected.length; i++) {
-      const expectedTool = expected[i].tool;
+      const expectedItem = expected[i];
+      const expectedTool = expectedItem.tool;
       let found = false;
+      let argsMismatch = false;
 
       while (actualIndex < toolCalls.length) {
-        if (toolCalls[actualIndex].name === expectedTool) {
-          hits.push(`Found ${expectedTool} at position ${actualIndex}`);
+        const actualCall = toolCalls[actualIndex];
+        if (actualCall.name === expectedTool) {
+          // Tool name matches, check args if specified
+          if (argsMatch(expectedItem.args, actualCall.args)) {
+            hits.push(`Found ${expectedTool} at position ${actualIndex}`);
+            actualIndex++;
+            found = true;
+            break;
+          }
+          // Tool name matches but args don't - this is a miss for this expected item
+          misses.push(
+            `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`,
+          );
           actualIndex++;
-          found = true;
+          argsMismatch = true;
           break;
         }
         actualIndex++;
       }
 
-      if (!found) {
+      if (!found && !argsMismatch) {
         misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
       }
     }
@@ -768,7 +840,7 @@ export class ToolTrajectoryEvaluator implements Evaluator {
     };
   }
 
-  private evaluateExact(toolCalls: readonly { name: string }[]): EvaluationScore {
+  private evaluateExact(toolCalls: readonly ExtractedToolCall[]): EvaluationScore {
     const expected = this.config.expected ?? [];
 
     if (expected.length === 0) {
@@ -790,10 +862,17 @@ export class ToolTrajectoryEvaluator implements Evaluator {
 
     const checkLength = Math.min(expected.length, toolCalls.length);
     for (let i = 0; i < checkLength; i++) {
-      const expectedTool = expected[i].tool;
-      const actualTool = toolCalls[i].name;
+      const expectedItem = expected[i];
+      const expectedTool = expectedItem.tool;
+      const actualCall = toolCalls[i];
+      const actualTool = actualCall.name;
       if (actualTool === expectedTool) {
-        hits.push(`Position ${i}: ${expectedTool} ✓`);
+        // Tool name matches, check args if specified
+        if (argsMatch(expectedItem.args, actualCall.args)) {
+          hits.push(`Position ${i}: ${expectedTool}`);
+        } else {
+          misses.push(`Position ${i}: ${expectedTool} args mismatch`);
+        }
       } else {
         misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
       }
diff --git a/packages/core/src/evaluation/loaders/evaluator-parser.ts b/packages/core/src/evaluation/loaders/evaluator-parser.ts
index 7381ac87..a3ad708c 100644
--- a/packages/core/src/evaluation/loaders/evaluator-parser.ts
+++ b/packages/core/src/evaluation/loaders/evaluator-parser.ts
@@ -258,7 +258,14 @@ export async function parseEvaluators(
         expected = [];
         for (const item of rawExpected) {
           if (isJsonObject(item) && typeof item.tool === 'string') {
-            expected.push({ tool: item.tool });
+            // Parse optional args field: 'any' or Record<string, unknown>
+            let args: ToolTrajectoryExpectedItem['args'];
+            if (item.args === 'any') {
+              args = 'any';
+            } else if (isJsonObject(item.args)) {
+              args = item.args as Record<string, unknown>;
+            }
+            expected.push({ tool: item.tool, ...(args !== undefined ? { args } : {}) });
           }
         }
       }
diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts
index e1728ee9..1ee66fdb 100644
--- a/packages/core/src/evaluation/orchestrator.ts
+++ b/packages/core/src/evaluation/orchestrator.ts
@@ -28,6 +28,7 @@ import {
   type ToolTrajectoryEvaluatorConfig,
   type TraceSummary,
   computeTraceSummary,
+  mergeExecutionMetrics,
 } from './trace.js';
 import type {
   EvalCase,
@@ -411,7 +412,15 @@ async function runBatchEvaluation(options: {
 
     // Extract outputMessages from batch response
     const outputMessages = providerResponse.outputMessages;
-    const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : undefined;
+    const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : undefined;
+    // Merge execution metrics from provider response
+    const traceSummary = baseSummary
+      ? mergeExecutionMetrics(baseSummary, {
+          tokenUsage: providerResponse.tokenUsage,
+          costUsd: providerResponse.costUsd,
+          durationMs: providerResponse.durationMs,
+        })
+      : undefined;
 
     // Extract candidate from last assistant message in output_messages
     const candidate = extractLastAssistantContent(outputMessages);
@@ -550,7 +559,15 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
   const outputMessages = providerResponse.outputMessages;
 
   // Compute trace summary if outputMessages available
-  const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : undefined;
+  const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : undefined;
+  // Merge execution metrics from provider response
+  const traceSummary = baseSummary
+    ? mergeExecutionMetrics(baseSummary, {
+        tokenUsage: providerResponse.tokenUsage,
+        costUsd: providerResponse.costUsd,
+        durationMs: providerResponse.durationMs,
+      })
+    : undefined;
 
   // Extract candidate from last assistant message in output_messages
   const candidate = extractLastAssistantContent(outputMessages);
diff --git a/packages/core/src/evaluation/providers/index.ts b/packages/core/src/evaluation/providers/index.ts
index 2651ddd3..304144b6 100644
--- a/packages/core/src/evaluation/providers/index.ts
+++ b/packages/core/src/evaluation/providers/index.ts
@@ -14,6 +14,7 @@ export type {
   ProviderKind,
   ProviderRequest,
   ProviderResponse,
+  ProviderTokenUsage,
   TargetDefinition,
 } from './types.js';
 
diff --git a/packages/core/src/evaluation/providers/types.ts b/packages/core/src/evaluation/providers/types.ts
index 5515d8ba..34dec571 100644
--- a/packages/core/src/evaluation/providers/types.ts
+++ b/packages/core/src/evaluation/providers/types.ts
@@ -119,11 +119,29 @@ export interface OutputMessage {
   readonly metadata?: Record<string, unknown>;
 }
 
+/**
+ * Token usage metrics reported by provider.
+ */
+export interface ProviderTokenUsage {
+  /** Input/prompt tokens consumed */
+  readonly input: number;
+  /** Output/completion tokens generated */
+  readonly output: number;
+  /** Cached tokens (optional, provider-specific) */
+  readonly cached?: number;
+}
+
 export interface ProviderResponse {
   readonly raw?: unknown;
   readonly usage?: JsonObject;
   /** Output messages from agent execution (primary source for tool trajectory) */
   readonly outputMessages?: readonly OutputMessage[];
+  /** Token usage metrics (optional) */
+  readonly tokenUsage?: ProviderTokenUsage;
+  /** Total cost in USD (optional) */
+  readonly costUsd?: number;
+  /** Execution duration in milliseconds (optional) */
+  readonly durationMs?: number;
 }
 
 /**
diff --git a/packages/core/src/evaluation/trace.ts b/packages/core/src/evaluation/trace.ts
index 5a78378e..bcf61a1d 100644
--- a/packages/core/src/evaluation/trace.ts
+++ b/packages/core/src/evaluation/trace.ts
@@ -3,6 +3,18 @@
  * Provides a normalized, provider-agnostic model for tool-call trajectories.
  */
 
+/**
+ * Token usage metrics from provider execution.
+ */
+export interface TokenUsage {
+  /** Input/prompt tokens consumed */
+  readonly input: number;
+  /** Output/completion tokens generated */
+  readonly output: number;
+  /** Cached tokens (optional, provider-specific) */
+  readonly cached?: number;
+}
+
 /**
  * Compact summary of a trace for lightweight persistence.
  * Included in results by default to avoid payload bloat.
@@ -16,6 +28,14 @@ export interface TraceSummary {
   readonly toolCallsByName: Readonly<Record<string, number>>;
   /** Number of error events */
   readonly errorCount: number;
+  /** Token usage metrics (optional, from provider) */
+  readonly tokenUsage?: TokenUsage;
+  /** Total cost in USD (optional, from provider) */
+  readonly costUsd?: number;
+  /** Total execution duration in milliseconds (optional) */
+  readonly durationMs?: number;
+  /** Per-tool duration arrays in milliseconds (optional) */
+  readonly toolDurations?: Readonly<Record<string, readonly number[]>>;
 }
 
 /**
@@ -39,6 +59,8 @@ export interface ToolTrajectoryEvaluatorConfig {
  */
 export interface ToolTrajectoryExpectedItem {
   readonly tool: string;
+  /** Optional argument matching: 'any' skips validation, object performs partial deep equality */
+  readonly args?: 'any' | Record<string, unknown>;
 }
 
 /**
@@ -77,3 +99,111 @@ export function computeTraceSummary(messages: readonly OutputMessageLike[]): Tra
     errorCount: 0,
   };
 }
+
+/**
+ * Default tool names considered as exploration/read-only operations.
+ * Can be overridden per-evaluation via config.
+ */
+export const DEFAULT_EXPLORATION_TOOLS = [
+  'read',
+  'grep',
+  'glob',
+  'search',
+  'list',
+  'Read',
+  'Grep',
+  'Glob',
+  'WebSearch',
+  'WebFetch',
+] as const;
+
+/**
+ * Ratio of exploration tool calls to total tool calls.
+ * Returns undefined if there are no tool calls.
+ *
+ * @param summary - Trace summary with tool call counts
+ * @param explorationTools - Tool names considered exploration (defaults to DEFAULT_EXPLORATION_TOOLS)
+ * @returns Ratio between 0 and 1, or undefined if no tool calls
+ */
+export function explorationRatio(
+  summary: TraceSummary,
+  explorationTools: readonly string[] = DEFAULT_EXPLORATION_TOOLS,
+): number | undefined {
+  if (summary.eventCount === 0) return undefined;
+
+  const explorationCalls = explorationTools.reduce(
+    (sum, tool) => sum + (summary.toolCallsByName[tool] ?? 0),
+    0,
+  );
+
+  return explorationCalls / summary.eventCount;
+}
+
+/**
+ * Average tokens consumed per tool call.
+ * Returns undefined if tokenUsage is not available or no tool calls.
+ *
+ * @param summary - Trace summary with optional token usage
+ * @returns Average tokens per tool call, or undefined
+ */
+export function tokensPerTool(summary: TraceSummary): number | undefined {
+  if (!summary.tokenUsage || summary.eventCount === 0) return undefined;
+
+  const totalTokens = summary.tokenUsage.input + summary.tokenUsage.output;
+  return totalTokens / summary.eventCount;
+}
+
+/**
+ * Average tool duration across all tool calls.
+ * Returns undefined if toolDurations is not available or empty.
+ *
+ * @param summary - Trace summary with optional tool durations
+ * @returns Average duration in milliseconds, or undefined
+ */
+export function avgToolDurationMs(summary: TraceSummary): number | undefined {
+  if (!summary.toolDurations) return undefined;
+
+  let totalDuration = 0;
+  let totalCalls = 0;
+
+  for (const durations of Object.values(summary.toolDurations)) {
+    for (const duration of durations) {
+      totalDuration += duration;
+      totalCalls++;
+    }
+  }
+
+  if (totalCalls === 0) return undefined;
+  return totalDuration / totalCalls;
+}
+
+/**
+ * Execution metrics from provider response.
+ */
+export interface ExecutionMetrics {
+  readonly tokenUsage?: TokenUsage;
+  readonly costUsd?: number;
+  readonly durationMs?: number;
+}
+
+/**
+ * Merge execution metrics from provider response into a trace summary.
+ * Returns a new TraceSummary with metrics fields populated.
+ *
+ * @param summary - Base trace summary from computeTraceSummary
+ * @param metrics - Optional execution metrics from provider
+ * @returns TraceSummary with merged metrics
+ */
+export function mergeExecutionMetrics(
+  summary: TraceSummary,
+  metrics?: ExecutionMetrics,
+): TraceSummary {
+  if (!metrics) return summary;
+
+  return {
+    ...summary,
+    tokenUsage: metrics.tokenUsage,
+    costUsd: metrics.costUsd,
+    durationMs: metrics.durationMs,
+  };
+}
diff --git a/packages/core/test/evaluation/execution-metrics.test.ts b/packages/core/test/evaluation/execution-metrics.test.ts
new file mode 100644
index 00000000..666d2de1
--- /dev/null
+++ b/packages/core/test/evaluation/execution-metrics.test.ts
@@ -0,0 +1,248 @@
+import { describe, expect, it } from 'bun:test';
+
+import {
+  type TraceSummary,
+  avgToolDurationMs,
+  explorationRatio,
+  tokensPerTool,
+  mergeExecutionMetrics,
+} from '../../src/evaluation/trace.js';
+
+describe('Execution Metrics', () => {
+  describe('explorationRatio', () => {
+    it('returns undefined when there are no tool calls', () => {
+      const summary: TraceSummary = {
+        eventCount: 0,
+        toolNames: [],
+        toolCallsByName: {},
+        errorCount: 0,
+      };
+
+      expect(explorationRatio(summary)).toBeUndefined();
+    });
+
+    it('returns 1.0 when all calls are exploration tools', () => {
+      const summary: TraceSummary = {
+        eventCount: 5,
+        toolNames: ['Read', 'Grep', 'Glob'],
+        toolCallsByName: { Read: 2, Grep: 2, Glob: 1 },
+        errorCount: 0,
+      };
+
+      expect(explorationRatio(summary)).toBe(1.0);
+    });
+
+    it('returns 0.0 when no calls are exploration tools', () => {
+      const summary: TraceSummary = {
+        eventCount: 3,
+        toolNames: ['Edit', 'Write', 'Bash'],
+        toolCallsByName: { Edit: 1, Write: 1, Bash: 1 },
+        errorCount: 0,
+      };
+
+      expect(explorationRatio(summary)).toBe(0.0);
+    });
+
+    it('returns correct ratio for mixed tool usage', () => {
+      const summary: TraceSummary = {
+        eventCount: 10,
+        toolNames: ['Edit', 'Grep', 'Read', 'Write'],
+        toolCallsByName: { Read: 4, Grep: 2, Edit: 3, Write: 1 },
+        errorCount: 0,
+      };
+
+      // 6 exploration calls (Read: 4, Grep: 2) out of 10
+      expect(explorationRatio(summary)).toBe(0.6);
+    });
+
+    it('accepts custom exploration tools list', () => {
+      const summary: TraceSummary = {
+        eventCount: 6,
+        toolNames: ['CustomTool', 'Edit', 'OtherTool'],
+        toolCallsByName: { CustomTool: 3, Edit: 2, OtherTool: 1 },
+        errorCount: 0,
+      };
+
+      // 4 calls (CustomTool: 3, OtherTool: 1) are exploration with custom list
+      expect(explorationRatio(summary, ['CustomTool', 'OtherTool'])).toBeCloseTo(4 / 6);
+    });
+  });
+
+  describe('tokensPerTool', () => {
+    it('returns undefined when tokenUsage is not available', () => {
+      const summary: TraceSummary = {
+        eventCount: 5,
+        toolNames: ['Read'],
+        toolCallsByName: { Read: 5 },
+        errorCount: 0,
+      };
+
+      expect(tokensPerTool(summary)).toBeUndefined();
+    });
+
+    it('returns undefined when there are no tool calls', () => {
+      const summary: TraceSummary = {
+        eventCount: 0,
+        toolNames: [],
+        toolCallsByName: {},
+        errorCount: 0,
+        tokenUsage: { input: 1000, output: 500 },
+      };
+
+      expect(tokensPerTool(summary)).toBeUndefined();
+    });
+
+    it('computes correct tokens per tool', () => {
+      const summary: TraceSummary = {
+        eventCount: 5,
+        toolNames: ['Read', 'Edit'],
+        toolCallsByName: { Read: 3, Edit: 2 },
+        errorCount: 0,
+        tokenUsage: { input: 1000, output: 500 },
+      };
+
+      // Total tokens: 1500, divided by 5 tool calls = 300 tokens per tool
+      expect(tokensPerTool(summary)).toBe(300);
+    });
+
+    it('handles cached tokens in total calculation', () => {
+      const summary: TraceSummary = {
+        eventCount: 4,
+        toolNames: ['Read'],
+        toolCallsByName: { Read: 4 },
+        errorCount: 0,
+        tokenUsage: { input: 800, output: 400, cached: 200 },
+      };
+
+      // Total tokens: 800 + 400 = 1200 (cached not added to total)
+      expect(tokensPerTool(summary)).toBe(300);
+    });
+  });
+
+  describe('avgToolDurationMs', () => {
+    it('returns undefined when toolDurations is not available', () => {
+      const summary: TraceSummary = {
+        eventCount: 5,
+        toolNames: ['Read'],
+        toolCallsByName: { Read: 5 },
+        errorCount: 0,
+      };
+
+      expect(avgToolDurationMs(summary)).toBeUndefined();
+    });
+
+    it('returns undefined when toolDurations is empty', () => {
+      const summary: TraceSummary = {
+        eventCount: 0,
+        toolNames: [],
+        toolCallsByName: {},
+        errorCount: 0,
+        toolDurations: {},
+      };
+
+      expect(avgToolDurationMs(summary)).toBeUndefined();
+    });
+
+    it('computes correct average duration', () => {
+      const summary: TraceSummary = {
+        eventCount: 4,
+        toolNames: ['Read', 'Edit'],
+        toolCallsByName: { Read: 3, Edit: 1 },
+        errorCount: 0,
+        toolDurations: {
+          Read: [100, 150, 200], // avg: 150
+          Edit: [50], // avg: 50
+        },
+      };
+
+      // Total duration: 100 + 150 + 200 + 50 = 500ms
+      // Total calls: 4
+      // Average: 125ms
+      expect(avgToolDurationMs(summary)).toBe(125);
+    });
+
+    it('handles single tool with multiple calls', () => {
+      const summary: TraceSummary = {
+        eventCount: 3,
+        toolNames: ['Grep'],
+        toolCallsByName: { Grep: 3 },
+        errorCount: 0,
+        toolDurations: {
+          Grep: [100, 200, 300],
+        },
+      };
+
+      expect(avgToolDurationMs(summary)).toBe(200);
+    });
+  });
+
+  describe('mergeExecutionMetrics', () => {
+    const baseSummary: TraceSummary = {
+      eventCount: 5,
+      toolNames: ['Read', 'Edit'],
+      toolCallsByName: { Read: 3, Edit: 2 },
+      errorCount: 0,
+    };
+
+    it('returns the same summary when no metrics provided', () => {
+      const result = mergeExecutionMetrics(baseSummary);
+
+      expect(result).toBe(baseSummary);
+    });
+
+    it('returns the same summary when metrics is undefined', () => {
+      const result = mergeExecutionMetrics(baseSummary, undefined);
+
+      expect(result).toBe(baseSummary);
+    });
+
+    it('merges tokenUsage into summary', () => {
+      const result = mergeExecutionMetrics(baseSummary, {
+        tokenUsage: { input: 1000, output: 500 },
+      });
+
+      expect(result.eventCount).toBe(5);
+      expect(result.toolNames).toEqual(['Read', 'Edit']);
+      expect(result.tokenUsage).toEqual({ input: 1000, output: 500 });
+      expect(result.costUsd).toBeUndefined();
+      expect(result.durationMs).toBeUndefined();
+    });
+
+    it('merges all metrics into summary', () => {
+      const result = mergeExecutionMetrics(baseSummary, {
+        tokenUsage: { input: 1000, output: 500, cached: 100 },
+        costUsd: 0.05,
+        durationMs: 12000,
+      });
+
+      expect(result.eventCount).toBe(5);
+      expect(result.toolNames).toEqual(['Read', 'Edit']);
+      expect(result.tokenUsage).toEqual({ input: 1000, output: 500, cached: 100 });
+      expect(result.costUsd).toBe(0.05);
+      expect(result.durationMs).toBe(12000);
+    });
+
+    it('preserves existing summary fields', () => {
+      const summaryWithError: TraceSummary = {
+        ...baseSummary,
+        errorCount: 2,
+      };
+
+      const result = mergeExecutionMetrics(summaryWithError, {
+        costUsd: 0.1,
+      });
+
+      expect(result.errorCount).toBe(2);
+      expect(result.costUsd).toBe(0.1);
+    });
+
+    it('does not mutate the original summary', () => {
+      const result = mergeExecutionMetrics(baseSummary, {
+        tokenUsage: { input: 1000, output: 500 },
+      });
+
+      expect(baseSummary.tokenUsage).toBeUndefined();
+      expect(result.tokenUsage).toEqual({ input: 1000, output: 500 });
+    });
+  });
+});
diff --git a/packages/core/test/evaluation/tool-trajectory-evaluator.test.ts b/packages/core/test/evaluation/tool-trajectory-evaluator.test.ts
index 39d2adb9..624a92ea 100644
--- a/packages/core/test/evaluation/tool-trajectory-evaluator.test.ts
+++ b/packages/core/test/evaluation/tool-trajectory-evaluator.test.ts
@@ -381,4 +381,308 @@ describe('ToolTrajectoryEvaluator', () => {
       expect(result.misses.some((m) => m.includes('expected analyze, got nothing'))).toBe(true);
     });
   });
+
+  describe('argument matching', () => {
+    describe('exact mode with args', () => {
+      it('passes when args match exactly', () => {
+        const outputMessages: OutputMessage[] = [
+          {
+            role: 'assistant',
+            toolCalls: [
+              { tool: 'search', input: { query: 'test', limit: 10 } },
+              { tool: 'analyze', input: { format: 'json' } },
+            ],
+          },
+        ];
+
+        const config: ToolTrajectoryEvaluatorConfig = {
+          name: 'test',
+          type: 'tool_trajectory',
+          mode: 'exact',
+          expected: [
+            { tool: 'search', args: { query: 'test', limit: 10 } },
+            { tool: 'analyze', args: { format: 'json' } },
+          ],
+        };
+        const evaluator = new ToolTrajectoryEvaluator({ config });
+
+        const result = evaluator.evaluate(createContext({ outputMessages }));
+
+        expect(result.score).toBe(1);
+        expect(result.verdict).toBe('pass');
+      });
+
+      it('fails when args do not match', () => {
+        const outputMessages: OutputMessage[] = [
+          {
+            role: 'assistant',
+            toolCalls: [{ tool: 'search', input: { query: 'wrong', limit: 10 } }],
+          },
+        ];
+
+        const config: ToolTrajectoryEvaluatorConfig = {
+          name: 'test',
+          type: 'tool_trajectory',
+          mode: 'exact',
+          expected: [{ tool: 'search', args: { query: 'test', limit: 10 } }],
+        };
+        const evaluator = new ToolTrajectoryEvaluator({ config });
+
+        const result = evaluator.evaluate(createContext({ outputMessages }));
+
+        expect(result.score).toBe(0);
+        expect(result.verdict).toBe('fail');
+        expect(result.misses.some((m) => m.includes('args mismatch'))).toBe(true);
+      });
+
+      it('skips arg validation with args: any', () => {
+        const outputMessages: OutputMessage[] = [
+          {
+            role: 'assistant',
+            toolCalls: [{ tool: 'search', input: { query: 'anything', limit: 999 } }],
+          },
+        ];
+
+        const config: ToolTrajectoryEvaluatorConfig = {
+          name: 'test',
+          type: 'tool_trajectory',
+          mode: 'exact',
+          expected: [{ tool: 'search', args: 'any' }],
+        };
+        const evaluator = new ToolTrajectoryEvaluator({ config });
+
+        const result = evaluator.evaluate(createContext({ outputMessages }));
+
+        expect(result.score).toBe(1);
+        expect(result.verdict).toBe('pass');
+      });
+
+      it('performs partial matching - only validates specified keys', () => {
+        const outputMessages: OutputMessage[] = [
+          {
+            role: 'assistant',
+            toolCalls: [{ tool: 'search', input: { query: 'test', limit: 10, extra: 'ignored' } }],
+          },
+        ];
+
+        const config: ToolTrajectoryEvaluatorConfig = {
+          name: 'test',
+          type: 'tool_trajectory',
+          mode: 'exact',
+          expected: [{ tool: 'search', args: { query: 'test' } }],
+        };
+        const evaluator = new ToolTrajectoryEvaluator({ config });
+
+        const result = evaluator.evaluate(createContext({ outputMessages }));
+
+        expect(result.score).toBe(1);
+        expect(result.verdict).toBe('pass');
+      });
+
+      it('handles nested objects with deep equality', () => {
+        const outputMessages: OutputMessage[] = [
+          {
+            role: 'assistant',
+            toolCalls: [
+              {
+                tool: 'search',
+                input: { options: { nested: { value: 123 } }, other: 'field' },
+              },
+            ],
+          },
+        ];
+
+        const config: ToolTrajectoryEvaluatorConfig = {
+          name: 'test',
+          type: 'tool_trajectory',
+          mode: 'exact',
+          expected: [{ tool: 'search', args: { options: { nested: { value: 123 } } } }],
+        };
+        const evaluator = new ToolTrajectoryEvaluator({ config });
+
+        const result = evaluator.evaluate(createContext({ outputMessages }));
+
+        expect(result.score).toBe(1);
+        expect(result.verdict).toBe('pass');
+      });
+
+      it('fails on nested object mismatch', () => {
+        const outputMessages: OutputMessage[] = [
+          {
+            role: 'assistant',
+            toolCalls: [{ tool: 'search', input: { options: { nested: { value: 999 } } } }],
+          },
+        ];
+
+        const config: ToolTrajectoryEvaluatorConfig = {
+          name: 'test',
+          type: 'tool_trajectory',
+          mode: 'exact',
+          expected: [{ tool: 'search', args: { options: { nested: { value: 123 } } } }],
+        };
+        const evaluator = new ToolTrajectoryEvaluator({ config });
+
+        const result = evaluator.evaluate(createContext({ outputMessages }));
+
+        expect(result.score).toBe(0);
+        expect(result.verdict).toBe('fail');
+      });
+
+      it('matches without args field (backward compatibility)', () => {
+        const outputMessages: OutputMessage[] = [
+          {
+            role: 'assistant',
+            toolCalls: [{ tool: 'search', input: { any: 'args' } }],
+          },
+        ];
+
+        const config: ToolTrajectoryEvaluatorConfig = {
+          name: 'test',
+          type: 'tool_trajectory',
+          mode: 'exact',
+          expected: [{ tool: 'search' }],
+        };
+        const evaluator = new ToolTrajectoryEvaluator({ config });
+
+        const result = evaluator.evaluate(createContext({ outputMessages }));
+
+        expect(result.score).toBe(1);
+        expect(result.verdict).toBe('pass');
+      });
+    });
+
+    describe('in_order mode with args', () => {
+      it('passes when args match in sequence', () => {
+        const outputMessages: OutputMessage[] = [
+          {
+            role: 'assistant',
+            toolCalls: [
+              { tool: 'init', input: {} },
+              { tool: 'search', input: { query: 'test' } },
+              { tool: 'analyze', input: { format: 'json' } },
+            ],
+          },
+        ];
+
+        const config: ToolTrajectoryEvaluatorConfig = {
+          name: 'test',
+          type: 'tool_trajectory',
+          mode: 'in_order',
+          expected: [
+            { tool: 'search', args: { query: 'test' } },
+            { tool: 'analyze', args: { format: 'json' } },
+          ],
+        };
+        const evaluator = new ToolTrajectoryEvaluator({ config });
+
+        const result = evaluator.evaluate(createContext({ outputMessages }));
+
+        expect(result.score).toBe(1);
+        expect(result.verdict).toBe('pass');
+      });
+
+      it('fails when tool found but args mismatch', () => {
+        const outputMessages: OutputMessage[] = [
+          {
+            role: 'assistant',
+            toolCalls: [
+              { tool: 'search', input: { query: 'wrong' } },
+              { tool: 'analyze', input: { format: 'json' } },
+            ],
+          },
+        ];
+
+        const config: ToolTrajectoryEvaluatorConfig = {
+          name: 'test',
+          type: 'tool_trajectory',
+          mode: 'in_order',
+          expected: [
+            { tool: 'search', args: { query: 'test' } },
+            { tool: 'analyze', args: { format: 'json' } },
+          ],
+        };
+        const evaluator = new ToolTrajectoryEvaluator({ config });
+
+        const result = evaluator.evaluate(createContext({ outputMessages }));
+
+        expect(result.score).toBe(0.5);
+        expect(result.verdict).toBe('fail');
+        expect(result.misses.some((m) => m.includes('args mismatch'))).toBe(true);
+      });
+
+      it('uses args: any to skip validation in sequence', () => {
+        const outputMessages: OutputMessage[] = [
+          {
+            role: 'assistant',
+            toolCalls: [
+              { tool: 'search', input: { query: 'anything' } },
+              { tool: 'analyze', input: { format: 'xml' } },
+            ],
+          },
+        ];
+
+        const config: ToolTrajectoryEvaluatorConfig = {
+          name: 'test',
+          type: 'tool_trajectory',
+          mode: 'in_order',
+          expected: [
+            { tool: 'search', args: 'any' },
+            { tool: 'analyze', args: 'any' },
+          ],
+        };
+        const evaluator = new ToolTrajectoryEvaluator({ config });
+
+        const result = evaluator.evaluate(createContext({ outputMessages }));
+
+        expect(result.score).toBe(1);
+        expect(result.verdict).toBe('pass');
+      });
+    });
+
+    describe('array argument matching', () => {
+      it('matches arrays with deep equality', () => {
+        const outputMessages: OutputMessage[] = [
+          {
+            role: 'assistant',
+            toolCalls: [{ tool: 'search', input: { tags: ['a', 'b', 'c'] } }],
+          },
+        ];
+
+        const config: ToolTrajectoryEvaluatorConfig = {
+          name: 'test',
+          type: 'tool_trajectory',
+          mode: 'exact',
+          expected: [{ tool: 'search', args: { tags: ['a', 'b', 'c'] } }],
+        };
+        const evaluator = new ToolTrajectoryEvaluator({ config });
+
+        const result = evaluator.evaluate(createContext({ outputMessages }));
+
+        expect(result.score).toBe(1);
+        expect(result.verdict).toBe('pass');
+      });
+
+      it('fails on array order mismatch', () => {
+        const outputMessages: OutputMessage[] = [
+          {
+            role: 'assistant',
+            toolCalls: [{ tool: 'search', input: { tags: ['c', 'b', 'a'] } }],
+          },
+        ];
+
+        const config: ToolTrajectoryEvaluatorConfig = {
+          name: 'test',
+          type: 'tool_trajectory',
+          mode: 'exact',
+          expected: [{ tool: 'search', args: { tags: ['a', 'b', 'c'] } }],
+        };
+        const evaluator = new ToolTrajectoryEvaluator({ config });
+
+        const result = evaluator.evaluate(createContext({ outputMessages }));
+
+        expect(result.score).toBe(0);
+        expect(result.verdict).toBe('fail');
+      });
+    });
+  });
 });

From 1048559b5a58038341655b85ef06ae7be55ff074 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Thu, 1 Jan 2026 13:19:23 +0000
Subject: [PATCH 3/5] refactor(core): rename evaluation properties to camelCase

Update evaluation result types and code evaluator payload to use
consistent camelCase naming convention throughout the codebase.
---
 apps/cli/src/commands/compare/index.ts        | 22 +++---
 apps/cli/src/commands/eval/statistics.ts      |  6 +-
 .../cli/test/commands/compare/compare.test.ts | 48 ++++++-------
 apps/cli/test/eval.integration.test.ts        |  4 +-
 apps/cli/test/fixtures/mock-run-evaluation.ts | 26 +++----
 .../scripts/check-batch-cli-output.ts         | 14 ++--
 .../evals/validate_risk_output.ts             |  6 +-
 packages/core/src/evaluation/evaluators.ts    | 18 ++---
 packages/core/src/evaluation/orchestrator.ts  | 42 ++++++------
 packages/core/src/evaluation/types.ts         | 24 +++----
 .../core/test/evaluation/evaluators.test.ts   |  8 +--
 .../test/evaluation/execution-metrics.test.ts |  2 +-
 .../core/test/evaluation/orchestrator.test.ts | 68 +++++++++----------
 13 files changed, 144 insertions(+), 144 deletions(-)

diff --git a/apps/cli/src/commands/compare/index.ts b/apps/cli/src/commands/compare/index.ts
index f3cad4c7..3fda72c8 100644
--- a/apps/cli/src/commands/compare/index.ts
+++ b/apps/cli/src/commands/compare/index.ts
@@ -2,12 +2,12 @@ import { readFileSync } from 'node:fs';
 import { command, number, option, optional, positional, string } from 'cmd-ts';
 
 interface EvalResult {
-  eval_id: string;
+  evalId: string;
   score: number;
 }
 
 interface MatchedResult {
-  eval_id: string;
+  evalId: string;
   score1: number;
   score2: number;
   delta: number;
@@ -35,14 +35,14 @@ export function loadJsonlResults(filePath: string): EvalResult[] {
     .filter((line) => line.trim());
 
   return lines.map((line) => {
-    const record = JSON.parse(line) as { eval_id?: string; score?: number };
-    if (typeof record.eval_id !== 'string') {
-      throw new Error(`Missing eval_id in result: ${line}`);
+    const record = JSON.parse(line) as { evalId?: string; score?: number };
+    if (typeof record.evalId !== 'string') {
+      throw new Error(`Missing evalId in result: ${line}`);
     }
     if (typeof record.score !== 'number') {
       throw new Error(`Missing or invalid score in result: ${line}`);
     }
-    return { eval_id: record.eval_id, score: record.score };
+    return { evalId: record.evalId, score: record.score };
   });
 }
 
@@ -57,8 +57,8 @@ export function compareResults(
   results2: EvalResult[],
   threshold: number,
 ): ComparisonOutput {
-  const map1 = new Map(results1.map((r) => [r.eval_id, r.score]));
-  const map2 = new Map(results2.map((r) => [r.eval_id, r.score]));
+  const map1 = new Map(results1.map((r) => [r.evalId, r.score]));
+  const map2 = new Map(results2.map((r) => [r.evalId, r.score]));
 
   const matched: MatchedResult[] = [];
   const matchedIds = new Set<string>();
@@ -68,7 +68,7 @@ export function compareResults(
     if (score2 !== undefined) {
       const delta = score2 - score1;
       matched.push({
-        eval_id: evalId,
+        evalId: evalId,
         score1,
         score2,
         delta,
@@ -78,8 +78,8 @@ export function compareResults(
     }
   }
 
-  const unmatchedFile1 = results1.filter((r) => !matchedIds.has(r.eval_id)).length;
-  const unmatchedFile2 = results2.filter((r) => !map1.has(r.eval_id)).length;
+  const unmatchedFile1 = results1.filter((r) => !matchedIds.has(r.evalId)).length;
+  const unmatchedFile2 = results2.filter((r) => !map1.has(r.evalId)).length;
 
   const wins = matched.filter((m) => m.outcome === 'win').length;
   const losses = matched.filter((m) => m.outcome === 'loss').length;
diff --git a/apps/cli/src/commands/eval/statistics.ts b/apps/cli/src/commands/eval/statistics.ts
index 0bd37a6f..e4d08356 100644
--- a/apps/cli/src/commands/eval/statistics.ts
+++ b/apps/cli/src/commands/eval/statistics.ts
@@ -86,7 +86,7 @@ export function calculateEvaluationSummary(
   // Track errors
   const errors = results
     .filter((result) => result.error !== undefined)
-    .map((result) => ({ evalId: result.eval_id, error: result.error as string }));
+    .map((result) => ({ evalId: result.evalId, error: result.error as string }));
   const errorCount = errors.length;
 
   if (total === 0) {
@@ -180,12 +180,12 @@ export function formatEvaluationSummary(summary: EvaluationSummary): string {
 
   lines.push('\nTop performing eval cases:');
   summary.topResults.forEach((result, index) => {
-    lines.push(`  ${index + 1}. ${result.eval_id}: ${formatScore(result.score)}`);
+    lines.push(`  ${index + 1}. ${result.evalId}: ${formatScore(result.score)}`);
   });
 
   lines.push('\nLowest performing eval cases:');
   summary.bottomResults.forEach((result, index) => {
-    lines.push(`  ${index + 1}. ${result.eval_id}: ${formatScore(result.score)}`);
+    lines.push(`  ${index + 1}. ${result.evalId}: ${formatScore(result.score)}`);
   });
 
   return lines.join('\n');
diff --git a/apps/cli/test/commands/compare/compare.test.ts b/apps/cli/test/commands/compare/compare.test.ts
index 83ad1a58..d64d5d1d 100644
--- a/apps/cli/test/commands/compare/compare.test.ts
+++ b/apps/cli/test/commands/compare/compare.test.ts
@@ -26,14 +26,14 @@ describe('compare command', () => {
       const filePath = path.join(tempDir, 'results.jsonl');
       writeFileSync(
         filePath,
-        '{"eval_id": "case-1", "score": 0.8}\n{"eval_id": "case-2", "score": 0.9}\n',
+        '{"evalId": "case-1", "score": 0.8}\n{"evalId": "case-2", "score": 0.9}\n',
       );
 
       const results = loadJsonlResults(filePath);
 
       expect(results).toEqual([
-        { eval_id: 'case-1', score: 0.8 },
-        { eval_id: 'case-2', score: 0.9 },
+        { evalId: 'case-1', score: 0.8 },
+        { evalId: 'case-2', score: 0.9 },
       ]);
     });
 
@@ -41,7 +41,7 @@ describe('compare command', () => {
       const filePath = path.join(tempDir, 'results.jsonl');
       writeFileSync(
         filePath,
-        '{"eval_id": "case-1", "score": 0.8}\n\n{"eval_id": "case-2", "score": 0.9}\n',
+        '{"evalId": "case-1", "score": 0.8}\n\n{"evalId": "case-2", "score": 0.9}\n',
       );
 
       const results = loadJsonlResults(filePath);
@@ -49,16 +49,16 @@ describe('compare command', () => {
       expect(results).toHaveLength(2);
     });
 
-    it('should throw error for missing eval_id', () => {
+    it('should throw error for missing evalId', () => {
       const filePath = path.join(tempDir, 'results.jsonl');
       writeFileSync(filePath, '{"score": 0.8}\n');
 
-      expect(() => loadJsonlResults(filePath)).toThrow('Missing eval_id');
+      expect(() => loadJsonlResults(filePath)).toThrow('Missing evalId');
     });
 
     it('should throw error for missing score', () => {
       const filePath = path.join(tempDir, 'results.jsonl');
-      writeFileSync(filePath, '{"eval_id": "case-1"}\n');
+      writeFileSync(filePath, '{"evalId": "case-1"}\n');
 
       expect(() => loadJsonlResults(filePath)).toThrow('Missing or invalid score');
     });
@@ -93,27 +93,27 @@ describe('compare command', () => {
   });
 
   describe('compareResults', () => {
-    it('should match results by eval_id and compute deltas', () => {
+    it('should match results by evalId and compute deltas', () => {
       // Use values that avoid floating point precision issues
       const results1 = [
-        { eval_id: 'case-1', score: 0.5 },
-        { eval_id: 'case-2', score: 0.75 },
+        { evalId: 'case-1', score: 0.5 },
+        { evalId: 'case-2', score: 0.75 },
       ];
       const results2 = [
-        { eval_id: 'case-1', score: 0.7 }, // +0.2 win
-        { eval_id: 'case-2', score: 0.5 }, // -0.25 loss
+        { evalId: 'case-1', score: 0.7 }, // +0.2 win
+        { evalId: 'case-2', score: 0.5 }, // -0.25 loss
       ];
 
       const comparison = compareResults(results1, results2, 0.1);
 
       expect(comparison.matched).toHaveLength(2);
-      expect(comparison.matched[0].eval_id).toBe('case-1');
+      expect(comparison.matched[0].evalId).toBe('case-1');
       expect(comparison.matched[0].score1).toBe(0.5);
       expect(comparison.matched[0].score2).toBe(0.7);
       expect(comparison.matched[0].delta).toBeCloseTo(0.2, 10);
       expect(comparison.matched[0].outcome).toBe('win');
 
-      expect(comparison.matched[1].eval_id).toBe('case-2');
+      expect(comparison.matched[1].evalId).toBe('case-2');
       expect(comparison.matched[1].score1).toBe(0.75);
       expect(comparison.matched[1].score2).toBe(0.5);
       expect(comparison.matched[1].delta).toBeCloseTo(-0.25, 10);
@@ -122,12 +122,12 @@ describe('compare command', () => {
 
     it('should count unmatched results', () => {
       const results1 = [
-        { eval_id: 'case-1', score: 0.8 },
-        { eval_id: 'only-in-1', score: 0.5 },
+        { evalId: 'case-1', score: 0.8 },
+        { evalId: 'only-in-1', score: 0.5 },
       ];
       const results2 = [
-        { eval_id: 'case-1', score: 0.9 },
-        { eval_id: 'only-in-2', score: 0.6 },
+        { evalId: 'case-1', score: 0.9 },
+        { evalId: 'only-in-2', score: 0.6 },
       ];
 
       const comparison = compareResults(results1, results2, 0.1);
@@ -138,14 +138,14 @@ describe('compare command', () => {
     it('should compute summary statistics', () => {
       // Use values that produce clear deltas above/below threshold
       const results1 = [
-        { eval_id: 'case-1', score: 0.5 },
-        { eval_id: 'case-2', score: 0.75 },
-        { eval_id: 'case-3', score: 0.6 },
+        { evalId: 'case-1', score: 0.5 },
+        { evalId: 'case-2', score: 0.75 },
+        { evalId: 'case-3', score: 0.6 },
       ];
       const results2 = [
-        { eval_id: 'case-1', score: 0.7 }, // win (+0.2)
-        { eval_id: 'case-2', score: 0.5 }, // loss (-0.25)
-        { eval_id: 'case-3', score: 0.65 }, // tie (+0.05)
+        { evalId: 'case-1', score: 0.7 }, // win (+0.2)
+        { evalId: 'case-2', score: 0.5 }, // loss (-0.25)
+        { evalId: 'case-3', score: 0.65 }, // tie (+0.05)
       ];
 
       const comparison = compareResults(results1, results2, 0.1);
diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts
index 8d21a069..b291e115 100644
--- a/apps/cli/test/eval.integration.test.ts
+++ b/apps/cli/test/eval.integration.test.ts
@@ -167,8 +167,8 @@ describe('agentv eval CLI', () => {
     const results = await readJsonLines(outputPath);
     expect(results).toHaveLength(2);
     const [firstResult, secondResult] = results as Array<Record<string, unknown>>;
-    expect(firstResult.eval_id).toBe('case-alpha');
-    expect(secondResult.eval_id).toBe('case-beta');
+    expect(firstResult.evalId).toBe('case-alpha');
+    expect(secondResult.evalId).toBe('case-beta');
 
     const diagnostics = await readDiagnostics(fixture);
     expect(diagnostics).toMatchObject({
diff --git a/apps/cli/test/fixtures/mock-run-evaluation.ts b/apps/cli/test/fixtures/mock-run-evaluation.ts
index 0e9c0321..1ad4fc7a 100644
--- a/apps/cli/test/fixtures/mock-run-evaluation.ts
+++ b/apps/cli/test/fixtures/mock-run-evaluation.ts
@@ -22,44 +22,44 @@ interface RunEvaluationOptionsLike {
 }
 
 interface EvaluationResultLike {
-  readonly eval_id: string;
+  readonly evalId: string;
   readonly score: number;
   readonly hits: readonly string[];
   readonly misses: readonly string[];
-  readonly candidate_answer: string;
-  readonly expected_aspect_count: number;
+  readonly candidateAnswer: string;
+  readonly expectedAspectCount: number;
   readonly target: string;
   readonly timestamp: string;
   readonly reasoning?: string;
-  readonly raw_aspects?: readonly string[];
+  readonly rawAspects?: readonly string[];
 }
 
 function buildResults(targetName: string): EvaluationResultLike[] {
   const baseTime = new Date('2024-01-01T00:00:00.000Z');
   return [
     {
-      eval_id: 'case-alpha',
+      evalId: 'case-alpha',
       score: 0.6,
       hits: ['alpha'],
       misses: [],
-      candidate_answer: 'Alpha answer',
-      expected_aspect_count: 1,
+      candidateAnswer: 'Alpha answer',
+      expectedAspectCount: 1,
       target: targetName,
       timestamp: baseTime.toISOString(),
       reasoning: 'Alpha reasoning',
-      raw_aspects: ['alpha'],
+      rawAspects: ['alpha'],
     },
     {
-      eval_id: 'case-beta',
+      evalId: 'case-beta',
       score: 0.9,
       hits: ['beta', 'gamma'],
       misses: ['delta'],
-      candidate_answer: 'Beta answer',
-      expected_aspect_count: 3,
+      candidateAnswer: 'Beta answer',
+      expectedAspectCount: 3,
       target: targetName,
       timestamp: new Date(baseTime.getTime() + 60_000).toISOString(),
       reasoning: 'Beta reasoning',
-      raw_aspects: ['beta', 'gamma', 'delta'],
+      rawAspects: ['beta', 'gamma', 'delta'],
     },
   ];
 }
@@ -109,7 +109,7 @@ export async function runEvaluation(
   await maybeWriteDiagnostics(options, results);
   await maybeWritePromptDump(
     options.promptDumpDir,
-    results.map((result) => result.eval_id),
+    results.map((result) => result.evalId),
   );
 
   for (const result of results) {
diff --git a/examples/features/evals/batch-cli/scripts/check-batch-cli-output.ts b/examples/features/evals/batch-cli/scripts/check-batch-cli-output.ts
index 3087b118..93b7e758 100644
--- a/examples/features/evals/batch-cli/scripts/check-batch-cli-output.ts
+++ b/examples/features/evals/batch-cli/scripts/check-batch-cli-output.ts
@@ -5,9 +5,9 @@ function isObject(value: unknown): value is Record<string, unknown> {
 }
 
 type EvalInput = {
-  readonly input_messages?: unknown;
-  readonly expected_messages?: unknown;
-  readonly candidate_answer?: unknown;
+  readonly inputMessages?: unknown;
+  readonly expectedMessages?: unknown;
+  readonly candidateAnswer?: unknown;
 };
 
 function findExpectedDecisionFromExpectedMessages(expectedMessages: unknown): string | undefined {
@@ -53,9 +53,9 @@ function main(): void {
   const input = JSON.parse(stdin) as EvalInput;
 
   const expectedDecision =
-    findExpectedDecisionFromExpectedMessages(input.expected_messages) ??
-    findExpectedDecision(input.input_messages);
-  const candidate = typeof input.candidate_answer === 'string' ? input.candidate_answer : '';
+    findExpectedDecisionFromExpectedMessages(input.expectedMessages) ??
+    findExpectedDecision(input.inputMessages);
+  const candidate = typeof input.candidateAnswer === 'string' ? input.candidateAnswer : '';
 
   let candidateObj: unknown;
   try {
@@ -73,7 +73,7 @@ function main(): void {
   const misses: string[] = [];
 
   if (!expectedDecision) {
-    misses.push('Missing expected decision (expected_messages[].content.decision)');
+    misses.push('Missing expected decision (expectedMessages[].content.decision)');
   } else {
     hits.push(`expected.decision present: ${expectedDecision}`);
   }
diff --git a/examples/showcase/export-screening/evals/validate_risk_output.ts b/examples/showcase/export-screening/evals/validate_risk_output.ts
index e7ffbd29..c3a4daaf 100644
--- a/examples/showcase/export-screening/evals/validate_risk_output.ts
+++ b/examples/showcase/export-screening/evals/validate_risk_output.ts
@@ -12,8 +12,8 @@ const VALID_RISK_LEVELS = new Set(['High', 'Medium', 'Low']);
 const REQUIRED_KEYS = ['riskLevel', 'reasoning'];
 
 interface EvalInput {
-  candidate_answer: string;
-  expected_messages?: Array<{
+  candidateAnswer: string;
+  expectedMessages?: Array<{
     role: string;
     content: unknown;
   }>;
@@ -170,7 +170,7 @@ async function main(): Promise<void> {
     process.exit(1);
   }
 
-  const result = validateRiskOutput(evalData.candidate_answer ?? '', evalData.expected_messages);
+  const result = validateRiskOutput(evalData.candidateAnswer ?? '', evalData.expectedMessages);
 
   console.log(JSON.stringify(result, null, 2));
 }
diff --git a/packages/core/src/evaluation/evaluators.ts b/packages/core/src/evaluation/evaluators.ts
index 1c233e91..f5f5bf68 100644
--- a/packages/core/src/evaluation/evaluators.ts
+++ b/packages/core/src/evaluation/evaluators.ts
@@ -452,17 +452,17 @@ export class CodeEvaluator implements Evaluator {
     const inputPayload = JSON.stringify(
       {
         question: context.evalCase.question,
-        expected_outcome: context.evalCase.expected_outcome,
-        expected_messages: context.evalCase.expected_messages,
-        reference_answer: context.evalCase.reference_answer,
-        candidate_answer: context.candidate,
-        output_messages: context.outputMessages ?? null,
-        guideline_files: context.evalCase.guideline_paths,
-        input_files: context.evalCase.file_paths.filter(
+        expectedOutcome: context.evalCase.expected_outcome,
+        expectedMessages: context.evalCase.expected_messages,
+        referenceAnswer: context.evalCase.reference_answer,
+        candidateAnswer: context.candidate,
+        outputMessages: context.outputMessages ?? null,
+        guidelineFiles: context.evalCase.guideline_paths,
+        inputFiles: context.evalCase.file_paths.filter(
           (path) => !context.evalCase.guideline_paths.includes(path),
         ),
-        input_messages: context.evalCase.input_messages,
-        candidate_trace_summary: context.traceSummary ?? null,
+        inputMessages: context.evalCase.input_messages,
+        traceSummary: context.traceSummary ?? null,
       },
       null,
       2,
diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts
index 1ee66fdb..aa095cf2 100644
--- a/packages/core/src/evaluation/orchestrator.ts
+++ b/packages/core/src/evaluation/orchestrator.ts
@@ -662,21 +662,21 @@ async function evaluateCandidate(options: {
 
   return {
     timestamp: completedAt.toISOString(),
-    eval_id: evalCase.id,
+    evalId: evalCase.id,
     dataset: evalCase.dataset,
-    conversation_id: evalCase.conversation_id,
+    conversationId: evalCase.conversation_id,
     score: score.score,
     hits: score.hits,
     misses: score.misses,
-    candidate_answer: candidate,
+    candidateAnswer: candidate,
     target: target.name,
     reasoning: score.reasoning,
-    raw_aspects: score.rawAspects,
-    agent_provider_request: agentProviderRequest,
-    lm_provider_request: lmProviderRequest,
-    evaluator_provider_request: evaluatorResults ? undefined : score.evaluatorRawRequest,
-    evaluator_results: evaluatorResults,
-    trace_summary: traceSummary,
+    rawAspects: score.rawAspects,
+    agentProviderRequest: agentProviderRequest,
+    lmProviderRequest: lmProviderRequest,
+    evaluatorProviderRequest: evaluatorResults ? undefined : score.evaluatorRawRequest,
+    evaluatorResults: evaluatorResults,
+    traceSummary: traceSummary,
   };
 }
 
@@ -816,7 +816,7 @@ async function runEvaluatorList(options: {
           hits: score.hits,
           misses: score.misses,
           reasoning: score.reasoning,
-          evaluator_provider_request: score.evaluatorRawRequest,
+          evaluatorProviderRequest: score.evaluatorRawRequest,
         });
       }
 
@@ -848,7 +848,7 @@ async function runEvaluatorList(options: {
           hits: score.hits,
           misses: score.misses,
           reasoning: score.reasoning,
-          evaluator_provider_request: score.evaluatorRawRequest,
+          evaluatorProviderRequest: score.evaluatorRawRequest,
         });
       }
 
@@ -910,8 +910,8 @@ async function runEvaluatorList(options: {
           hits: score.hits,
           misses: score.misses,
           reasoning: score.reasoning,
-          evaluator_provider_request: score.evaluatorRawRequest,
-          evaluator_results: mapChildResults(score.evaluatorResults),
+          evaluatorProviderRequest: score.evaluatorRawRequest,
+          evaluatorResults: mapChildResults(score.evaluatorResults),
         });
       }
 
@@ -1212,17 +1212,17 @@ function buildErrorResult(
 
   return {
     timestamp: timestamp.toISOString(),
-    eval_id: evalCase.id,
+    evalId: evalCase.id,
     dataset: evalCase.dataset,
-    conversation_id: evalCase.conversation_id,
+    conversationId: evalCase.conversation_id,
     score: 0,
     hits: [],
     misses: [`Error: ${message}`],
-    candidate_answer: `Error occurred: ${message}`,
+    candidateAnswer: `Error occurred: ${message}`,
     target: targetName,
-    raw_aspects: [],
-    agent_provider_request: agentProviderRequest,
-    lm_provider_request: lmProviderRequest,
+    rawAspects: [],
+    agentProviderRequest: agentProviderRequest,
+    lmProviderRequest: lmProviderRequest,
     error: message,
   } satisfies EvaluationResult;
 }
@@ -1282,8 +1282,8 @@ function mapChildResults(
     hits: child.hits,
     misses: child.misses,
     reasoning: child.reasoning,
-    evaluator_provider_request: child.evaluatorRawRequest,
-    evaluator_results: mapChildResults(child.evaluatorResults),
+    evaluatorProviderRequest: child.evaluatorRawRequest,
+    evaluatorResults: mapChildResults(child.evaluatorResults),
   }));
 }
 
diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts
index 96d40107..f686b1d6 100644
--- a/packages/core/src/evaluation/types.ts
+++ b/packages/core/src/evaluation/types.ts
@@ -239,23 +239,23 @@ export interface EvalCase {
  */
 export interface EvaluationResult {
   readonly timestamp: string;
-  readonly eval_id: string;
+  readonly evalId: string;
   readonly dataset?: string;
-  readonly conversation_id?: string;
+  readonly conversationId?: string;
   readonly score: number;
   readonly hits: readonly string[];
   readonly misses: readonly string[];
-  readonly candidate_answer: string;
+  readonly candidateAnswer: string;
   readonly target: string;
   readonly reasoning?: string;
-  readonly raw_aspects?: readonly string[];
-  readonly agent_provider_request?: JsonObject;
-  readonly lm_provider_request?: JsonObject;
-  readonly evaluator_provider_request?: JsonObject;
-  readonly evaluator_results?: readonly EvaluatorResult[];
+  readonly rawAspects?: readonly string[];
+  readonly agentProviderRequest?: JsonObject;
+  readonly lmProviderRequest?: JsonObject;
+  readonly evaluatorProviderRequest?: JsonObject;
+  readonly evaluatorResults?: readonly EvaluatorResult[];
   readonly error?: string;
   /** Lightweight summary of the execution trace (always included when available) */
-  readonly trace_summary?: TraceSummary;
+  readonly traceSummary?: TraceSummary;
 }
 
 export type EvaluationVerdict = 'pass' | 'fail' | 'borderline';
@@ -269,9 +269,9 @@ export interface EvaluatorResult {
   readonly hits: readonly string[];
   readonly misses: readonly string[];
   readonly reasoning?: string;
-  readonly raw_request?: JsonObject;
-  readonly evaluator_provider_request?: JsonObject;
-  readonly evaluator_results?: readonly EvaluatorResult[];
+  readonly rawRequest?: JsonObject;
+  readonly evaluatorProviderRequest?: JsonObject;
+  readonly evaluatorResults?: readonly EvaluatorResult[];
 }
 
 /**
diff --git a/packages/core/test/evaluation/evaluators.test.ts b/packages/core/test/evaluation/evaluators.test.ts
index c26b6557..8dd2d5e7 100644
--- a/packages/core/test/evaluation/evaluators.test.ts
+++ b/packages/core/test/evaluation/evaluators.test.ts
@@ -441,7 +441,7 @@ describe('CodeEvaluator', () => {
     const expectedCandidate = '{"decision":"ACCEPT"}';
 
     const script =
-      "bun -e \"import fs from 'node:fs'; const input = JSON.parse(fs.readFileSync(0, 'utf8')); const hasExpected = Array.isArray(input.expected_messages); const hasCandidate = typeof input.candidate_answer === 'string'; let candidateDecisionOk = false; try { const obj = JSON.parse(input.candidate_answer); candidateDecisionOk = obj && obj.decision === 'ACCEPT'; } catch {} const ok = hasExpected && hasCandidate && candidateDecisionOk; console.log(JSON.stringify({ score: ok ? 1 : 0, hits: [hasExpected ? 'expected_messages present' : null, hasCandidate ? 'candidate_answer present' : null, candidateDecisionOk ? 'candidate_answer parses' : null].filter(Boolean), misses: [hasExpected ? null : 'expected_messages missing', hasCandidate ? null : 'candidate_answer missing', candidateDecisionOk ? null : 'candidate_answer invalid'].filter(Boolean) }));\"";
+      "bun -e \"import fs from 'node:fs'; const input = JSON.parse(fs.readFileSync(0, 'utf8')); const hasExpected = Array.isArray(input.expectedMessages); const hasCandidate = typeof input.candidateAnswer === 'string'; let candidateDecisionOk = false; try { const obj = JSON.parse(input.candidateAnswer); candidateDecisionOk = obj && obj.decision === 'ACCEPT'; } catch {} const ok = hasExpected && hasCandidate && candidateDecisionOk; console.log(JSON.stringify({ score: ok ? 1 : 0, hits: [hasExpected ? 'expectedMessages present' : null, hasCandidate ? 'candidateAnswer present' : null, candidateDecisionOk ? 'candidateAnswer parses' : null].filter(Boolean), misses: [hasExpected ? null : 'expectedMessages missing', hasCandidate ? null : 'candidateAnswer missing', candidateDecisionOk ? null : 'candidateAnswer invalid'].filter(Boolean) }));\"";
 
     const evaluator = new CodeEvaluator({ script });
 
@@ -457,8 +457,8 @@ describe('CodeEvaluator', () => {
 
     expect(result.score).toBe(1);
     expect(result.verdict).toBe('pass');
-    expect(result.hits).toContain('expected_messages present');
-    expect(result.hits).toContain('candidate_answer present');
-    expect(result.hits).toContain('candidate_answer parses');
+    expect(result.hits).toContain('expectedMessages present');
+    expect(result.hits).toContain('candidateAnswer present');
+    expect(result.hits).toContain('candidateAnswer parses');
   });
 });
diff --git a/packages/core/test/evaluation/execution-metrics.test.ts b/packages/core/test/evaluation/execution-metrics.test.ts
index 666d2de1..96f1e27b 100644
--- a/packages/core/test/evaluation/execution-metrics.test.ts
+++ b/packages/core/test/evaluation/execution-metrics.test.ts
@@ -4,8 +4,8 @@ import {
   type TraceSummary,
   avgToolDurationMs,
   explorationRatio,
-  tokensPerTool,
   mergeExecutionMetrics,
+  tokensPerTool,
 } from '../../src/evaluation/trace.js';
 
 describe('Execution Metrics', () => {
diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts
index 4701ef93..a8118c6f 100644
--- a/packages/core/test/evaluation/orchestrator.test.ts
+++ b/packages/core/test/evaluation/orchestrator.test.ts
@@ -183,7 +183,7 @@ describe('runTestCase', () => {
       useCache: true,
     });
 
-    expect(first.candidate_answer).toContain('structured logging');
+    expect(first.candidateAnswer).toContain('structured logging');
 
     const second = await runEvalCase({
       evalCase: baseTestCase,
@@ -194,7 +194,7 @@ describe('runTestCase', () => {
       useCache: true,
     });
 
-    expect(second.candidate_answer).toBe(first.candidate_answer);
+    expect(second.candidateAnswer).toBe(first.candidateAnswer);
     expect(provider.callIndex).toBe(1);
   });
 
@@ -314,13 +314,13 @@ describe('runTestCase', () => {
     );
     expect(judgeProvider.lastRequest?.systemPrompt).not.toContain('CUSTOM PROMPT CONTENT');
 
-    expect(result.evaluator_results?.[0]?.evaluator_provider_request?.userPrompt).toContain(
+    expect(result.evaluatorResults?.[0]?.evaluatorProviderRequest?.userPrompt).toContain(
       'CUSTOM PROMPT CONTENT',
     );
-    expect(result.evaluator_results?.[0]?.evaluator_provider_request?.systemPrompt).toContain(
+    expect(result.evaluatorResults?.[0]?.evaluatorProviderRequest?.systemPrompt).toContain(
       'You must respond with a single JSON object',
     );
-    expect(result.evaluator_results?.[0]?.evaluator_provider_request?.systemPrompt).not.toContain(
+    expect(result.evaluatorResults?.[0]?.evaluatorProviderRequest?.systemPrompt).not.toContain(
       'CUSTOM PROMPT CONTENT',
     );
   });
@@ -374,7 +374,7 @@ describe('runTestCase', () => {
       content: '<file path="snippet.txt">\ncode()\n</file>\nReview',
     });
     expect(chatPrompt[2]).toEqual({ role: 'assistant', content: 'Ack' });
-    expect(result.lm_provider_request?.chat_prompt).toBeDefined();
+    expect(result.lmProviderRequest?.chat_prompt).toBeDefined();
   });
 
   it('omits chatPrompt for single-turn evals', async () => {
@@ -429,9 +429,9 @@ describe('runTestCase', () => {
       evaluators: evaluatorRegistry,
     });
 
-    expect(result.agent_provider_request).toBeDefined();
-    expect(result.lm_provider_request).toBeUndefined();
-    expect(result.agent_provider_request?.question).toBe('Explain logging improvements');
+    expect(result.agentProviderRequest).toBeDefined();
+    expect(result.lmProviderRequest).toBeUndefined();
+    expect(result.agentProviderRequest?.question).toBe('Explain logging improvements');
   });
 });
 
@@ -474,7 +474,7 @@ describe('runEvalCase trace integration', () => {
     evaluator: 'llm_judge',
   };
 
-  it('includes trace_summary in result when provider returns outputMessages with tool calls', async () => {
+  it('includes traceSummary in result when provider returns outputMessages with tool calls', async () => {
     const outputMessages: OutputMessage[] = [
       {
         role: 'assistant',
@@ -504,14 +504,14 @@ describe('runEvalCase trace integration', () => {
       evaluators: evaluatorRegistry,
     });
 
-    expect(result.trace_summary).toBeDefined();
-    expect(result.trace_summary?.eventCount).toBe(1);
-    expect(result.trace_summary?.toolNames).toEqual(['getWeather']);
-    expect(result.trace_summary?.toolCallsByName).toEqual({ getWeather: 1 });
-    expect(result.trace_summary?.errorCount).toBe(0);
+    expect(result.traceSummary).toBeDefined();
+    expect(result.traceSummary?.eventCount).toBe(1);
+    expect(result.traceSummary?.toolNames).toEqual(['getWeather']);
+    expect(result.traceSummary?.toolCallsByName).toEqual({ getWeather: 1 });
+    expect(result.traceSummary?.errorCount).toBe(0);
   });
 
-  it('omits trace_summary when provider returns no outputMessages', async () => {
+  it('omits traceSummary when provider returns no outputMessages', async () => {
     const provider = new TraceProvider('mock', {
       outputMessages: [{ role: 'assistant', content: 'The weather is sunny' }],
     });
@@ -523,7 +523,7 @@ describe('runEvalCase trace integration', () => {
       evaluators: evaluatorRegistry,
     });
 
-    expect(result.trace_summary).toBeUndefined();
+    expect(result.traceSummary).toBeUndefined();
   });
 
   it('runs tool_trajectory evaluator with outputMessages', async () => {
@@ -586,9 +586,9 @@ describe('runEvalCase trace integration', () => {
     });
 
     expect(result.score).toBe(1);
-    expect(result.evaluator_results).toHaveLength(1);
-    expect(result.evaluator_results?.[0]?.name).toBe('tool-check');
-    expect(result.evaluator_results?.[0]?.verdict).toBe('pass');
+    expect(result.evaluatorResults).toHaveLength(1);
+    expect(result.evaluatorResults?.[0]?.name).toBe('tool-check');
+    expect(result.evaluatorResults?.[0]?.verdict).toBe('pass');
   });
 
   it('fails tool_trajectory evaluator when no trace available', async () => {
@@ -626,8 +626,8 @@ describe('runEvalCase trace integration', () => {
     });
 
     expect(result.score).toBe(0);
-    expect(result.evaluator_results?.[0]?.verdict).toBe('fail');
-    expect(result.evaluator_results?.[0]?.misses).toContain('No trace available for evaluation');
+    expect(result.evaluatorResults?.[0]?.verdict).toBe('fail');
+    expect(result.evaluatorResults?.[0]?.misses).toContain('No trace available for evaluation');
   });
 
   it('computes correct trace summary with multiple tool calls', async () => {
@@ -657,11 +657,11 @@ describe('runEvalCase trace integration', () => {
       evaluators: evaluatorRegistry,
     });
 
-    expect(result.trace_summary).toBeDefined();
-    expect(result.trace_summary?.eventCount).toBe(4);
-    expect(result.trace_summary?.toolNames).toEqual(['toolA', 'toolB', 'toolC']);
-    expect(result.trace_summary?.toolCallsByName).toEqual({ toolA: 2, toolB: 1, toolC: 1 });
-    expect(result.trace_summary?.errorCount).toBe(0);
+    expect(result.traceSummary).toBeDefined();
+    expect(result.traceSummary?.eventCount).toBe(4);
+    expect(result.traceSummary?.toolNames).toEqual(['toolA', 'toolB', 'toolC']);
+    expect(result.traceSummary?.toolCallsByName).toEqual({ toolA: 2, toolB: 1, toolC: 1 });
+    expect(result.traceSummary?.errorCount).toBe(0);
   });
 
   describe('weighted evaluators', () => {
@@ -692,9 +692,9 @@ describe('runEvalCase trace integration', () => {
       // eval2 weight=1.0, score=0.8 -> 0.8
       // Total: (1.6 + 0.8) / (2.0 + 1.0) = 2.4 / 3.0 = 0.8
       expect(result.score).toBeCloseTo(0.8);
-      expect(result.evaluator_results).toHaveLength(2);
-      expect(result.evaluator_results?.[0]?.weight).toBe(2.0);
-      expect(result.evaluator_results?.[1]?.weight).toBe(1.0);
+      expect(result.evaluatorResults).toHaveLength(2);
+      expect(result.evaluatorResults?.[0]?.weight).toBe(2.0);
+      expect(result.evaluatorResults?.[1]?.weight).toBe(1.0);
     });
 
     it('defaults missing weights to 1.0', async () => {
@@ -724,8 +724,8 @@ describe('runEvalCase trace integration', () => {
       // eval2 weight=1.0 (default), score=0.8 -> 0.8
       // Total: (2.4 + 0.8) / (3.0 + 1.0) = 3.2 / 4.0 = 0.8
       expect(result.score).toBeCloseTo(0.8);
-      expect(result.evaluator_results?.[0]?.weight).toBe(3.0);
-      expect(result.evaluator_results?.[1]?.weight).toBe(1.0);
+      expect(result.evaluatorResults?.[0]?.weight).toBe(3.0);
+      expect(result.evaluatorResults?.[1]?.weight).toBe(1.0);
     });
 
     it('excludes evaluators with weight 0', async () => {
@@ -755,8 +755,8 @@ describe('runEvalCase trace integration', () => {
       // eval2 weight=1.0, score=0.8 -> 0.8
       // Total: (0 + 0.8) / (0 + 1.0) = 0.8 / 1.0 = 0.8
       expect(result.score).toBeCloseTo(0.8);
-      expect(result.evaluator_results?.[0]?.weight).toBe(0);
-      expect(result.evaluator_results?.[1]?.weight).toBe(1.0);
+      expect(result.evaluatorResults?.[0]?.weight).toBe(0);
+      expect(result.evaluatorResults?.[1]?.weight).toBe(1.0);
     });
 
     it('returns 0 when all evaluators have weight 0', async () => {

From f0f4b78062389a3840bf47caff538a9e10d36c7b Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Thu, 1 Jan 2026 13:37:16 +0000
Subject: [PATCH 4/5] chore: add claude settings with superpowers plugin

---
 .claude/settings.json | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 .claude/settings.json

diff --git a/.claude/settings.json b/.claude/settings.json
new file mode 100644
index 00000000..07fa4272
--- /dev/null
+++ b/.claude/settings.json
@@ -0,0 +1,5 @@
+{
+  "enabledPlugins": {
+    "superpowers@superpowers-marketplace": true
+  }
+}

From 4e6a0465ebc62cac064d39d8aea2804b919835a1 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Thu, 1 Jan 2026 14:07:56 +0000
Subject: [PATCH 5/5] refactor: convert Python plugin scripts to TypeScript

- Convert efficiency_scorer.py, pairwise_tool_compare.py, and
  tool_selection_judge.py to TypeScript using bun
- Add execution-metrics example to examples/features/evals/
- Add integration test for code judge receiving traceSummary
- Update README and YAML to use camelCase field names
- Update tasks.md to reflect completed work
---
 .../execution-metrics-demo.yaml               |  88 +++++++
 .../scripts/check-efficiency.ts               | 141 ++++++++++
 .../tool-evaluation-plugins/README.md         |  37 +--
 .../scripts/efficiency-scorer.ts              | 239 +++++++++++++++++
 .../scripts/efficiency_scorer.py              | 214 ---------------
 .../scripts/pairwise-tool-compare.ts          | 243 ++++++++++++++++++
 .../scripts/pairwise_tool_compare.py          | 220 ----------------
 .../scripts/tool-selection-judge.ts           | 186 ++++++++++++++
 .../scripts/tool_selection_judge.py           | 166 ------------
 .../tool-eval-demo.yaml                       |  10 +-
 .../changes/add-execution-metrics/tasks.md    |   6 +-
 .../test/evaluation/execution-metrics.test.ts | 111 ++++++++
 12 files changed, 1028 insertions(+), 633 deletions(-)
 create mode 100644 examples/features/evals/execution-metrics/execution-metrics-demo.yaml
 create mode 100644 examples/features/evals/execution-metrics/scripts/check-efficiency.ts
 create mode 100644 examples/showcase/tool-evaluation-plugins/scripts/efficiency-scorer.ts
 delete mode 100644 examples/showcase/tool-evaluation-plugins/scripts/efficiency_scorer.py
 create mode 100644 examples/showcase/tool-evaluation-plugins/scripts/pairwise-tool-compare.ts
 delete mode 100644 examples/showcase/tool-evaluation-plugins/scripts/pairwise_tool_compare.py
 create mode 100644 examples/showcase/tool-evaluation-plugins/scripts/tool-selection-judge.ts
 delete mode 100644 examples/showcase/tool-evaluation-plugins/scripts/tool_selection_judge.py

diff --git a/examples/features/evals/execution-metrics/execution-metrics-demo.yaml b/examples/features/evals/execution-metrics/execution-metrics-demo.yaml
new file mode 100644
index 00000000..22563e8b
--- /dev/null
+++ b/examples/features/evals/execution-metrics/execution-metrics-demo.yaml
@@ -0,0 +1,88 @@
+# Execution Metrics Demo
+# Demonstrates how to use execution metrics in evaluation
+#
+# Execution metrics capture runtime information from provider invocations:
+# - tokenUsage: { input, output, cached? } - token consumption
+# - costUsd: API cost in USD
+# - durationMs: execution time in milliseconds
+#
+# These metrics are available in:
+# 1. TraceSummary (included in evaluation results)
+# 2. Code judge stdin (for custom metric-based evaluation)
+#
+# Run: cd examples/features/evals/execution-metrics
+#      npx agentv eval execution-metrics-demo.yaml --target your_target
+
+$schema: agentv-eval-v2
+description: Demonstrates execution metrics collection and evaluation
+
+target: default
+
+evalcases:
+  # ==========================================
+  # Example 1: Basic metrics collection
+  # Metrics are automatically included in results when available
+  # ==========================================
+  - id: metrics-collection
+
+    expected_outcome: |-
+      Agent responds to a simple query. Execution metrics are captured
+      automatically and included in the evaluation result.
+
+    input_messages:
+      - role: user
+        content: What is 2 + 2?
+
+    execution:
+      evaluators:
+        - name: basic-check
+          type: llm_judge
+
+  # ==========================================
+  # Example 2: Metric-aware code judge
+  # Use custom thresholds to evaluate efficiency
+  # ==========================================
+  - id: efficiency-evaluation
+
+    expected_outcome: |-
+      Agent efficiently answers a simple question without excessive
+      token usage or tool calls.
+
+    input_messages:
+      - role: user
+        content: List three primary colors.
+
+    execution:
+      evaluators:
+        # Custom code judge that evaluates efficiency metrics
+        - name: efficiency-check
+          type: code_judge
+          script: bun run scripts/check-efficiency.ts
+
+  # ==========================================
+  # Example 3: Combined trajectory and metrics
+  # Evaluate both tool usage and efficiency together
+  # ==========================================
+  - id: combined-evaluation
+
+    expected_outcome: |-
+      Agent searches for information and provides a response.
+      Evaluation checks both tool trajectory and execution efficiency.
+
+    input_messages:
+      - role: user
+        content: Find information about the weather in New York.
+
+    execution:
+      evaluators:
+        # Built-in: Check tool sequence
+        - name: trajectory-check
+          type: tool_trajectory
+          mode: any_order
+          minimums:
+            search: 1
+
+        # Custom: Check efficiency metrics
+        - name: metrics-check
+          type: code_judge
+          script: bun run scripts/check-efficiency.ts
diff --git a/examples/features/evals/execution-metrics/scripts/check-efficiency.ts b/examples/features/evals/execution-metrics/scripts/check-efficiency.ts
new file mode 100644
index 00000000..3a47158e
--- /dev/null
+++ b/examples/features/evals/execution-metrics/scripts/check-efficiency.ts
@@ -0,0 +1,141 @@
+#!/usr/bin/env bun
+/**
+ * Efficiency Check - Code Judge for Execution Metrics
+ *
+ * Demonstrates how to evaluate agent efficiency using execution metrics
+ * available in the TraceSummary.
+ *
+ * Input (stdin JSON):
+ *   - traceSummary: Contains execution metrics when available
+ *     - eventCount: Number of tool calls
+ *     - tokenUsage?: { input, output, cached? }
+ *     - costUsd?: API cost
+ *     - durationMs?: Execution time
+ *
+ * Output (stdout JSON):
+ *   - score: 0.0-1.0
+ *   - hits: Efficiency wins
+ *   - misses: Efficiency issues
+ *   - reasoning: Explanation
+ */
+
+interface TraceSummary {
+  eventCount: number;
+  toolNames: string[];
+  toolCallsByName: Record<string, number>;
+  errorCount: number;
+  tokenUsage?: { input: number; output: number; cached?: number };
+  costUsd?: number;
+  durationMs?: number;
+}
+
+interface EvalInput {
+  traceSummary?: TraceSummary;
+  expectedOutcome?: string;
+}
+
+interface EvalOutput {
+  score: number;
+  hits: string[];
+  misses: string[];
+  reasoning: string;
+}
+
+// Configurable thresholds
+const THRESHOLDS = {
+  maxToolCalls: 5,
+  maxTokens: 2000,
+  maxCostUsd: 0.01,
+  maxDurationMs: 10000,
+};
+
+function checkEfficiency(input: EvalInput): EvalOutput {
+  const hits: string[] = [];
+  const misses: string[] = [];
+  const checks: boolean[] = [];
+
+  const summary = input.traceSummary;
+
+  if (!summary) {
+    return {
+      score: 0.5,
+      hits: [],
+      misses: ['No trace summary available'],
+      reasoning: 'Cannot evaluate efficiency without trace data',
+    };
+  }
+
+  // Check tool call count
+  if (summary.eventCount <= THRESHOLDS.maxToolCalls) {
+    hits.push(`Tool calls (${summary.eventCount}) within limit (${THRESHOLDS.maxToolCalls})`);
+    checks.push(true);
+  } else {
+    misses.push(`Too many tool calls: ${summary.eventCount} (max: ${THRESHOLDS.maxToolCalls})`);
+    checks.push(false);
+  }
+
+  // Check token usage if available
+  if (summary.tokenUsage) {
+    const totalTokens = summary.tokenUsage.input + summary.tokenUsage.output;
+    if (totalTokens <= THRESHOLDS.maxTokens) {
+      hits.push(`Token usage (${totalTokens}) within limit`);
+      checks.push(true);
+    } else {
+      misses.push(`High token usage: ${totalTokens} (max: ${THRESHOLDS.maxTokens})`);
+      checks.push(false);
+    }
+  }
+
+  // Check cost if available
+  if (summary.costUsd !== undefined) {
+    if (summary.costUsd <= THRESHOLDS.maxCostUsd) {
+      hits.push(`Cost ($${summary.costUsd.toFixed(4)}) within budget`);
+      checks.push(true);
+    } else {
+      misses.push(`High cost: $${summary.costUsd.toFixed(4)} (max: $${THRESHOLDS.maxCostUsd})`);
+      checks.push(false);
+    }
+  }
+
+  // Check duration if available
+  if (summary.durationMs !== undefined) {
+    if (summary.durationMs <= THRESHOLDS.maxDurationMs) {
+      hits.push(`Duration (${summary.durationMs}ms) within limit`);
+      checks.push(true);
+    } else {
+      misses.push(`Slow execution: ${summary.durationMs}ms (max: ${THRESHOLDS.maxDurationMs}ms)`);
+      checks.push(false);
+    }
+  }
+
+  // Calculate score
+  const passCount = checks.filter((c) => c).length;
+  const score = checks.length > 0 ? passCount / checks.length : 0.5;
+
+  return {
+    score: Math.round(score * 100) / 100,
+    hits: hits.slice(0, 4),
+    misses: misses.slice(0, 4),
+    reasoning: `Checked ${checks.length} efficiency metrics: ${passCount} passed, ${checks.length - passCount} failed`,
+  };
+}
+
+async function main(): Promise<void> {
+  try {
+    const stdin = await Bun.stdin.text();
+    const input = JSON.parse(stdin) as EvalInput;
+    const result = checkEfficiency(input);
+    console.log(JSON.stringify(result, null, 2));
+  } catch (error) {
+    const errorResult: EvalOutput = {
+      score: 0,
+      hits: [],
+      misses: [`Error: ${error instanceof Error ? error.message : String(error)}`],
+      reasoning: 'Evaluation failed due to error',
+    };
+    console.log(JSON.stringify(errorResult, null, 2));
+    process.exit(1);
+  }
+}
+
+main();
diff --git a/examples/showcase/tool-evaluation-plugins/README.md b/examples/showcase/tool-evaluation-plugins/README.md
index 94350618..c46403c2 100644
--- a/examples/showcase/tool-evaluation-plugins/README.md
+++ b/examples/showcase/tool-evaluation-plugins/README.md
@@ -16,29 +16,18 @@ This showcase demonstrates **plugin-based tool evaluation patterns** that comple
 
 ## Plugin Examples
 
-### 1. Tool Selection Evaluator (`tool_selection_judge.py`)
+### 1. Tool Selection Evaluator (`tool-selection-judge.ts`)
 
-Evaluates whether the agent selected the **right tools** for the task. Uses LLM-as-judge pattern to semantically assess tool choices.
+Evaluates whether the agent selected the **right tools** for the task. Uses heuristic matching to assess tool choices against task keywords.
 
 ```yaml
 evaluators:
   - name: tool-selection
     type: code_judge
-    script: scripts/tool_selection_judge.py
+    script: bun run scripts/tool-selection-judge.ts
 ```
 
-### 2. Tool Input Validator (`tool_input_validator.ts`)
-
-Validates that tool **arguments are semantically appropriate** (not just syntactically correct). Checks if argument values make sense in context.
-
-```yaml
-evaluators:
-  - name: input-validation
-    type: code_judge
-    script: scripts/tool_input_validator.ts
-```
-
-### 3. Tool Efficiency Scorer (`efficiency_scorer.py`)
+### 2. Tool Efficiency Scorer (`efficiency-scorer.ts`)
 
 Computes efficiency metrics and scores based on configurable thresholds. Demonstrates how to use execution metrics in evaluation.
 
@@ -46,10 +35,10 @@ Computes efficiency metrics and scores based on configurable thresholds. Demonst
 evaluators:
   - name: efficiency
     type: code_judge
-    script: scripts/efficiency_scorer.py
+    script: bun run scripts/efficiency-scorer.ts
 ```
 
-### 4. Pairwise Tool Comparison (`pairwise_tool_compare.py`)
+### 3. Pairwise Tool Comparison (`pairwise-tool-compare.ts`)
 
 Compares two agent responses for tool usage quality with position bias mitigation (runs comparison twice with swapped order).
 
@@ -57,7 +46,7 @@ Compares two agent responses for tool usage quality with position bias mitigatio
 evaluators:
   - name: pairwise-compare
     type: code_judge
-    script: scripts/pairwise_tool_compare.py
+    script: bun run scripts/pairwise-tool-compare.ts
 ```
 
 ## Running the Examples
@@ -74,9 +63,9 @@ All code judges receive a JSON object on stdin with:
 ```json
 {
   "question": "User's question/task",
-  "expected_outcome": "Expected behavior description",
-  "candidate_answer": "Agent's final response",
-  "output_messages": [
+  "expectedOutcome": "Expected behavior description",
+  "candidateAnswer": "Agent's final response",
+  "outputMessages": [
     {
       "role": "assistant",
       "content": "...",
@@ -91,13 +80,11 @@ All code judges receive a JSON object on stdin with:
       "content": "Tool result..."
     }
   ],
-  "candidate_trace_summary": {
+  "traceSummary": {
     "eventCount": 5,
     "toolNames": ["search", "fetch"],
     "toolCallsByName": { "search": 2, "fetch": 1 },
-    "errorCount": 0
-  },
-  "execution_metrics": {
+    "errorCount": 0,
     "tokenUsage": { "input": 1000, "output": 500 },
     "durationMs": 3500,
     "costUsd": 0.0015
diff --git a/examples/showcase/tool-evaluation-plugins/scripts/efficiency-scorer.ts b/examples/showcase/tool-evaluation-plugins/scripts/efficiency-scorer.ts
new file mode 100644
index 00000000..93682f24
--- /dev/null
+++ b/examples/showcase/tool-evaluation-plugins/scripts/efficiency-scorer.ts
@@ -0,0 +1,239 @@
+#!/usr/bin/env bun
+/**
+ * Tool Efficiency Scorer - Code Judge Plugin
+ *
+ * Evaluates agent efficiency based on execution metrics:
+ * - Token usage relative to task complexity
+ * - Number of tool calls (redundancy detection)
+ * - Exploration ratio (read-only vs action tools)
+ * - Cost efficiency
+ *
+ * Why this is a plugin (not built-in):
+ * - Efficiency thresholds are domain-specific
+ * - What's "efficient" depends on the task type
+ * - Different projects have different cost/performance tradeoffs
+ *
+ * Usage in eval YAML:
+ *   evaluators:
+ *     - name: efficiency
+ *       type: code_judge
+ *       script: bun run scripts/efficiency-scorer.ts
+ *
+ * Input (stdin JSON):
+ *   - traceSummary: Tool call statistics
+ *   - expectedOutcome: Task description (for complexity estimation)
+ *
+ * Output (stdout JSON):
+ *   - score: 0.0-1.0 efficiency score
+ *   - hits: Efficiency wins
+ *   - misses: Efficiency issues
+ *   - reasoning: Explanation
+ */
+
+interface TraceSummary {
+  eventCount: number;
+  toolCallsByName: Record<string, number>;
+  tokenUsage?: { input: number; output: number; cached?: number };
+  costUsd?: number;
+  durationMs?: number;
+}
+
+interface EvalInput {
+  traceSummary?: TraceSummary;
+  expectedOutcome?: string;
+}
+
+interface EvalOutput {
+  score: number;
+  hits: string[];
+  misses: string[];
+  reasoning: string;
+}
+
+// Configurable thresholds (customize for your domain)
+const THRESHOLDS = {
+  // Maximum tool calls before penalty
+  maxToolCalls: 10,
+  // Ideal exploration ratio (read-only tools / total)
+  targetExplorationRatio: 0.6,
+  explorationTolerance: 0.2,
+  // Token budgets
+  maxTokensSimple: 2000,
+  maxTokensComplex: 10000,
+  // Cost thresholds (USD)
+  maxCostSimple: 0.01,
+  maxCostComplex: 0.1,
+};
+
+// Tools considered "exploration" (read-only)
+const EXPLORATION_TOOLS = new Set([
+  'read',
+  'grep',
+  'glob',
+  'search',
+  'list',
+  'find',
+  'get',
+  'fetch',
+  'query',
+  'inspect',
+  'view',
+]);
+
+function estimateTaskComplexity(expectedOutcome: string): 'simple' | 'complex' {
+  const text = expectedOutcome.toLowerCase();
+  const complexIndicators = [
+    'multiple',
+    'several',
+    'comprehensive',
+    'thorough',
+    'analyze',
+    'compare',
+    'synthesize',
+    'integrate',
+  ];
+  if (complexIndicators.some((indicator) => text.includes(indicator))) {
+    return 'complex';
+  }
+  return 'simple';
+}
+
+function calculateExplorationRatio(traceSummary: TraceSummary): number {
+  const toolCalls = traceSummary.toolCallsByName;
+  const total = Object.values(toolCalls).reduce((sum, count) => sum + count, 0);
+  if (total === 0) {
+    return 0;
+  }
+
+  let explorationCount = 0;
+  for (const [tool, count] of Object.entries(toolCalls)) {
+    const toolLower = tool.toLowerCase();
+    if ([...EXPLORATION_TOOLS].some((exp) => toolLower.includes(exp))) {
+      explorationCount += count;
+    }
+  }
+  return explorationCount / total;
+}
+
+function evaluateEfficiency(
+  traceSummary: TraceSummary | undefined,
+  expectedOutcome: string,
+): EvalOutput {
+  const hits: string[] = [];
+  const misses: string[] = [];
+  const scores: number[] = [];
+
+  const complexity = estimateTaskComplexity(expectedOutcome);
+
+  // 1. Tool call count evaluation
+  if (traceSummary) {
+    const toolCount = traceSummary.eventCount;
+    const maxCalls = THRESHOLDS.maxToolCalls;
+
+    if (toolCount <= maxCalls) {
+      hits.push(`Tool calls (${toolCount}) within budget (${maxCalls})`);
+      scores.push(1.0);
+    } else {
+      const penalty = Math.min((toolCount - maxCalls) / maxCalls, 1.0);
+      scores.push(1.0 - penalty);
+      misses.push(`Excessive tool calls: ${toolCount} (budget: ${maxCalls})`);
+    }
+
+    // 2. Exploration ratio evaluation
+    const expRatio = calculateExplorationRatio(traceSummary);
+    const target = THRESHOLDS.targetExplorationRatio;
+    const tolerance = THRESHOLDS.explorationTolerance;
+
+    if (Math.abs(expRatio - target) <= tolerance) {
+      hits.push(`Good exploration ratio: ${expRatio.toFixed(2)}`);
+      scores.push(1.0);
+    } else if (expRatio < target - tolerance) {
+      scores.push(0.7);
+      misses.push(`Low exploration ratio: ${expRatio.toFixed(2)} (target: ${target.toFixed(2)})`);
+    } else {
+      scores.push(0.7);
+      misses.push(`High exploration ratio: ${expRatio.toFixed(2)} (target: ${target.toFixed(2)})`);
+    }
+
+    // 3. Token usage evaluation
+    if (traceSummary.tokenUsage) {
+      const tokens = traceSummary.tokenUsage;
+      const totalTokens = tokens.input + tokens.output;
+      const maxTokens =
+        complexity === 'complex' ? THRESHOLDS.maxTokensComplex : THRESHOLDS.maxTokensSimple;
+
+      if (totalTokens <= maxTokens) {
+        hits.push(`Token usage (${totalTokens}) within budget`);
+        scores.push(1.0);
+      } else {
+        const penalty = Math.min((totalTokens - maxTokens) / maxTokens, 1.0);
+        scores.push(1.0 - penalty * 0.5); // Softer penalty
+        misses.push(`High token usage: ${totalTokens} (budget: ${maxTokens})`);
+      }
+    }
+
+    // 4. Cost evaluation
+    if (traceSummary.costUsd !== undefined) {
+      const cost = traceSummary.costUsd;
+      const maxCost =
+        complexity === 'complex' ? THRESHOLDS.maxCostComplex : THRESHOLDS.maxCostSimple;
+
+      if (cost <= maxCost) {
+        hits.push(`Cost ($${cost.toFixed(4)}) within budget`);
+        scores.push(1.0);
+      } else {
+        scores.push(0.5);
+        misses.push(`High cost: $${cost.toFixed(4)} (budget: $${maxCost.toFixed(4)})`);
+      }
+    }
+  }
+
+  // Calculate final score
+  if (scores.length === 0) {
+    return {
+      score: 0.5,
+      hits: ['No efficiency metrics available'],
+      misses: [],
+      reasoning: 'Could not evaluate efficiency - no metrics provided',
+    };
+  }
+
+  const finalScore = scores.reduce((sum, s) => sum + s, 0) / scores.length;
+
+  const reasoning =
+    `Task complexity: ${complexity}. ` +
+    `Evaluated ${scores.length} efficiency criteria. ` +
+    `Score: ${finalScore.toFixed(2)}`;
+
+  return {
+    score: Math.round(finalScore * 100) / 100,
+    hits: hits.slice(0, 4),
+    misses: misses.slice(0, 4),
+    reasoning,
+  };
+}
+
+async function main(): Promise<void> {
+  try {
+    const stdin = await Bun.stdin.text();
+    const inputData = JSON.parse(stdin) as EvalInput;
+
+    const traceSummary = inputData.traceSummary;
+    const expectedOutcome = inputData.expectedOutcome ?? '';
+
+    const result = evaluateEfficiency(traceSummary, expectedOutcome);
+
+    console.log(JSON.stringify(result, null, 2));
+  } catch (error) {
+    const errorResult: EvalOutput = {
+      score: 0,
+      hits: [],
+      misses: [`Evaluator error: ${error instanceof Error ? error.message : String(error)}`],
+      reasoning: `Evaluation failed: ${error instanceof Error ? error.message : String(error)}`,
+    };
+    console.log(JSON.stringify(errorResult, null, 2));
+    process.exit(1);
+  }
+}
+
+main();
diff --git a/examples/showcase/tool-evaluation-plugins/scripts/efficiency_scorer.py b/examples/showcase/tool-evaluation-plugins/scripts/efficiency_scorer.py
deleted file mode 100644
index 15c68ecc..00000000
--- a/examples/showcase/tool-evaluation-plugins/scripts/efficiency_scorer.py
+++ /dev/null
@@ -1,214 +0,0 @@
-#!/usr/bin/env python3
-"""
-Tool Efficiency Scorer - Code Judge Plugin
-
-Evaluates agent efficiency based on execution metrics:
-- Token usage relative to task complexity
-- Number of tool calls (redundancy detection)
-- Exploration ratio (read-only vs action tools)
-- Cost efficiency
-
-Why this is a plugin (not built-in):
-- Efficiency thresholds are domain-specific
-- What's "efficient" depends on the task type
-- Different projects have different cost/performance tradeoffs
-
-Usage in eval YAML:
-  evaluators:
-    - name: efficiency
-      type: code_judge
-      script: scripts/efficiency_scorer.py
-
-Input (stdin JSON):
-  - candidate_trace_summary: Tool call statistics
-  - execution_metrics: Token usage, cost, duration (if available)
-  - expected_outcome: Task description (for complexity estimation)
-
-Output (stdout JSON):
-  - score: 0.0-1.0 efficiency score
-  - hits: Efficiency wins
-  - misses: Efficiency issues
-  - reasoning: Explanation
-"""
-
-import json
-import sys
-from typing import Any
-
-
-# Configurable thresholds (customize for your domain)
-THRESHOLDS = {
-    # Maximum tool calls before penalty
-    "max_tool_calls": 10,
-    # Ideal exploration ratio (read-only tools / total)
-    "target_exploration_ratio": 0.6,
-    "exploration_tolerance": 0.2,
-    # Token budgets
-    "max_tokens_simple": 2000,
-    "max_tokens_complex": 10000,
-    # Cost thresholds (USD)
-    "max_cost_simple": 0.01,
-    "max_cost_complex": 0.10,
-}
-
-# Tools considered "exploration" (read-only)
-EXPLORATION_TOOLS = {
-    "read", "grep", "glob", "search", "list", "find",
-    "get", "fetch", "query", "inspect", "view",
-}
-
-
-def estimate_task_complexity(expected_outcome: str) -> str:
-    """Estimate task complexity from expected outcome description."""
-    text = expected_outcome.lower()
-    complex_indicators = [
-        "multiple", "several", "comprehensive", "thorough",
-        "analyze", "compare", "synthesize", "integrate",
-    ]
-    if any(indicator in text for indicator in complex_indicators):
-        return "complex"
-    return "simple"
-
-
-def calculate_exploration_ratio(trace_summary: dict) -> float:
-    """Calculate ratio of exploration tools to total tools."""
-    tool_calls = trace_summary.get("toolCallsByName", {})
-    total = sum(tool_calls.values())
-    if total == 0:
-        return 0.0
-
-    exploration_count = sum(
-        count for tool, count in tool_calls.items()
-        if any(exp in tool.lower() for exp in EXPLORATION_TOOLS)
-    )
-    return exploration_count / total
-
-
-def evaluate_efficiency(
-    trace_summary: dict | None,
-    execution_metrics: dict | None,
-    expected_outcome: str,
-) -> dict[str, Any]:
-    """Evaluate agent efficiency against configurable thresholds."""
-    hits = []
-    misses = []
-    scores = []
-
-    complexity = estimate_task_complexity(expected_outcome)
-
-    # 1. Tool call count evaluation
-    if trace_summary:
-        tool_count = trace_summary.get("eventCount", 0)
-        max_calls = THRESHOLDS["max_tool_calls"]
-
-        if tool_count <= max_calls:
-            hits.append(f"Tool calls ({tool_count}) within budget ({max_calls})")
-            scores.append(1.0)
-        else:
-            penalty = min((tool_count - max_calls) / max_calls, 1.0)
-            scores.append(1.0 - penalty)
-            misses.append(f"Excessive tool calls: {tool_count} (budget: {max_calls})")
-
-        # 2. Exploration ratio evaluation
-        exp_ratio = calculate_exploration_ratio(trace_summary)
-        target = THRESHOLDS["target_exploration_ratio"]
-        tolerance = THRESHOLDS["exploration_tolerance"]
-
-        if abs(exp_ratio - target) <= tolerance:
-            hits.append(f"Good exploration ratio: {exp_ratio:.2f}")
-            scores.append(1.0)
-        elif exp_ratio < target - tolerance:
-            scores.append(0.7)
-            misses.append(f"Low exploration ratio: {exp_ratio:.2f} (target: {target:.2f})")
-        else:
-            scores.append(0.7)
-            misses.append(f"High exploration ratio: {exp_ratio:.2f} (target: {target:.2f})")
-
-    # 3. Token usage evaluation
-    if execution_metrics and "tokenUsage" in execution_metrics:
-        tokens = execution_metrics["tokenUsage"]
-        total_tokens = tokens.get("input", 0) + tokens.get("output", 0)
-        max_tokens = (
-            THRESHOLDS["max_tokens_complex"]
-            if complexity == "complex"
-            else THRESHOLDS["max_tokens_simple"]
-        )
-
-        if total_tokens <= max_tokens:
-            hits.append(f"Token usage ({total_tokens}) within budget")
-            scores.append(1.0)
-        else:
-            penalty = min((total_tokens - max_tokens) / max_tokens, 1.0)
-            scores.append(1.0 - penalty * 0.5)  # Softer penalty
-            misses.append(f"High token usage: {total_tokens} (budget: {max_tokens})")
-
-    # 4. Cost evaluation
-    if execution_metrics and "costUsd" in execution_metrics:
-        cost = execution_metrics["costUsd"]
-        max_cost = (
-            THRESHOLDS["max_cost_complex"]
-            if complexity == "complex"
-            else THRESHOLDS["max_cost_simple"]
-        )
-
-        if cost <= max_cost:
-            hits.append(f"Cost (${cost:.4f}) within budget")
-            scores.append(1.0)
-        else:
-            scores.append(0.5)
-            misses.append(f"High cost: ${cost:.4f} (budget: ${max_cost:.4f})")
-
-    # Calculate final score
-    if not scores:
-        return {
-            "score": 0.5,
-            "hits": ["No efficiency metrics available"],
-            "misses": [],
-            "reasoning": "Could not evaluate efficiency - no metrics provided",
-        }
-
-    final_score = sum(scores) / len(scores)
-
-    reasoning = (
-        f"Task complexity: {complexity}. "
-        f"Evaluated {len(scores)} efficiency criteria. "
-        f"Score: {final_score:.2f}"
-    )
-
-    return {
-        "score": round(final_score, 2),
-        "hits": hits[:4],
-        "misses": misses[:4],
-        "reasoning": reasoning,
-    }
-
-
-def main():
-    try:
-        input_data = json.loads(sys.stdin.read())
-
-        trace_summary = input_data.get("candidate_trace_summary")
-        execution_metrics = input_data.get("execution_metrics")
-        expected_outcome = input_data.get("expected_outcome", "")
-
-        result = evaluate_efficiency(
-            trace_summary=trace_summary,
-            execution_metrics=execution_metrics,
-            expected_outcome=expected_outcome,
-        )
-
-        print(json.dumps(result, indent=2))
-
-    except Exception as e:
-        error_result = {
-            "score": 0.0,
-            "hits": [],
-            "misses": [f"Evaluator error: {str(e)}"],
-            "reasoning": f"Evaluation failed: {str(e)}",
-        }
-        print(json.dumps(error_result, indent=2))
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/showcase/tool-evaluation-plugins/scripts/pairwise-tool-compare.ts b/examples/showcase/tool-evaluation-plugins/scripts/pairwise-tool-compare.ts
new file mode 100644
index 00000000..cfee4d06
--- /dev/null
+++ b/examples/showcase/tool-evaluation-plugins/scripts/pairwise-tool-compare.ts
@@ -0,0 +1,243 @@
+#!/usr/bin/env bun
+/**
+ * Pairwise Tool Comparison - Code Judge Plugin
+ *
+ * Compares tool usage quality between two agent responses with
+ * position bias mitigation (runs comparison twice with swapped order).
+ *
+ * Why this is a plugin (not built-in):
+ * - Pairwise comparison is a specialized evaluation pattern
+ * - Requires reference response (not always available)
+ * - Position bias mitigation adds complexity
+ * - Not all evaluations need comparative assessment
+ *
+ * Usage in eval YAML:
+ *   evaluators:
+ *     - name: pairwise-compare
+ *       type: code_judge
+ *       script: bun run scripts/pairwise-tool-compare.ts
+ *
+ * Input (stdin JSON):
+ *   - candidateAnswer: Agent's response (Response A)
+ *   - referenceAnswer: Reference/baseline response (Response B)
+ *   - outputMessages: Tool calls from candidate
+ *   - expectedOutcome: Task description
+ *
+ * Output (stdout JSON):
+ *   - score: 0.0-1.0 (1.0 = candidate wins, 0.5 = tie, 0.0 = reference wins)
+ *   - hits: Candidate advantages
+ *   - misses: Reference advantages
+ *   - reasoning: Comparison explanation with bias check result
+ */
+
+interface OutputMessage {
+  role: string;
+  toolCalls?: Array<{ tool: string; args?: Record<string, unknown> }>;
+}
+
+interface EvalInput {
+  candidateAnswer?: string;
+  referenceAnswer?: string;
+  outputMessages?: OutputMessage[];
+  referenceOutputMessages?: OutputMessage[];
+  expectedOutcome?: string;
+}
+
+interface EvalOutput {
+  score: number;
+  hits: string[];
+  misses: string[];
+  reasoning: string;
+}
+
+interface ToolSummary {
+  tools: string[];
+  count: number;
+  unique: string[];
+}
+
+interface CompareResult {
+  winner: 'A' | 'B' | 'TIE';
+  aAdvantages: string[];
+  bAdvantages: string[];
+}
+
+function extractToolSummary(messages: OutputMessage[] | undefined): ToolSummary {
+  if (!messages) {
+    return { tools: [], count: 0, unique: [] };
+  }
+
+  const tools: string[] = [];
+  for (const msg of messages) {
+    if (msg.role === 'assistant' && msg.toolCalls) {
+      for (const call of msg.toolCalls) {
+        tools.push(call.tool ?? 'unknown');
+      }
+    }
+  }
+
+  return {
+    tools,
+    count: tools.length,
+    unique: [...new Set(tools)],
+  };
+}
+
+function compareResponses(
+  responseA: string,
+  responseB: string,
+  toolsA: ToolSummary,
+  toolsB: ToolSummary,
+): CompareResult {
+  const aAdvantages: string[] = [];
+  const bAdvantages: string[] = [];
+
+  // 1. Compare tool count efficiency
+  if (toolsA.count < toolsB.count && toolsA.count > 0) {
+    aAdvantages.push(`More efficient: ${toolsA.count} vs ${toolsB.count} tools`);
+  } else if (toolsB.count < toolsA.count && toolsB.count > 0) {
+    bAdvantages.push(`More efficient: ${toolsB.count} vs ${toolsA.count} tools`);
+  }
+
+  // 2. Compare tool diversity
+  if (toolsA.unique.length > toolsB.unique.length) {
+    aAdvantages.push(`More diverse tools: ${toolsA.unique.length} types`);
+  } else if (toolsB.unique.length > toolsA.unique.length) {
+    bAdvantages.push(`More diverse tools: ${toolsB.unique.length} types`);
+  }
+
+  // 3. Compare response length (proxy for completeness)
+  const lenA = responseA.length;
+  const lenB = responseB.length;
+  if (lenA > lenB * 1.2) {
+    aAdvantages.push('More comprehensive response');
+  } else if (lenB > lenA * 1.2) {
+    bAdvantages.push('More comprehensive response');
+  }
+
+  // 4. Check for no tools (penalty)
+  if (toolsA.count === 0 && toolsB.count > 0) {
+    bAdvantages.push('Response B used tools; A did not');
+  } else if (toolsB.count === 0 && toolsA.count > 0) {
+    aAdvantages.push('Response A used tools; B did not');
+  }
+
+  // Determine winner
+  const aScore = aAdvantages.length;
+  const bScore = bAdvantages.length;
+
+  if (aScore > bScore) {
+    return { winner: 'A', aAdvantages, bAdvantages };
+  } else if (bScore > aScore) {
+    return { winner: 'B', aAdvantages, bAdvantages };
+  } else {
+    return { winner: 'TIE', aAdvantages, bAdvantages };
+  }
+}
+
+function pairwiseWithBiasMitigation(
+  candidate: string,
+  reference: string,
+  candidateTools: ToolSummary,
+  referenceTools: ToolSummary,
+): EvalOutput {
+  // Pass 1: Candidate as A, Reference as B
+  const pass1 = compareResponses(candidate, reference, candidateTools, referenceTools);
+
+  // Pass 2: Reference as A, Candidate as B (swapped)
+  const pass2 = compareResponses(reference, candidate, referenceTools, candidateTools);
+
+  // Map pass2 result back (if A wins in pass2, that means Reference won)
+  const pass2Mapped: 'A' | 'B' | 'TIE' =
+    pass2.winner === 'A' ? 'B' : pass2.winner === 'B' ? 'A' : 'TIE';
+
+  // Check consistency
+  const consistent = pass1.winner === pass2Mapped;
+
+  let finalWinner: 'A' | 'B' | 'TIE';
+  let confidence: string;
+
+  if (consistent) {
+    finalWinner = pass1.winner;
+    confidence = 'high';
+  } else {
+    // Inconsistent results indicate position bias - return TIE
+    finalWinner = 'TIE';
+    confidence = 'low (position bias detected)';
+  }
+
+  // Convert to score (candidate perspective)
+  let score: number;
+  if (finalWinner === 'A') {
+    // Candidate wins
+    score = 1.0;
+  } else if (finalWinner === 'B') {
+    // Reference wins
+    score = 0.0;
+  } else {
+    // TIE
+    score = 0.5;
+  }
+
+  const hits = pass1.aAdvantages.slice(0, 4); // Candidate advantages
+  const misses = pass1.bAdvantages.slice(0, 4); // Reference advantages
+
+  const reasoning =
+    `Pass 1: ${pass1.winner} wins. ` +
+    `Pass 2 (swapped): ${pass2.winner} wins (maps to ${pass2Mapped}). ` +
+    `Consistency: ${consistent}. ` +
+    `Final: ${finalWinner} (${confidence} confidence)`;
+
+  return { score, hits, misses, reasoning };
+}
+
+async function main(): Promise<void> {
+  try {
+    const stdin = await Bun.stdin.text();
+    const inputData = JSON.parse(stdin) as EvalInput;
+
+    const candidate = inputData.candidateAnswer ?? '';
+    const reference = inputData.referenceAnswer ?? '';
+    const outputMessages = inputData.outputMessages ?? [];
+
+    // If no reference, we can't do pairwise comparison
+    if (!reference) {
+      console.log(
+        JSON.stringify(
+          {
+            score: 0.5,
+            hits: ['Candidate response provided'],
+            misses: ['No reference for comparison'],
+            reasoning: 'Pairwise comparison requires referenceAnswer field',
+          },
+          null,
+          2,
+        ),
+      );
+      return;
+    }
+
+    // Extract tool summaries
+    const candidateTools = extractToolSummary(outputMessages);
+
+    // For reference, we'd need referenceOutputMessages
+    // In practice, this would come from a baseline run
+    const referenceMessages = inputData.referenceOutputMessages ?? [];
+    const referenceTools = extractToolSummary(referenceMessages);
+
+    const result = pairwiseWithBiasMitigation(candidate, reference, candidateTools, referenceTools);
+
+    console.log(JSON.stringify(result, null, 2));
+  } catch (error) {
+    const errorResult: EvalOutput = {
+      score: 0,
+      hits: [],
+      misses: [`Evaluator error: ${error instanceof Error ? error.message : String(error)}`],
+      reasoning: `Evaluation failed: ${error instanceof Error ? error.message : String(error)}`,
+    };
+    console.log(JSON.stringify(errorResult, null, 2));
+    process.exit(1);
+  }
+}
+
+main();
diff --git a/examples/showcase/tool-evaluation-plugins/scripts/pairwise_tool_compare.py b/examples/showcase/tool-evaluation-plugins/scripts/pairwise_tool_compare.py
deleted file mode 100644
index e0bc842c..00000000
--- a/examples/showcase/tool-evaluation-plugins/scripts/pairwise_tool_compare.py
+++ /dev/null
@@ -1,220 +0,0 @@
-#!/usr/bin/env python3
-"""
-Pairwise Tool Comparison - Code Judge Plugin
-
-Compares tool usage quality between two agent responses with
-position bias mitigation (runs comparison twice with swapped order).
-
-Why this is a plugin (not built-in):
-- Pairwise comparison is a specialized evaluation pattern
-- Requires reference response (not always available)
-- Position bias mitigation adds complexity
-- Not all evaluations need comparative assessment
-
-Usage in eval YAML:
-  evaluators:
-    - name: pairwise-compare
-      type: code_judge
-      script: scripts/pairwise_tool_compare.py
-
-Input (stdin JSON):
-  - candidate_answer: Agent's response (Response A)
-  - reference_answer: Reference/baseline response (Response B)
-  - output_messages: Tool calls from candidate
-  - expected_outcome: Task description
-
-Output (stdout JSON):
-  - score: 0.0-1.0 (1.0 = candidate wins, 0.5 = tie, 0.0 = reference wins)
-  - hits: Candidate advantages
-  - misses: Reference advantages
-  - reasoning: Comparison explanation with bias check result
-"""
-
-import json
-import sys
-from typing import Any
-
-
-def extract_tool_summary(messages: list[dict] | None) -> dict:
-    """Extract tool usage summary from messages."""
-    if not messages:
-        return {"tools": [], "count": 0}
-
-    tools = []
-    for msg in messages:
-        if msg.get("role") == "assistant" and msg.get("toolCalls"):
-            for call in msg["toolCalls"]:
-                tools.append(call.get("tool", "unknown"))
-
-    return {
-        "tools": tools,
-        "count": len(tools),
-        "unique": list(set(tools)),
-    }
-
-
-def compare_responses(
-    response_a: str,
-    response_b: str,
-    tools_a: dict,
-    tools_b: dict,
-    task: str,
-) -> dict[str, Any]:
-    """
-    Compare two responses for tool usage quality.
-    Returns winner and reasoning.
-    """
-    a_advantages = []
-    b_advantages = []
-
-    # 1. Compare tool count efficiency
-    if tools_a["count"] < tools_b["count"] and tools_a["count"] > 0:
-        a_advantages.append(f"More efficient: {tools_a['count']} vs {tools_b['count']} tools")
-    elif tools_b["count"] < tools_a["count"] and tools_b["count"] > 0:
-        b_advantages.append(f"More efficient: {tools_b['count']} vs {tools_a['count']} tools")
-
-    # 2. Compare tool diversity
-    if len(tools_a["unique"]) > len(tools_b["unique"]):
-        a_advantages.append(f"More diverse tools: {len(tools_a['unique'])} types")
-    elif len(tools_b["unique"]) > len(tools_a["unique"]):
-        b_advantages.append(f"More diverse tools: {len(tools_b['unique'])} types")
-
-    # 3. Compare response length (proxy for completeness)
-    len_a, len_b = len(response_a), len(response_b)
-    if len_a > len_b * 1.2:
-        a_advantages.append("More comprehensive response")
-    elif len_b > len_a * 1.2:
-        b_advantages.append("More comprehensive response")
-
-    # 4. Check for no tools (penalty)
-    if tools_a["count"] == 0 and tools_b["count"] > 0:
-        b_advantages.append("Response B used tools; A did not")
-    elif tools_b["count"] == 0 and tools_a["count"] > 0:
-        a_advantages.append("Response A used tools; B did not")
-
-    # Determine winner
-    a_score = len(a_advantages)
-    b_score = len(b_advantages)
-
-    if a_score > b_score:
-        return {"winner": "A", "a_advantages": a_advantages, "b_advantages": b_advantages}
-    elif b_score > a_score:
-        return {"winner": "B", "a_advantages": a_advantages, "b_advantages": b_advantages}
-    else:
-        return {"winner": "TIE", "a_advantages": a_advantages, "b_advantages": b_advantages}
-
-
-def pairwise_with_bias_mitigation(
-    candidate: str,
-    reference: str,
-    candidate_tools: dict,
-    reference_tools: dict,
-    task: str,
-) -> dict[str, Any]:
-    """
-    Run pairwise comparison twice with position swap to mitigate bias.
-    """
-    # Pass 1: Candidate as A, Reference as B
-    pass1 = compare_responses(
-        candidate, reference, candidate_tools, reference_tools, task
-    )
-
-    # Pass 2: Reference as A, Candidate as B (swapped)
-    pass2 = compare_responses(
-        reference, candidate, reference_tools, candidate_tools, task
-    )
-
-    # Map pass2 result back (if A wins in pass2, that means Reference won)
-    pass2_mapped = {
-        "A": "B",  # A in pass2 = Reference = B in pass1 terms
-        "B": "A",  # B in pass2 = Candidate = A in pass1 terms
-        "TIE": "TIE",
-    }.get(pass2["winner"], "TIE")
-
-    # Check consistency
-    consistent = pass1["winner"] == pass2_mapped
-
-    if consistent:
-        final_winner = pass1["winner"]
-        confidence = "high"
-    else:
-        # Inconsistent results indicate position bias - return TIE
-        final_winner = "TIE"
-        confidence = "low (position bias detected)"
-
-    # Convert to score (candidate perspective)
-    if final_winner == "A":  # Candidate wins
-        score = 1.0
-    elif final_winner == "B":  # Reference wins
-        score = 0.0
-    else:  # TIE
-        score = 0.5
-
-    hits = pass1["a_advantages"][:4]  # Candidate advantages
-    misses = pass1["b_advantages"][:4]  # Reference advantages
-
-    reasoning = (
-        f"Pass 1: {pass1['winner']} wins. "
-        f"Pass 2 (swapped): {pass2['winner']} wins (maps to {pass2_mapped}). "
-        f"Consistency: {consistent}. "
-        f"Final: {final_winner} ({confidence} confidence)"
-    )
-
-    return {
-        "score": score,
-        "hits": hits,
-        "misses": misses,
-        "reasoning": reasoning,
-    }
-
-
-def main():
-    try:
-        input_data = json.loads(sys.stdin.read())
-
-        candidate = input_data.get("candidate_answer", "")
-        reference = input_data.get("reference_answer", "")
-        output_messages = input_data.get("output_messages", [])
-        task = input_data.get("expected_outcome", "")
-
-        # If no reference, we can't do pairwise comparison
-        if not reference:
-            print(json.dumps({
-                "score": 0.5,
-                "hits": ["Candidate response provided"],
-                "misses": ["No reference for comparison"],
-                "reasoning": "Pairwise comparison requires reference_answer field",
-            }, indent=2))
-            return
-
-        # Extract tool summaries
-        candidate_tools = extract_tool_summary(output_messages)
-
-        # For reference, we'd need reference_output_messages
-        # In practice, this would come from a baseline run
-        reference_messages = input_data.get("reference_output_messages", [])
-        reference_tools = extract_tool_summary(reference_messages)
-
-        result = pairwise_with_bias_mitigation(
-            candidate=candidate,
-            reference=reference,
-            candidate_tools=candidate_tools,
-            reference_tools=reference_tools,
-            task=task,
-        )
-
-        print(json.dumps(result, indent=2))
-
-    except Exception as e:
-        error_result = {
-            "score": 0.0,
-            "hits": [],
-            "misses": [f"Evaluator error: {str(e)}"],
-            "reasoning": f"Evaluation failed: {str(e)}",
-        }
-        print(json.dumps(error_result, indent=2))
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/showcase/tool-evaluation-plugins/scripts/tool-selection-judge.ts b/examples/showcase/tool-evaluation-plugins/scripts/tool-selection-judge.ts
new file mode 100644
index 00000000..c44e4ce5
--- /dev/null
+++ b/examples/showcase/tool-evaluation-plugins/scripts/tool-selection-judge.ts
@@ -0,0 +1,186 @@
+#!/usr/bin/env bun
+/**
+ * Tool Selection Evaluator - Code Judge Plugin
+ *
+ * Evaluates whether the agent selected the RIGHT tools for the task.
+ * This is a semantic evaluation that requires understanding task requirements
+ * and matching them against available tools.
+ *
+ * Why this is a plugin (not built-in):
+ * - Requires domain-specific knowledge of what tools are "appropriate"
+ * - Involves semantic judgment, not just pattern matching
+ * - Different projects have different tool selection criteria
+ *
+ * Usage in eval YAML:
+ *   evaluators:
+ *     - name: tool-selection
+ *       type: code_judge
+ *       script: bun run scripts/tool-selection-judge.ts
+ *
+ * Input (stdin JSON):
+ *   - question: The user's task/question
+ *   - expectedOutcome: Description of expected behavior
+ *   - outputMessages: Array of messages including tool calls
+ *   - traceSummary: Summary of tool usage
+ *
+ * Output (stdout JSON):
+ *   - score: 0.0-1.0 (1.0 = all tools appropriate, 0.0 = all inappropriate)
+ *   - hits: List of appropriate tool selections
+ *   - misses: List of missing or inappropriate tools
+ *   - reasoning: Explanation of the evaluation
+ */
+
+interface ToolCall {
+  tool: string;
+  args?: Record<string, unknown>;
+}
+
+interface OutputMessage {
+  role: string;
+  toolCalls?: ToolCall[];
+}
+
+interface TraceSummary {
+  eventCount: number;
+  toolCallsByName: Record<string, number>;
+}
+
+interface EvalInput {
+  question?: string;
+  expectedOutcome?: string;
+  outputMessages?: OutputMessage[];
+  traceSummary?: TraceSummary;
+}
+
+interface EvalOutput {
+  score: number;
+  hits: string[];
+  misses: string[];
+  reasoning: string;
+}
+
+interface ExtractedToolCall {
+  tool: string;
+  args: Record<string, unknown>;
+}
+
+function extractToolCalls(messages: OutputMessage[]): ExtractedToolCall[] {
+  const toolCalls: ExtractedToolCall[] = [];
+  for (const msg of messages) {
+    if (msg.role === 'assistant' && msg.toolCalls) {
+      for (const call of msg.toolCalls) {
+        toolCalls.push({
+          tool: call.tool,
+          args: call.args ?? {},
+        });
+      }
+    }
+  }
+  return toolCalls;
+}
+
+function evaluateToolSelection(
+  question: string,
+  expectedOutcome: string,
+  toolCalls: ExtractedToolCall[],
+): EvalOutput {
+  const hits: string[] = [];
+  const misses: string[] = [];
+
+  // Extract keywords from question and expected outcome
+  const taskText = `${question} ${expectedOutcome}`.toLowerCase();
+
+  // Define tool-to-task mappings (customize for your domain)
+  const toolTaskMappings: Record<string, string[]> = {
+    search: ['find', 'search', 'look', 'query', 'discover'],
+    fetch: ['get', 'retrieve', 'fetch', 'download', 'load'],
+    read: ['read', 'open', 'view', 'examine', 'inspect'],
+    write: ['write', 'save', 'create', 'output', 'generate'],
+    analyze: ['analyze', 'process', 'compute', 'calculate'],
+    validate: ['check', 'validate', 'verify', 'confirm'],
+  };
+
+  // Determine expected tools based on task keywords
+  const expectedTools = new Set<string>();
+  for (const [tool, keywords] of Object.entries(toolTaskMappings)) {
+    if (keywords.some((kw) => taskText.includes(kw))) {
+      expectedTools.add(tool);
+    }
+  }
+
+  // Get actual tools used
+  const actualTools = new Set(toolCalls.map((call) => call.tool));
+
+  // Evaluate selection
+  if (toolCalls.length === 0) {
+    return {
+      score: 0,
+      hits: [],
+      misses: ['No tools were called'],
+      reasoning: 'Agent did not use any tools. Expected at least some tool usage.',
+    };
+  }
+
+  // Check for appropriate selections
+  for (const tool of actualTools) {
+    const toolLower = tool.toLowerCase();
+    const isRelevant = [...expectedTools].some(
+      (expected) => toolLower.includes(expected) || expected.includes(toolLower),
+    );
+    if (isRelevant || expectedTools.size === 0) {
+      hits.push(`Tool '${tool}' appears relevant to task`);
+    } else {
+      misses.push(`Tool '${tool}' may not be needed for this task`);
+    }
+  }
+
+  // Check for missing expected tools
+  for (const expected of expectedTools) {
+    if (![...actualTools].some((t) => t.toLowerCase().includes(expected))) {
+      misses.push(`Expected a '${expected}'-type tool but none used`);
+    }
+  }
+
+  // Calculate score
+  const totalChecks = hits.length + misses.length;
+  const score = totalChecks > 0 ? hits.length / totalChecks : 0.5;
+
+  const reasoning =
+    `Evaluated ${actualTools.size} tool(s) against task requirements. ` +
+    `${hits.length} appropriate, ${misses.length} issues found.`;
+
+  return {
+    score: Math.round(score * 100) / 100,
+    hits: hits.slice(0, 4), // Cap at 4 per contract
+    misses: misses.slice(0, 4),
+    reasoning,
+  };
+}
+
+async function main(): Promise<void> {
+  try {
+    const stdin = await Bun.stdin.text();
+    const inputData = JSON.parse(stdin) as EvalInput;
+
+    const question = inputData.question ?? '';
+    const expectedOutcome = inputData.expectedOutcome ?? '';
+    const outputMessages = inputData.outputMessages ?? [];
+
+    const toolCalls = extractToolCalls(outputMessages);
+
+    const result = evaluateToolSelection(question, expectedOutcome, toolCalls);
+
+    console.log(JSON.stringify(result, null, 2));
+  } catch (error) {
+    const errorResult: EvalOutput = {
+      score: 0,
+      hits: [],
+      misses: [`Evaluator error: ${error instanceof Error ? error.message : String(error)}`],
+      reasoning: `Evaluation failed: ${error instanceof Error ? error.message : String(error)}`,
+    };
+    console.log(JSON.stringify(errorResult, null, 2));
+    process.exit(1);
+  }
+}
+
+main();
diff --git a/examples/showcase/tool-evaluation-plugins/scripts/tool_selection_judge.py b/examples/showcase/tool-evaluation-plugins/scripts/tool_selection_judge.py
deleted file mode 100644
index 18f8f560..00000000
--- a/examples/showcase/tool-evaluation-plugins/scripts/tool_selection_judge.py
+++ /dev/null
@@ -1,166 +0,0 @@
-#!/usr/bin/env python3
-"""
-Tool Selection Evaluator - Code Judge Plugin
-
-Evaluates whether the agent selected the RIGHT tools for the task.
-This is a semantic evaluation that requires understanding task requirements
-and matching them against available tools.
-
-Why this is a plugin (not built-in):
-- Requires domain-specific knowledge of what tools are "appropriate"
-- Involves semantic judgment, not just pattern matching
-- Different projects have different tool selection criteria
-
-Usage in eval YAML:
-  evaluators:
-    - name: tool-selection
-      type: code_judge
-      script: scripts/tool_selection_judge.py
-
-Input (stdin JSON):
-  - question: The user's task/question
-  - expected_outcome: Description of expected behavior
-  - output_messages: Array of messages including tool calls
-  - candidate_trace_summary: Summary of tool usage
-
-Output (stdout JSON):
-  - score: 0.0-1.0 (1.0 = all tools appropriate, 0.0 = all inappropriate)
-  - hits: List of appropriate tool selections
-  - misses: List of missing or inappropriate tools
-  - reasoning: Explanation of the evaluation
-"""
-
-import json
-import sys
-from typing import Any
-
-
-def extract_tool_calls(messages: list[dict]) -> list[dict]:
-    """Extract all tool calls from output messages."""
-    tool_calls = []
-    for msg in messages:
-        if msg.get("role") == "assistant" and msg.get("toolCalls"):
-            for call in msg["toolCalls"]:
-                tool_calls.append({
-                    "tool": call.get("tool"),
-                    "args": call.get("args", {}),
-                })
-    return tool_calls
-
-
-def evaluate_tool_selection(
-    question: str,
-    expected_outcome: str,
-    tool_calls: list[dict],
-    trace_summary: dict | None,
-) -> dict[str, Any]:
-    """
-    Evaluate tool selection based on task requirements.
-
-    This is a simplified heuristic-based evaluation.
-    For production use, you might:
-    1. Use an LLM to judge appropriateness
-    2. Define explicit tool-to-task mappings
-    3. Use a decision tree based on task classification
-    """
-    hits = []
-    misses = []
-
-    # Extract keywords from question and expected outcome
-    task_text = f"{question} {expected_outcome}".lower()
-
-    # Define tool-to-task mappings (customize for your domain)
-    tool_task_mappings = {
-        "search": ["find", "search", "look", "query", "discover"],
-        "fetch": ["get", "retrieve", "fetch", "download", "load"],
-        "read": ["read", "open", "view", "examine", "inspect"],
-        "write": ["write", "save", "create", "output", "generate"],
-        "analyze": ["analyze", "process", "compute", "calculate"],
-        "validate": ["check", "validate", "verify", "confirm"],
-    }
-
-    # Determine expected tools based on task keywords
-    expected_tools = set()
-    for tool, keywords in tool_task_mappings.items():
-        if any(kw in task_text for kw in keywords):
-            expected_tools.add(tool)
-
-    # Get actual tools used
-    actual_tools = set(call["tool"] for call in tool_calls)
-
-    # Evaluate selection
-    if not tool_calls:
-        return {
-            "score": 0.0,
-            "hits": [],
-            "misses": ["No tools were called"],
-            "reasoning": "Agent did not use any tools. Expected at least some tool usage.",
-        }
-
-    # Check for appropriate selections
-    for tool in actual_tools:
-        tool_lower = tool.lower()
-        is_relevant = any(
-            tool_lower in expected or expected in tool_lower
-            for expected in expected_tools
-        )
-        if is_relevant or not expected_tools:
-            hits.append(f"Tool '{tool}' appears relevant to task")
-        else:
-            misses.append(f"Tool '{tool}' may not be needed for this task")
-
-    # Check for missing expected tools
-    for expected in expected_tools:
-        if not any(expected in t.lower() for t in actual_tools):
-            misses.append(f"Expected a '{expected}'-type tool but none used")
-
-    # Calculate score
-    total_checks = len(hits) + len(misses)
-    score = len(hits) / total_checks if total_checks > 0 else 0.5
-
-    reasoning = (
-        f"Evaluated {len(actual_tools)} tool(s) against task requirements. "
-        f"{len(hits)} appropriate, {len(misses)} issues found."
-    )
-
-    return {
-        "score": round(score, 2),
-        "hits": hits[:4],  # Cap at 4 per contract
-        "misses": misses[:4],
-        "reasoning": reasoning,
-    }
-
-
-def main():
-    try:
-        input_data = json.loads(sys.stdin.read())
-
-        question = input_data.get("question", "")
-        expected_outcome = input_data.get("expected_outcome", "")
-        output_messages = input_data.get("output_messages", [])
-        trace_summary = input_data.get("candidate_trace_summary")
-
-        tool_calls = extract_tool_calls(output_messages)
-
-        result = evaluate_tool_selection(
-            question=question,
-            expected_outcome=expected_outcome,
-            tool_calls=tool_calls,
-            trace_summary=trace_summary,
-        )
-
-        print(json.dumps(result, indent=2))
-
-    except Exception as e:
-        error_result = {
-            "score": 0.0,
-            "hits": [],
-            "misses": [f"Evaluator error: {str(e)}"],
-            "reasoning": f"Evaluation failed: {str(e)}",
-        }
-        print(json.dumps(error_result, indent=2))
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/showcase/tool-evaluation-plugins/tool-eval-demo.yaml b/examples/showcase/tool-evaluation-plugins/tool-eval-demo.yaml
index 0b830617..0892cf68 100644
--- a/examples/showcase/tool-evaluation-plugins/tool-eval-demo.yaml
+++ b/examples/showcase/tool-evaluation-plugins/tool-eval-demo.yaml
@@ -41,7 +41,7 @@ evalcases:
         # Plugin: Semantic tool selection evaluation
         - name: selection-quality
           type: code_judge
-          script: scripts/tool_selection_judge.py
+          script: bun run scripts/tool-selection-judge.ts
 
   # ==========================================
   # Example 2: Efficiency Scoring
@@ -62,7 +62,7 @@ evalcases:
         # Plugin: Efficiency metrics scoring
         - name: efficiency-check
           type: code_judge
-          script: scripts/efficiency_scorer.py
+          script: bun run scripts/efficiency-scorer.ts
 
   # ==========================================
   # Example 3: Combined Built-in + Plugin Evaluation
@@ -95,12 +95,12 @@ evalcases:
         # Plugin: Check if tools were appropriate choices
         - name: selection-check
           type: code_judge
-          script: scripts/tool_selection_judge.py
+          script: bun run scripts/tool-selection-judge.ts
 
         # Plugin: Evaluate efficiency
         - name: efficiency
           type: code_judge
-          script: scripts/efficiency_scorer.py
+          script: bun run scripts/efficiency-scorer.ts
 
   # ==========================================
   # Example 4: Pairwise Comparison
@@ -128,4 +128,4 @@ evalcases:
         # Plugin: Pairwise comparison with position bias mitigation
         - name: pairwise-quality
           type: code_judge
-          script: scripts/pairwise_tool_compare.py
+          script: bun run scripts/pairwise-tool-compare.ts
diff --git a/openspec/changes/add-execution-metrics/tasks.md b/openspec/changes/add-execution-metrics/tasks.md
index d3891971..9f16312b 100644
--- a/openspec/changes/add-execution-metrics/tasks.md
+++ b/openspec/changes/add-execution-metrics/tasks.md
@@ -25,10 +25,10 @@
 
 ## 5. Examples & Documentation
 
-- [ ] 5.1 Add metrics evaluation example to `examples/features/`
-- [ ] 5.2 Create code judge example that uses metrics
+- [x] 5.1 Add metrics evaluation example to `examples/features/`
+- [x] 5.2 Create code judge example that uses metrics
 
 ## 6. Testing
 
 - [x] 6.1 Unit tests for metric computation
-- [ ] 6.2 Integration test with metric-aware code judge
+- [x] 6.2 Integration test with metric-aware code judge
diff --git a/packages/core/test/evaluation/execution-metrics.test.ts b/packages/core/test/evaluation/execution-metrics.test.ts
index 96f1e27b..9ee0d227 100644
--- a/packages/core/test/evaluation/execution-metrics.test.ts
+++ b/packages/core/test/evaluation/execution-metrics.test.ts
@@ -1,5 +1,8 @@
 import { describe, expect, it } from 'bun:test';
 
+import { CodeEvaluator } from '../../src/evaluation/evaluators.js';
+import type { ResolvedTarget } from '../../src/evaluation/providers/targets.js';
+import type { EvalCase } from '../../src/evaluation/types.js';
 import {
   type TraceSummary,
   avgToolDurationMs,
@@ -246,3 +249,111 @@ describe('Execution Metrics', () => {
     });
   });
 });
+
+describe('Code Judge Metrics Integration', () => {
+  const baseTestCase: EvalCase = {
+    id: 'metrics-test',
+    dataset: 'test',
+    question: 'Test question',
+    input_messages: [{ role: 'user', content: 'Test' }],
+    input_segments: [{ type: 'text', value: 'Test' }],
+    expected_messages: [],
+    reference_answer: '',
+    guideline_paths: [],
+    file_paths: [],
+    code_snippets: [],
+    expected_outcome: 'Test outcome',
+    evaluator: 'code_judge',
+  };
+
+  const baseTarget: ResolvedTarget = {
+    kind: 'mock',
+    name: 'mock',
+    config: { response: '{}' },
+  };
+
+  it('passes traceSummary to code_judge scripts', async () => {
+    // Script that checks if traceSummary is present and has expected fields
+    const script = `bun -e "
+      import fs from 'node:fs';
+      const input = JSON.parse(fs.readFileSync(0, 'utf8'));
+      const summary = input.traceSummary;
+      const hasEventCount = summary && typeof summary.eventCount === 'number';
+      const hasTokenUsage = summary && summary.tokenUsage && typeof summary.tokenUsage.input === 'number';
+      const hasCostUsd = summary && typeof summary.costUsd === 'number';
+      const score = (hasEventCount && hasTokenUsage && hasCostUsd) ? 1 : 0;
+      console.log(JSON.stringify({
+        score,
+        hits: [
+          hasEventCount ? 'eventCount present' : null,
+          hasTokenUsage ? 'tokenUsage present' : null,
+          hasCostUsd ? 'costUsd present' : null
+        ].filter(Boolean),
+        misses: [
+          hasEventCount ? null : 'eventCount missing',
+          hasTokenUsage ? null : 'tokenUsage missing',
+          hasCostUsd ? null : 'costUsd missing'
+        ].filter(Boolean),
+        reasoning: 'Checked traceSummary fields'
+      }));
+    "`;
+
+    const evaluator = new CodeEvaluator({ script });
+
+    const traceSummary: TraceSummary = {
+      eventCount: 3,
+      toolNames: ['Read', 'Edit'],
+      toolCallsByName: { Read: 2, Edit: 1 },
+      errorCount: 0,
+      tokenUsage: { input: 1000, output: 500 },
+      costUsd: 0.005,
+      durationMs: 2500,
+    };
+
+    const result = await evaluator.evaluate({
+      evalCase: baseTestCase,
+      candidate: 'Test answer',
+      target: baseTarget,
+      attempt: 0,
+      promptInputs: { question: '', guidelines: '' },
+      now: new Date(),
+      traceSummary,
+    });
+
+    expect(result.score).toBe(1);
+    expect(result.verdict).toBe('pass');
+    expect(result.hits).toContain('eventCount present');
+    expect(result.hits).toContain('tokenUsage present');
+    expect(result.hits).toContain('costUsd present');
+  });
+
+  it('handles missing traceSummary gracefully', async () => {
+    // Script that handles missing traceSummary
+    const script = `bun -e "
+      import fs from 'node:fs';
+      const input = JSON.parse(fs.readFileSync(0, 'utf8'));
+      const hasSummary = input.traceSummary !== null && input.traceSummary !== undefined;
+      console.log(JSON.stringify({
+        score: hasSummary ? 0 : 1,
+        hits: hasSummary ? [] : ['Correctly handled missing summary'],
+        misses: hasSummary ? ['Expected no summary'] : [],
+        reasoning: 'Checked for missing traceSummary'
+      }));
+    "`;
+
+    const evaluator = new CodeEvaluator({ script });
+
+    const result = await evaluator.evaluate({
+      evalCase: baseTestCase,
+      candidate: 'Test answer',
+      target: baseTarget,
+      attempt: 0,
+      promptInputs: { question: '', guidelines: '' },
+      now: new Date(),
+      // No traceSummary provided
+    });
+
+    expect(result.score).toBe(1);
+    expect(result.hits).toContain('Correctly handled missing summary');
+  });
+});