From c6fe9a2084c73813e408c009798c09e078a91636 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 31 Dec 2025 13:48:36 +0000 Subject: [PATCH 1/5] docs: add tool evaluation proposals and plugin showcase --- AGENTS.md | 28 +++ .../tool-trajectory/tool-trajectory-demo.yaml | 69 ++++++ .../tool-evaluation-plugins/README.md | 119 ++++++++++ .../scripts/efficiency_scorer.py | 214 +++++++++++++++++ .../scripts/pairwise_tool_compare.py | 220 ++++++++++++++++++ .../scripts/tool_selection_judge.py | 166 +++++++++++++ .../tool-eval-demo.yaml | 131 +++++++++++ .../changes/add-execution-metrics/proposal.md | 81 +++++++ .../specs/evaluation/spec.md | 80 +++++++ .../changes/add-execution-metrics/tasks.md | 34 +++ .../proposal.md | 49 ++++ .../specs/evaluation/spec.md | 192 +++++++++++++++ .../add-trajectory-argument-matching/tasks.md | 22 ++ 13 files changed, 1405 insertions(+) create mode 100644 examples/showcase/tool-evaluation-plugins/README.md create mode 100644 examples/showcase/tool-evaluation-plugins/scripts/efficiency_scorer.py create mode 100644 examples/showcase/tool-evaluation-plugins/scripts/pairwise_tool_compare.py create mode 100644 examples/showcase/tool-evaluation-plugins/scripts/tool_selection_judge.py create mode 100644 examples/showcase/tool-evaluation-plugins/tool-eval-demo.yaml create mode 100644 openspec/changes/add-execution-metrics/proposal.md create mode 100644 openspec/changes/add-execution-metrics/specs/evaluation/spec.md create mode 100644 openspec/changes/add-execution-metrics/tasks.md create mode 100644 openspec/changes/add-trajectory-argument-matching/proposal.md create mode 100644 openspec/changes/add-trajectory-argument-matching/specs/evaluation/spec.md create mode 100644 openspec/changes/add-trajectory-argument-matching/tasks.md diff --git a/AGENTS.md b/AGENTS.md index 364f217f..57b09312 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -9,6 +9,34 @@ AgentV aims to provide a robust, declarative framework for evaluating AI agents. - **Multi-Objective Scoring**: Measure correctness, latency, cost, and safety in a single run. - **Optimization Ready**: Designed to support future automated hyperparameter tuning and candidate generation. +## IMPORTANT: Design Principles + +These principles guide all feature decisions. **Follow these when proposing or implementing changes.** + +### 1. Lightweight Core, Plugin Extensibility +AgentV's core should remain minimal. Complex or domain-specific logic belongs in plugins, not built-in features. + +**Extension points (prefer these over adding built-ins):** +- `code_judge` scripts for custom evaluation logic +- CLI wrappers that consume AgentV's JSON/JSONL output for post-processing (aggregation, comparison, reporting) + +**Ask yourself:** "Can this be achieved with existing primitives + a plugin or wrapper?" If yes, it should not be a built-in. + +### 2. Built-ins for Primitives Only +Built-in evaluators provide **universal primitives** that users compose. A primitive is: +- Stateless and deterministic +- Has a single, clear responsibility +- Cannot be trivially composed from other primitives +- Needed by the majority of users + +If a feature serves a niche use case or adds conditional logic, it belongs in a plugin. + +### 3. Align with Industry Standards +Before adding features, research how peer frameworks solve the problem. Prefer the **lowest common denominator** that covers most use cases. Novel features without industry precedent require strong justification and should default to plugin implementation. + +### 4. Non-Breaking Extensions +New fields should be optional. Existing configurations must continue working unchanged. + ## Tech Stack & Tools - **Language:** TypeScript 5.x targeting ES2022 - **Runtime:** Bun (use `bun` for all package and script operations) diff --git a/examples/features/evals/tool-trajectory/tool-trajectory-demo.yaml b/examples/features/evals/tool-trajectory/tool-trajectory-demo.yaml index 8941426a..ab16557a 100644 --- a/examples/features/evals/tool-trajectory/tool-trajectory-demo.yaml +++ b/examples/features/evals/tool-trajectory/tool-trajectory-demo.yaml @@ -9,6 +9,11 @@ # - in_order: Validates tools appear in expected sequence (allows gaps) # - exact: Validates exact tool sequence match (no gaps, no extra tools) # +# Argument matching (PLANNED - see openspec/changes/add-trajectory-argument-matching/): +# - Exact: args: { key: "value" } - must match exactly (deep equality) +# - Skip: args: any - validate tool name only, ignore arguments +# Note: For pattern/regex matching, use a code_judge evaluator instead. +# # This demo uses a CLI provider (mock-agent.ts) that simulates an agent with tool usage. # The mock agent generates different traces based on the prompt content. # @@ -148,3 +153,67 @@ evalcases: knowledgeSearch: 1 # Present in research trace (will pass) documentRetrieve: 1 # Present in research trace (will pass) generateReport: 1 # NOT present (will fail - demonstrates partial scoring) + + # ========================================== + # PLANNED FEATURES - Argument Matching + # The examples below show the intended syntax for argument validation. + # See: openspec/changes/add-trajectory-argument-matching/ + # ========================================== + + # ========================================== + # Example 6: Exact argument matching + # Use case: Validate tool is called with specific argument values + # PLANNED - will not run until feature is implemented + # ========================================== + - id: exact-args-match + + expected_outcome: |- + Agent searches for weather and retrieves forecast for the correct city. + + input_messages: + - role: user + content: What's the weather like in Paris? + + execution: + evaluators: + - name: arg-validation + type: tool_trajectory + mode: in_order + expected: + - tool: search + args: + query: "weather Paris" + - tool: get_weather + args: + location: "Paris" + + # ========================================== + # Example 7: Skip argument validation with `any` + # Use case: Validate tool sequence but ignore specific arguments + # PLANNED - will not run until feature is implemented + # ========================================== + - id: skip-args-validation + + expected_outcome: |- + Agent loads data, transforms it, and saves. Arguments don't matter. + + input_messages: + - role: user + content: Load customer data, normalize it, and save + + execution: + evaluators: + - name: workflow-sequence-only + type: tool_trajectory + mode: in_order + expected: + # Exact match: must use specific source + - tool: load_data + args: + source: "customers" + # Skip: any transformation is acceptable + - tool: transform + args: any + # Skip: we don't care about save arguments + - tool: save_data + args: any diff --git a/examples/showcase/tool-evaluation-plugins/README.md b/examples/showcase/tool-evaluation-plugins/README.md new file mode 100644 index 00000000..94350618 --- /dev/null +++ b/examples/showcase/tool-evaluation-plugins/README.md @@ -0,0 +1,119 @@ +# Tool Evaluation Plugin Patterns + +This showcase demonstrates **plugin-based tool evaluation patterns** that complement AgentV's built-in `tool_trajectory` evaluator. These patterns are intentionally implemented as plugins (code judges) rather than built-ins because they involve domain-specific logic or semantic evaluation. + +## When to Use Plugins vs Built-ins + +| Pattern | Implementation | Reason | +|---------|----------------|--------| +| Tool name/sequence matching | Built-in (`tool_trajectory`) | Deterministic, reusable primitive | +| Argument matching | Built-in (planned) | Extension of existing primitive | +| Tool selection correctness | **Plugin** | Requires semantic judgment | +| Tool input appropriateness | **Plugin** | Domain-specific criteria | +| Tool output utilization | **Plugin** | Requires understanding tool purposes | +| Efficiency scoring | **Plugin** | Custom thresholds, domain-specific | +| Pairwise comparison | **Plugin** | Specialized evaluation pattern | + +## Plugin Examples + +### 1. Tool Selection Evaluator (`tool_selection_judge.py`) + +Evaluates whether the agent selected the **right tools** for the task. Uses LLM-as-judge pattern to semantically assess tool choices. + +```yaml +evaluators: + - name: tool-selection + type: code_judge + script: scripts/tool_selection_judge.py +``` + +### 2. Tool Input Validator (`tool_input_validator.ts`) + +Validates that tool **arguments are semantically appropriate** (not just syntactically correct). Checks if argument values make sense in context. + +```yaml +evaluators: + - name: input-validation + type: code_judge + script: scripts/tool_input_validator.ts +``` + +### 3. Tool Efficiency Scorer (`efficiency_scorer.py`) + +Computes efficiency metrics and scores based on configurable thresholds. Demonstrates how to use execution metrics in evaluation. + +```yaml +evaluators: + - name: efficiency + type: code_judge + script: scripts/efficiency_scorer.py +``` + +### 4. Pairwise Tool Comparison (`pairwise_tool_compare.py`) + +Compares two agent responses for tool usage quality with position bias mitigation (runs comparison twice with swapped order). + +```yaml +evaluators: + - name: pairwise-compare + type: code_judge + script: scripts/pairwise_tool_compare.py +``` + +## Running the Examples + +```bash +cd examples/showcase/tool-evaluation-plugins +npx agentv eval tool-eval-demo.yaml --target mock_agent +``` + +## Input Contract + +All code judges receive a JSON object on stdin with: + +```json +{ + "question": "User's question/task", + "expected_outcome": "Expected behavior description", + "candidate_answer": "Agent's final response", + "output_messages": [ + { + "role": "assistant", + "content": "...", + "toolCalls": [ + { "id": "...", "tool": "search", "args": { "query": "..." } } + ] + }, + { + "role": "tool", + "toolCallId": "...", + "toolName": "search", + "content": "Tool result..." + } + ], + "candidate_trace_summary": { + "eventCount": 5, + "toolNames": ["search", "fetch"], + "toolCallsByName": { "search": 2, "fetch": 1 }, + "errorCount": 0 + }, + "execution_metrics": { + "tokenUsage": { "input": 1000, "output": 500 }, + "durationMs": 3500, + "costUsd": 0.0015 + } +} +``` + +## Output Contract + +Code judges must output JSON with: + +```json +{ + "score": 0.85, + "hits": ["Used appropriate search tool", "Validated input before fetch"], + "misses": ["Redundant search call"], + "reasoning": "Agent demonstrated good tool selection with minor inefficiency" +} +``` diff --git a/examples/showcase/tool-evaluation-plugins/scripts/efficiency_scorer.py b/examples/showcase/tool-evaluation-plugins/scripts/efficiency_scorer.py new file mode 100644 index 00000000..15c68ecc --- /dev/null +++ b/examples/showcase/tool-evaluation-plugins/scripts/efficiency_scorer.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 +""" +Tool Efficiency Scorer - Code Judge Plugin + +Evaluates agent efficiency based on execution metrics: +- Token usage relative to task complexity +- Number of tool calls (redundancy detection) +- Exploration ratio (read-only vs action tools) +- Cost efficiency + +Why this is a plugin (not built-in): +- Efficiency thresholds are domain-specific +- What's "efficient" depends on the task type +- Different projects have different cost/performance tradeoffs + +Usage in eval YAML: + evaluators: + - name: efficiency + type: code_judge + script: scripts/efficiency_scorer.py + +Input (stdin JSON): + - candidate_trace_summary: Tool call statistics + - execution_metrics: Token usage, cost, duration (if available) + - expected_outcome: Task description (for complexity estimation) + +Output (stdout JSON): + - score: 0.0-1.0 efficiency score + - hits: Efficiency wins + - misses: Efficiency issues + - reasoning: Explanation +""" + +import json +import sys +from typing import Any + + +# Configurable thresholds (customize for your domain) +THRESHOLDS = { + # Maximum tool calls before penalty + "max_tool_calls": 10, + # Ideal exploration ratio (read-only tools / total) + "target_exploration_ratio": 0.6, + "exploration_tolerance": 0.2, + # Token budgets + "max_tokens_simple": 2000, + "max_tokens_complex": 10000, + # Cost thresholds (USD) + "max_cost_simple": 0.01, + "max_cost_complex": 0.10, +} + +# Tools considered "exploration" (read-only) +EXPLORATION_TOOLS = { + "read", "grep", "glob", "search", "list", "find", + "get", "fetch", "query", "inspect", "view", +} + + +def estimate_task_complexity(expected_outcome: str) -> str: + """Estimate task complexity from expected outcome description.""" + text = expected_outcome.lower() + complex_indicators = [ + "multiple", "several", "comprehensive", "thorough", + "analyze", "compare", "synthesize", "integrate", + ] + if any(indicator in text for indicator in complex_indicators): + return "complex" + return "simple" + + +def calculate_exploration_ratio(trace_summary: dict) -> float: + """Calculate ratio of exploration tools to total tools.""" + tool_calls = trace_summary.get("toolCallsByName", {}) + total = sum(tool_calls.values()) + if total == 0: + return 0.0 + + exploration_count = sum( + count for tool, count in tool_calls.items() + if any(exp in tool.lower() for exp in EXPLORATION_TOOLS) + ) + return exploration_count / total + + +def evaluate_efficiency( + trace_summary: dict | None, + execution_metrics: dict | None, + expected_outcome: str, +) -> dict[str, Any]: + """Evaluate agent efficiency against configurable thresholds.""" + hits = [] + misses = [] + scores = [] + + complexity = estimate_task_complexity(expected_outcome) + + # 1. Tool call count evaluation + if trace_summary: + tool_count = trace_summary.get("eventCount", 0) + max_calls = THRESHOLDS["max_tool_calls"] + + if tool_count <= max_calls: + hits.append(f"Tool calls ({tool_count}) within budget ({max_calls})") + scores.append(1.0) + else: + penalty = min((tool_count - max_calls) / max_calls, 1.0) + scores.append(1.0 - penalty) + misses.append(f"Excessive tool calls: {tool_count} (budget: {max_calls})") + + # 2. Exploration ratio evaluation + exp_ratio = calculate_exploration_ratio(trace_summary) + target = THRESHOLDS["target_exploration_ratio"] + tolerance = THRESHOLDS["exploration_tolerance"] + + if abs(exp_ratio - target) <= tolerance: + hits.append(f"Good exploration ratio: {exp_ratio:.2f}") + scores.append(1.0) + elif exp_ratio < target - tolerance: + scores.append(0.7) + misses.append(f"Low exploration ratio: {exp_ratio:.2f} (target: {target:.2f})") + else: + scores.append(0.7) + misses.append(f"High exploration ratio: {exp_ratio:.2f} (target: {target:.2f})") + + # 3. Token usage evaluation + if execution_metrics and "tokenUsage" in execution_metrics: + tokens = execution_metrics["tokenUsage"] + total_tokens = tokens.get("input", 0) + tokens.get("output", 0) + max_tokens = ( + THRESHOLDS["max_tokens_complex"] + if complexity == "complex" + else THRESHOLDS["max_tokens_simple"] + ) + + if total_tokens <= max_tokens: + hits.append(f"Token usage ({total_tokens}) within budget") + scores.append(1.0) + else: + penalty = min((total_tokens - max_tokens) / max_tokens, 1.0) + scores.append(1.0 - penalty * 0.5) # Softer penalty + misses.append(f"High token usage: {total_tokens} (budget: {max_tokens})") + + # 4. Cost evaluation + if execution_metrics and "costUsd" in execution_metrics: + cost = execution_metrics["costUsd"] + max_cost = ( + THRESHOLDS["max_cost_complex"] + if complexity == "complex" + else THRESHOLDS["max_cost_simple"] + ) + + if cost <= max_cost: + hits.append(f"Cost (${cost:.4f}) within budget") + scores.append(1.0) + else: + scores.append(0.5) + misses.append(f"High cost: ${cost:.4f} (budget: ${max_cost:.4f})") + + # Calculate final score + if not scores: + return { + "score": 0.5, + "hits": ["No efficiency metrics available"], + "misses": [], + "reasoning": "Could not evaluate efficiency - no metrics provided", + } + + final_score = sum(scores) / len(scores) + + reasoning = ( + f"Task complexity: {complexity}. " + f"Evaluated {len(scores)} efficiency criteria. " + f"Score: {final_score:.2f}" + ) + + return { + "score": round(final_score, 2), + "hits": hits[:4], + "misses": misses[:4], + "reasoning": reasoning, + } + + +def main(): + try: + input_data = json.loads(sys.stdin.read()) + + trace_summary = input_data.get("candidate_trace_summary") + execution_metrics = input_data.get("execution_metrics") + expected_outcome = input_data.get("expected_outcome", "") + + result = evaluate_efficiency( + trace_summary=trace_summary, + execution_metrics=execution_metrics, + expected_outcome=expected_outcome, + ) + + print(json.dumps(result, indent=2)) + + except Exception as e: + error_result = { + "score": 0.0, + "hits": [], + "misses": [f"Evaluator error: {str(e)}"], + "reasoning": f"Evaluation failed: {str(e)}", + } + print(json.dumps(error_result, indent=2)) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/examples/showcase/tool-evaluation-plugins/scripts/pairwise_tool_compare.py b/examples/showcase/tool-evaluation-plugins/scripts/pairwise_tool_compare.py new file mode 100644 index 00000000..e0bc842c --- /dev/null +++ b/examples/showcase/tool-evaluation-plugins/scripts/pairwise_tool_compare.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python3 +""" +Pairwise Tool Comparison - Code Judge Plugin + +Compares tool usage quality between two agent responses with +position bias mitigation (runs comparison twice with swapped order). + +Why this is a plugin (not built-in): +- Pairwise comparison is a specialized evaluation pattern +- Requires reference response (not always available) +- Position bias mitigation adds complexity +- Not all evaluations need comparative assessment + +Usage in eval YAML: + evaluators: + - name: pairwise-compare + type: code_judge + script: scripts/pairwise_tool_compare.py + +Input (stdin JSON): + - candidate_answer: Agent's response (Response A) + - reference_answer: Reference/baseline response (Response B) + - output_messages: Tool calls from candidate + - expected_outcome: Task description + +Output (stdout JSON): + - score: 0.0-1.0 (1.0 = candidate wins, 0.5 = tie, 0.0 = reference wins) + - hits: Candidate advantages + - misses: Reference advantages + - reasoning: Comparison explanation with bias check result +""" + +import json +import sys +from typing import Any + + +def extract_tool_summary(messages: list[dict] | None) -> dict: + """Extract tool usage summary from messages.""" + if not messages: + return {"tools": [], "count": 0} + + tools = [] + for msg in messages: + if msg.get("role") == "assistant" and msg.get("toolCalls"): + for call in msg["toolCalls"]: + tools.append(call.get("tool", "unknown")) + + return { + "tools": tools, + "count": len(tools), + "unique": list(set(tools)), + } + + +def compare_responses( + response_a: str, + response_b: str, + tools_a: dict, + tools_b: dict, + task: str, +) -> dict[str, Any]: + """ + Compare two responses for tool usage quality. + Returns winner and reasoning. + """ + a_advantages = [] + b_advantages = [] + + # 1. Compare tool count efficiency + if tools_a["count"] < tools_b["count"] and tools_a["count"] > 0: + a_advantages.append(f"More efficient: {tools_a['count']} vs {tools_b['count']} tools") + elif tools_b["count"] < tools_a["count"] and tools_b["count"] > 0: + b_advantages.append(f"More efficient: {tools_b['count']} vs {tools_a['count']} tools") + + # 2. Compare tool diversity + if len(tools_a["unique"]) > len(tools_b["unique"]): + a_advantages.append(f"More diverse tools: {len(tools_a['unique'])} types") + elif len(tools_b["unique"]) > len(tools_a["unique"]): + b_advantages.append(f"More diverse tools: {len(tools_b['unique'])} types") + + # 3. Compare response length (proxy for completeness) + len_a, len_b = len(response_a), len(response_b) + if len_a > len_b * 1.2: + a_advantages.append("More comprehensive response") + elif len_b > len_a * 1.2: + b_advantages.append("More comprehensive response") + + # 4. Check for no tools (penalty) + if tools_a["count"] == 0 and tools_b["count"] > 0: + b_advantages.append("Response B used tools; A did not") + elif tools_b["count"] == 0 and tools_a["count"] > 0: + a_advantages.append("Response A used tools; B did not") + + # Determine winner + a_score = len(a_advantages) + b_score = len(b_advantages) + + if a_score > b_score: + return {"winner": "A", "a_advantages": a_advantages, "b_advantages": b_advantages} + elif b_score > a_score: + return {"winner": "B", "a_advantages": a_advantages, "b_advantages": b_advantages} + else: + return {"winner": "TIE", "a_advantages": a_advantages, "b_advantages": b_advantages} + + +def pairwise_with_bias_mitigation( + candidate: str, + reference: str, + candidate_tools: dict, + reference_tools: dict, + task: str, +) -> dict[str, Any]: + """ + Run pairwise comparison twice with position swap to mitigate bias. + """ + # Pass 1: Candidate as A, Reference as B + pass1 = compare_responses( + candidate, reference, candidate_tools, reference_tools, task + ) + + # Pass 2: Reference as A, Candidate as B (swapped) + pass2 = compare_responses( + reference, candidate, reference_tools, candidate_tools, task + ) + + # Map pass2 result back (if A wins in pass2, that means Reference won) + pass2_mapped = { + "A": "B", # A in pass2 = Reference = B in pass1 terms + "B": "A", # B in pass2 = Candidate = A in pass1 terms + "TIE": "TIE", + }.get(pass2["winner"], "TIE") + + # Check consistency + consistent = pass1["winner"] == pass2_mapped + + if consistent: + final_winner = pass1["winner"] + confidence = "high" + else: + # Inconsistent results indicate position bias - return TIE + final_winner = "TIE" + confidence = "low (position bias detected)" + + # Convert to score (candidate perspective) + if final_winner == "A": # Candidate wins + score = 1.0 + elif final_winner == "B": # Reference wins + score = 0.0 + else: # TIE + score = 0.5 + + hits = pass1["a_advantages"][:4] # Candidate advantages + misses = pass1["b_advantages"][:4] # Reference advantages + + reasoning = ( + f"Pass 1: {pass1['winner']} wins. " + f"Pass 2 (swapped): {pass2['winner']} wins (maps to {pass2_mapped}). " + f"Consistency: {consistent}. " + f"Final: {final_winner} ({confidence} confidence)" + ) + + return { + "score": score, + "hits": hits, + "misses": misses, + "reasoning": reasoning, + } + + +def main(): + try: + input_data = json.loads(sys.stdin.read()) + + candidate = input_data.get("candidate_answer", "") + reference = input_data.get("reference_answer", "") + output_messages = input_data.get("output_messages", []) + task = input_data.get("expected_outcome", "") + + # If no reference, we can't do pairwise comparison + if not reference: + print(json.dumps({ + "score": 0.5, + "hits": ["Candidate response provided"], + "misses": ["No reference for comparison"], + "reasoning": "Pairwise comparison requires reference_answer field", + }, indent=2)) + return + + # Extract tool summaries + candidate_tools = extract_tool_summary(output_messages) + + # For reference, we'd need reference_output_messages + # In practice, this would come from a baseline run + reference_messages = input_data.get("reference_output_messages", []) + reference_tools = extract_tool_summary(reference_messages) + + result = pairwise_with_bias_mitigation( + candidate=candidate, + reference=reference, + candidate_tools=candidate_tools, + reference_tools=reference_tools, + task=task, + ) + + print(json.dumps(result, indent=2)) + + except Exception as e: + error_result = { + "score": 0.0, + "hits": [], + "misses": [f"Evaluator error: {str(e)}"], + "reasoning": f"Evaluation failed: {str(e)}", + } + print(json.dumps(error_result, indent=2)) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/examples/showcase/tool-evaluation-plugins/scripts/tool_selection_judge.py b/examples/showcase/tool-evaluation-plugins/scripts/tool_selection_judge.py new file mode 100644 index 00000000..18f8f560 --- /dev/null +++ b/examples/showcase/tool-evaluation-plugins/scripts/tool_selection_judge.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +""" +Tool Selection Evaluator - Code Judge Plugin + +Evaluates whether the agent selected the RIGHT tools for the task. +This is a semantic evaluation that requires understanding task requirements +and matching them against available tools. + +Why this is a plugin (not built-in): +- Requires domain-specific knowledge of what tools are "appropriate" +- Involves semantic judgment, not just pattern matching +- Different projects have different tool selection criteria + +Usage in eval YAML: + evaluators: + - name: tool-selection + type: code_judge + script: scripts/tool_selection_judge.py + +Input (stdin JSON): + - question: The user's task/question + - expected_outcome: Description of expected behavior + - output_messages: Array of messages including tool calls + - candidate_trace_summary: Summary of tool usage + +Output (stdout JSON): + - score: 0.0-1.0 (1.0 = all tools appropriate, 0.0 = all inappropriate) + - hits: List of appropriate tool selections + - misses: List of missing or inappropriate tools + - reasoning: Explanation of the evaluation +""" + +import json +import sys +from typing import Any + + +def extract_tool_calls(messages: list[dict]) -> list[dict]: + """Extract all tool calls from output messages.""" + tool_calls = [] + for msg in messages: + if msg.get("role") == "assistant" and msg.get("toolCalls"): + for call in msg["toolCalls"]: + tool_calls.append({ + "tool": call.get("tool"), + "args": call.get("args", {}), + }) + return tool_calls + + +def evaluate_tool_selection( + question: str, + expected_outcome: str, + tool_calls: list[dict], + trace_summary: dict | None, +) -> dict[str, Any]: + """ + Evaluate tool selection based on task requirements. + + This is a simplified heuristic-based evaluation. + For production use, you might: + 1. Use an LLM to judge appropriateness + 2. Define explicit tool-to-task mappings + 3. Use a decision tree based on task classification + """ + hits = [] + misses = [] + + # Extract keywords from question and expected outcome + task_text = f"{question} {expected_outcome}".lower() + + # Define tool-to-task mappings (customize for your domain) + tool_task_mappings = { + "search": ["find", "search", "look", "query", "discover"], + "fetch": ["get", "retrieve", "fetch", "download", "load"], + "read": ["read", "open", "view", "examine", "inspect"], + "write": ["write", "save", "create", "output", "generate"], + "analyze": ["analyze", "process", "compute", "calculate"], + "validate": ["check", "validate", "verify", "confirm"], + } + + # Determine expected tools based on task keywords + expected_tools = set() + for tool, keywords in tool_task_mappings.items(): + if any(kw in task_text for kw in keywords): + expected_tools.add(tool) + + # Get actual tools used + actual_tools = set(call["tool"] for call in tool_calls) + + # Evaluate selection + if not tool_calls: + return { + "score": 0.0, + "hits": [], + "misses": ["No tools were called"], + "reasoning": "Agent did not use any tools. Expected at least some tool usage.", + } + + # Check for appropriate selections + for tool in actual_tools: + tool_lower = tool.lower() + is_relevant = any( + tool_lower in expected or expected in tool_lower + for expected in expected_tools + ) + if is_relevant or not expected_tools: + hits.append(f"Tool '{tool}' appears relevant to task") + else: + misses.append(f"Tool '{tool}' may not be needed for this task") + + # Check for missing expected tools + for expected in expected_tools: + if not any(expected in t.lower() for t in actual_tools): + misses.append(f"Expected a '{expected}'-type tool but none used") + + # Calculate score + total_checks = len(hits) + len(misses) + score = len(hits) / total_checks if total_checks > 0 else 0.5 + + reasoning = ( + f"Evaluated {len(actual_tools)} tool(s) against task requirements. " + f"{len(hits)} appropriate, {len(misses)} issues found." + ) + + return { + "score": round(score, 2), + "hits": hits[:4], # Cap at 4 per contract + "misses": misses[:4], + "reasoning": reasoning, + } + + +def main(): + try: + input_data = json.loads(sys.stdin.read()) + + question = input_data.get("question", "") + expected_outcome = input_data.get("expected_outcome", "") + output_messages = input_data.get("output_messages", []) + trace_summary = input_data.get("candidate_trace_summary") + + tool_calls = extract_tool_calls(output_messages) + + result = evaluate_tool_selection( + question=question, + expected_outcome=expected_outcome, + tool_calls=tool_calls, + trace_summary=trace_summary, + ) + + print(json.dumps(result, indent=2)) + + except Exception as e: + error_result = { + "score": 0.0, + "hits": [], + "misses": [f"Evaluator error: {str(e)}"], + "reasoning": f"Evaluation failed: {str(e)}", + } + print(json.dumps(error_result, indent=2)) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/examples/showcase/tool-evaluation-plugins/tool-eval-demo.yaml b/examples/showcase/tool-evaluation-plugins/tool-eval-demo.yaml new file mode 100644 index 00000000..0b830617 --- /dev/null +++ b/examples/showcase/tool-evaluation-plugins/tool-eval-demo.yaml @@ -0,0 +1,131 @@ +# Tool Evaluation Plugins Demo +# Demonstrates plugin-based (code judge) tool evaluation patterns +# +# These patterns complement the built-in tool_trajectory evaluator with +# semantic evaluation capabilities that require domain-specific logic. +# +# Run: cd examples/showcase/tool-evaluation-plugins +# npx agentv eval tool-eval-demo.yaml --target mock_agent + +$schema: agentv-eval-v2 +description: Showcase of tool evaluation plugin patterns + +# Use mock_agent target (configure in .agentv/targets.yaml) +target: mock_agent + +evalcases: + # ========================================== + # Example 1: Tool Selection Evaluation + # Use case: Verify agent chose appropriate tools for the task + # ========================================== + - id: tool-selection-demo + + expected_outcome: |- + Agent should search for relevant information and fetch data from APIs. + Uses search and fetch tools appropriately for the research task. + + input_messages: + - role: user + content: Find information about the current weather in Tokyo and fetch the detailed forecast. + + execution: + evaluators: + # Built-in: Check minimum tool calls + - name: trajectory-check + type: tool_trajectory + mode: any_order + minimums: + search: 1 + fetch: 1 + + # Plugin: Semantic tool selection evaluation + - name: selection-quality + type: code_judge + script: scripts/tool_selection_judge.py + + # ========================================== + # Example 2: Efficiency Scoring + # Use case: Evaluate resource efficiency of agent execution + # ========================================== + - id: efficiency-demo + + expected_outcome: |- + Agent efficiently processes the request with minimal redundant operations. + Simple task requiring straightforward tool usage. + + input_messages: + - role: user + content: Get the current time. + + execution: + evaluators: + # Plugin: Efficiency metrics scoring + - name: efficiency-check + type: code_judge + script: scripts/efficiency_scorer.py + + # ========================================== + # Example 3: Combined Built-in + Plugin Evaluation + # Use case: Comprehensive tool usage assessment + # ========================================== + - id: combined-evaluation + + expected_outcome: |- + Agent performs comprehensive data analysis: + 1. Search multiple sources + 2. Validate data quality + 3. Process and transform results + 4. Output formatted report + + input_messages: + - role: user + content: Analyze the quarterly sales data and generate a summary report. + + execution: + evaluators: + # Built-in: Verify required workflow sequence + - name: workflow-trajectory + type: tool_trajectory + mode: in_order + expected: + - tool: search + - tool: validate + - tool: process + + # Plugin: Check if tools were appropriate choices + - name: selection-check + type: code_judge + script: scripts/tool_selection_judge.py + + # Plugin: Evaluate efficiency + - name: efficiency + type: code_judge + script: scripts/efficiency_scorer.py + + # ========================================== + # Example 4: Pairwise Comparison + # Use case: Compare candidate against baseline response + # Requires reference_answer field + # ========================================== + - id: pairwise-demo + + expected_outcome: |- + Agent should retrieve and summarize the document efficiently. + + input_messages: + - role: user + content: Summarize the main points of the user manual. + + # Reference answer for comparison (from a baseline agent) + reference_answer: |- + Here is a summary of the user manual: + 1. Installation: Follow the setup wizard + 2. Configuration: Edit settings.json + 3. Usage: Run the main command + + execution: + evaluators: + # Plugin: Pairwise comparison with position bias mitigation + - name: pairwise-quality + type: code_judge + script: scripts/pairwise_tool_compare.py diff --git a/openspec/changes/add-execution-metrics/proposal.md b/openspec/changes/add-execution-metrics/proposal.md new file mode 100644 index 00000000..669d0385 --- /dev/null +++ b/openspec/changes/add-execution-metrics/proposal.md @@ -0,0 +1,81 @@ +# Change: Add Extended Execution Metrics + +## Why + +Tracking **how** agents work is as important as **what** they produce. Currently, AgentV's `TraceSummary` only captures basic tool call counts. Extended metrics like token usage, cost, duration, and efficiency ratios provide valuable signals for: + +1. Cost optimization (tokens, API costs) +2. Efficiency analysis (tokens per tool, exploration ratio) +3. Performance tracking (duration, tool latency) + +This is infrastructure/data collection - not domain logic. Providers optionally report metrics; the framework aggregates them. + +## What Changes + +- Extend `TraceSummary` to `ExecutionMetrics` with optional fields for token usage, cost, duration +- Add computed metrics: `explorationRatio`, `tokensPerTool` +- Make metrics available to evaluators and output writers +- Add example demonstrating metrics-based evaluation + +## Impact + +- Affected specs: `evaluation` +- Affected code: `packages/core/src/evaluation/trace.ts`, provider types +- Non-breaking: all new fields are optional; existing traces work unchanged + +## Implementation Notes + +### Data Model +Extend `TraceSummary` (don't create a separate type): +```typescript +// In packages/core/src/evaluation/trace.ts +export interface TraceSummary { + // Existing fields + eventCount: number; + toolNames: string[]; + toolCallsByName: Record; + errorCount: number; + + // NEW optional fields + tokenUsage?: { input: number; output: number; cached?: number }; + costUsd?: number; + durationMs?: number; + toolDurations?: Record; +} +``` + +### Provider Response +Extend `ProviderResponse` in `packages/core/src/evaluation/providers/types.ts`: +```typescript +export interface ProviderResponse { + // Existing fields... + + // NEW optional metrics (providers report what they can) + tokenUsage?: { input: number; output: number; cached?: number }; + costUsd?: number; + durationMs?: number; +} +``` + +### Computed Metrics +Add computation functions in `trace.ts`: +```typescript +// Default exploration tools (can be overridden per-eval via config) +const DEFAULT_EXPLORATION_TOOLS = ['read', 'grep', 'glob', 'search', 'list']; + +export function computeExplorationRatio( + summary: TraceSummary, + explorationTools: string[] = DEFAULT_EXPLORATION_TOOLS +): number | undefined { + if (summary.eventCount === 0) return undefined; + const explorationCalls = explorationTools.reduce( + (sum, tool) => sum + (summary.toolCallsByName[tool] ?? 0), 0 + ); + return explorationCalls / summary.eventCount; +} +``` + +### Integration Points +1. **EvaluationContext**: Add `executionMetrics?: TraceSummary` (already has `traceSummary`) +2. **Code judge stdin**: Include metrics in the JSON passed to scripts +3. **JSONL output**: Add `execution_metrics` field to result objects diff --git a/openspec/changes/add-execution-metrics/specs/evaluation/spec.md b/openspec/changes/add-execution-metrics/specs/evaluation/spec.md new file mode 100644 index 00000000..5f5f00b7 --- /dev/null +++ b/openspec/changes/add-execution-metrics/specs/evaluation/spec.md @@ -0,0 +1,80 @@ +## ADDED Requirements + +### Requirement: Extended Execution Metrics + +The system SHALL capture extended execution metrics from providers and make them available to evaluators. + +#### Scenario: Provider reports token usage +- **GIVEN** a provider invocation completes successfully +- **AND** the provider response includes token usage data +- **WHEN** the trace is processed +- **THEN** `execution_metrics.tokenUsage` contains `{ input, output, cached? }` +- **AND** the metrics are available to evaluators via `context.executionMetrics` + +#### Scenario: Provider reports cost +- **GIVEN** a provider invocation completes successfully +- **AND** the provider response includes cost data +- **WHEN** the trace is processed +- **THEN** `execution_metrics.costUsd` contains the reported cost +- **AND** the cost is included in evaluation results + +#### Scenario: Provider reports duration +- **GIVEN** a provider invocation completes successfully +- **WHEN** the trace is processed +- **THEN** `execution_metrics.durationMs` contains the total execution time +- **AND** if individual tool durations are available, `execution_metrics.toolDurations` maps tool names to duration arrays + +#### Scenario: Metrics not available +- **GIVEN** a provider invocation completes successfully +- **AND** the provider does not report metrics +- **WHEN** the trace is processed +- **THEN** `execution_metrics` fields are `undefined` or omitted +- **AND** evaluation proceeds normally without metrics + +#### Scenario: Computed exploration ratio +- **GIVEN** execution metrics with tool call data +- **AND** a configured list of exploration tools (e.g., `["read", "grep", "glob", "search"]`) +- **WHEN** `explorationRatio` is computed +- **THEN** the ratio equals `explorationToolCalls / totalToolCalls` +- **AND** the ratio is between 0.0 and 1.0 + +#### Scenario: Computed tokens per tool +- **GIVEN** execution metrics with `tokenUsage.output` and `toolCallCount` +- **WHEN** `tokensPerTool` is computed +- **THEN** the value equals `tokenUsage.output / toolCallCount` +- **AND** returns `undefined` if tool call count is zero + +#### Scenario: Code judge receives metrics +- **GIVEN** an eval case with a `code_judge` evaluator +- **AND** the provider reported execution metrics +- **WHEN** the code judge script is invoked +- **THEN** the stdin JSON includes `execution_metrics` with available fields +- **AND** the script can use metrics for scoring decisions + +#### Scenario: Metrics in evaluation results +- **GIVEN** an evaluation completes with execution metrics +- **WHEN** results are written to JSONL output +- **THEN** each result includes `execution_metrics` object with available fields +- **AND** undefined fields are omitted from output + +### Requirement: Execution Metrics Data Model + +The system SHALL define a structured data model for execution metrics. + +#### Scenario: Token usage structure +- **GIVEN** a provider reports token usage +- **WHEN** the data is captured +- **THEN** `tokenUsage` has required fields `input: number` and `output: number` +- **AND** optional field `cached?: number` for cache-hit tokens + +#### Scenario: Tool durations structure +- **GIVEN** a provider reports individual tool timing +- **WHEN** the data is captured +- **THEN** `toolDurations` is a map of `{ [toolName: string]: number[] }` +- **AND** each array contains durations in milliseconds for each invocation of that tool + +#### Scenario: Metrics schema validation +- **GIVEN** a provider returns metrics data +- **WHEN** the data is validated +- **THEN** numeric fields are non-negative +- **AND** invalid data is logged and omitted rather than causing failure diff --git a/openspec/changes/add-execution-metrics/tasks.md b/openspec/changes/add-execution-metrics/tasks.md new file mode 100644 index 00000000..aa80072a --- /dev/null +++ b/openspec/changes/add-execution-metrics/tasks.md @@ -0,0 +1,34 @@ +## 1. Data Model + +- [ ] 1.1 Extend `TraceSummary` type with optional `tokenUsage` field +- [ ] 1.2 Add optional `costUsd` field to trace +- [ ] 1.3 Add optional `durationMs` field to trace +- [ ] 1.4 Add optional `toolDurations` map (tool name -> duration array) + +## 2. Computed Metrics + +- [ ] 2.1 Implement `explorationRatio` computation (configurable exploration tool list) +- [ ] 2.2 Implement `tokensPerTool` computation +- [ ] 2.3 Add `avgToolDurationMs` computation + +## 3. Provider Integration + +- [ ] 3.1 Define provider metric reporting interface +- [ ] 3.2 Update CLI provider to report duration metrics +- [ ] 3.3 Document metric reporting for custom providers + +## 4. Output & Evaluation + +- [ ] 4.1 Include metrics in evaluation results JSON +- [ ] 4.2 Make metrics available to code judges via stdin +- [ ] 4.3 Add metrics to JSONL output format + +## 5. Examples & Documentation + +- [ ] 5.1 Add metrics evaluation example to `examples/features/` +- [ ] 5.2 Create code judge example that uses metrics + +## 6. Testing + +- [ ] 6.1 Unit tests for metric computation +- [ ] 6.2 Integration test with metric-aware code judge diff --git a/openspec/changes/add-trajectory-argument-matching/proposal.md b/openspec/changes/add-trajectory-argument-matching/proposal.md new file mode 100644 index 00000000..c00d84a3 --- /dev/null +++ b/openspec/changes/add-trajectory-argument-matching/proposal.md @@ -0,0 +1,49 @@ +# Change: Add Argument Matching to Tool Trajectory Evaluator + +## Why + +The current `tool_trajectory` evaluator only validates tool **names**, not their **arguments**. Argument validation is a core primitive for tool use evaluation. Without it, users cannot verify that agents pass correct parameters to tools. + +This is a lightweight extension to an existing primitive - not domain logic. It aligns with Google ADK's trajectory evaluator which supports exact argument matching in EXACT mode. + +## What Changes + +- Extend `tool_trajectory` evaluator to support optional `args` matching in `expected` items +- Support two argument matching modes: **exact** (deep equality) and **skip** (`any`) +- Add examples demonstrating argument matching + +**Note:** Pattern/regex matching is intentionally excluded - use `code_judge` for complex validation logic. See AGENTS.md "Design Principles" for rationale. + +## Impact + +- Affected specs: `evaluation` +- Affected code: `packages/core/src/evaluation/evaluators.ts` (ToolTrajectoryEvaluator) +- Non-breaking: existing configs without `args` continue to work unchanged + +## Implementation Notes + +### Data Source +Tool arguments are already available in `ToolCall.input` (see `packages/core/src/evaluation/providers/types.ts`). +Currently, `extractToolCallsFromMessages()` discards this - change to preserve it: +```typescript +// Current (discards args): +toolCalls.push({ name: call.tool }); + +// New (preserves args): +toolCalls.push({ name: call.tool, args: call.input }); +``` + +### Type Definition +Extend `ToolTrajectoryExpectedItem` in `trace.ts`: +```typescript +interface ToolTrajectoryExpectedItem { + tool: string; + args?: 'any' | Record; // NEW +} +``` + +### Matching Semantics +- `args: any` → skip argument validation entirely +- `args: { key: value }` → partial match (only validate specified keys, use deep equality) +- If tool name matches but args don't → **full miss** (score 0 for that expected item) +- Use deep equality for nested objects diff --git a/openspec/changes/add-trajectory-argument-matching/specs/evaluation/spec.md b/openspec/changes/add-trajectory-argument-matching/specs/evaluation/spec.md new file mode 100644 index 00000000..d890474c --- /dev/null +++ b/openspec/changes/add-trajectory-argument-matching/specs/evaluation/spec.md @@ -0,0 +1,192 @@ +## MODIFIED Requirements + +### Requirement: Tool Trajectory Evaluator + +The system SHALL provide a built-in `tool_trajectory` evaluator that asserts tool-call constraints, including optional argument validation. + +#### Scenario: Minimum calls met - PASS +- **GIVEN** an eval case with evaluator: + ```yaml + type: tool_trajectory + mode: any_order + minimums: + semanticSearch: 3 + ``` +- **AND** trace summary `toolCallsByName: { "semanticSearch": 3 }` +- **WHEN** the evaluator runs +- **THEN** it returns `score: 1.0` +- **AND** `hits` includes a message like `"semanticSearch called 3 times (minimum: 3)"` + +#### Scenario: Minimum calls not met - FAIL +- **GIVEN** an eval case with evaluator: + ```yaml + type: tool_trajectory + mode: any_order + minimums: + semanticSearch: 3 + ``` +- **AND** trace summary `toolCallsByName: { "semanticSearch": 1 }` +- **WHEN** the evaluator runs +- **THEN** it returns `score: 0.0` +- **AND** `misses` includes a message like `"semanticSearch called 1 time (minimum: 3)"` + +#### Scenario: Multiple minimums - partial pass +- **GIVEN** an eval case with evaluator: + ```yaml + type: tool_trajectory + mode: any_order + minimums: + toolA: 2 + toolB: 2 + ``` +- **AND** trace summary `toolCallsByName: { "toolA": 2, "toolB": 1 }` +- **WHEN** the evaluator runs +- **THEN** it returns `score: 0.5` (1 of 2 constraints met) +- **AND** `hits` includes message for toolA +- **AND** `misses` includes message for toolB + +#### Scenario: In-order sequence - PASS +- **GIVEN** an eval case with evaluator: + ```yaml + type: tool_trajectory + mode: in_order + expected: + - tool: A + - tool: B + - tool: C + ``` +- **AND** trace contains tool calls in order `[A, X, B, Y, C]` (extra tools allowed) +- **WHEN** the evaluator runs +- **THEN** it returns `score: 1.0` + +#### Scenario: In-order sequence - FAIL (wrong order) +- **GIVEN** an eval case with evaluator: + ```yaml + type: tool_trajectory + mode: in_order + expected: + - tool: A + - tool: B + ``` +- **AND** trace contains tool calls in order `[B, A]` +- **WHEN** the evaluator runs +- **THEN** it returns `score: 0.0` +- **AND** `misses` explains the order mismatch + +#### Scenario: Exact sequence - PASS +- **GIVEN** an eval case with evaluator: + ```yaml + type: tool_trajectory + mode: exact + expected: + - tool: A + - tool: B + ``` +- **AND** trace contains exactly tool calls `[A, B]` +- **WHEN** the evaluator runs +- **THEN** it returns `score: 1.0` + +#### Scenario: Exact sequence - FAIL (extra tools) +- **GIVEN** an eval case with evaluator: + ```yaml + type: tool_trajectory + mode: exact + expected: + - tool: A + - tool: B + ``` +- **AND** trace contains tool calls `[A, B, C]` +- **WHEN** the evaluator runs +- **THEN** it returns `score: 0.0` +- **AND** `misses` explains the extra tool + +#### Scenario: No trace available +- **GIVEN** an eval case with a `tool_trajectory` evaluator +- **AND** the provider did not return a trace +- **WHEN** the evaluator runs +- **THEN** it returns `score: 0.0` +- **AND** `misses` includes `"No trace available for evaluation"` + +#### Scenario: In-order with exact argument matching - PASS +- **GIVEN** an eval case with evaluator: + ```yaml + type: tool_trajectory + mode: in_order + expected: + - tool: search + args: + query: "weather forecast" + - tool: get_weather + args: + location: "Paris" + ``` +- **AND** trace contains tool calls `[search(query="weather forecast"), get_weather(location="Paris")]` +- **WHEN** the evaluator runs +- **THEN** it returns `score: 1.0` + +#### Scenario: In-order with exact argument matching - FAIL (wrong args) +- **GIVEN** an eval case with evaluator: + ```yaml + type: tool_trajectory + mode: in_order + expected: + - tool: search + args: + query: "weather forecast" + ``` +- **AND** trace contains tool calls `[search(query="stock prices")]` +- **WHEN** the evaluator runs +- **THEN** it returns `score: 0.0` +- **AND** `misses` explains the argument mismatch + +#### Scenario: Argument matching with `any` skip mode +- **GIVEN** an eval case with evaluator: + ```yaml + type: tool_trajectory + mode: in_order + expected: + - tool: search + args: any + - tool: process + args: + format: "json" + ``` +- **AND** trace contains tool calls `[search(query="anything"), process(format="json")]` +- **WHEN** the evaluator runs +- **THEN** it returns `score: 1.0` +- **AND** the `search` tool's arguments are not validated +- **AND** the `process` tool's `format` argument is validated + +#### Scenario: Exact mode with argument matching +- **GIVEN** an eval case with evaluator: + ```yaml + type: tool_trajectory + mode: exact + expected: + - tool: auth + args: + method: "oauth" + - tool: fetch + args: + endpoint: "/api/users" + ``` +- **AND** trace contains exactly `[auth(method="oauth"), fetch(endpoint="/api/users")]` +- **WHEN** the evaluator runs +- **THEN** it returns `score: 1.0` + +#### Scenario: Partial argument matching (subset validation) +- **GIVEN** an eval case with evaluator: + ```yaml + type: tool_trajectory + mode: in_order + expected: + - tool: api_call + args: + method: "POST" + # url not specified - not validated + ``` +- **AND** trace contains tool calls `[api_call(method="POST", url="https://example.com", headers={})]` +- **WHEN** the evaluator runs +- **THEN** it returns `score: 1.0` +- **AND** only the specified `method` argument is validated +- **AND** extra arguments `url` and `headers` are ignored diff --git a/openspec/changes/add-trajectory-argument-matching/tasks.md b/openspec/changes/add-trajectory-argument-matching/tasks.md new file mode 100644 index 00000000..5669ba16 --- /dev/null +++ b/openspec/changes/add-trajectory-argument-matching/tasks.md @@ -0,0 +1,22 @@ +## 1. Implementation + +- [ ] 1.1 Extend `ToolTrajectoryExpectedItem` type to include optional `args` field +- [ ] 1.2 Implement exact argument matching (deep equality) +- [ ] 1.3 Implement `any` mode (skip argument validation) +- [ ] 1.4 Update `evaluateInOrder` to check arguments +- [ ] 1.5 Update `evaluateExact` to check arguments +- [ ] 1.6 Update `extractToolCallsFromMessages` to preserve `ToolCall.input` + +## 2. Schema & Validation + +- [ ] 2.1 Update YAML schema for `expected[].args` field + +## 3. Examples & Documentation + +- [x] 3.1 Add argument matching examples to `examples/features/evals/tool-trajectory/tool-trajectory-demo.yaml` + +## 4. Testing + +- [ ] 4.1 Unit tests for exact argument matching +- [ ] 4.2 Unit tests for `any` mode +- [ ] 4.3 Integration tests with mock agent From 362c22a444ef6fd103632d509ab1892bc9808027 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 1 Jan 2026 09:45:08 +0000 Subject: [PATCH 2/5] feat(core): add execution metrics and trajectory argument matching Execution metrics: - Extend TraceSummary with tokenUsage, costUsd, durationMs, toolDurations - Add helper functions: explorationRatio, tokensPerTool, avgToolDurationMs - Extend ProviderResponse with metrics fields - Orchestrator merges provider metrics into TraceSummary Trajectory argument matching: - Add args field to ToolTrajectoryExpectedItem ('any' or partial object) - Implement deep equality matching for tool arguments - Update evaluateInOrder and evaluateExact to check args - Parse args field from YAML in evaluator-parser --- apps/cli/package.json | 5 +- .../changes/add-execution-metrics/proposal.md | 4 +- .../changes/add-execution-metrics/tasks.md | 24 +- .../add-trajectory-argument-matching/tasks.md | 20 +- packages/core/package.json | 5 +- packages/core/src/evaluation/evaluators.ts | 109 ++++++- .../evaluation/loaders/evaluator-parser.ts | 9 +- packages/core/src/evaluation/orchestrator.ts | 21 +- .../core/src/evaluation/providers/index.ts | 1 + .../core/src/evaluation/providers/types.ts | 18 ++ packages/core/src/evaluation/trace.ts | 130 ++++++++ .../test/evaluation/execution-metrics.test.ts | 248 ++++++++++++++ .../tool-trajectory-evaluator.test.ts | 304 ++++++++++++++++++ 13 files changed, 848 insertions(+), 50 deletions(-) create mode 100644 packages/core/test/evaluation/execution-metrics.test.ts diff --git a/apps/cli/package.json b/apps/cli/package.json index adaf747c..402e7bc5 100644 --- a/apps/cli/package.json +++ b/apps/cli/package.json @@ -14,10 +14,7 @@ "bin": { "agentv": "./dist/cli.js" }, - "files": [ - "dist", - "README.md" - ], + "files": ["dist", "README.md"], "scripts": { "dev": "bun --watch src/index.ts", "build": "tsup && bun run copy-readme", diff --git a/openspec/changes/add-execution-metrics/proposal.md b/openspec/changes/add-execution-metrics/proposal.md index 669d0385..1cb8b917 100644 --- a/openspec/changes/add-execution-metrics/proposal.md +++ b/openspec/changes/add-execution-metrics/proposal.md @@ -12,8 +12,8 @@ This is infrastructure/data collection - not domain logic. Providers optionally ## What Changes -- Extend `TraceSummary` to `ExecutionMetrics` with optional fields for token usage, cost, duration -- Add computed metrics: `explorationRatio`, `tokensPerTool` +- Add optional execution metrics fields to `TraceSummary` (token usage, cost, duration) +- Add helper functions to compute derived metrics (`explorationRatio`, `tokensPerTool`) - Make metrics available to evaluators and output writers - Add example demonstrating metrics-based evaluation diff --git a/openspec/changes/add-execution-metrics/tasks.md b/openspec/changes/add-execution-metrics/tasks.md index aa80072a..d3891971 100644 --- a/openspec/changes/add-execution-metrics/tasks.md +++ b/openspec/changes/add-execution-metrics/tasks.md @@ -1,27 +1,27 @@ ## 1. Data Model -- [ ] 1.1 Extend `TraceSummary` type with optional `tokenUsage` field -- [ ] 1.2 Add optional `costUsd` field to trace -- [ ] 1.3 Add optional `durationMs` field to trace -- [ ] 1.4 Add optional `toolDurations` map (tool name -> duration array) +- [x] 1.1 Extend `TraceSummary` type with optional `tokenUsage` field +- [x] 1.2 Add optional `costUsd` field to trace +- [x] 1.3 Add optional `durationMs` field to trace +- [x] 1.4 Add optional `toolDurations` map (tool name -> duration array) ## 2. Computed Metrics -- [ ] 2.1 Implement `explorationRatio` computation (configurable exploration tool list) -- [ ] 2.2 Implement `tokensPerTool` computation -- [ ] 2.3 Add `avgToolDurationMs` computation +- [x] 2.1 Implement `explorationRatio` computation (configurable exploration tool list) +- [x] 2.2 Implement `tokensPerTool` computation +- [x] 2.3 Add `avgToolDurationMs` computation ## 3. Provider Integration -- [ ] 3.1 Define provider metric reporting interface +- [x] 3.1 Define provider metric reporting interface - [ ] 3.2 Update CLI provider to report duration metrics - [ ] 3.3 Document metric reporting for custom providers ## 4. Output & Evaluation -- [ ] 4.1 Include metrics in evaluation results JSON -- [ ] 4.2 Make metrics available to code judges via stdin -- [ ] 4.3 Add metrics to JSONL output format +- [x] 4.1 Include metrics in evaluation results JSON +- [x] 4.2 Make metrics available to code judges via stdin +- [x] 4.3 Add metrics to JSONL output format ## 5. Examples & Documentation @@ -30,5 +30,5 @@ ## 6. Testing -- [ ] 6.1 Unit tests for metric computation +- [x] 6.1 Unit tests for metric computation - [ ] 6.2 Integration test with metric-aware code judge diff --git a/openspec/changes/add-trajectory-argument-matching/tasks.md b/openspec/changes/add-trajectory-argument-matching/tasks.md index 5669ba16..12b7f20c 100644 --- a/openspec/changes/add-trajectory-argument-matching/tasks.md +++ b/openspec/changes/add-trajectory-argument-matching/tasks.md @@ -1,15 +1,15 @@ ## 1. Implementation -- [ ] 1.1 Extend `ToolTrajectoryExpectedItem` type to include optional `args` field -- [ ] 1.2 Implement exact argument matching (deep equality) -- [ ] 1.3 Implement `any` mode (skip argument validation) -- [ ] 1.4 Update `evaluateInOrder` to check arguments -- [ ] 1.5 Update `evaluateExact` to check arguments -- [ ] 1.6 Update `extractToolCallsFromMessages` to preserve `ToolCall.input` +- [x] 1.1 Extend `ToolTrajectoryExpectedItem` type to include optional `args` field +- [x] 1.2 Implement exact argument matching (deep equality) +- [x] 1.3 Implement `any` mode (skip argument validation) +- [x] 1.4 Update `evaluateInOrder` to check arguments +- [x] 1.5 Update `evaluateExact` to check arguments +- [x] 1.6 Update `extractToolCallsFromMessages` to preserve `ToolCall.input` ## 2. Schema & Validation -- [ ] 2.1 Update YAML schema for `expected[].args` field +- [x] 2.1 Update YAML schema for `expected[].args` field (updated evaluator-parser.ts) ## 3. Examples & Documentation @@ -17,6 +17,6 @@ ## 4. Testing -- [ ] 4.1 Unit tests for exact argument matching -- [ ] 4.2 Unit tests for `any` mode -- [ ] 4.3 Integration tests with mock agent +- [x] 4.1 Unit tests for exact argument matching +- [x] 4.2 Unit tests for `any` mode +- [x] 4.3 Integration tests with mock agent (covered by unit tests with mock context) diff --git a/packages/core/package.json b/packages/core/package.json index 765f7e0a..5328d44b 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -36,10 +36,7 @@ "test:watch": "bun test --watch", "diagnostics:azure": "bun src/diagnostics/azure-deployment-diag.ts" }, - "files": [ - "dist", - "README.md" - ], + "files": ["dist", "README.md"], "dependencies": { "@ai-sdk/anthropic": "^2.0.53", "@ai-sdk/azure": "^2.0.78", diff --git a/packages/core/src/evaluation/evaluators.ts b/packages/core/src/evaluation/evaluators.ts index ad6ccebd..1c233e91 100644 --- a/packages/core/src/evaluation/evaluators.ts +++ b/packages/core/src/evaluation/evaluators.ts @@ -11,7 +11,11 @@ import { extractLastAssistantContent, } from './providers/types.js'; import { TEMPLATE_VARIABLES } from './template-variables.js'; -import type { ToolTrajectoryEvaluatorConfig, TraceSummary } from './trace.js'; +import type { + ToolTrajectoryEvaluatorConfig, + ToolTrajectoryExpectedItem, + TraceSummary, +} from './trace.js'; import type { EvalCase, EvaluationVerdict, @@ -584,6 +588,58 @@ function substituteVariables(template: string, variables: Record // Tool Trajectory Evaluator +/** Extracted tool call with optional arguments */ +interface ExtractedToolCall { + readonly name: string; + readonly args?: Record; +} + +/** + * Deep equality check for two values. + * Handles primitives, arrays, and plain objects. + */ +function deepEqual(a: unknown, b: unknown): boolean { + if (a === b) return true; + if (a === null || b === null) return a === b; + if (typeof a !== typeof b) return false; + if (typeof a !== 'object') return a === b; + + if (Array.isArray(a) !== Array.isArray(b)) return false; + if (Array.isArray(a) && Array.isArray(b)) { + if (a.length !== b.length) return false; + return a.every((val, i) => deepEqual(val, b[i])); + } + + const aObj = a as Record; + const bObj = b as Record; + const aKeys = Object.keys(aObj); + const bKeys = Object.keys(bObj); + if (aKeys.length !== bKeys.length) return false; + return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key])); +} + +/** + * Check if actual args match expected args. + * - 'any' → always matches + * - object → partial match (only specified keys, deep equality) + */ +function argsMatch( + expected: ToolTrajectoryExpectedItem['args'], + actual: Record | undefined, +): boolean { + // No args constraint means match + if (expected === undefined) return true; + // 'any' means skip validation + if (expected === 'any') return true; + // Partial match: check only specified keys + if (actual === undefined) return false; + for (const key of Object.keys(expected)) { + if (!Object.hasOwn(actual, key)) return false; + if (!deepEqual(expected[key], actual[key])) return false; + } + return true; +} + export interface ToolTrajectoryEvaluatorOptions { readonly config: ToolTrajectoryEvaluatorConfig; } @@ -650,16 +706,19 @@ export class ToolTrajectoryEvaluator implements Evaluator { */ private extractToolCallsFromMessages( messages: readonly OutputMessage[] | undefined, - ): readonly { name: string }[] { + ): readonly ExtractedToolCall[] { if (!messages) { return []; } - const toolCalls: { name: string }[] = []; + const toolCalls: ExtractedToolCall[] = []; for (const message of messages) { if (message.toolCalls) { for (const call of message.toolCalls) { - toolCalls.push({ name: call.tool }); + toolCalls.push({ + name: call.tool, + args: call.input as Record | undefined, + }); } } } @@ -669,7 +728,7 @@ export class ToolTrajectoryEvaluator implements Evaluator { /** * Build a summary from extracted tool calls. */ - private buildSummary(toolCalls: readonly { name: string }[]): TraceSummary { + private buildSummary(toolCalls: readonly ExtractedToolCall[]): TraceSummary { const toolCallsByName: Record = {}; for (const call of toolCalls) { toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1; @@ -721,7 +780,7 @@ export class ToolTrajectoryEvaluator implements Evaluator { }; } - private evaluateInOrder(toolCalls: readonly { name: string }[]): EvaluationScore { + private evaluateInOrder(toolCalls: readonly ExtractedToolCall[]): EvaluationScore { const expected = this.config.expected ?? []; if (expected.length === 0) { @@ -739,20 +798,33 @@ export class ToolTrajectoryEvaluator implements Evaluator { let actualIndex = 0; for (let i = 0; i < expected.length; i++) { - const expectedTool = expected[i].tool; + const expectedItem = expected[i]; + const expectedTool = expectedItem.tool; let found = false; + let argsMismatch = false; while (actualIndex < toolCalls.length) { - if (toolCalls[actualIndex].name === expectedTool) { - hits.push(`Found ${expectedTool} at position ${actualIndex}`); + const actualCall = toolCalls[actualIndex]; + if (actualCall.name === expectedTool) { + // Tool name matches, check args if specified + if (argsMatch(expectedItem.args, actualCall.args)) { + hits.push(`Found ${expectedTool} at position ${actualIndex}`); + actualIndex++; + found = true; + break; + } + // Tool name matches but args don't - this is a miss for this expected item + misses.push( + `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`, + ); actualIndex++; - found = true; + argsMismatch = true; break; } actualIndex++; } - if (!found) { + if (!found && !argsMismatch) { misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`); } } @@ -768,7 +840,7 @@ export class ToolTrajectoryEvaluator implements Evaluator { }; } - private evaluateExact(toolCalls: readonly { name: string }[]): EvaluationScore { + private evaluateExact(toolCalls: readonly ExtractedToolCall[]): EvaluationScore { const expected = this.config.expected ?? []; if (expected.length === 0) { @@ -790,10 +862,17 @@ export class ToolTrajectoryEvaluator implements Evaluator { const checkLength = Math.min(expected.length, toolCalls.length); for (let i = 0; i < checkLength; i++) { - const expectedTool = expected[i].tool; - const actualTool = toolCalls[i].name; + const expectedItem = expected[i]; + const expectedTool = expectedItem.tool; + const actualCall = toolCalls[i]; + const actualTool = actualCall.name; if (actualTool === expectedTool) { - hits.push(`Position ${i}: ${expectedTool} ✓`); + // Tool name matches, check args if specified + if (argsMatch(expectedItem.args, actualCall.args)) { + hits.push(`Position ${i}: ${expectedTool}`); + } else { + misses.push(`Position ${i}: ${expectedTool} args mismatch`); + } } else { misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`); } diff --git a/packages/core/src/evaluation/loaders/evaluator-parser.ts b/packages/core/src/evaluation/loaders/evaluator-parser.ts index 7381ac87..a3ad708c 100644 --- a/packages/core/src/evaluation/loaders/evaluator-parser.ts +++ b/packages/core/src/evaluation/loaders/evaluator-parser.ts @@ -258,7 +258,14 @@ export async function parseEvaluators( expected = []; for (const item of rawExpected) { if (isJsonObject(item) && typeof item.tool === 'string') { - expected.push({ tool: item.tool }); + // Parse optional args field: 'any' or Record + let args: ToolTrajectoryExpectedItem['args']; + if (item.args === 'any') { + args = 'any'; + } else if (isJsonObject(item.args)) { + args = item.args as Record; + } + expected.push({ tool: item.tool, ...(args !== undefined ? { args } : {}) }); } } } diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index e1728ee9..1ee66fdb 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -28,6 +28,7 @@ import { type ToolTrajectoryEvaluatorConfig, type TraceSummary, computeTraceSummary, + mergeExecutionMetrics, } from './trace.js'; import type { EvalCase, @@ -411,7 +412,15 @@ async function runBatchEvaluation(options: { // Extract outputMessages from batch response const outputMessages = providerResponse.outputMessages; - const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : undefined; + const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : undefined; + // Merge execution metrics from provider response + const traceSummary = baseSummary + ? mergeExecutionMetrics(baseSummary, { + tokenUsage: providerResponse.tokenUsage, + costUsd: providerResponse.costUsd, + durationMs: providerResponse.durationMs, + }) + : undefined; // Extract candidate from last assistant message in output_messages const candidate = extractLastAssistantContent(outputMessages); @@ -550,7 +559,15 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise; } +/** + * Token usage metrics reported by provider. + */ +export interface ProviderTokenUsage { + /** Input/prompt tokens consumed */ + readonly input: number; + /** Output/completion tokens generated */ + readonly output: number; + /** Cached tokens (optional, provider-specific) */ + readonly cached?: number; +} + export interface ProviderResponse { readonly raw?: unknown; readonly usage?: JsonObject; /** Output messages from agent execution (primary source for tool trajectory) */ readonly outputMessages?: readonly OutputMessage[]; + /** Token usage metrics (optional) */ + readonly tokenUsage?: ProviderTokenUsage; + /** Total cost in USD (optional) */ + readonly costUsd?: number; + /** Execution duration in milliseconds (optional) */ + readonly durationMs?: number; } /** diff --git a/packages/core/src/evaluation/trace.ts b/packages/core/src/evaluation/trace.ts index 5a78378e..bcf61a1d 100644 --- a/packages/core/src/evaluation/trace.ts +++ b/packages/core/src/evaluation/trace.ts @@ -3,6 +3,18 @@ * Provides a normalized, provider-agnostic model for tool-call trajectories. */ +/** + * Token usage metrics from provider execution. + */ +export interface TokenUsage { + /** Input/prompt tokens consumed */ + readonly input: number; + /** Output/completion tokens generated */ + readonly output: number; + /** Cached tokens (optional, provider-specific) */ + readonly cached?: number; +} + /** * Compact summary of a trace for lightweight persistence. * Included in results by default to avoid payload bloat. @@ -16,6 +28,14 @@ export interface TraceSummary { readonly toolCallsByName: Readonly>; /** Number of error events */ readonly errorCount: number; + /** Token usage metrics (optional, from provider) */ + readonly tokenUsage?: TokenUsage; + /** Total cost in USD (optional, from provider) */ + readonly costUsd?: number; + /** Total execution duration in milliseconds (optional) */ + readonly durationMs?: number; + /** Per-tool duration arrays in milliseconds (optional) */ + readonly toolDurations?: Readonly>; } /** @@ -39,6 +59,8 @@ export interface ToolTrajectoryEvaluatorConfig { */ export interface ToolTrajectoryExpectedItem { readonly tool: string; + /** Optional argument matching: 'any' skips validation, object performs partial deep equality */ + readonly args?: 'any' | Record; } /** @@ -77,3 +99,111 @@ export function computeTraceSummary(messages: readonly OutputMessageLike[]): Tra errorCount: 0, }; } + +/** + * Default tool names considered as exploration/read-only operations. + * Can be overridden per-evaluation via config. + */ +export const DEFAULT_EXPLORATION_TOOLS = [ + 'read', + 'grep', + 'glob', + 'search', + 'list', + 'Read', + 'Grep', + 'Glob', + 'WebSearch', + 'WebFetch', +] as const; + +/** + * Ratio of exploration tool calls to total tool calls. + * Returns undefined if there are no tool calls. + * + * @param summary - Trace summary with tool call counts + * @param explorationTools - Tool names considered exploration (defaults to DEFAULT_EXPLORATION_TOOLS) + * @returns Ratio between 0 and 1, or undefined if no tool calls + */ +export function explorationRatio( + summary: TraceSummary, + explorationTools: readonly string[] = DEFAULT_EXPLORATION_TOOLS, +): number | undefined { + if (summary.eventCount === 0) return undefined; + + const explorationCalls = explorationTools.reduce( + (sum, tool) => sum + (summary.toolCallsByName[tool] ?? 0), + 0, + ); + + return explorationCalls / summary.eventCount; +} + +/** + * Average tokens consumed per tool call. + * Returns undefined if tokenUsage is not available or no tool calls. + * + * @param summary - Trace summary with optional token usage + * @returns Average tokens per tool call, or undefined + */ +export function tokensPerTool(summary: TraceSummary): number | undefined { + if (!summary.tokenUsage || summary.eventCount === 0) return undefined; + + const totalTokens = summary.tokenUsage.input + summary.tokenUsage.output; + return totalTokens / summary.eventCount; +} + +/** + * Average tool duration across all tool calls. + * Returns undefined if toolDurations is not available or empty. + * + * @param summary - Trace summary with optional tool durations + * @returns Average duration in milliseconds, or undefined + */ +export function avgToolDurationMs(summary: TraceSummary): number | undefined { + if (!summary.toolDurations) return undefined; + + let totalDuration = 0; + let totalCalls = 0; + + for (const durations of Object.values(summary.toolDurations)) { + for (const duration of durations) { + totalDuration += duration; + totalCalls++; + } + } + + if (totalCalls === 0) return undefined; + return totalDuration / totalCalls; +} + +/** + * Execution metrics from provider response. + */ +export interface ExecutionMetrics { + readonly tokenUsage?: TokenUsage; + readonly costUsd?: number; + readonly durationMs?: number; +} + +/** + * Merge execution metrics from provider response into a trace summary. + * Returns a new TraceSummary with metrics fields populated. + * + * @param summary - Base trace summary from computeTraceSummary + * @param metrics - Optional execution metrics from provider + * @returns TraceSummary with merged metrics + */ +export function mergeExecutionMetrics( + summary: TraceSummary, + metrics?: ExecutionMetrics, +): TraceSummary { + if (!metrics) return summary; + + return { + ...summary, + tokenUsage: metrics.tokenUsage, + costUsd: metrics.costUsd, + durationMs: metrics.durationMs, + }; +} diff --git a/packages/core/test/evaluation/execution-metrics.test.ts b/packages/core/test/evaluation/execution-metrics.test.ts new file mode 100644 index 00000000..666d2de1 --- /dev/null +++ b/packages/core/test/evaluation/execution-metrics.test.ts @@ -0,0 +1,248 @@ +import { describe, expect, it } from 'bun:test'; + +import { + type TraceSummary, + avgToolDurationMs, + explorationRatio, + tokensPerTool, + mergeExecutionMetrics, +} from '../../src/evaluation/trace.js'; + +describe('Execution Metrics', () => { + describe('explorationRatio', () => { + it('returns undefined when there are no tool calls', () => { + const summary: TraceSummary = { + eventCount: 0, + toolNames: [], + toolCallsByName: {}, + errorCount: 0, + }; + + expect(explorationRatio(summary)).toBeUndefined(); + }); + + it('returns 1.0 when all calls are exploration tools', () => { + const summary: TraceSummary = { + eventCount: 5, + toolNames: ['Read', 'Grep', 'Glob'], + toolCallsByName: { Read: 2, Grep: 2, Glob: 1 }, + errorCount: 0, + }; + + expect(explorationRatio(summary)).toBe(1.0); + }); + + it('returns 0.0 when no calls are exploration tools', () => { + const summary: TraceSummary = { + eventCount: 3, + toolNames: ['Edit', 'Write', 'Bash'], + toolCallsByName: { Edit: 1, Write: 1, Bash: 1 }, + errorCount: 0, + }; + + expect(explorationRatio(summary)).toBe(0.0); + }); + + it('returns correct ratio for mixed tool usage', () => { + const summary: TraceSummary = { + eventCount: 10, + toolNames: ['Edit', 'Grep', 'Read', 'Write'], + toolCallsByName: { Read: 4, Grep: 2, Edit: 3, Write: 1 }, + errorCount: 0, + }; + + // 6 exploration calls (Read: 4, Grep: 2) out of 10 + expect(explorationRatio(summary)).toBe(0.6); + }); + + it('accepts custom exploration tools list', () => { + const summary: TraceSummary = { + eventCount: 6, + toolNames: ['CustomTool', 'Edit', 'OtherTool'], + toolCallsByName: { CustomTool: 3, Edit: 2, OtherTool: 1 }, + errorCount: 0, + }; + + // 4 calls (CustomTool: 3, OtherTool: 1) are exploration with custom list + expect(explorationRatio(summary, ['CustomTool', 'OtherTool'])).toBeCloseTo(4 / 6); + }); + }); + + describe('tokensPerTool', () => { + it('returns undefined when tokenUsage is not available', () => { + const summary: TraceSummary = { + eventCount: 5, + toolNames: ['Read'], + toolCallsByName: { Read: 5 }, + errorCount: 0, + }; + + expect(tokensPerTool(summary)).toBeUndefined(); + }); + + it('returns undefined when there are no tool calls', () => { + const summary: TraceSummary = { + eventCount: 0, + toolNames: [], + toolCallsByName: {}, + errorCount: 0, + tokenUsage: { input: 1000, output: 500 }, + }; + + expect(tokensPerTool(summary)).toBeUndefined(); + }); + + it('computes correct tokens per tool', () => { + const summary: TraceSummary = { + eventCount: 5, + toolNames: ['Read', 'Edit'], + toolCallsByName: { Read: 3, Edit: 2 }, + errorCount: 0, + tokenUsage: { input: 1000, output: 500 }, + }; + + // Total tokens: 1500, divided by 5 tool calls = 300 tokens per tool + expect(tokensPerTool(summary)).toBe(300); + }); + + it('handles cached tokens in total calculation', () => { + const summary: TraceSummary = { + eventCount: 4, + toolNames: ['Read'], + toolCallsByName: { Read: 4 }, + errorCount: 0, + tokenUsage: { input: 800, output: 400, cached: 200 }, + }; + + // Total tokens: 800 + 400 = 1200 (cached not added to total) + expect(tokensPerTool(summary)).toBe(300); + }); + }); + + describe('avgToolDurationMs', () => { + it('returns undefined when toolDurations is not available', () => { + const summary: TraceSummary = { + eventCount: 5, + toolNames: ['Read'], + toolCallsByName: { Read: 5 }, + errorCount: 0, + }; + + expect(avgToolDurationMs(summary)).toBeUndefined(); + }); + + it('returns undefined when toolDurations is empty', () => { + const summary: TraceSummary = { + eventCount: 0, + toolNames: [], + toolCallsByName: {}, + errorCount: 0, + toolDurations: {}, + }; + + expect(avgToolDurationMs(summary)).toBeUndefined(); + }); + + it('computes correct average duration', () => { + const summary: TraceSummary = { + eventCount: 4, + toolNames: ['Read', 'Edit'], + toolCallsByName: { Read: 3, Edit: 1 }, + errorCount: 0, + toolDurations: { + Read: [100, 150, 200], // avg: 150 + Edit: [50], // avg: 50 + }, + }; + + // Total duration: 100 + 150 + 200 + 50 = 500ms + // Total calls: 4 + // Average: 125ms + expect(avgToolDurationMs(summary)).toBe(125); + }); + + it('handles single tool with multiple calls', () => { + const summary: TraceSummary = { + eventCount: 3, + toolNames: ['Grep'], + toolCallsByName: { Grep: 3 }, + errorCount: 0, + toolDurations: { + Grep: [100, 200, 300], + }, + }; + + expect(avgToolDurationMs(summary)).toBe(200); + }); + }); + + describe('mergeExecutionMetrics', () => { + const baseSummary: TraceSummary = { + eventCount: 5, + toolNames: ['Read', 'Edit'], + toolCallsByName: { Read: 3, Edit: 2 }, + errorCount: 0, + }; + + it('returns the same summary when no metrics provided', () => { + const result = mergeExecutionMetrics(baseSummary); + + expect(result).toBe(baseSummary); + }); + + it('returns the same summary when metrics is undefined', () => { + const result = mergeExecutionMetrics(baseSummary, undefined); + + expect(result).toBe(baseSummary); + }); + + it('merges tokenUsage into summary', () => { + const result = mergeExecutionMetrics(baseSummary, { + tokenUsage: { input: 1000, output: 500 }, + }); + + expect(result.eventCount).toBe(5); + expect(result.toolNames).toEqual(['Read', 'Edit']); + expect(result.tokenUsage).toEqual({ input: 1000, output: 500 }); + expect(result.costUsd).toBeUndefined(); + expect(result.durationMs).toBeUndefined(); + }); + + it('merges all metrics into summary', () => { + const result = mergeExecutionMetrics(baseSummary, { + tokenUsage: { input: 1000, output: 500, cached: 100 }, + costUsd: 0.05, + durationMs: 12000, + }); + + expect(result.eventCount).toBe(5); + expect(result.toolNames).toEqual(['Read', 'Edit']); + expect(result.tokenUsage).toEqual({ input: 1000, output: 500, cached: 100 }); + expect(result.costUsd).toBe(0.05); + expect(result.durationMs).toBe(12000); + }); + + it('preserves existing summary fields', () => { + const summaryWithError: TraceSummary = { + ...baseSummary, + errorCount: 2, + }; + + const result = mergeExecutionMetrics(summaryWithError, { + costUsd: 0.1, + }); + + expect(result.errorCount).toBe(2); + expect(result.costUsd).toBe(0.1); + }); + + it('does not mutate the original summary', () => { + const result = mergeExecutionMetrics(baseSummary, { + tokenUsage: { input: 1000, output: 500 }, + }); + + expect(baseSummary.tokenUsage).toBeUndefined(); + expect(result.tokenUsage).toEqual({ input: 1000, output: 500 }); + }); + }); +}); diff --git a/packages/core/test/evaluation/tool-trajectory-evaluator.test.ts b/packages/core/test/evaluation/tool-trajectory-evaluator.test.ts index 39d2adb9..624a92ea 100644 --- a/packages/core/test/evaluation/tool-trajectory-evaluator.test.ts +++ b/packages/core/test/evaluation/tool-trajectory-evaluator.test.ts @@ -381,4 +381,308 @@ describe('ToolTrajectoryEvaluator', () => { expect(result.misses.some((m) => m.includes('expected analyze, got nothing'))).toBe(true); }); }); + + describe('argument matching', () => { + describe('exact mode with args', () => { + it('passes when args match exactly', () => { + const outputMessages: OutputMessage[] = [ + { + role: 'assistant', + toolCalls: [ + { tool: 'search', input: { query: 'test', limit: 10 } }, + { tool: 'analyze', input: { format: 'json' } }, + ], + }, + ]; + + const config: ToolTrajectoryEvaluatorConfig = { + name: 'test', + type: 'tool_trajectory', + mode: 'exact', + expected: [ + { tool: 'search', args: { query: 'test', limit: 10 } }, + { tool: 'analyze', args: { format: 'json' } }, + ], + }; + const evaluator = new ToolTrajectoryEvaluator({ config }); + + const result = evaluator.evaluate(createContext({ outputMessages })); + + expect(result.score).toBe(1); + expect(result.verdict).toBe('pass'); + }); + + it('fails when args do not match', () => { + const outputMessages: OutputMessage[] = [ + { + role: 'assistant', + toolCalls: [{ tool: 'search', input: { query: 'wrong', limit: 10 } }], + }, + ]; + + const config: ToolTrajectoryEvaluatorConfig = { + name: 'test', + type: 'tool_trajectory', + mode: 'exact', + expected: [{ tool: 'search', args: { query: 'test', limit: 10 } }], + }; + const evaluator = new ToolTrajectoryEvaluator({ config }); + + const result = evaluator.evaluate(createContext({ outputMessages })); + + expect(result.score).toBe(0); + expect(result.verdict).toBe('fail'); + expect(result.misses.some((m) => m.includes('args mismatch'))).toBe(true); + }); + + it('skips arg validation with args: any', () => { + const outputMessages: OutputMessage[] = [ + { + role: 'assistant', + toolCalls: [{ tool: 'search', input: { query: 'anything', limit: 999 } }], + }, + ]; + + const config: ToolTrajectoryEvaluatorConfig = { + name: 'test', + type: 'tool_trajectory', + mode: 'exact', + expected: [{ tool: 'search', args: 'any' }], + }; + const evaluator = new ToolTrajectoryEvaluator({ config }); + + const result = evaluator.evaluate(createContext({ outputMessages })); + + expect(result.score).toBe(1); + expect(result.verdict).toBe('pass'); + }); + + it('performs partial matching - only validates specified keys', () => { + const outputMessages: OutputMessage[] = [ + { + role: 'assistant', + toolCalls: [{ tool: 'search', input: { query: 'test', limit: 10, extra: 'ignored' } }], + }, + ]; + + const config: ToolTrajectoryEvaluatorConfig = { + name: 'test', + type: 'tool_trajectory', + mode: 'exact', + expected: [{ tool: 'search', args: { query: 'test' } }], + }; + const evaluator = new ToolTrajectoryEvaluator({ config }); + + const result = evaluator.evaluate(createContext({ outputMessages })); + + expect(result.score).toBe(1); + expect(result.verdict).toBe('pass'); + }); + + it('handles nested objects with deep equality', () => { + const outputMessages: OutputMessage[] = [ + { + role: 'assistant', + toolCalls: [ + { + tool: 'search', + input: { options: { nested: { value: 123 } }, other: 'field' }, + }, + ], + }, + ]; + + const config: ToolTrajectoryEvaluatorConfig = { + name: 'test', + type: 'tool_trajectory', + mode: 'exact', + expected: [{ tool: 'search', args: { options: { nested: { value: 123 } } } }], + }; + const evaluator = new ToolTrajectoryEvaluator({ config }); + + const result = evaluator.evaluate(createContext({ outputMessages })); + + expect(result.score).toBe(1); + expect(result.verdict).toBe('pass'); + }); + + it('fails on nested object mismatch', () => { + const outputMessages: OutputMessage[] = [ + { + role: 'assistant', + toolCalls: [{ tool: 'search', input: { options: { nested: { value: 999 } } } }], + }, + ]; + + const config: ToolTrajectoryEvaluatorConfig = { + name: 'test', + type: 'tool_trajectory', + mode: 'exact', + expected: [{ tool: 'search', args: { options: { nested: { value: 123 } } } }], + }; + const evaluator = new ToolTrajectoryEvaluator({ config }); + + const result = evaluator.evaluate(createContext({ outputMessages })); + + expect(result.score).toBe(0); + expect(result.verdict).toBe('fail'); + }); + + it('matches without args field (backward compatibility)', () => { + const outputMessages: OutputMessage[] = [ + { + role: 'assistant', + toolCalls: [{ tool: 'search', input: { any: 'args' } }], + }, + ]; + + const config: ToolTrajectoryEvaluatorConfig = { + name: 'test', + type: 'tool_trajectory', + mode: 'exact', + expected: [{ tool: 'search' }], + }; + const evaluator = new ToolTrajectoryEvaluator({ config }); + + const result = evaluator.evaluate(createContext({ outputMessages })); + + expect(result.score).toBe(1); + expect(result.verdict).toBe('pass'); + }); + }); + + describe('in_order mode with args', () => { + it('passes when args match in sequence', () => { + const outputMessages: OutputMessage[] = [ + { + role: 'assistant', + toolCalls: [ + { tool: 'init', input: {} }, + { tool: 'search', input: { query: 'test' } }, + { tool: 'analyze', input: { format: 'json' } }, + ], + }, + ]; + + const config: ToolTrajectoryEvaluatorConfig = { + name: 'test', + type: 'tool_trajectory', + mode: 'in_order', + expected: [ + { tool: 'search', args: { query: 'test' } }, + { tool: 'analyze', args: { format: 'json' } }, + ], + }; + const evaluator = new ToolTrajectoryEvaluator({ config }); + + const result = evaluator.evaluate(createContext({ outputMessages })); + + expect(result.score).toBe(1); + expect(result.verdict).toBe('pass'); + }); + + it('fails when tool found but args mismatch', () => { + const outputMessages: OutputMessage[] = [ + { + role: 'assistant', + toolCalls: [ + { tool: 'search', input: { query: 'wrong' } }, + { tool: 'analyze', input: { format: 'json' } }, + ], + }, + ]; + + const config: ToolTrajectoryEvaluatorConfig = { + name: 'test', + type: 'tool_trajectory', + mode: 'in_order', + expected: [ + { tool: 'search', args: { query: 'test' } }, + { tool: 'analyze', args: { format: 'json' } }, + ], + }; + const evaluator = new ToolTrajectoryEvaluator({ config }); + + const result = evaluator.evaluate(createContext({ outputMessages })); + + expect(result.score).toBe(0.5); + expect(result.verdict).toBe('fail'); + expect(result.misses.some((m) => m.includes('args mismatch'))).toBe(true); + }); + + it('uses args: any to skip validation in sequence', () => { + const outputMessages: OutputMessage[] = [ + { + role: 'assistant', + toolCalls: [ + { tool: 'search', input: { query: 'anything' } }, + { tool: 'analyze', input: { format: 'xml' } }, + ], + }, + ]; + + const config: ToolTrajectoryEvaluatorConfig = { + name: 'test', + type: 'tool_trajectory', + mode: 'in_order', + expected: [ + { tool: 'search', args: 'any' }, + { tool: 'analyze', args: 'any' }, + ], + }; + const evaluator = new ToolTrajectoryEvaluator({ config }); + + const result = evaluator.evaluate(createContext({ outputMessages })); + + expect(result.score).toBe(1); + expect(result.verdict).toBe('pass'); + }); + }); + + describe('array argument matching', () => { + it('matches arrays with deep equality', () => { + const outputMessages: OutputMessage[] = [ + { + role: 'assistant', + toolCalls: [{ tool: 'search', input: { tags: ['a', 'b', 'c'] } }], + }, + ]; + + const config: ToolTrajectoryEvaluatorConfig = { + name: 'test', + type: 'tool_trajectory', + mode: 'exact', + expected: [{ tool: 'search', args: { tags: ['a', 'b', 'c'] } }], + }; + const evaluator = new ToolTrajectoryEvaluator({ config }); + + const result = evaluator.evaluate(createContext({ outputMessages })); + + expect(result.score).toBe(1); + expect(result.verdict).toBe('pass'); + }); + + it('fails on array order mismatch', () => { + const outputMessages: OutputMessage[] = [ + { + role: 'assistant', + toolCalls: [{ tool: 'search', input: { tags: ['c', 'b', 'a'] } }], + }, + ]; + + const config: ToolTrajectoryEvaluatorConfig = { + name: 'test', + type: 'tool_trajectory', + mode: 'exact', + expected: [{ tool: 'search', args: { tags: ['a', 'b', 'c'] } }], + }; + const evaluator = new ToolTrajectoryEvaluator({ config }); + + const result = evaluator.evaluate(createContext({ outputMessages })); + + expect(result.score).toBe(0); + expect(result.verdict).toBe('fail'); + }); + }); + }); }); From 1048559b5a58038341655b85ef06ae7be55ff074 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 1 Jan 2026 13:19:23 +0000 Subject: [PATCH 3/5] refactor(core): rename evaluation properties to camelCase Update evaluation result types and code evaluator payload to use consistent camelCase naming convention throughout the codebase. --- apps/cli/src/commands/compare/index.ts | 22 +++--- apps/cli/src/commands/eval/statistics.ts | 6 +- .../cli/test/commands/compare/compare.test.ts | 48 ++++++------- apps/cli/test/eval.integration.test.ts | 4 +- apps/cli/test/fixtures/mock-run-evaluation.ts | 26 +++---- .../scripts/check-batch-cli-output.ts | 14 ++-- .../evals/validate_risk_output.ts | 6 +- packages/core/src/evaluation/evaluators.ts | 18 ++--- packages/core/src/evaluation/orchestrator.ts | 42 ++++++------ packages/core/src/evaluation/types.ts | 24 +++---- .../core/test/evaluation/evaluators.test.ts | 8 +-- .../test/evaluation/execution-metrics.test.ts | 2 +- .../core/test/evaluation/orchestrator.test.ts | 68 +++++++++---------- 13 files changed, 144 insertions(+), 144 deletions(-) diff --git a/apps/cli/src/commands/compare/index.ts b/apps/cli/src/commands/compare/index.ts index f3cad4c7..3fda72c8 100644 --- a/apps/cli/src/commands/compare/index.ts +++ b/apps/cli/src/commands/compare/index.ts @@ -2,12 +2,12 @@ import { readFileSync } from 'node:fs'; import { command, number, option, optional, positional, string } from 'cmd-ts'; interface EvalResult { - eval_id: string; + evalId: string; score: number; } interface MatchedResult { - eval_id: string; + evalId: string; score1: number; score2: number; delta: number; @@ -35,14 +35,14 @@ export function loadJsonlResults(filePath: string): EvalResult[] { .filter((line) => line.trim()); return lines.map((line) => { - const record = JSON.parse(line) as { eval_id?: string; score?: number }; - if (typeof record.eval_id !== 'string') { - throw new Error(`Missing eval_id in result: ${line}`); + const record = JSON.parse(line) as { evalId?: string; score?: number }; + if (typeof record.evalId !== 'string') { + throw new Error(`Missing evalId in result: ${line}`); } if (typeof record.score !== 'number') { throw new Error(`Missing or invalid score in result: ${line}`); } - return { eval_id: record.eval_id, score: record.score }; + return { evalId: record.evalId, score: record.score }; }); } @@ -57,8 +57,8 @@ export function compareResults( results2: EvalResult[], threshold: number, ): ComparisonOutput { - const map1 = new Map(results1.map((r) => [r.eval_id, r.score])); - const map2 = new Map(results2.map((r) => [r.eval_id, r.score])); + const map1 = new Map(results1.map((r) => [r.evalId, r.score])); + const map2 = new Map(results2.map((r) => [r.evalId, r.score])); const matched: MatchedResult[] = []; const matchedIds = new Set(); @@ -68,7 +68,7 @@ export function compareResults( if (score2 !== undefined) { const delta = score2 - score1; matched.push({ - eval_id: evalId, + evalId: evalId, score1, score2, delta, @@ -78,8 +78,8 @@ export function compareResults( } } - const unmatchedFile1 = results1.filter((r) => !matchedIds.has(r.eval_id)).length; - const unmatchedFile2 = results2.filter((r) => !map1.has(r.eval_id)).length; + const unmatchedFile1 = results1.filter((r) => !matchedIds.has(r.evalId)).length; + const unmatchedFile2 = results2.filter((r) => !map1.has(r.evalId)).length; const wins = matched.filter((m) => m.outcome === 'win').length; const losses = matched.filter((m) => m.outcome === 'loss').length; diff --git a/apps/cli/src/commands/eval/statistics.ts b/apps/cli/src/commands/eval/statistics.ts index 0bd37a6f..e4d08356 100644 --- a/apps/cli/src/commands/eval/statistics.ts +++ b/apps/cli/src/commands/eval/statistics.ts @@ -86,7 +86,7 @@ export function calculateEvaluationSummary( // Track errors const errors = results .filter((result) => result.error !== undefined) - .map((result) => ({ evalId: result.eval_id, error: result.error as string })); + .map((result) => ({ evalId: result.evalId, error: result.error as string })); const errorCount = errors.length; if (total === 0) { @@ -180,12 +180,12 @@ export function formatEvaluationSummary(summary: EvaluationSummary): string { lines.push('\nTop performing eval cases:'); summary.topResults.forEach((result, index) => { - lines.push(` ${index + 1}. ${result.eval_id}: ${formatScore(result.score)}`); + lines.push(` ${index + 1}. ${result.evalId}: ${formatScore(result.score)}`); }); lines.push('\nLowest performing eval cases:'); summary.bottomResults.forEach((result, index) => { - lines.push(` ${index + 1}. ${result.eval_id}: ${formatScore(result.score)}`); + lines.push(` ${index + 1}. ${result.evalId}: ${formatScore(result.score)}`); }); return lines.join('\n'); diff --git a/apps/cli/test/commands/compare/compare.test.ts b/apps/cli/test/commands/compare/compare.test.ts index 83ad1a58..d64d5d1d 100644 --- a/apps/cli/test/commands/compare/compare.test.ts +++ b/apps/cli/test/commands/compare/compare.test.ts @@ -26,14 +26,14 @@ describe('compare command', () => { const filePath = path.join(tempDir, 'results.jsonl'); writeFileSync( filePath, - '{"eval_id": "case-1", "score": 0.8}\n{"eval_id": "case-2", "score": 0.9}\n', + '{"evalId": "case-1", "score": 0.8}\n{"evalId": "case-2", "score": 0.9}\n', ); const results = loadJsonlResults(filePath); expect(results).toEqual([ - { eval_id: 'case-1', score: 0.8 }, - { eval_id: 'case-2', score: 0.9 }, + { evalId: 'case-1', score: 0.8 }, + { evalId: 'case-2', score: 0.9 }, ]); }); @@ -41,7 +41,7 @@ describe('compare command', () => { const filePath = path.join(tempDir, 'results.jsonl'); writeFileSync( filePath, - '{"eval_id": "case-1", "score": 0.8}\n\n{"eval_id": "case-2", "score": 0.9}\n', + '{"evalId": "case-1", "score": 0.8}\n\n{"evalId": "case-2", "score": 0.9}\n', ); const results = loadJsonlResults(filePath); @@ -49,16 +49,16 @@ describe('compare command', () => { expect(results).toHaveLength(2); }); - it('should throw error for missing eval_id', () => { + it('should throw error for missing evalId', () => { const filePath = path.join(tempDir, 'results.jsonl'); writeFileSync(filePath, '{"score": 0.8}\n'); - expect(() => loadJsonlResults(filePath)).toThrow('Missing eval_id'); + expect(() => loadJsonlResults(filePath)).toThrow('Missing evalId'); }); it('should throw error for missing score', () => { const filePath = path.join(tempDir, 'results.jsonl'); - writeFileSync(filePath, '{"eval_id": "case-1"}\n'); + writeFileSync(filePath, '{"evalId": "case-1"}\n'); expect(() => loadJsonlResults(filePath)).toThrow('Missing or invalid score'); }); @@ -93,27 +93,27 @@ describe('compare command', () => { }); describe('compareResults', () => { - it('should match results by eval_id and compute deltas', () => { + it('should match results by evalId and compute deltas', () => { // Use values that avoid floating point precision issues const results1 = [ - { eval_id: 'case-1', score: 0.5 }, - { eval_id: 'case-2', score: 0.75 }, + { evalId: 'case-1', score: 0.5 }, + { evalId: 'case-2', score: 0.75 }, ]; const results2 = [ - { eval_id: 'case-1', score: 0.7 }, // +0.2 win - { eval_id: 'case-2', score: 0.5 }, // -0.25 loss + { evalId: 'case-1', score: 0.7 }, // +0.2 win + { evalId: 'case-2', score: 0.5 }, // -0.25 loss ]; const comparison = compareResults(results1, results2, 0.1); expect(comparison.matched).toHaveLength(2); - expect(comparison.matched[0].eval_id).toBe('case-1'); + expect(comparison.matched[0].evalId).toBe('case-1'); expect(comparison.matched[0].score1).toBe(0.5); expect(comparison.matched[0].score2).toBe(0.7); expect(comparison.matched[0].delta).toBeCloseTo(0.2, 10); expect(comparison.matched[0].outcome).toBe('win'); - expect(comparison.matched[1].eval_id).toBe('case-2'); + expect(comparison.matched[1].evalId).toBe('case-2'); expect(comparison.matched[1].score1).toBe(0.75); expect(comparison.matched[1].score2).toBe(0.5); expect(comparison.matched[1].delta).toBeCloseTo(-0.25, 10); @@ -122,12 +122,12 @@ describe('compare command', () => { it('should count unmatched results', () => { const results1 = [ - { eval_id: 'case-1', score: 0.8 }, - { eval_id: 'only-in-1', score: 0.5 }, + { evalId: 'case-1', score: 0.8 }, + { evalId: 'only-in-1', score: 0.5 }, ]; const results2 = [ - { eval_id: 'case-1', score: 0.9 }, - { eval_id: 'only-in-2', score: 0.6 }, + { evalId: 'case-1', score: 0.9 }, + { evalId: 'only-in-2', score: 0.6 }, ]; const comparison = compareResults(results1, results2, 0.1); @@ -138,14 +138,14 @@ describe('compare command', () => { it('should compute summary statistics', () => { // Use values that produce clear deltas above/below threshold const results1 = [ - { eval_id: 'case-1', score: 0.5 }, - { eval_id: 'case-2', score: 0.75 }, - { eval_id: 'case-3', score: 0.6 }, + { evalId: 'case-1', score: 0.5 }, + { evalId: 'case-2', score: 0.75 }, + { evalId: 'case-3', score: 0.6 }, ]; const results2 = [ - { eval_id: 'case-1', score: 0.7 }, // win (+0.2) - { eval_id: 'case-2', score: 0.5 }, // loss (-0.25) - { eval_id: 'case-3', score: 0.65 }, // tie (+0.05) + { evalId: 'case-1', score: 0.7 }, // win (+0.2) + { evalId: 'case-2', score: 0.5 }, // loss (-0.25) + { evalId: 'case-3', score: 0.65 }, // tie (+0.05) ]; const comparison = compareResults(results1, results2, 0.1); diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts index 8d21a069..b291e115 100644 --- a/apps/cli/test/eval.integration.test.ts +++ b/apps/cli/test/eval.integration.test.ts @@ -167,8 +167,8 @@ describe('agentv eval CLI', () => { const results = await readJsonLines(outputPath); expect(results).toHaveLength(2); const [firstResult, secondResult] = results as Array>; - expect(firstResult.eval_id).toBe('case-alpha'); - expect(secondResult.eval_id).toBe('case-beta'); + expect(firstResult.evalId).toBe('case-alpha'); + expect(secondResult.evalId).toBe('case-beta'); const diagnostics = await readDiagnostics(fixture); expect(diagnostics).toMatchObject({ diff --git a/apps/cli/test/fixtures/mock-run-evaluation.ts b/apps/cli/test/fixtures/mock-run-evaluation.ts index 0e9c0321..1ad4fc7a 100644 --- a/apps/cli/test/fixtures/mock-run-evaluation.ts +++ b/apps/cli/test/fixtures/mock-run-evaluation.ts @@ -22,44 +22,44 @@ interface RunEvaluationOptionsLike { } interface EvaluationResultLike { - readonly eval_id: string; + readonly evalId: string; readonly score: number; readonly hits: readonly string[]; readonly misses: readonly string[]; - readonly candidate_answer: string; - readonly expected_aspect_count: number; + readonly candidateAnswer: string; + readonly expectedAspectCount: number; readonly target: string; readonly timestamp: string; readonly reasoning?: string; - readonly raw_aspects?: readonly string[]; + readonly rawAspects?: readonly string[]; } function buildResults(targetName: string): EvaluationResultLike[] { const baseTime = new Date('2024-01-01T00:00:00.000Z'); return [ { - eval_id: 'case-alpha', + evalId: 'case-alpha', score: 0.6, hits: ['alpha'], misses: [], - candidate_answer: 'Alpha answer', - expected_aspect_count: 1, + candidateAnswer: 'Alpha answer', + expectedAspectCount: 1, target: targetName, timestamp: baseTime.toISOString(), reasoning: 'Alpha reasoning', - raw_aspects: ['alpha'], + rawAspects: ['alpha'], }, { - eval_id: 'case-beta', + evalId: 'case-beta', score: 0.9, hits: ['beta', 'gamma'], misses: ['delta'], - candidate_answer: 'Beta answer', - expected_aspect_count: 3, + candidateAnswer: 'Beta answer', + expectedAspectCount: 3, target: targetName, timestamp: new Date(baseTime.getTime() + 60_000).toISOString(), reasoning: 'Beta reasoning', - raw_aspects: ['beta', 'gamma', 'delta'], + rawAspects: ['beta', 'gamma', 'delta'], }, ]; } @@ -109,7 +109,7 @@ export async function runEvaluation( await maybeWriteDiagnostics(options, results); await maybeWritePromptDump( options.promptDumpDir, - results.map((result) => result.eval_id), + results.map((result) => result.evalId), ); for (const result of results) { diff --git a/examples/features/evals/batch-cli/scripts/check-batch-cli-output.ts b/examples/features/evals/batch-cli/scripts/check-batch-cli-output.ts index 3087b118..93b7e758 100644 --- a/examples/features/evals/batch-cli/scripts/check-batch-cli-output.ts +++ b/examples/features/evals/batch-cli/scripts/check-batch-cli-output.ts @@ -5,9 +5,9 @@ function isObject(value: unknown): value is Record { } type EvalInput = { - readonly input_messages?: unknown; - readonly expected_messages?: unknown; - readonly candidate_answer?: unknown; + readonly inputMessages?: unknown; + readonly expectedMessages?: unknown; + readonly candidateAnswer?: unknown; }; function findExpectedDecisionFromExpectedMessages(expectedMessages: unknown): string | undefined { @@ -53,9 +53,9 @@ function main(): void { const input = JSON.parse(stdin) as EvalInput; const expectedDecision = - findExpectedDecisionFromExpectedMessages(input.expected_messages) ?? - findExpectedDecision(input.input_messages); - const candidate = typeof input.candidate_answer === 'string' ? input.candidate_answer : ''; + findExpectedDecisionFromExpectedMessages(input.expectedMessages) ?? + findExpectedDecision(input.inputMessages); + const candidate = typeof input.candidateAnswer === 'string' ? input.candidateAnswer : ''; let candidateObj: unknown; try { @@ -73,7 +73,7 @@ function main(): void { const misses: string[] = []; if (!expectedDecision) { - misses.push('Missing expected decision (expected_messages[].content.decision)'); + misses.push('Missing expected decision (expectedMessages[].content.decision)'); } else { hits.push(`expected.decision present: ${expectedDecision}`); } diff --git a/examples/showcase/export-screening/evals/validate_risk_output.ts b/examples/showcase/export-screening/evals/validate_risk_output.ts index e7ffbd29..c3a4daaf 100644 --- a/examples/showcase/export-screening/evals/validate_risk_output.ts +++ b/examples/showcase/export-screening/evals/validate_risk_output.ts @@ -12,8 +12,8 @@ const VALID_RISK_LEVELS = new Set(['High', 'Medium', 'Low']); const REQUIRED_KEYS = ['riskLevel', 'reasoning']; interface EvalInput { - candidate_answer: string; - expected_messages?: Array<{ + candidateAnswer: string; + expectedMessages?: Array<{ role: string; content: unknown; }>; @@ -170,7 +170,7 @@ async function main(): Promise { process.exit(1); } - const result = validateRiskOutput(evalData.candidate_answer ?? '', evalData.expected_messages); + const result = validateRiskOutput(evalData.candidateAnswer ?? '', evalData.expectedMessages); console.log(JSON.stringify(result, null, 2)); } diff --git a/packages/core/src/evaluation/evaluators.ts b/packages/core/src/evaluation/evaluators.ts index 1c233e91..f5f5bf68 100644 --- a/packages/core/src/evaluation/evaluators.ts +++ b/packages/core/src/evaluation/evaluators.ts @@ -452,17 +452,17 @@ export class CodeEvaluator implements Evaluator { const inputPayload = JSON.stringify( { question: context.evalCase.question, - expected_outcome: context.evalCase.expected_outcome, - expected_messages: context.evalCase.expected_messages, - reference_answer: context.evalCase.reference_answer, - candidate_answer: context.candidate, - output_messages: context.outputMessages ?? null, - guideline_files: context.evalCase.guideline_paths, - input_files: context.evalCase.file_paths.filter( + expectedOutcome: context.evalCase.expected_outcome, + expectedMessages: context.evalCase.expected_messages, + referenceAnswer: context.evalCase.reference_answer, + candidateAnswer: context.candidate, + outputMessages: context.outputMessages ?? null, + guidelineFiles: context.evalCase.guideline_paths, + inputFiles: context.evalCase.file_paths.filter( (path) => !context.evalCase.guideline_paths.includes(path), ), - input_messages: context.evalCase.input_messages, - candidate_trace_summary: context.traceSummary ?? null, + inputMessages: context.evalCase.input_messages, + traceSummary: context.traceSummary ?? null, }, null, 2, diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 1ee66fdb..aa095cf2 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -662,21 +662,21 @@ async function evaluateCandidate(options: { return { timestamp: completedAt.toISOString(), - eval_id: evalCase.id, + evalId: evalCase.id, dataset: evalCase.dataset, - conversation_id: evalCase.conversation_id, + conversationId: evalCase.conversation_id, score: score.score, hits: score.hits, misses: score.misses, - candidate_answer: candidate, + candidateAnswer: candidate, target: target.name, reasoning: score.reasoning, - raw_aspects: score.rawAspects, - agent_provider_request: agentProviderRequest, - lm_provider_request: lmProviderRequest, - evaluator_provider_request: evaluatorResults ? undefined : score.evaluatorRawRequest, - evaluator_results: evaluatorResults, - trace_summary: traceSummary, + rawAspects: score.rawAspects, + agentProviderRequest: agentProviderRequest, + lmProviderRequest: lmProviderRequest, + evaluatorProviderRequest: evaluatorResults ? undefined : score.evaluatorRawRequest, + evaluatorResults: evaluatorResults, + traceSummary: traceSummary, }; } @@ -816,7 +816,7 @@ async function runEvaluatorList(options: { hits: score.hits, misses: score.misses, reasoning: score.reasoning, - evaluator_provider_request: score.evaluatorRawRequest, + evaluatorProviderRequest: score.evaluatorRawRequest, }); } @@ -848,7 +848,7 @@ async function runEvaluatorList(options: { hits: score.hits, misses: score.misses, reasoning: score.reasoning, - evaluator_provider_request: score.evaluatorRawRequest, + evaluatorProviderRequest: score.evaluatorRawRequest, }); } @@ -910,8 +910,8 @@ async function runEvaluatorList(options: { hits: score.hits, misses: score.misses, reasoning: score.reasoning, - evaluator_provider_request: score.evaluatorRawRequest, - evaluator_results: mapChildResults(score.evaluatorResults), + evaluatorProviderRequest: score.evaluatorRawRequest, + evaluatorResults: mapChildResults(score.evaluatorResults), }); } @@ -1212,17 +1212,17 @@ function buildErrorResult( return { timestamp: timestamp.toISOString(), - eval_id: evalCase.id, + evalId: evalCase.id, dataset: evalCase.dataset, - conversation_id: evalCase.conversation_id, + conversationId: evalCase.conversation_id, score: 0, hits: [], misses: [`Error: ${message}`], - candidate_answer: `Error occurred: ${message}`, + candidateAnswer: `Error occurred: ${message}`, target: targetName, - raw_aspects: [], - agent_provider_request: agentProviderRequest, - lm_provider_request: lmProviderRequest, + rawAspects: [], + agentProviderRequest: agentProviderRequest, + lmProviderRequest: lmProviderRequest, error: message, } satisfies EvaluationResult; } @@ -1282,8 +1282,8 @@ function mapChildResults( hits: child.hits, misses: child.misses, reasoning: child.reasoning, - evaluator_provider_request: child.evaluatorRawRequest, - evaluator_results: mapChildResults(child.evaluatorResults), + evaluatorProviderRequest: child.evaluatorRawRequest, + evaluatorResults: mapChildResults(child.evaluatorResults), })); } diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 96d40107..f686b1d6 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -239,23 +239,23 @@ export interface EvalCase { */ export interface EvaluationResult { readonly timestamp: string; - readonly eval_id: string; + readonly evalId: string; readonly dataset?: string; - readonly conversation_id?: string; + readonly conversationId?: string; readonly score: number; readonly hits: readonly string[]; readonly misses: readonly string[]; - readonly candidate_answer: string; + readonly candidateAnswer: string; readonly target: string; readonly reasoning?: string; - readonly raw_aspects?: readonly string[]; - readonly agent_provider_request?: JsonObject; - readonly lm_provider_request?: JsonObject; - readonly evaluator_provider_request?: JsonObject; - readonly evaluator_results?: readonly EvaluatorResult[]; + readonly rawAspects?: readonly string[]; + readonly agentProviderRequest?: JsonObject; + readonly lmProviderRequest?: JsonObject; + readonly evaluatorProviderRequest?: JsonObject; + readonly evaluatorResults?: readonly EvaluatorResult[]; readonly error?: string; /** Lightweight summary of the execution trace (always included when available) */ - readonly trace_summary?: TraceSummary; + readonly traceSummary?: TraceSummary; } export type EvaluationVerdict = 'pass' | 'fail' | 'borderline'; @@ -269,9 +269,9 @@ export interface EvaluatorResult { readonly hits: readonly string[]; readonly misses: readonly string[]; readonly reasoning?: string; - readonly raw_request?: JsonObject; - readonly evaluator_provider_request?: JsonObject; - readonly evaluator_results?: readonly EvaluatorResult[]; + readonly rawRequest?: JsonObject; + readonly evaluatorProviderRequest?: JsonObject; + readonly evaluatorResults?: readonly EvaluatorResult[]; } /** diff --git a/packages/core/test/evaluation/evaluators.test.ts b/packages/core/test/evaluation/evaluators.test.ts index c26b6557..8dd2d5e7 100644 --- a/packages/core/test/evaluation/evaluators.test.ts +++ b/packages/core/test/evaluation/evaluators.test.ts @@ -441,7 +441,7 @@ describe('CodeEvaluator', () => { const expectedCandidate = '{"decision":"ACCEPT"}'; const script = - "bun -e \"import fs from 'node:fs'; const input = JSON.parse(fs.readFileSync(0, 'utf8')); const hasExpected = Array.isArray(input.expected_messages); const hasCandidate = typeof input.candidate_answer === 'string'; let candidateDecisionOk = false; try { const obj = JSON.parse(input.candidate_answer); candidateDecisionOk = obj && obj.decision === 'ACCEPT'; } catch {} const ok = hasExpected && hasCandidate && candidateDecisionOk; console.log(JSON.stringify({ score: ok ? 1 : 0, hits: [hasExpected ? 'expected_messages present' : null, hasCandidate ? 'candidate_answer present' : null, candidateDecisionOk ? 'candidate_answer parses' : null].filter(Boolean), misses: [hasExpected ? null : 'expected_messages missing', hasCandidate ? null : 'candidate_answer missing', candidateDecisionOk ? null : 'candidate_answer invalid'].filter(Boolean) }));\""; + "bun -e \"import fs from 'node:fs'; const input = JSON.parse(fs.readFileSync(0, 'utf8')); const hasExpected = Array.isArray(input.expectedMessages); const hasCandidate = typeof input.candidateAnswer === 'string'; let candidateDecisionOk = false; try { const obj = JSON.parse(input.candidateAnswer); candidateDecisionOk = obj && obj.decision === 'ACCEPT'; } catch {} const ok = hasExpected && hasCandidate && candidateDecisionOk; console.log(JSON.stringify({ score: ok ? 1 : 0, hits: [hasExpected ? 'expectedMessages present' : null, hasCandidate ? 'candidateAnswer present' : null, candidateDecisionOk ? 'candidateAnswer parses' : null].filter(Boolean), misses: [hasExpected ? null : 'expectedMessages missing', hasCandidate ? null : 'candidateAnswer missing', candidateDecisionOk ? null : 'candidateAnswer invalid'].filter(Boolean) }));\""; const evaluator = new CodeEvaluator({ script }); @@ -457,8 +457,8 @@ describe('CodeEvaluator', () => { expect(result.score).toBe(1); expect(result.verdict).toBe('pass'); - expect(result.hits).toContain('expected_messages present'); - expect(result.hits).toContain('candidate_answer present'); - expect(result.hits).toContain('candidate_answer parses'); + expect(result.hits).toContain('expectedMessages present'); + expect(result.hits).toContain('candidateAnswer present'); + expect(result.hits).toContain('candidateAnswer parses'); }); }); diff --git a/packages/core/test/evaluation/execution-metrics.test.ts b/packages/core/test/evaluation/execution-metrics.test.ts index 666d2de1..96f1e27b 100644 --- a/packages/core/test/evaluation/execution-metrics.test.ts +++ b/packages/core/test/evaluation/execution-metrics.test.ts @@ -4,8 +4,8 @@ import { type TraceSummary, avgToolDurationMs, explorationRatio, - tokensPerTool, mergeExecutionMetrics, + tokensPerTool, } from '../../src/evaluation/trace.js'; describe('Execution Metrics', () => { diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index 4701ef93..a8118c6f 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -183,7 +183,7 @@ describe('runTestCase', () => { useCache: true, }); - expect(first.candidate_answer).toContain('structured logging'); + expect(first.candidateAnswer).toContain('structured logging'); const second = await runEvalCase({ evalCase: baseTestCase, @@ -194,7 +194,7 @@ describe('runTestCase', () => { useCache: true, }); - expect(second.candidate_answer).toBe(first.candidate_answer); + expect(second.candidateAnswer).toBe(first.candidateAnswer); expect(provider.callIndex).toBe(1); }); @@ -314,13 +314,13 @@ describe('runTestCase', () => { ); expect(judgeProvider.lastRequest?.systemPrompt).not.toContain('CUSTOM PROMPT CONTENT'); - expect(result.evaluator_results?.[0]?.evaluator_provider_request?.userPrompt).toContain( + expect(result.evaluatorResults?.[0]?.evaluatorProviderRequest?.userPrompt).toContain( 'CUSTOM PROMPT CONTENT', ); - expect(result.evaluator_results?.[0]?.evaluator_provider_request?.systemPrompt).toContain( + expect(result.evaluatorResults?.[0]?.evaluatorProviderRequest?.systemPrompt).toContain( 'You must respond with a single JSON object', ); - expect(result.evaluator_results?.[0]?.evaluator_provider_request?.systemPrompt).not.toContain( + expect(result.evaluatorResults?.[0]?.evaluatorProviderRequest?.systemPrompt).not.toContain( 'CUSTOM PROMPT CONTENT', ); }); @@ -374,7 +374,7 @@ describe('runTestCase', () => { content: '\ncode()\n\nReview', }); expect(chatPrompt[2]).toEqual({ role: 'assistant', content: 'Ack' }); - expect(result.lm_provider_request?.chat_prompt).toBeDefined(); + expect(result.lmProviderRequest?.chat_prompt).toBeDefined(); }); it('omits chatPrompt for single-turn evals', async () => { @@ -429,9 +429,9 @@ describe('runTestCase', () => { evaluators: evaluatorRegistry, }); - expect(result.agent_provider_request).toBeDefined(); - expect(result.lm_provider_request).toBeUndefined(); - expect(result.agent_provider_request?.question).toBe('Explain logging improvements'); + expect(result.agentProviderRequest).toBeDefined(); + expect(result.lmProviderRequest).toBeUndefined(); + expect(result.agentProviderRequest?.question).toBe('Explain logging improvements'); }); }); @@ -474,7 +474,7 @@ describe('runEvalCase trace integration', () => { evaluator: 'llm_judge', }; - it('includes trace_summary in result when provider returns outputMessages with tool calls', async () => { + it('includes traceSummary in result when provider returns outputMessages with tool calls', async () => { const outputMessages: OutputMessage[] = [ { role: 'assistant', @@ -504,14 +504,14 @@ describe('runEvalCase trace integration', () => { evaluators: evaluatorRegistry, }); - expect(result.trace_summary).toBeDefined(); - expect(result.trace_summary?.eventCount).toBe(1); - expect(result.trace_summary?.toolNames).toEqual(['getWeather']); - expect(result.trace_summary?.toolCallsByName).toEqual({ getWeather: 1 }); - expect(result.trace_summary?.errorCount).toBe(0); + expect(result.traceSummary).toBeDefined(); + expect(result.traceSummary?.eventCount).toBe(1); + expect(result.traceSummary?.toolNames).toEqual(['getWeather']); + expect(result.traceSummary?.toolCallsByName).toEqual({ getWeather: 1 }); + expect(result.traceSummary?.errorCount).toBe(0); }); - it('omits trace_summary when provider returns no outputMessages', async () => { + it('omits traceSummary when provider returns no outputMessages', async () => { const provider = new TraceProvider('mock', { outputMessages: [{ role: 'assistant', content: 'The weather is sunny' }], }); @@ -523,7 +523,7 @@ describe('runEvalCase trace integration', () => { evaluators: evaluatorRegistry, }); - expect(result.trace_summary).toBeUndefined(); + expect(result.traceSummary).toBeUndefined(); }); it('runs tool_trajectory evaluator with outputMessages', async () => { @@ -586,9 +586,9 @@ describe('runEvalCase trace integration', () => { }); expect(result.score).toBe(1); - expect(result.evaluator_results).toHaveLength(1); - expect(result.evaluator_results?.[0]?.name).toBe('tool-check'); - expect(result.evaluator_results?.[0]?.verdict).toBe('pass'); + expect(result.evaluatorResults).toHaveLength(1); + expect(result.evaluatorResults?.[0]?.name).toBe('tool-check'); + expect(result.evaluatorResults?.[0]?.verdict).toBe('pass'); }); it('fails tool_trajectory evaluator when no trace available', async () => { @@ -626,8 +626,8 @@ describe('runEvalCase trace integration', () => { }); expect(result.score).toBe(0); - expect(result.evaluator_results?.[0]?.verdict).toBe('fail'); - expect(result.evaluator_results?.[0]?.misses).toContain('No trace available for evaluation'); + expect(result.evaluatorResults?.[0]?.verdict).toBe('fail'); + expect(result.evaluatorResults?.[0]?.misses).toContain('No trace available for evaluation'); }); it('computes correct trace summary with multiple tool calls', async () => { @@ -657,11 +657,11 @@ describe('runEvalCase trace integration', () => { evaluators: evaluatorRegistry, }); - expect(result.trace_summary).toBeDefined(); - expect(result.trace_summary?.eventCount).toBe(4); - expect(result.trace_summary?.toolNames).toEqual(['toolA', 'toolB', 'toolC']); - expect(result.trace_summary?.toolCallsByName).toEqual({ toolA: 2, toolB: 1, toolC: 1 }); - expect(result.trace_summary?.errorCount).toBe(0); + expect(result.traceSummary).toBeDefined(); + expect(result.traceSummary?.eventCount).toBe(4); + expect(result.traceSummary?.toolNames).toEqual(['toolA', 'toolB', 'toolC']); + expect(result.traceSummary?.toolCallsByName).toEqual({ toolA: 2, toolB: 1, toolC: 1 }); + expect(result.traceSummary?.errorCount).toBe(0); }); describe('weighted evaluators', () => { @@ -692,9 +692,9 @@ describe('runEvalCase trace integration', () => { // eval2 weight=1.0, score=0.8 -> 0.8 // Total: (1.6 + 0.8) / (2.0 + 1.0) = 2.4 / 3.0 = 0.8 expect(result.score).toBeCloseTo(0.8); - expect(result.evaluator_results).toHaveLength(2); - expect(result.evaluator_results?.[0]?.weight).toBe(2.0); - expect(result.evaluator_results?.[1]?.weight).toBe(1.0); + expect(result.evaluatorResults).toHaveLength(2); + expect(result.evaluatorResults?.[0]?.weight).toBe(2.0); + expect(result.evaluatorResults?.[1]?.weight).toBe(1.0); }); it('defaults missing weights to 1.0', async () => { @@ -724,8 +724,8 @@ describe('runEvalCase trace integration', () => { // eval2 weight=1.0 (default), score=0.8 -> 0.8 // Total: (2.4 + 0.8) / (3.0 + 1.0) = 3.2 / 4.0 = 0.8 expect(result.score).toBeCloseTo(0.8); - expect(result.evaluator_results?.[0]?.weight).toBe(3.0); - expect(result.evaluator_results?.[1]?.weight).toBe(1.0); + expect(result.evaluatorResults?.[0]?.weight).toBe(3.0); + expect(result.evaluatorResults?.[1]?.weight).toBe(1.0); }); it('excludes evaluators with weight 0', async () => { @@ -755,8 +755,8 @@ describe('runEvalCase trace integration', () => { // eval2 weight=1.0, score=0.8 -> 0.8 // Total: (0 + 0.8) / (0 + 1.0) = 0.8 / 1.0 = 0.8 expect(result.score).toBeCloseTo(0.8); - expect(result.evaluator_results?.[0]?.weight).toBe(0); - expect(result.evaluator_results?.[1]?.weight).toBe(1.0); + expect(result.evaluatorResults?.[0]?.weight).toBe(0); + expect(result.evaluatorResults?.[1]?.weight).toBe(1.0); }); it('returns 0 when all evaluators have weight 0', async () => { From f0f4b78062389a3840bf47caff538a9e10d36c7b Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 1 Jan 2026 13:37:16 +0000 Subject: [PATCH 4/5] chore: add claude settings with superpowers plugin --- .claude/settings.json | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .claude/settings.json diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 00000000..07fa4272 --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,5 @@ +{ + "enabledPlugins": { + "superpowers@superpowers-marketplace": true + } +} From 4e6a0465ebc62cac064d39d8aea2804b919835a1 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 1 Jan 2026 14:07:56 +0000 Subject: [PATCH 5/5] refactor: convert Python plugin scripts to TypeScript - Convert efficiency_scorer.py, pairwise_tool_compare.py, and tool_selection_judge.py to TypeScript using bun - Add execution-metrics example to examples/features/evals/ - Add integration test for code judge receiving traceSummary - Update README and YAML to use camelCase field names - Update tasks.md to reflect completed work --- .../execution-metrics-demo.yaml | 88 +++++++ .../scripts/check-efficiency.ts | 141 ++++++++++ .../tool-evaluation-plugins/README.md | 37 +-- .../scripts/efficiency-scorer.ts | 239 +++++++++++++++++ .../scripts/efficiency_scorer.py | 214 --------------- .../scripts/pairwise-tool-compare.ts | 243 ++++++++++++++++++ .../scripts/pairwise_tool_compare.py | 220 ---------------- .../scripts/tool-selection-judge.ts | 186 ++++++++++++++ .../scripts/tool_selection_judge.py | 166 ------------ .../tool-eval-demo.yaml | 10 +- .../changes/add-execution-metrics/tasks.md | 6 +- .../test/evaluation/execution-metrics.test.ts | 111 ++++++++ 12 files changed, 1028 insertions(+), 633 deletions(-) create mode 100644 examples/features/evals/execution-metrics/execution-metrics-demo.yaml create mode 100644 examples/features/evals/execution-metrics/scripts/check-efficiency.ts create mode 100644 examples/showcase/tool-evaluation-plugins/scripts/efficiency-scorer.ts delete mode 100644 examples/showcase/tool-evaluation-plugins/scripts/efficiency_scorer.py create mode 100644 examples/showcase/tool-evaluation-plugins/scripts/pairwise-tool-compare.ts delete mode 100644 examples/showcase/tool-evaluation-plugins/scripts/pairwise_tool_compare.py create mode 100644 examples/showcase/tool-evaluation-plugins/scripts/tool-selection-judge.ts delete mode 100644 examples/showcase/tool-evaluation-plugins/scripts/tool_selection_judge.py diff --git a/examples/features/evals/execution-metrics/execution-metrics-demo.yaml b/examples/features/evals/execution-metrics/execution-metrics-demo.yaml new file mode 100644 index 00000000..22563e8b --- /dev/null +++ b/examples/features/evals/execution-metrics/execution-metrics-demo.yaml @@ -0,0 +1,88 @@ +# Execution Metrics Demo +# Demonstrates how to use execution metrics in evaluation +# +# Execution metrics capture runtime information from provider invocations: +# - tokenUsage: { input, output, cached? } - token consumption +# - costUsd: API cost in USD +# - durationMs: execution time in milliseconds +# +# These metrics are available in: +# 1. TraceSummary (included in evaluation results) +# 2. Code judge stdin (for custom metric-based evaluation) +# +# Run: cd examples/features/evals/execution-metrics +# npx agentv eval execution-metrics-demo.yaml --target your_target + +$schema: agentv-eval-v2 +description: Demonstrates execution metrics collection and evaluation + +target: default + +evalcases: + # ========================================== + # Example 1: Basic metrics collection + # Metrics are automatically included in results when available + # ========================================== + - id: metrics-collection + + expected_outcome: |- + Agent responds to a simple query. Execution metrics are captured + automatically and included in the evaluation result. + + input_messages: + - role: user + content: What is 2 + 2? + + execution: + evaluators: + - name: basic-check + type: llm_judge + + # ========================================== + # Example 2: Metric-aware code judge + # Use custom thresholds to evaluate efficiency + # ========================================== + - id: efficiency-evaluation + + expected_outcome: |- + Agent efficiently answers a simple question without excessive + token usage or tool calls. + + input_messages: + - role: user + content: List three primary colors. + + execution: + evaluators: + # Custom code judge that evaluates efficiency metrics + - name: efficiency-check + type: code_judge + script: bun run scripts/check-efficiency.ts + + # ========================================== + # Example 3: Combined trajectory and metrics + # Evaluate both tool usage and efficiency together + # ========================================== + - id: combined-evaluation + + expected_outcome: |- + Agent searches for information and provides a response. + Evaluation checks both tool trajectory and execution efficiency. + + input_messages: + - role: user + content: Find information about the weather in New York. + + execution: + evaluators: + # Built-in: Check tool sequence + - name: trajectory-check + type: tool_trajectory + mode: any_order + minimums: + search: 1 + + # Custom: Check efficiency metrics + - name: metrics-check + type: code_judge + script: bun run scripts/check-efficiency.ts diff --git a/examples/features/evals/execution-metrics/scripts/check-efficiency.ts b/examples/features/evals/execution-metrics/scripts/check-efficiency.ts new file mode 100644 index 00000000..3a47158e --- /dev/null +++ b/examples/features/evals/execution-metrics/scripts/check-efficiency.ts @@ -0,0 +1,141 @@ +#!/usr/bin/env bun +/** + * Efficiency Check - Code Judge for Execution Metrics + * + * Demonstrates how to evaluate agent efficiency using execution metrics + * available in the TraceSummary. + * + * Input (stdin JSON): + * - traceSummary: Contains execution metrics when available + * - eventCount: Number of tool calls + * - tokenUsage?: { input, output, cached? } + * - costUsd?: API cost + * - durationMs?: Execution time + * + * Output (stdout JSON): + * - score: 0.0-1.0 + * - hits: Efficiency wins + * - misses: Efficiency issues + * - reasoning: Explanation + */ + +interface TraceSummary { + eventCount: number; + toolNames: string[]; + toolCallsByName: Record; + errorCount: number; + tokenUsage?: { input: number; output: number; cached?: number }; + costUsd?: number; + durationMs?: number; +} + +interface EvalInput { + traceSummary?: TraceSummary; + expectedOutcome?: string; +} + +interface EvalOutput { + score: number; + hits: string[]; + misses: string[]; + reasoning: string; +} + +// Configurable thresholds +const THRESHOLDS = { + maxToolCalls: 5, + maxTokens: 2000, + maxCostUsd: 0.01, + maxDurationMs: 10000, +}; + +function checkEfficiency(input: EvalInput): EvalOutput { + const hits: string[] = []; + const misses: string[] = []; + const checks: boolean[] = []; + + const summary = input.traceSummary; + + if (!summary) { + return { + score: 0.5, + hits: [], + misses: ['No trace summary available'], + reasoning: 'Cannot evaluate efficiency without trace data', + }; + } + + // Check tool call count + if (summary.eventCount <= THRESHOLDS.maxToolCalls) { + hits.push(`Tool calls (${summary.eventCount}) within limit (${THRESHOLDS.maxToolCalls})`); + checks.push(true); + } else { + misses.push(`Too many tool calls: ${summary.eventCount} (max: ${THRESHOLDS.maxToolCalls})`); + checks.push(false); + } + + // Check token usage if available + if (summary.tokenUsage) { + const totalTokens = summary.tokenUsage.input + summary.tokenUsage.output; + if (totalTokens <= THRESHOLDS.maxTokens) { + hits.push(`Token usage (${totalTokens}) within limit`); + checks.push(true); + } else { + misses.push(`High token usage: ${totalTokens} (max: ${THRESHOLDS.maxTokens})`); + checks.push(false); + } + } + + // Check cost if available + if (summary.costUsd !== undefined) { + if (summary.costUsd <= THRESHOLDS.maxCostUsd) { + hits.push(`Cost ($${summary.costUsd.toFixed(4)}) within budget`); + checks.push(true); + } else { + misses.push(`High cost: $${summary.costUsd.toFixed(4)} (max: $${THRESHOLDS.maxCostUsd})`); + checks.push(false); + } + } + + // Check duration if available + if (summary.durationMs !== undefined) { + if (summary.durationMs <= THRESHOLDS.maxDurationMs) { + hits.push(`Duration (${summary.durationMs}ms) within limit`); + checks.push(true); + } else { + misses.push(`Slow execution: ${summary.durationMs}ms (max: ${THRESHOLDS.maxDurationMs}ms)`); + checks.push(false); + } + } + + // Calculate score + const passCount = checks.filter((c) => c).length; + const score = checks.length > 0 ? passCount / checks.length : 0.5; + + return { + score: Math.round(score * 100) / 100, + hits: hits.slice(0, 4), + misses: misses.slice(0, 4), + reasoning: `Checked ${checks.length} efficiency metrics: ${passCount} passed, ${checks.length - passCount} failed`, + }; +} + +async function main(): Promise { + try { + const stdin = await Bun.stdin.text(); + const input = JSON.parse(stdin) as EvalInput; + const result = checkEfficiency(input); + console.log(JSON.stringify(result, null, 2)); + } catch (error) { + const errorResult: EvalOutput = { + score: 0, + hits: [], + misses: [`Error: ${error instanceof Error ? error.message : String(error)}`], + reasoning: 'Evaluation failed due to error', + }; + console.log(JSON.stringify(errorResult, null, 2)); + process.exit(1); + } +} + +main(); diff --git a/examples/showcase/tool-evaluation-plugins/README.md b/examples/showcase/tool-evaluation-plugins/README.md index 94350618..c46403c2 100644 --- a/examples/showcase/tool-evaluation-plugins/README.md +++ b/examples/showcase/tool-evaluation-plugins/README.md @@ -16,29 +16,18 @@ This showcase demonstrates **plugin-based tool evaluation patterns** that comple ## Plugin Examples -### 1. Tool Selection Evaluator (`tool_selection_judge.py`) +### 1. Tool Selection Evaluator (`tool-selection-judge.ts`) -Evaluates whether the agent selected the **right tools** for the task. Uses LLM-as-judge pattern to semantically assess tool choices. +Evaluates whether the agent selected the **right tools** for the task. Uses heuristic matching to assess tool choices against task keywords. ```yaml evaluators: - name: tool-selection type: code_judge - script: scripts/tool_selection_judge.py + script: bun run scripts/tool-selection-judge.ts ``` -### 2. Tool Input Validator (`tool_input_validator.ts`) - -Validates that tool **arguments are semantically appropriate** (not just syntactically correct). Checks if argument values make sense in context. - -```yaml -evaluators: - - name: input-validation - type: code_judge - script: scripts/tool_input_validator.ts -``` - -### 3. Tool Efficiency Scorer (`efficiency_scorer.py`) +### 2. Tool Efficiency Scorer (`efficiency-scorer.ts`) Computes efficiency metrics and scores based on configurable thresholds. Demonstrates how to use execution metrics in evaluation. @@ -46,10 +35,10 @@ Computes efficiency metrics and scores based on configurable thresholds. Demonst evaluators: - name: efficiency type: code_judge - script: scripts/efficiency_scorer.py + script: bun run scripts/efficiency-scorer.ts ``` -### 4. Pairwise Tool Comparison (`pairwise_tool_compare.py`) +### 3. Pairwise Tool Comparison (`pairwise-tool-compare.ts`) Compares two agent responses for tool usage quality with position bias mitigation (runs comparison twice with swapped order). @@ -57,7 +46,7 @@ Compares two agent responses for tool usage quality with position bias mitigatio evaluators: - name: pairwise-compare type: code_judge - script: scripts/pairwise_tool_compare.py + script: bun run scripts/pairwise-tool-compare.ts ``` ## Running the Examples @@ -74,9 +63,9 @@ All code judges receive a JSON object on stdin with: ```json { "question": "User's question/task", - "expected_outcome": "Expected behavior description", - "candidate_answer": "Agent's final response", - "output_messages": [ + "expectedOutcome": "Expected behavior description", + "candidateAnswer": "Agent's final response", + "outputMessages": [ { "role": "assistant", "content": "...", @@ -91,13 +80,11 @@ All code judges receive a JSON object on stdin with: "content": "Tool result..." } ], - "candidate_trace_summary": { + "traceSummary": { "eventCount": 5, "toolNames": ["search", "fetch"], "toolCallsByName": { "search": 2, "fetch": 1 }, - "errorCount": 0 - }, - "execution_metrics": { + "errorCount": 0, "tokenUsage": { "input": 1000, "output": 500 }, "durationMs": 3500, "costUsd": 0.0015 diff --git a/examples/showcase/tool-evaluation-plugins/scripts/efficiency-scorer.ts b/examples/showcase/tool-evaluation-plugins/scripts/efficiency-scorer.ts new file mode 100644 index 00000000..93682f24 --- /dev/null +++ b/examples/showcase/tool-evaluation-plugins/scripts/efficiency-scorer.ts @@ -0,0 +1,239 @@ +#!/usr/bin/env bun +/** + * Tool Efficiency Scorer - Code Judge Plugin + * + * Evaluates agent efficiency based on execution metrics: + * - Token usage relative to task complexity + * - Number of tool calls (redundancy detection) + * - Exploration ratio (read-only vs action tools) + * - Cost efficiency + * + * Why this is a plugin (not built-in): + * - Efficiency thresholds are domain-specific + * - What's "efficient" depends on the task type + * - Different projects have different cost/performance tradeoffs + * + * Usage in eval YAML: + * evaluators: + * - name: efficiency + * type: code_judge + * script: bun run scripts/efficiency-scorer.ts + * + * Input (stdin JSON): + * - traceSummary: Tool call statistics + * - expectedOutcome: Task description (for complexity estimation) + * + * Output (stdout JSON): + * - score: 0.0-1.0 efficiency score + * - hits: Efficiency wins + * - misses: Efficiency issues + * - reasoning: Explanation + */ + +interface TraceSummary { + eventCount: number; + toolCallsByName: Record; + tokenUsage?: { input: number; output: number; cached?: number }; + costUsd?: number; + durationMs?: number; +} + +interface EvalInput { + traceSummary?: TraceSummary; + expectedOutcome?: string; +} + +interface EvalOutput { + score: number; + hits: string[]; + misses: string[]; + reasoning: string; +} + +// Configurable thresholds (customize for your domain) +const THRESHOLDS = { + // Maximum tool calls before penalty + maxToolCalls: 10, + // Ideal exploration ratio (read-only tools / total) + targetExplorationRatio: 0.6, + explorationTolerance: 0.2, + // Token budgets + maxTokensSimple: 2000, + maxTokensComplex: 10000, + // Cost thresholds (USD) + maxCostSimple: 0.01, + maxCostComplex: 0.1, +}; + +// Tools considered "exploration" (read-only) +const EXPLORATION_TOOLS = new Set([ + 'read', + 'grep', + 'glob', + 'search', + 'list', + 'find', + 'get', + 'fetch', + 'query', + 'inspect', + 'view', +]); + +function estimateTaskComplexity(expectedOutcome: string): 'simple' | 'complex' { + const text = expectedOutcome.toLowerCase(); + const complexIndicators = [ + 'multiple', + 'several', + 'comprehensive', + 'thorough', + 'analyze', + 'compare', + 'synthesize', + 'integrate', + ]; + if (complexIndicators.some((indicator) => text.includes(indicator))) { + return 'complex'; + } + return 'simple'; +} + +function calculateExplorationRatio(traceSummary: TraceSummary): number { + const toolCalls = traceSummary.toolCallsByName; + const total = Object.values(toolCalls).reduce((sum, count) => sum + count, 0); + if (total === 0) { + return 0; + } + + let explorationCount = 0; + for (const [tool, count] of Object.entries(toolCalls)) { + const toolLower = tool.toLowerCase(); + if ([...EXPLORATION_TOOLS].some((exp) => toolLower.includes(exp))) { + explorationCount += count; + } + } + return explorationCount / total; +} + +function evaluateEfficiency( + traceSummary: TraceSummary | undefined, + expectedOutcome: string, +): EvalOutput { + const hits: string[] = []; + const misses: string[] = []; + const scores: number[] = []; + + const complexity = estimateTaskComplexity(expectedOutcome); + + // 1. Tool call count evaluation + if (traceSummary) { + const toolCount = traceSummary.eventCount; + const maxCalls = THRESHOLDS.maxToolCalls; + + if (toolCount <= maxCalls) { + hits.push(`Tool calls (${toolCount}) within budget (${maxCalls})`); + scores.push(1.0); + } else { + const penalty = Math.min((toolCount - maxCalls) / maxCalls, 1.0); + scores.push(1.0 - penalty); + misses.push(`Excessive tool calls: ${toolCount} (budget: ${maxCalls})`); + } + + // 2. Exploration ratio evaluation + const expRatio = calculateExplorationRatio(traceSummary); + const target = THRESHOLDS.targetExplorationRatio; + const tolerance = THRESHOLDS.explorationTolerance; + + if (Math.abs(expRatio - target) <= tolerance) { + hits.push(`Good exploration ratio: ${expRatio.toFixed(2)}`); + scores.push(1.0); + } else if (expRatio < target - tolerance) { + scores.push(0.7); + misses.push(`Low exploration ratio: ${expRatio.toFixed(2)} (target: ${target.toFixed(2)})`); + } else { + scores.push(0.7); + misses.push(`High exploration ratio: ${expRatio.toFixed(2)} (target: ${target.toFixed(2)})`); + } + + // 3. Token usage evaluation + if (traceSummary.tokenUsage) { + const tokens = traceSummary.tokenUsage; + const totalTokens = tokens.input + tokens.output; + const maxTokens = + complexity === 'complex' ? THRESHOLDS.maxTokensComplex : THRESHOLDS.maxTokensSimple; + + if (totalTokens <= maxTokens) { + hits.push(`Token usage (${totalTokens}) within budget`); + scores.push(1.0); + } else { + const penalty = Math.min((totalTokens - maxTokens) / maxTokens, 1.0); + scores.push(1.0 - penalty * 0.5); // Softer penalty + misses.push(`High token usage: ${totalTokens} (budget: ${maxTokens})`); + } + } + + // 4. Cost evaluation + if (traceSummary.costUsd !== undefined) { + const cost = traceSummary.costUsd; + const maxCost = + complexity === 'complex' ? THRESHOLDS.maxCostComplex : THRESHOLDS.maxCostSimple; + + if (cost <= maxCost) { + hits.push(`Cost ($${cost.toFixed(4)}) within budget`); + scores.push(1.0); + } else { + scores.push(0.5); + misses.push(`High cost: $${cost.toFixed(4)} (budget: $${maxCost.toFixed(4)})`); + } + } + } + + // Calculate final score + if (scores.length === 0) { + return { + score: 0.5, + hits: ['No efficiency metrics available'], + misses: [], + reasoning: 'Could not evaluate efficiency - no metrics provided', + }; + } + + const finalScore = scores.reduce((sum, s) => sum + s, 0) / scores.length; + + const reasoning = + `Task complexity: ${complexity}. ` + + `Evaluated ${scores.length} efficiency criteria. ` + + `Score: ${finalScore.toFixed(2)}`; + + return { + score: Math.round(finalScore * 100) / 100, + hits: hits.slice(0, 4), + misses: misses.slice(0, 4), + reasoning, + }; +} + +async function main(): Promise { + try { + const stdin = await Bun.stdin.text(); + const inputData = JSON.parse(stdin) as EvalInput; + + const traceSummary = inputData.traceSummary; + const expectedOutcome = inputData.expectedOutcome ?? ''; + + const result = evaluateEfficiency(traceSummary, expectedOutcome); + + console.log(JSON.stringify(result, null, 2)); + } catch (error) { + const errorResult: EvalOutput = { + score: 0, + hits: [], + misses: [`Evaluator error: ${error instanceof Error ? error.message : String(error)}`], + reasoning: `Evaluation failed: ${error instanceof Error ? error.message : String(error)}`, + }; + console.log(JSON.stringify(errorResult, null, 2)); + process.exit(1); + } +} + +main(); diff --git a/examples/showcase/tool-evaluation-plugins/scripts/efficiency_scorer.py b/examples/showcase/tool-evaluation-plugins/scripts/efficiency_scorer.py deleted file mode 100644 index 15c68ecc..00000000 --- a/examples/showcase/tool-evaluation-plugins/scripts/efficiency_scorer.py +++ /dev/null @@ -1,214 +0,0 @@ -#!/usr/bin/env python3 -""" -Tool Efficiency Scorer - Code Judge Plugin - -Evaluates agent efficiency based on execution metrics: -- Token usage relative to task complexity -- Number of tool calls (redundancy detection) -- Exploration ratio (read-only vs action tools) -- Cost efficiency - -Why this is a plugin (not built-in): -- Efficiency thresholds are domain-specific -- What's "efficient" depends on the task type -- Different projects have different cost/performance tradeoffs - -Usage in eval YAML: - evaluators: - - name: efficiency - type: code_judge - script: scripts/efficiency_scorer.py - -Input (stdin JSON): - - candidate_trace_summary: Tool call statistics - - execution_metrics: Token usage, cost, duration (if available) - - expected_outcome: Task description (for complexity estimation) - -Output (stdout JSON): - - score: 0.0-1.0 efficiency score - - hits: Efficiency wins - - misses: Efficiency issues - - reasoning: Explanation -""" - -import json -import sys -from typing import Any - - -# Configurable thresholds (customize for your domain) -THRESHOLDS = { - # Maximum tool calls before penalty - "max_tool_calls": 10, - # Ideal exploration ratio (read-only tools / total) - "target_exploration_ratio": 0.6, - "exploration_tolerance": 0.2, - # Token budgets - "max_tokens_simple": 2000, - "max_tokens_complex": 10000, - # Cost thresholds (USD) - "max_cost_simple": 0.01, - "max_cost_complex": 0.10, -} - -# Tools considered "exploration" (read-only) -EXPLORATION_TOOLS = { - "read", "grep", "glob", "search", "list", "find", - "get", "fetch", "query", "inspect", "view", -} - - -def estimate_task_complexity(expected_outcome: str) -> str: - """Estimate task complexity from expected outcome description.""" - text = expected_outcome.lower() - complex_indicators = [ - "multiple", "several", "comprehensive", "thorough", - "analyze", "compare", "synthesize", "integrate", - ] - if any(indicator in text for indicator in complex_indicators): - return "complex" - return "simple" - - -def calculate_exploration_ratio(trace_summary: dict) -> float: - """Calculate ratio of exploration tools to total tools.""" - tool_calls = trace_summary.get("toolCallsByName", {}) - total = sum(tool_calls.values()) - if total == 0: - return 0.0 - - exploration_count = sum( - count for tool, count in tool_calls.items() - if any(exp in tool.lower() for exp in EXPLORATION_TOOLS) - ) - return exploration_count / total - - -def evaluate_efficiency( - trace_summary: dict | None, - execution_metrics: dict | None, - expected_outcome: str, -) -> dict[str, Any]: - """Evaluate agent efficiency against configurable thresholds.""" - hits = [] - misses = [] - scores = [] - - complexity = estimate_task_complexity(expected_outcome) - - # 1. Tool call count evaluation - if trace_summary: - tool_count = trace_summary.get("eventCount", 0) - max_calls = THRESHOLDS["max_tool_calls"] - - if tool_count <= max_calls: - hits.append(f"Tool calls ({tool_count}) within budget ({max_calls})") - scores.append(1.0) - else: - penalty = min((tool_count - max_calls) / max_calls, 1.0) - scores.append(1.0 - penalty) - misses.append(f"Excessive tool calls: {tool_count} (budget: {max_calls})") - - # 2. Exploration ratio evaluation - exp_ratio = calculate_exploration_ratio(trace_summary) - target = THRESHOLDS["target_exploration_ratio"] - tolerance = THRESHOLDS["exploration_tolerance"] - - if abs(exp_ratio - target) <= tolerance: - hits.append(f"Good exploration ratio: {exp_ratio:.2f}") - scores.append(1.0) - elif exp_ratio < target - tolerance: - scores.append(0.7) - misses.append(f"Low exploration ratio: {exp_ratio:.2f} (target: {target:.2f})") - else: - scores.append(0.7) - misses.append(f"High exploration ratio: {exp_ratio:.2f} (target: {target:.2f})") - - # 3. Token usage evaluation - if execution_metrics and "tokenUsage" in execution_metrics: - tokens = execution_metrics["tokenUsage"] - total_tokens = tokens.get("input", 0) + tokens.get("output", 0) - max_tokens = ( - THRESHOLDS["max_tokens_complex"] - if complexity == "complex" - else THRESHOLDS["max_tokens_simple"] - ) - - if total_tokens <= max_tokens: - hits.append(f"Token usage ({total_tokens}) within budget") - scores.append(1.0) - else: - penalty = min((total_tokens - max_tokens) / max_tokens, 1.0) - scores.append(1.0 - penalty * 0.5) # Softer penalty - misses.append(f"High token usage: {total_tokens} (budget: {max_tokens})") - - # 4. Cost evaluation - if execution_metrics and "costUsd" in execution_metrics: - cost = execution_metrics["costUsd"] - max_cost = ( - THRESHOLDS["max_cost_complex"] - if complexity == "complex" - else THRESHOLDS["max_cost_simple"] - ) - - if cost <= max_cost: - hits.append(f"Cost (${cost:.4f}) within budget") - scores.append(1.0) - else: - scores.append(0.5) - misses.append(f"High cost: ${cost:.4f} (budget: ${max_cost:.4f})") - - # Calculate final score - if not scores: - return { - "score": 0.5, - "hits": ["No efficiency metrics available"], - "misses": [], - "reasoning": "Could not evaluate efficiency - no metrics provided", - } - - final_score = sum(scores) / len(scores) - - reasoning = ( - f"Task complexity: {complexity}. " - f"Evaluated {len(scores)} efficiency criteria. " - f"Score: {final_score:.2f}" - ) - - return { - "score": round(final_score, 2), - "hits": hits[:4], - "misses": misses[:4], - "reasoning": reasoning, - } - - -def main(): - try: - input_data = json.loads(sys.stdin.read()) - - trace_summary = input_data.get("candidate_trace_summary") - execution_metrics = input_data.get("execution_metrics") - expected_outcome = input_data.get("expected_outcome", "") - - result = evaluate_efficiency( - trace_summary=trace_summary, - execution_metrics=execution_metrics, - expected_outcome=expected_outcome, - ) - - print(json.dumps(result, indent=2)) - - except Exception as e: - error_result = { - "score": 0.0, - "hits": [], - "misses": [f"Evaluator error: {str(e)}"], - "reasoning": f"Evaluation failed: {str(e)}", - } - print(json.dumps(error_result, indent=2)) - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/examples/showcase/tool-evaluation-plugins/scripts/pairwise-tool-compare.ts b/examples/showcase/tool-evaluation-plugins/scripts/pairwise-tool-compare.ts new file mode 100644 index 00000000..cfee4d06 --- /dev/null +++ b/examples/showcase/tool-evaluation-plugins/scripts/pairwise-tool-compare.ts @@ -0,0 +1,243 @@ +#!/usr/bin/env bun +/** + * Pairwise Tool Comparison - Code Judge Plugin + * + * Compares tool usage quality between two agent responses with + * position bias mitigation (runs comparison twice with swapped order). + * + * Why this is a plugin (not built-in): + * - Pairwise comparison is a specialized evaluation pattern + * - Requires reference response (not always available) + * - Position bias mitigation adds complexity + * - Not all evaluations need comparative assessment + * + * Usage in eval YAML: + * evaluators: + * - name: pairwise-compare + * type: code_judge + * script: bun run scripts/pairwise-tool-compare.ts + * + * Input (stdin JSON): + * - candidateAnswer: Agent's response (Response A) + * - referenceAnswer: Reference/baseline response (Response B) + * - outputMessages: Tool calls from candidate + * - expectedOutcome: Task description + * + * Output (stdout JSON): + * - score: 0.0-1.0 (1.0 = candidate wins, 0.5 = tie, 0.0 = reference wins) + * - hits: Candidate advantages + * - misses: Reference advantages + * - reasoning: Comparison explanation with bias check result + */ + +interface OutputMessage { + role: string; + toolCalls?: Array<{ tool: string; args?: Record }>; +} + +interface EvalInput { + candidateAnswer?: string; + referenceAnswer?: string; + outputMessages?: OutputMessage[]; + referenceOutputMessages?: OutputMessage[]; + expectedOutcome?: string; +} + +interface EvalOutput { + score: number; + hits: string[]; + misses: string[]; + reasoning: string; +} + +interface ToolSummary { + tools: string[]; + count: number; + unique: string[]; +} + +interface CompareResult { + winner: 'A' | 'B' | 'TIE'; + aAdvantages: string[]; + bAdvantages: string[]; +} + +function extractToolSummary(messages: OutputMessage[] | undefined): ToolSummary { + if (!messages) { + return { tools: [], count: 0, unique: [] }; + } + + const tools: string[] = []; + for (const msg of messages) { + if (msg.role === 'assistant' && msg.toolCalls) { + for (const call of msg.toolCalls) { + tools.push(call.tool ?? 'unknown'); + } + } + } + + return { + tools, + count: tools.length, + unique: [...new Set(tools)], + }; +} + +function compareResponses( + responseA: string, + responseB: string, + toolsA: ToolSummary, + toolsB: ToolSummary, +): CompareResult { + const aAdvantages: string[] = []; + const bAdvantages: string[] = []; + + // 1. Compare tool count efficiency + if (toolsA.count < toolsB.count && toolsA.count > 0) { + aAdvantages.push(`More efficient: ${toolsA.count} vs ${toolsB.count} tools`); + } else if (toolsB.count < toolsA.count && toolsB.count > 0) { + bAdvantages.push(`More efficient: ${toolsB.count} vs ${toolsA.count} tools`); + } + + // 2. Compare tool diversity + if (toolsA.unique.length > toolsB.unique.length) { + aAdvantages.push(`More diverse tools: ${toolsA.unique.length} types`); + } else if (toolsB.unique.length > toolsA.unique.length) { + bAdvantages.push(`More diverse tools: ${toolsB.unique.length} types`); + } + + // 3. Compare response length (proxy for completeness) + const lenA = responseA.length; + const lenB = responseB.length; + if (lenA > lenB * 1.2) { + aAdvantages.push('More comprehensive response'); + } else if (lenB > lenA * 1.2) { + bAdvantages.push('More comprehensive response'); + } + + // 4. Check for no tools (penalty) + if (toolsA.count === 0 && toolsB.count > 0) { + bAdvantages.push('Response B used tools; A did not'); + } else if (toolsB.count === 0 && toolsA.count > 0) { + aAdvantages.push('Response A used tools; B did not'); + } + + // Determine winner + const aScore = aAdvantages.length; + const bScore = bAdvantages.length; + + if (aScore > bScore) { + return { winner: 'A', aAdvantages, bAdvantages }; + } else if (bScore > aScore) { + return { winner: 'B', aAdvantages, bAdvantages }; + } else { + return { winner: 'TIE', aAdvantages, bAdvantages }; + } +} + +function pairwiseWithBiasMitigation( + candidate: string, + reference: string, + candidateTools: ToolSummary, + referenceTools: ToolSummary, +): EvalOutput { + // Pass 1: Candidate as A, Reference as B + const pass1 = compareResponses(candidate, reference, candidateTools, referenceTools); + + // Pass 2: Reference as A, Candidate as B (swapped) + const pass2 = compareResponses(reference, candidate, referenceTools, candidateTools); + + // Map pass2 result back (if A wins in pass2, that means Reference won) + const pass2Mapped: 'A' | 'B' | 'TIE' = + pass2.winner === 'A' ? 'B' : pass2.winner === 'B' ? 'A' : 'TIE'; + + // Check consistency + const consistent = pass1.winner === pass2Mapped; + + let finalWinner: 'A' | 'B' | 'TIE'; + let confidence: string; + + if (consistent) { + finalWinner = pass1.winner; + confidence = 'high'; + } else { + // Inconsistent results indicate position bias - return TIE + finalWinner = 'TIE'; + confidence = 'low (position bias detected)'; + } + + // Convert to score (candidate perspective) + let score: number; + if (finalWinner === 'A') { + // Candidate wins + score = 1.0; + } else if (finalWinner === 'B') { + // Reference wins + score = 0.0; + } else { + // TIE + score = 0.5; + } + + const hits = pass1.aAdvantages.slice(0, 4); // Candidate advantages + const misses = pass1.bAdvantages.slice(0, 4); // Reference advantages + + const reasoning = + `Pass 1: ${pass1.winner} wins. ` + + `Pass 2 (swapped): ${pass2.winner} wins (maps to ${pass2Mapped}). ` + + `Consistency: ${consistent}. ` + + `Final: ${finalWinner} (${confidence} confidence)`; + + return { score, hits, misses, reasoning }; +} + +async function main(): Promise { + try { + const stdin = await Bun.stdin.text(); + const inputData = JSON.parse(stdin) as EvalInput; + + const candidate = inputData.candidateAnswer ?? ''; + const reference = inputData.referenceAnswer ?? ''; + const outputMessages = inputData.outputMessages ?? []; + + // If no reference, we can't do pairwise comparison + if (!reference) { + console.log( + JSON.stringify( + { + score: 0.5, + hits: ['Candidate response provided'], + misses: ['No reference for comparison'], + reasoning: 'Pairwise comparison requires referenceAnswer field', + }, + null, + 2, + ), + ); + return; + } + + // Extract tool summaries + const candidateTools = extractToolSummary(outputMessages); + + // For reference, we'd need referenceOutputMessages + // In practice, this would come from a baseline run + const referenceMessages = inputData.referenceOutputMessages ?? []; + const referenceTools = extractToolSummary(referenceMessages); + + const result = pairwiseWithBiasMitigation(candidate, reference, candidateTools, referenceTools); + + console.log(JSON.stringify(result, null, 2)); + } catch (error) { + const errorResult: EvalOutput = { + score: 0, + hits: [], + misses: [`Evaluator error: ${error instanceof Error ? error.message : String(error)}`], + reasoning: `Evaluation failed: ${error instanceof Error ? error.message : String(error)}`, + }; + console.log(JSON.stringify(errorResult, null, 2)); + process.exit(1); + } +} + +main(); diff --git a/examples/showcase/tool-evaluation-plugins/scripts/pairwise_tool_compare.py b/examples/showcase/tool-evaluation-plugins/scripts/pairwise_tool_compare.py deleted file mode 100644 index e0bc842c..00000000 --- a/examples/showcase/tool-evaluation-plugins/scripts/pairwise_tool_compare.py +++ /dev/null @@ -1,220 +0,0 @@ -#!/usr/bin/env python3 -""" -Pairwise Tool Comparison - Code Judge Plugin - -Compares tool usage quality between two agent responses with -position bias mitigation (runs comparison twice with swapped order). - -Why this is a plugin (not built-in): -- Pairwise comparison is a specialized evaluation pattern -- Requires reference response (not always available) -- Position bias mitigation adds complexity -- Not all evaluations need comparative assessment - -Usage in eval YAML: - evaluators: - - name: pairwise-compare - type: code_judge - script: scripts/pairwise_tool_compare.py - -Input (stdin JSON): - - candidate_answer: Agent's response (Response A) - - reference_answer: Reference/baseline response (Response B) - - output_messages: Tool calls from candidate - - expected_outcome: Task description - -Output (stdout JSON): - - score: 0.0-1.0 (1.0 = candidate wins, 0.5 = tie, 0.0 = reference wins) - - hits: Candidate advantages - - misses: Reference advantages - - reasoning: Comparison explanation with bias check result -""" - -import json -import sys -from typing import Any - - -def extract_tool_summary(messages: list[dict] | None) -> dict: - """Extract tool usage summary from messages.""" - if not messages: - return {"tools": [], "count": 0} - - tools = [] - for msg in messages: - if msg.get("role") == "assistant" and msg.get("toolCalls"): - for call in msg["toolCalls"]: - tools.append(call.get("tool", "unknown")) - - return { - "tools": tools, - "count": len(tools), - "unique": list(set(tools)), - } - - -def compare_responses( - response_a: str, - response_b: str, - tools_a: dict, - tools_b: dict, - task: str, -) -> dict[str, Any]: - """ - Compare two responses for tool usage quality. - Returns winner and reasoning. - """ - a_advantages = [] - b_advantages = [] - - # 1. Compare tool count efficiency - if tools_a["count"] < tools_b["count"] and tools_a["count"] > 0: - a_advantages.append(f"More efficient: {tools_a['count']} vs {tools_b['count']} tools") - elif tools_b["count"] < tools_a["count"] and tools_b["count"] > 0: - b_advantages.append(f"More efficient: {tools_b['count']} vs {tools_a['count']} tools") - - # 2. Compare tool diversity - if len(tools_a["unique"]) > len(tools_b["unique"]): - a_advantages.append(f"More diverse tools: {len(tools_a['unique'])} types") - elif len(tools_b["unique"]) > len(tools_a["unique"]): - b_advantages.append(f"More diverse tools: {len(tools_b['unique'])} types") - - # 3. Compare response length (proxy for completeness) - len_a, len_b = len(response_a), len(response_b) - if len_a > len_b * 1.2: - a_advantages.append("More comprehensive response") - elif len_b > len_a * 1.2: - b_advantages.append("More comprehensive response") - - # 4. Check for no tools (penalty) - if tools_a["count"] == 0 and tools_b["count"] > 0: - b_advantages.append("Response B used tools; A did not") - elif tools_b["count"] == 0 and tools_a["count"] > 0: - a_advantages.append("Response A used tools; B did not") - - # Determine winner - a_score = len(a_advantages) - b_score = len(b_advantages) - - if a_score > b_score: - return {"winner": "A", "a_advantages": a_advantages, "b_advantages": b_advantages} - elif b_score > a_score: - return {"winner": "B", "a_advantages": a_advantages, "b_advantages": b_advantages} - else: - return {"winner": "TIE", "a_advantages": a_advantages, "b_advantages": b_advantages} - - -def pairwise_with_bias_mitigation( - candidate: str, - reference: str, - candidate_tools: dict, - reference_tools: dict, - task: str, -) -> dict[str, Any]: - """ - Run pairwise comparison twice with position swap to mitigate bias. - """ - # Pass 1: Candidate as A, Reference as B - pass1 = compare_responses( - candidate, reference, candidate_tools, reference_tools, task - ) - - # Pass 2: Reference as A, Candidate as B (swapped) - pass2 = compare_responses( - reference, candidate, reference_tools, candidate_tools, task - ) - - # Map pass2 result back (if A wins in pass2, that means Reference won) - pass2_mapped = { - "A": "B", # A in pass2 = Reference = B in pass1 terms - "B": "A", # B in pass2 = Candidate = A in pass1 terms - "TIE": "TIE", - }.get(pass2["winner"], "TIE") - - # Check consistency - consistent = pass1["winner"] == pass2_mapped - - if consistent: - final_winner = pass1["winner"] - confidence = "high" - else: - # Inconsistent results indicate position bias - return TIE - final_winner = "TIE" - confidence = "low (position bias detected)" - - # Convert to score (candidate perspective) - if final_winner == "A": # Candidate wins - score = 1.0 - elif final_winner == "B": # Reference wins - score = 0.0 - else: # TIE - score = 0.5 - - hits = pass1["a_advantages"][:4] # Candidate advantages - misses = pass1["b_advantages"][:4] # Reference advantages - - reasoning = ( - f"Pass 1: {pass1['winner']} wins. " - f"Pass 2 (swapped): {pass2['winner']} wins (maps to {pass2_mapped}). " - f"Consistency: {consistent}. " - f"Final: {final_winner} ({confidence} confidence)" - ) - - return { - "score": score, - "hits": hits, - "misses": misses, - "reasoning": reasoning, - } - - -def main(): - try: - input_data = json.loads(sys.stdin.read()) - - candidate = input_data.get("candidate_answer", "") - reference = input_data.get("reference_answer", "") - output_messages = input_data.get("output_messages", []) - task = input_data.get("expected_outcome", "") - - # If no reference, we can't do pairwise comparison - if not reference: - print(json.dumps({ - "score": 0.5, - "hits": ["Candidate response provided"], - "misses": ["No reference for comparison"], - "reasoning": "Pairwise comparison requires reference_answer field", - }, indent=2)) - return - - # Extract tool summaries - candidate_tools = extract_tool_summary(output_messages) - - # For reference, we'd need reference_output_messages - # In practice, this would come from a baseline run - reference_messages = input_data.get("reference_output_messages", []) - reference_tools = extract_tool_summary(reference_messages) - - result = pairwise_with_bias_mitigation( - candidate=candidate, - reference=reference, - candidate_tools=candidate_tools, - reference_tools=reference_tools, - task=task, - ) - - print(json.dumps(result, indent=2)) - - except Exception as e: - error_result = { - "score": 0.0, - "hits": [], - "misses": [f"Evaluator error: {str(e)}"], - "reasoning": f"Evaluation failed: {str(e)}", - } - print(json.dumps(error_result, indent=2)) - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/examples/showcase/tool-evaluation-plugins/scripts/tool-selection-judge.ts b/examples/showcase/tool-evaluation-plugins/scripts/tool-selection-judge.ts new file mode 100644 index 00000000..c44e4ce5 --- /dev/null +++ b/examples/showcase/tool-evaluation-plugins/scripts/tool-selection-judge.ts @@ -0,0 +1,186 @@ +#!/usr/bin/env bun +/** + * Tool Selection Evaluator - Code Judge Plugin + * + * Evaluates whether the agent selected the RIGHT tools for the task. + * This is a semantic evaluation that requires understanding task requirements + * and matching them against available tools. + * + * Why this is a plugin (not built-in): + * - Requires domain-specific knowledge of what tools are "appropriate" + * - Involves semantic judgment, not just pattern matching + * - Different projects have different tool selection criteria + * + * Usage in eval YAML: + * evaluators: + * - name: tool-selection + * type: code_judge + * script: bun run scripts/tool-selection-judge.ts + * + * Input (stdin JSON): + * - question: The user's task/question + * - expectedOutcome: Description of expected behavior + * - outputMessages: Array of messages including tool calls + * - traceSummary: Summary of tool usage + * + * Output (stdout JSON): + * - score: 0.0-1.0 (1.0 = all tools appropriate, 0.0 = all inappropriate) + * - hits: List of appropriate tool selections + * - misses: List of missing or inappropriate tools + * - reasoning: Explanation of the evaluation + */ + +interface ToolCall { + tool: string; + args?: Record; +} + +interface OutputMessage { + role: string; + toolCalls?: ToolCall[]; +} + +interface TraceSummary { + eventCount: number; + toolCallsByName: Record; +} + +interface EvalInput { + question?: string; + expectedOutcome?: string; + outputMessages?: OutputMessage[]; + traceSummary?: TraceSummary; +} + +interface EvalOutput { + score: number; + hits: string[]; + misses: string[]; + reasoning: string; +} + +interface ExtractedToolCall { + tool: string; + args: Record; +} + +function extractToolCalls(messages: OutputMessage[]): ExtractedToolCall[] { + const toolCalls: ExtractedToolCall[] = []; + for (const msg of messages) { + if (msg.role === 'assistant' && msg.toolCalls) { + for (const call of msg.toolCalls) { + toolCalls.push({ + tool: call.tool, + args: call.args ?? {}, + }); + } + } + } + return toolCalls; +} + +function evaluateToolSelection( + question: string, + expectedOutcome: string, + toolCalls: ExtractedToolCall[], +): EvalOutput { + const hits: string[] = []; + const misses: string[] = []; + + // Extract keywords from question and expected outcome + const taskText = `${question} ${expectedOutcome}`.toLowerCase(); + + // Define tool-to-task mappings (customize for your domain) + const toolTaskMappings: Record = { + search: ['find', 'search', 'look', 'query', 'discover'], + fetch: ['get', 'retrieve', 'fetch', 'download', 'load'], + read: ['read', 'open', 'view', 'examine', 'inspect'], + write: ['write', 'save', 'create', 'output', 'generate'], + analyze: ['analyze', 'process', 'compute', 'calculate'], + validate: ['check', 'validate', 'verify', 'confirm'], + }; + + // Determine expected tools based on task keywords + const expectedTools = new Set(); + for (const [tool, keywords] of Object.entries(toolTaskMappings)) { + if (keywords.some((kw) => taskText.includes(kw))) { + expectedTools.add(tool); + } + } + + // Get actual tools used + const actualTools = new Set(toolCalls.map((call) => call.tool)); + + // Evaluate selection + if (toolCalls.length === 0) { + return { + score: 0, + hits: [], + misses: ['No tools were called'], + reasoning: 'Agent did not use any tools. Expected at least some tool usage.', + }; + } + + // Check for appropriate selections + for (const tool of actualTools) { + const toolLower = tool.toLowerCase(); + const isRelevant = [...expectedTools].some( + (expected) => toolLower.includes(expected) || expected.includes(toolLower), + ); + if (isRelevant || expectedTools.size === 0) { + hits.push(`Tool '${tool}' appears relevant to task`); + } else { + misses.push(`Tool '${tool}' may not be needed for this task`); + } + } + + // Check for missing expected tools + for (const expected of expectedTools) { + if (![...actualTools].some((t) => t.toLowerCase().includes(expected))) { + misses.push(`Expected a '${expected}'-type tool but none used`); + } + } + + // Calculate score + const totalChecks = hits.length + misses.length; + const score = totalChecks > 0 ? hits.length / totalChecks : 0.5; + + const reasoning = + `Evaluated ${actualTools.size} tool(s) against task requirements. ` + + `${hits.length} appropriate, ${misses.length} issues found.`; + + return { + score: Math.round(score * 100) / 100, + hits: hits.slice(0, 4), // Cap at 4 per contract + misses: misses.slice(0, 4), + reasoning, + }; +} + +async function main(): Promise { + try { + const stdin = await Bun.stdin.text(); + const inputData = JSON.parse(stdin) as EvalInput; + + const question = inputData.question ?? ''; + const expectedOutcome = inputData.expectedOutcome ?? ''; + const outputMessages = inputData.outputMessages ?? []; + + const toolCalls = extractToolCalls(outputMessages); + + const result = evaluateToolSelection(question, expectedOutcome, toolCalls); + + console.log(JSON.stringify(result, null, 2)); + } catch (error) { + const errorResult: EvalOutput = { + score: 0, + hits: [], + misses: [`Evaluator error: ${error instanceof Error ? error.message : String(error)}`], + reasoning: `Evaluation failed: ${error instanceof Error ? error.message : String(error)}`, + }; + console.log(JSON.stringify(errorResult, null, 2)); + process.exit(1); + } +} + +main(); diff --git a/examples/showcase/tool-evaluation-plugins/scripts/tool_selection_judge.py b/examples/showcase/tool-evaluation-plugins/scripts/tool_selection_judge.py deleted file mode 100644 index 18f8f560..00000000 --- a/examples/showcase/tool-evaluation-plugins/scripts/tool_selection_judge.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -""" -Tool Selection Evaluator - Code Judge Plugin - -Evaluates whether the agent selected the RIGHT tools for the task. -This is a semantic evaluation that requires understanding task requirements -and matching them against available tools. - -Why this is a plugin (not built-in): -- Requires domain-specific knowledge of what tools are "appropriate" -- Involves semantic judgment, not just pattern matching -- Different projects have different tool selection criteria - -Usage in eval YAML: - evaluators: - - name: tool-selection - type: code_judge - script: scripts/tool_selection_judge.py - -Input (stdin JSON): - - question: The user's task/question - - expected_outcome: Description of expected behavior - - output_messages: Array of messages including tool calls - - candidate_trace_summary: Summary of tool usage - -Output (stdout JSON): - - score: 0.0-1.0 (1.0 = all tools appropriate, 0.0 = all inappropriate) - - hits: List of appropriate tool selections - - misses: List of missing or inappropriate tools - - reasoning: Explanation of the evaluation -""" - -import json -import sys -from typing import Any - - -def extract_tool_calls(messages: list[dict]) -> list[dict]: - """Extract all tool calls from output messages.""" - tool_calls = [] - for msg in messages: - if msg.get("role") == "assistant" and msg.get("toolCalls"): - for call in msg["toolCalls"]: - tool_calls.append({ - "tool": call.get("tool"), - "args": call.get("args", {}), - }) - return tool_calls - - -def evaluate_tool_selection( - question: str, - expected_outcome: str, - tool_calls: list[dict], - trace_summary: dict | None, -) -> dict[str, Any]: - """ - Evaluate tool selection based on task requirements. - - This is a simplified heuristic-based evaluation. - For production use, you might: - 1. Use an LLM to judge appropriateness - 2. Define explicit tool-to-task mappings - 3. Use a decision tree based on task classification - """ - hits = [] - misses = [] - - # Extract keywords from question and expected outcome - task_text = f"{question} {expected_outcome}".lower() - - # Define tool-to-task mappings (customize for your domain) - tool_task_mappings = { - "search": ["find", "search", "look", "query", "discover"], - "fetch": ["get", "retrieve", "fetch", "download", "load"], - "read": ["read", "open", "view", "examine", "inspect"], - "write": ["write", "save", "create", "output", "generate"], - "analyze": ["analyze", "process", "compute", "calculate"], - "validate": ["check", "validate", "verify", "confirm"], - } - - # Determine expected tools based on task keywords - expected_tools = set() - for tool, keywords in tool_task_mappings.items(): - if any(kw in task_text for kw in keywords): - expected_tools.add(tool) - - # Get actual tools used - actual_tools = set(call["tool"] for call in tool_calls) - - # Evaluate selection - if not tool_calls: - return { - "score": 0.0, - "hits": [], - "misses": ["No tools were called"], - "reasoning": "Agent did not use any tools. Expected at least some tool usage.", - } - - # Check for appropriate selections - for tool in actual_tools: - tool_lower = tool.lower() - is_relevant = any( - tool_lower in expected or expected in tool_lower - for expected in expected_tools - ) - if is_relevant or not expected_tools: - hits.append(f"Tool '{tool}' appears relevant to task") - else: - misses.append(f"Tool '{tool}' may not be needed for this task") - - # Check for missing expected tools - for expected in expected_tools: - if not any(expected in t.lower() for t in actual_tools): - misses.append(f"Expected a '{expected}'-type tool but none used") - - # Calculate score - total_checks = len(hits) + len(misses) - score = len(hits) / total_checks if total_checks > 0 else 0.5 - - reasoning = ( - f"Evaluated {len(actual_tools)} tool(s) against task requirements. " - f"{len(hits)} appropriate, {len(misses)} issues found." - ) - - return { - "score": round(score, 2), - "hits": hits[:4], # Cap at 4 per contract - "misses": misses[:4], - "reasoning": reasoning, - } - - -def main(): - try: - input_data = json.loads(sys.stdin.read()) - - question = input_data.get("question", "") - expected_outcome = input_data.get("expected_outcome", "") - output_messages = input_data.get("output_messages", []) - trace_summary = input_data.get("candidate_trace_summary") - - tool_calls = extract_tool_calls(output_messages) - - result = evaluate_tool_selection( - question=question, - expected_outcome=expected_outcome, - tool_calls=tool_calls, - trace_summary=trace_summary, - ) - - print(json.dumps(result, indent=2)) - - except Exception as e: - error_result = { - "score": 0.0, - "hits": [], - "misses": [f"Evaluator error: {str(e)}"], - "reasoning": f"Evaluation failed: {str(e)}", - } - print(json.dumps(error_result, indent=2)) - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/examples/showcase/tool-evaluation-plugins/tool-eval-demo.yaml b/examples/showcase/tool-evaluation-plugins/tool-eval-demo.yaml index 0b830617..0892cf68 100644 --- a/examples/showcase/tool-evaluation-plugins/tool-eval-demo.yaml +++ b/examples/showcase/tool-evaluation-plugins/tool-eval-demo.yaml @@ -41,7 +41,7 @@ evalcases: # Plugin: Semantic tool selection evaluation - name: selection-quality type: code_judge - script: scripts/tool_selection_judge.py + script: bun run scripts/tool-selection-judge.ts # ========================================== # Example 2: Efficiency Scoring @@ -62,7 +62,7 @@ evalcases: # Plugin: Efficiency metrics scoring - name: efficiency-check type: code_judge - script: scripts/efficiency_scorer.py + script: bun run scripts/efficiency-scorer.ts # ========================================== # Example 3: Combined Built-in + Plugin Evaluation @@ -95,12 +95,12 @@ evalcases: # Plugin: Check if tools were appropriate choices - name: selection-check type: code_judge - script: scripts/tool_selection_judge.py + script: bun run scripts/tool-selection-judge.ts # Plugin: Evaluate efficiency - name: efficiency type: code_judge - script: scripts/efficiency_scorer.py + script: bun run scripts/efficiency-scorer.ts # ========================================== # Example 4: Pairwise Comparison @@ -128,4 +128,4 @@ evalcases: # Plugin: Pairwise comparison with position bias mitigation - name: pairwise-quality type: code_judge - script: scripts/pairwise_tool_compare.py + script: bun run scripts/pairwise-tool-compare.ts diff --git a/openspec/changes/add-execution-metrics/tasks.md b/openspec/changes/add-execution-metrics/tasks.md index d3891971..9f16312b 100644 --- a/openspec/changes/add-execution-metrics/tasks.md +++ b/openspec/changes/add-execution-metrics/tasks.md @@ -25,10 +25,10 @@ ## 5. Examples & Documentation -- [ ] 5.1 Add metrics evaluation example to `examples/features/` -- [ ] 5.2 Create code judge example that uses metrics +- [x] 5.1 Add metrics evaluation example to `examples/features/` +- [x] 5.2 Create code judge example that uses metrics ## 6. Testing - [x] 6.1 Unit tests for metric computation -- [ ] 6.2 Integration test with metric-aware code judge +- [x] 6.2 Integration test with metric-aware code judge diff --git a/packages/core/test/evaluation/execution-metrics.test.ts b/packages/core/test/evaluation/execution-metrics.test.ts index 96f1e27b..9ee0d227 100644 --- a/packages/core/test/evaluation/execution-metrics.test.ts +++ b/packages/core/test/evaluation/execution-metrics.test.ts @@ -1,5 +1,8 @@ import { describe, expect, it } from 'bun:test'; +import { CodeEvaluator } from '../../src/evaluation/evaluators.js'; +import type { ResolvedTarget } from '../../src/evaluation/providers/targets.js'; +import type { EvalCase } from '../../src/evaluation/types.js'; import { type TraceSummary, avgToolDurationMs, @@ -246,3 +249,111 @@ describe('Execution Metrics', () => { }); }); }); + +describe('Code Judge Metrics Integration', () => { + const baseTestCase: EvalCase = { + id: 'metrics-test', + dataset: 'test', + question: 'Test question', + input_messages: [{ role: 'user', content: 'Test' }], + input_segments: [{ type: 'text', value: 'Test' }], + expected_messages: [], + reference_answer: '', + guideline_paths: [], + file_paths: [], + code_snippets: [], + expected_outcome: 'Test outcome', + evaluator: 'code_judge', + }; + + const baseTarget: ResolvedTarget = { + kind: 'mock', + name: 'mock', + config: { response: '{}' }, + }; + + it('passes traceSummary to code_judge scripts', async () => { + // Script that checks if traceSummary is present and has expected fields + const script = `bun -e " + import fs from 'node:fs'; + const input = JSON.parse(fs.readFileSync(0, 'utf8')); + const summary = input.traceSummary; + const hasEventCount = summary && typeof summary.eventCount === 'number'; + const hasTokenUsage = summary && summary.tokenUsage && typeof summary.tokenUsage.input === 'number'; + const hasCostUsd = summary && typeof summary.costUsd === 'number'; + const score = (hasEventCount && hasTokenUsage && hasCostUsd) ? 1 : 0; + console.log(JSON.stringify({ + score, + hits: [ + hasEventCount ? 'eventCount present' : null, + hasTokenUsage ? 'tokenUsage present' : null, + hasCostUsd ? 'costUsd present' : null + ].filter(Boolean), + misses: [ + hasEventCount ? null : 'eventCount missing', + hasTokenUsage ? null : 'tokenUsage missing', + hasCostUsd ? null : 'costUsd missing' + ].filter(Boolean), + reasoning: 'Checked traceSummary fields' + })); + "`; + + const evaluator = new CodeEvaluator({ script }); + + const traceSummary: TraceSummary = { + eventCount: 3, + toolNames: ['Read', 'Edit'], + toolCallsByName: { Read: 2, Edit: 1 }, + errorCount: 0, + tokenUsage: { input: 1000, output: 500 }, + costUsd: 0.005, + durationMs: 2500, + }; + + const result = await evaluator.evaluate({ + evalCase: baseTestCase, + candidate: 'Test answer', + target: baseTarget, + attempt: 0, + promptInputs: { question: '', guidelines: '' }, + now: new Date(), + traceSummary, + }); + + expect(result.score).toBe(1); + expect(result.verdict).toBe('pass'); + expect(result.hits).toContain('eventCount present'); + expect(result.hits).toContain('tokenUsage present'); + expect(result.hits).toContain('costUsd present'); + }); + + it('handles missing traceSummary gracefully', async () => { + // Script that handles missing traceSummary + const script = `bun -e " + import fs from 'node:fs'; + const input = JSON.parse(fs.readFileSync(0, 'utf8')); + const hasSummary = input.traceSummary !== null && input.traceSummary !== undefined; + console.log(JSON.stringify({ + score: hasSummary ? 0 : 1, + hits: hasSummary ? [] : ['Correctly handled missing summary'], + misses: hasSummary ? ['Expected no summary'] : [], + reasoning: 'Checked for missing traceSummary' + })); + "`; + + const evaluator = new CodeEvaluator({ script }); + + const result = await evaluator.evaluate({ + evalCase: baseTestCase, + candidate: 'Test answer', + target: baseTarget, + attempt: 0, + promptInputs: { question: '', guidelines: '' }, + now: new Date(), + // No traceSummary provided + }); + + expect(result.score).toBe(1); + expect(result.hits).toContain('Correctly handled missing summary'); + }); +});