diff --git a/eval_protocol/adapters/huggingface.py b/eval_protocol/adapters/huggingface.py index 15391181..2825dafa 100644 --- a/eval_protocol/adapters/huggingface.py +++ b/eval_protocol/adapters/huggingface.py @@ -4,21 +4,20 @@ transformation functions to convert them to EvaluationRow format. """ -from typing import Any, Callable, Dict, Iterator, List, Optional import logging +from typing import Any, Callable, Dict, Iterator, List, Optional -from eval_protocol.models import EvaluationRow, Message, InputMetadata, CompletionParams +from eval_protocol.models import CompletionParams, EvaluationRow, InputMetadata, Message logger = logging.getLogger(__name__) try: - from datasets import load_dataset, Dataset, DatasetDict + from datasets import Dataset, DatasetDict, load_dataset + DATASETS_AVAILABLE = True except ImportError: DATASETS_AVAILABLE = False - logger.warning( - "HuggingFace datasets not installed. Install with: pip install 'eval-protocol[huggingface]'" - ) + logger.warning("HuggingFace datasets not installed. Install with: pip install 'eval-protocol[huggingface]'") # Type alias for transformation function TransformFunction = Callable[[Dict[str, Any]], Dict[str, Any]] @@ -26,11 +25,11 @@ class HuggingFaceAdapter: """Generic adapter to load HuggingFace datasets with custom transformations. - + This adapter loads datasets from HuggingFace Hub and applies a user-provided - transformation function to convert each row to the format expected by + transformation function to convert each row to the format expected by EvaluationRow. - + The transformation function should take a dataset row dictionary and return: { 'messages': List[Dict] - list of message dictionaries with 'role' and 'content' @@ -38,7 +37,7 @@ class HuggingFaceAdapter: 'metadata': Optional[Dict] - any additional metadata to preserve 'tools': Optional[List[Dict]] - tool definitions for tool calling scenarios } - + Examples: Simple Q&A dataset: >>> def transform(row): @@ -49,7 +48,7 @@ class HuggingFaceAdapter: ... } >>> adapter = HuggingFaceAdapter("my-dataset", transform_fn=transform) >>> rows = list(adapter.get_evaluation_rows(split="test", limit=10)) - + Math problems with system prompt: >>> def gsm8k_transform(row): ... return { @@ -62,7 +61,7 @@ class HuggingFaceAdapter: ... } >>> adapter = HuggingFaceAdapter("gsm8k", config_name="main", transform_fn=gsm8k_transform) """ - + def __init__( self, dataset_id: str, @@ -72,7 +71,7 @@ def __init__( **load_dataset_kwargs, ): """Initialize the HuggingFace adapter. - + Args: dataset_id: HuggingFace dataset identifier (e.g., "gsm8k", "squad", "org/dataset") transform_fn: Function to transform dataset rows to evaluation format @@ -84,16 +83,16 @@ def __init__( raise ImportError( "HuggingFace datasets not installed. Install with: pip install 'eval-protocol[huggingface]'" ) - + self.dataset_id = dataset_id self.transform_fn = transform_fn self.config_name = config_name self.revision = revision self.load_dataset_kwargs = load_dataset_kwargs - + # Load the dataset self.dataset = self._load_dataset() - + @classmethod def from_local( cls, @@ -102,53 +101,49 @@ def from_local( **load_dataset_kwargs, ) -> "HuggingFaceAdapter": """Create adapter from local dataset file. - + Args: path: Path to local dataset file (JSON, JSONL, CSV, etc.) transform_fn: Function to transform dataset rows **load_dataset_kwargs: Additional arguments to pass to load_dataset - + Returns: HuggingFaceAdapter instance """ # Determine file format - if path.endswith('.jsonl'): + if path.endswith(".jsonl"): dataset_type = "json" - elif path.endswith('.json'): + elif path.endswith(".json"): dataset_type = "json" - elif path.endswith('.csv'): + elif path.endswith(".csv"): dataset_type = "csv" - elif path.endswith('.parquet'): + elif path.endswith(".parquet"): dataset_type = "parquet" else: # Let HuggingFace auto-detect dataset_type = None - - load_kwargs = {'data_files': path, **load_dataset_kwargs} - - return cls( - dataset_id=dataset_type or "json", - transform_fn=transform_fn, - **load_kwargs - ) - + + load_kwargs = {"data_files": path, **load_dataset_kwargs} + + return cls(dataset_id=dataset_type or "json", transform_fn=transform_fn, **load_kwargs) + def _load_dataset(self) -> "Dataset | DatasetDict": """Load the dataset from HuggingFace Hub or local source.""" try: kwargs = {} if self.config_name: - kwargs['name'] = self.config_name + kwargs["name"] = self.config_name if self.revision: - kwargs['revision'] = self.revision - + kwargs["revision"] = self.revision + kwargs.update(self.load_dataset_kwargs) - + return load_dataset(self.dataset_id, **kwargs) - + except (OSError, ValueError, RuntimeError) as e: logger.error("Failed to load dataset %s: %s", self.dataset_id, e) raise - + def get_evaluation_rows( self, split: Optional[str] = None, @@ -160,7 +155,7 @@ def get_evaluation_rows( **completion_params_kwargs, ) -> Iterator[EvaluationRow]: """Convert dataset entries to EvaluationRow format. - + Args: split: Dataset split to use (if dataset has multiple splits) limit: Maximum number of rows to return @@ -169,7 +164,7 @@ def get_evaluation_rows( temperature: Temperature for completion parameters max_tokens: Max tokens for completion parameters **completion_params_kwargs: Additional completion parameters - + Yields: EvaluationRow: Converted evaluation rows """ @@ -183,35 +178,33 @@ def get_evaluation_rows( dataset = self.dataset[split] elif split is not None: logger.warning("Split '%s' specified but dataset is not split", split) - + # Apply offset and limit total_rows = len(dataset) end_idx = min(offset + limit, total_rows) if limit else total_rows - + if offset >= total_rows: logger.warning("Offset %d is greater than dataset size %d", offset, total_rows) return - + # Create completion parameters - completion_params = CompletionParams( - model=model_name, - temperature=temperature, - max_tokens=max_tokens, + completion_params: CompletionParams = { + "model": model_name, + "temperature": temperature, + "max_tokens": max_tokens, **completion_params_kwargs, - ) - + } + # Convert each row for i in range(offset, end_idx): try: raw_row = dataset[i] - eval_row = self._convert_row_to_evaluation_row( - raw_row, i, completion_params, split - ) + eval_row = self._convert_row_to_evaluation_row(raw_row, i, completion_params, split) yield eval_row except (AttributeError, ValueError, KeyError) as e: logger.warning("Failed to convert row %d: %s", i, e) continue - + def _convert_row_to_evaluation_row( self, raw_row: Dict[str, Any], @@ -220,83 +213,87 @@ def _convert_row_to_evaluation_row( split: Optional[str] = None, ) -> EvaluationRow: """Convert a single dataset row to EvaluationRow format. - + Args: raw_row: Raw dataset row dictionary row_index: Index of the row in the dataset completion_params: Completion parameters to use split: Dataset split name - + Returns: EvaluationRow object """ # Apply user transformation transformed = self.transform_fn(raw_row) - + # Validate required fields - if 'messages' not in transformed: + if "messages" not in transformed: raise ValueError("Transform function must return 'messages' field") - + # Convert message dictionaries to Message objects messages = [] - for msg_dict in transformed['messages']: + for msg_dict in transformed["messages"]: if not isinstance(msg_dict, dict): raise ValueError("Each message must be a dictionary") - if 'role' not in msg_dict: + if "role" not in msg_dict: raise ValueError("Each message must have a 'role' field") - - messages.append(Message( - role=msg_dict['role'], - content=msg_dict.get('content'), - name=msg_dict.get('name'), - tool_call_id=msg_dict.get('tool_call_id'), - tool_calls=msg_dict.get('tool_calls'), - function_call=msg_dict.get('function_call'), - )) - + + messages.append( + Message( + role=msg_dict["role"], + content=msg_dict.get("content"), + name=msg_dict.get("name"), + tool_call_id=msg_dict.get("tool_call_id"), + tool_calls=msg_dict.get("tool_calls"), + function_call=msg_dict.get("function_call"), + ) + ) + # Extract other fields - ground_truth = transformed.get('ground_truth') - tools = transformed.get('tools') - user_metadata = transformed.get('metadata', {}) - + ground_truth = transformed.get("ground_truth") + tools = transformed.get("tools") + user_metadata = transformed.get("metadata", {}) + # Create dataset info dataset_info = { - 'dataset_id': self.dataset_id, - 'config_name': self.config_name, - 'revision': self.revision, - 'split': split, - 'row_index': row_index, - 'transform_function': self.transform_fn.__name__ if hasattr(self.transform_fn, '__name__') else 'anonymous', + "dataset_id": self.dataset_id, + "config_name": self.config_name, + "revision": self.revision, + "split": split, + "row_index": row_index, + "transform_function": ( + self.transform_fn.__name__ if hasattr(self.transform_fn, "__name__") else "anonymous" + ), } - + # Add user metadata dataset_info.update(user_metadata) - + # Add original row data (with prefix to avoid conflicts) for key, value in raw_row.items(): - dataset_info[f'original_{key}'] = value - + dataset_info[f"original_{key}"] = value + # Create input metadata input_metadata = InputMetadata( row_id=f"{self.dataset_id}_{row_index}", completion_params=completion_params, dataset_info=dataset_info, session_data={ - 'dataset_source': 'huggingface', - 'timestamp': None, - } + "dataset_source": "huggingface", + "timestamp": None, + }, ) - + return EvaluationRow( messages=messages, tools=tools, input_metadata=input_metadata, ground_truth=str(ground_truth) if ground_truth is not None else None, ) - + def get_splits(self) -> List[str]: """Get available dataset splits. - + Returns: List of available split names """ @@ -304,27 +301,29 @@ def get_splits(self) -> List[str]: return list(self.dataset.keys()) else: return ["train"] # Default split name for non-split datasets - + def get_dataset_info(self) -> Dict[str, Any]: """Get information about the loaded dataset. - + Returns: Dictionary with dataset information """ info = { - 'dataset_id': self.dataset_id, - 'config_name': self.config_name, - 'revision': self.revision, - 'splits': self.get_splits(), - 'transform_function': self.transform_fn.__name__ if hasattr(self.transform_fn, '__name__') else 'anonymous', + "dataset_id": self.dataset_id, + "config_name": self.config_name, + "revision": self.revision, + "splits": self.get_splits(), + "transform_function": ( + self.transform_fn.__name__ if hasattr(self.transform_fn, "__name__") else "anonymous" + ), } - + # Add split sizes if isinstance(self.dataset, DatasetDict): - info['split_sizes'] = {split: len(data) for split, data in self.dataset.items()} + info["split_sizes"] = {split: len(data) for split, data in self.dataset.items()} else: - info['total_size'] = len(self.dataset) - + info["total_size"] = len(self.dataset) + return info @@ -336,14 +335,14 @@ def create_huggingface_adapter( **load_dataset_kwargs, ) -> HuggingFaceAdapter: """Factory function to create a HuggingFace adapter. - + Args: dataset_id: HuggingFace dataset identifier transform_fn: Function to transform dataset rows to evaluation format config_name: Optional configuration name revision: Optional dataset revision/commit hash **load_dataset_kwargs: Additional arguments for load_dataset - + Returns: HuggingFaceAdapter instance """ @@ -362,11 +361,11 @@ def create_gsm8k_adapter( revision: Optional[str] = None, ) -> HuggingFaceAdapter: """Create adapter specifically configured for GSM8K dataset. - + Args: system_prompt: Optional system prompt for math problems revision: Optional dataset revision/commit - + Returns: HuggingFaceAdapter configured for GSM8K """ @@ -374,24 +373,24 @@ def create_gsm8k_adapter( "You are a helpful assistant that solves math problems step by step. " "Show your work and provide the final answer." ) - + system_content = system_prompt or default_system_prompt - + def gsm8k_transform(row: Dict[str, Any]) -> Dict[str, Any]: """Transform GSM8K row to evaluation format.""" return { - 'messages': [ - {'role': 'system', 'content': system_content}, - {'role': 'user', 'content': row['question']}, + "messages": [ + {"role": "system", "content": system_content}, + {"role": "user", "content": row["question"]}, ], - 'ground_truth': row['answer'], - 'metadata': { - 'dataset': 'gsm8k', - 'question_length': len(row['question']), - 'answer_length': len(row['answer']), - } + "ground_truth": row["answer"], + "metadata": { + "dataset": "gsm8k", + "question_length": len(row["question"]), + "answer_length": len(row["answer"]), + }, } - + return create_huggingface_adapter( dataset_id="gsm8k", config_name="main", @@ -405,40 +404,39 @@ def create_math_adapter( revision: Optional[str] = None, ) -> HuggingFaceAdapter: """Create adapter specifically configured for MATH competition dataset. - + Args: system_prompt: Optional system prompt for math problems revision: Optional dataset revision/commit - + Returns: HuggingFaceAdapter configured for MATH dataset """ default_system_prompt = ( - "You are an expert mathematician. Solve this advanced math problem " - "step by step, showing detailed work." + "You are an expert mathematician. Solve this advanced math problem " "step by step, showing detailed work." ) - + system_content = system_prompt or default_system_prompt - + def math_transform(row: Dict[str, Any]) -> Dict[str, Any]: """Transform MATH dataset row to evaluation format.""" return { - 'messages': [ - {'role': 'system', 'content': system_content}, - {'role': 'user', 'content': row['problem']}, + "messages": [ + {"role": "system", "content": system_content}, + {"role": "user", "content": row["problem"]}, ], - 'ground_truth': row['solution'], - 'metadata': { - 'dataset': 'hendrycks_math', - 'type': row.get('type', 'unknown'), - 'level': row.get('level', 'unknown'), - 'problem_length': len(row['problem']), - 'solution_length': len(row['solution']), - } + "ground_truth": row["solution"], + "metadata": { + "dataset": "hendrycks_math", + "type": row.get("type", "unknown"), + "level": row.get("level", "unknown"), + "problem_length": len(row["problem"]), + "solution_length": len(row["solution"]), + }, } - + return create_huggingface_adapter( dataset_id="hendrycks/competition_math", transform_fn=math_transform, revision=revision, - ) \ No newline at end of file + ) diff --git a/eval_protocol/adapters/langfuse.py b/eval_protocol/adapters/langfuse.py index a3f35cba..0061b983 100644 --- a/eval_protocol/adapters/langfuse.py +++ b/eval_protocol/adapters/langfuse.py @@ -4,11 +4,11 @@ to EvaluationRow format for use in evaluation pipelines. """ -from typing import Any, Dict, Iterator, List, Optional -from datetime import datetime import logging +from datetime import datetime +from typing import Any, Dict, Iterator, List, Optional -from eval_protocol.models import EvaluationRow, Message, InputMetadata, CompletionParams +from eval_protocol.models import EvaluationRow, InputMetadata, Message logger = logging.getLogger(__name__) @@ -277,20 +277,20 @@ def _create_input_metadata(self, trace: Any, observations: List[Any]) -> InputMe InputMetadata object """ # Extract completion parameters from observations - completion_params = CompletionParams() + completion_params = {} # Look for model parameters in observations for obs in observations: if hasattr(obs, "model") and obs.model: - completion_params.model = obs.model + completion_params["model"] = obs.model if hasattr(obs, "model_parameters") and obs.model_parameters: params = obs.model_parameters if "temperature" in params: - completion_params.temperature = params["temperature"] + completion_params["temperature"] = params["temperature"] if "max_tokens" in params: - completion_params.max_tokens = params["max_tokens"] + completion_params["max_tokens"] = params["max_tokens"] if "top_p" in params: - completion_params.top_p = params["top_p"] + completion_params["top_p"] = params["top_p"] break # Create dataset info from trace metadata diff --git a/eval_protocol/benchmarks/suites/aime25.py b/eval_protocol/benchmarks/suites/aime25.py index 4a5d3a4c..3558eaa1 100644 --- a/eval_protocol/benchmarks/suites/aime25.py +++ b/eval_protocol/benchmarks/suites/aime25.py @@ -60,13 +60,18 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: @export_benchmark("aime25") @evaluation_test( - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], input_dataset=[ "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl", "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl", ], dataset_adapter=aime2025_dataset_adapter, - rollout_input_params=[{"max_tokens": 131000, "extra_body": {"reasoning_effort": "low"}}], + completion_params=[ + { + "max_tokens": 131000, + "extra_body": {"reasoning_effort": "low"}, + "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", + } + ], rollout_processor=default_single_turn_rollout_processor, aggregation_method="mean", passed_threshold=None, diff --git a/eval_protocol/benchmarks/suites/gpqa.py b/eval_protocol/benchmarks/suites/gpqa.py index 91620c9a..76967beb 100644 --- a/eval_protocol/benchmarks/suites/gpqa.py +++ b/eval_protocol/benchmarks/suites/gpqa.py @@ -55,6 +55,7 @@ def _extract_abcd_letter(text: str) -> str | None: _GPQA_INPUT_MESSAGES = _load_gpqa_messages_from_csv() + def _strip_gt_messages(msgs: List[Message]) -> List[Message]: return [m for m in msgs if not (m.role == "system" and (m.content or "").startswith("__GT__:"))] @@ -67,16 +68,19 @@ async def gpqa_strip_gt_rollout_processor(rows: List[EvaluationRow], config) -> if gt_tokens: gt_val = gt_tokens[-1].split(":", 1)[1].strip() r.ground_truth = gt_val - r.messages = [m for m in r.messages if not (m.role == "system" and (m.content or "").startswith("__GT__:"))] + r.messages = [ + m for m in r.messages if not (m.role == "system" and (m.content or "").startswith("__GT__:")) + ] processed.append(r) return await default_single_turn_rollout_processor(processed, config) @export_benchmark("gpqa") @evaluation_test( - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], input_messages=_GPQA_INPUT_MESSAGES, - rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}], + completion_params=[ + {"extra_body": {"reasoning_effort": "low"}, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"} + ], rollout_processor=gpqa_strip_gt_rollout_processor, aggregation_method="mean", passed_threshold=None, diff --git a/eval_protocol/benchmarks/suites/livebench_data_analysis.py b/eval_protocol/benchmarks/suites/livebench_data_analysis.py index 1c04b6fd..fc5abb4e 100644 --- a/eval_protocol/benchmarks/suites/livebench_data_analysis.py +++ b/eval_protocol/benchmarks/suites/livebench_data_analysis.py @@ -1,20 +1,19 @@ -from typing import Any, Dict, List, Optional - import json import re +from typing import Any, Dict, List, Optional +from eval_protocol.benchmarks.registry import export_benchmark, register_composite_benchmark from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult from eval_protocol.pytest.default_single_turn_rollout_process import ( default_single_turn_rollout_processor, ) from eval_protocol.pytest.evaluation_test import evaluation_test -from eval_protocol.benchmarks.registry import export_benchmark, register_composite_benchmark - # ------------------------- # Lightweight ports of LiveBench scoring utilities for data_analysis tasks # ------------------------- + def _lb_clean_text(text: str) -> str: text = text.lower().strip() text = re.sub(r"[^\w]", "", text) @@ -36,9 +35,7 @@ def _cta_process_results(ground_truth: str, llm_answer: str) -> int: boxed = _extract_last_boxed_segment(parsed_answer) if boxed is not None: parsed_answer = boxed - parsed_answer = ( - parsed_answer.replace("\\text{", "").replace("}", "").replace("\\", "") - ) + parsed_answer = parsed_answer.replace("\\text{", "").replace("}", "").replace("\\", "") gt_clean = _lb_clean_text(ground_truth) ans_clean = _lb_clean_text(parsed_answer) @@ -132,17 +129,15 @@ def _tablejoin_process_results(ground_truth: Any, llm_answer: str) -> float: return round((2 * tp) / denom, 2) -def _tablereformat_process_results( - input_command: str, ground_truth: str, llm_answer: str, version: str -) -> int: +def _tablereformat_process_results(input_command: str, ground_truth: str, llm_answer: str, version: str) -> int: try: import pandas as pd # type: ignore except Exception: return 0 - from io import StringIO import math as _math import traceback as _traceback + from io import StringIO def _read_df_v1(df_type: str, df_str: str): if df_type == "json": @@ -252,8 +247,12 @@ def _read_jsonl_table_from_text(text: str, header_cols: List[str]): ) else: lines = input_command.split("\n") - input_fmt = [l for l in lines if "Source Format" in l][-1].split("Source Format: ")[-1].strip().lower() - output_fmt = [l for l in lines if "Target Format" in l][-1].split("Target Format: ")[-1].strip().lower() + input_fmt = ( + [line for line in lines if "Source Format" in line][-1].split("Source Format: ")[-1].strip().lower() + ) + output_fmt = ( + [line for line in lines if "Target Format" in line][-1].split("Target Format: ")[-1].strip().lower() + ) reader = _read_df_v1 if version == "v1" else _read_df_v2 gt_df = reader(output_fmt, ground_truth) @@ -373,9 +372,9 @@ def _extract_gt(row: EvaluationRow) -> Dict[str, Any]: @export_benchmark("live_bench/data_analysis/cta") @evaluation_test( - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], input_messages=[[m for m in r.messages] for r in _CTA_ROWS], - rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}], + rollout_processor_kwargs=[{"extra_body": {"reasoning_effort": "low"}}], rollout_processor=default_single_turn_rollout_processor, aggregation_method="mean", passed_threshold=None, @@ -416,9 +415,9 @@ def livebench_cta_pointwise(row: EvaluationRow) -> EvaluationRow: @export_benchmark("live_bench/data_analysis/tablejoin") @evaluation_test( - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], input_messages=[[m for m in r.messages] for r in _TABLEJOIN_ROWS], - rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}], + rollout_processor_kwargs=[{"extra_body": {"reasoning_effort": "low"}}], rollout_processor=default_single_turn_rollout_processor, aggregation_method="mean", passed_threshold=None, @@ -460,9 +459,9 @@ def livebench_tablejoin_pointwise(row: EvaluationRow) -> EvaluationRow: @export_benchmark("live_bench/data_analysis/tablereformat") @evaluation_test( - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], input_messages=[[m for m in r.messages] for r in _TABLEREFORMAT_ROWS], - rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}], + rollout_processor_kwargs=[{"extra_body": {"reasoning_effort": "low"}}], rollout_processor=default_single_turn_rollout_processor, aggregation_method="mean", passed_threshold=None, @@ -508,5 +507,3 @@ def livebench_tablereformat_pointwise(row: EvaluationRow) -> EvaluationRow: "live_bench/data_analysis/tablereformat", ], ) - - diff --git a/eval_protocol/benchmarks/suites/tau_bench_retail.py b/eval_protocol/benchmarks/suites/tau_bench_retail.py index 9e1104d4..8e8aaea0 100644 --- a/eval_protocol/benchmarks/suites/tau_bench_retail.py +++ b/eval_protocol/benchmarks/suites/tau_bench_retail.py @@ -11,7 +11,7 @@ from typing import Any, Dict, List from eval_protocol.benchmarks.registry import export_benchmark -from eval_protocol.models import CompletionParams, EvaluateResult, EvaluationRow, InputMetadata, Message +from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message from eval_protocol.pytest import evaluation_test from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor from vendor.tau2.data_model.message import ( @@ -66,8 +66,13 @@ def tau_bench_retail_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu @evaluation_test( input_dataset=["tests/pytest/data/retail_dataset.jsonl"], dataset_adapter=tau_bench_retail_to_evaluation_row, - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], - rollout_input_params=[{"temperature": 0.8, "extra_body": {"reasoning_effort": "medium"}}], + completion_params=[ + { + "temperature": 0.8, + "extra_body": {"reasoning_effort": "medium"}, + "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", + } + ], rollout_processor=default_mcp_gym_rollout_processor, rollout_processor_kwargs={"domain": "retail"}, num_runs=8, diff --git a/eval_protocol/dataset_logger/__init__.py b/eval_protocol/dataset_logger/__init__.py index 9478ec6f..1caf6adc 100644 --- a/eval_protocol/dataset_logger/__init__.py +++ b/eval_protocol/dataset_logger/__init__.py @@ -1,11 +1,14 @@ -from eval_protocol.dataset_logger.sqlite_dataset_logger_adapter import SqliteDatasetLoggerAdapter import os +from eval_protocol.dataset_logger.dataset_logger import DatasetLogger +from eval_protocol.dataset_logger.sqlite_dataset_logger_adapter import SqliteDatasetLoggerAdapter + # Allow disabling sqlite logger to avoid environment-specific constraints in simple CLI runs. -if os.getenv("EP_SQLITE_LOG", "0").strip() == "1": +if os.getenv("DISABLE_EP_SQLITE_LOG", "0").strip() == "1": default_logger = SqliteDatasetLoggerAdapter() else: - class _NoOpLogger: + + class _NoOpLogger(DatasetLogger): def log(self, row): return None diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py index e0d101a7..76c7b6cc 100644 --- a/eval_protocol/mcp/execution/manager.py +++ b/eval_protocol/mcp/execution/manager.py @@ -20,7 +20,7 @@ from vendor.tau2.data_model.message import AssistantMessage, UserMessage from vendor.tau2.user.user_simulator import UserSimulator -from ...models import CompletionParams, EvaluationRow, InputMetadata, Message +from ...models import EvaluationRow, InputMetadata, Message from ...types import MCPSession, MCPToolCall, TerminationReason, Trajectory if TYPE_CHECKING: @@ -128,12 +128,12 @@ async def _execute_with_semaphore(idx): evaluation_row.messages = messages evaluation_row.tools = shared_tool_schema evaluation_row.usage = CompletionUsage(**trajectory.usage) - evaluation_row.input_metadata.completion_params = CompletionParams( - model=policy.model_id, - temperature=getattr(policy, "temperature", None), - max_tokens=getattr(policy, "max_tokens", None), - max_tool_calls=getattr(policy, "max_tools_per_turn", None), - ) + evaluation_row.input_metadata.completion_params = { + "model": policy.model_id, + "temperature": getattr(policy, "temperature", None), + "max_tokens": getattr(policy, "max_tokens", None), + "max_tool_calls": getattr(policy, "max_tools_per_turn", None), + } if trajectory.terminated: if trajectory.termination_reason == TerminationReason.ERROR: diff --git a/eval_protocol/models.py b/eval_protocol/models.py index 77707c23..3f4391fa 100644 --- a/eval_protocol/models.py +++ b/eval_protocol/models.py @@ -1,6 +1,6 @@ import os from datetime import datetime -from typing import Any, Dict, List, Literal, Optional, Union +from typing import Any, Dict, List, Literal, Optional, TypedDict, Union from openai.types import CompletionUsage from openai.types.chat.chat_completion_message import ( @@ -178,13 +178,18 @@ def __iter__(self): return iter(self.__fields__.keys()) # Changed to __fields__ -class CompletionParams(BaseModel): - """Configuration for the language model used in the session.""" +CompletionParams = Dict[str, Any] +""" +Common set of completion parameters that most model providers support in their +API. Set total=False to allow extra fields since LiteLLM + providers have their +own set of parameters. The following parameters are common fields that are +populated. - model: str = Field(..., description="Model identifier (e.g., 'gpt-4.1', 'fireworks/llama')") - temperature: Optional[float] = Field(None, description="Temperature setting for model generation") - max_tokens: Optional[int] = Field(None, description="Maximum tokens to generate") - max_tool_calls: Optional[int] = Field(None, description="Maximum tool calls per turn") +model: str +temperature: Optional[float] +max_tokens: Optional[int] +top_p: Optional[float] +""" class InputMetadata(BaseModel): @@ -193,7 +198,9 @@ class InputMetadata(BaseModel): model_config = ConfigDict(extra="allow") row_id: Optional[str] = Field(default_factory=generate_id, description="Unique string to ID the row") - completion_params: Optional[CompletionParams] = Field(None, description="Completion endpoint parameters used") + completion_params: CompletionParams = Field( + default_factory=dict, description="Completion endpoint parameters used" + ) dataset_info: Optional[Dict[str, Any]] = Field( None, description="Dataset row details: seed, system_prompt, environment_context, etc" ) diff --git a/eval_protocol/pytest/default_agent_rollout_processor.py b/eval_protocol/pytest/default_agent_rollout_processor.py index 6a158b54..b3997c49 100644 --- a/eval_protocol/pytest/default_agent_rollout_processor.py +++ b/eval_protocol/pytest/default_agent_rollout_processor.py @@ -125,7 +125,9 @@ async def default_agent_rollout_processor( async def process_row(row: EvaluationRow) -> EvaluationRow: """Process a single row with agent rollout.""" - agent = Agent(model=config.model, row=row, config_path=config.mcp_config_path, logger=config.logger) + agent = Agent( + model=config.completion_params.model, row=row, config_path=config.mcp_config_path, logger=config.logger + ) try: await agent.setup() await agent.call_agent() diff --git a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py index de9d8ca1..2b90239d 100644 --- a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +++ b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py @@ -216,10 +216,10 @@ async def default_mcp_gym_rollout_processor( server.start() policy = ep.LiteLLMPolicy( - model_id=config.model, - temperature=config.input_params.get("temperature", 0.0), - max_tokens=config.input_params.get("max_tokens", 4096), - reasoning_effort=config.input_params.get("reasoning_effort", None), + model_id=config.completion_params.model, + temperature=config.completion_params.get("temperature", 0.0), + max_tokens=config.completion_params.get("max_tokens", 4096), + reasoning_effort=config.completion_params.get("reasoning_effort", None), ) # Create MCP environments directly from evaluation_rows diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py index 424347cd..ef2ad48b 100644 --- a/eval_protocol/pytest/default_single_turn_rollout_process.py +++ b/eval_protocol/pytest/default_single_turn_rollout_process.py @@ -41,20 +41,20 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: messages_payload = [{"role": m.role, "content": m.content} for m in row.messages] - request_params = {"model": config.model, "messages": messages_payload, **config.input_params} + request_params = {"messages": messages_payload, **config.completion_params} # Ensure caching is disabled only for this request (review feedback) request_params["cache"] = {"no-cache": True} # Single-level reasoning effort: expect `reasoning_effort` only effort_val = None - if isinstance(config.input_params, dict): - if "reasoning_effort" in config.input_params: - effort_val = str(config.input_params["reasoning_effort"]) # flat shape - elif ( - isinstance(config.input_params.get("extra_body"), dict) - and "reasoning_effort" in config.input_params["extra_body"] - ): - # Accept if user passed it directly inside extra_body - effort_val = str(config.input_params["extra_body"]["reasoning_effort"]) # already in extra_body + + if "reasoning_effort" in config.completion_params: + effort_val = str(config.completion_params["reasoning_effort"]) # flat shape + elif ( + isinstance(config.completion_params.get("extra_body"), dict) + and "reasoning_effort" in config.completion_params["extra_body"] + ): + # Accept if user passed it directly inside extra_body + effort_val = str(config.completion_params["extra_body"]["reasoning_effort"]) # already in extra_body if effort_val: # Always under extra_body so LiteLLM forwards to provider-specific param set diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 81856ff6..dd7ecb04 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -32,7 +32,6 @@ EvaluationTestMode, InputMessagesParam, ModelParam, - RolloutInputParam, RolloutProcessor, RolloutProcessorConfig, RolloutProcessorInputParam, @@ -52,16 +51,15 @@ def evaluation_test( # noqa: C901 *, - model: List[ModelParam], + completion_params: List[CompletionParams], input_messages: Optional[List[InputMessagesParam]] = None, input_dataset: Optional[List[DatasetPathParam]] = None, dataset_adapter: Callable[[List[Dict[str, Any]]], Dataset] = default_dataset_adapter, - rollout_input_params: Optional[List[RolloutInputParam]] = None, rollout_processor: RolloutProcessor = default_no_op_rollout_processor, evaluation_test_kwargs: Optional[List[EvaluationInputParam]] = None, rollout_processor_kwargs: Optional[RolloutProcessorInputParam] = None, aggregation_method: AggregationMethod = "mean", - passed_threshold: Optional[Union[EvaluationThreshold, float]] = None, + passed_threshold: Optional[Union[EvaluationThreshold, float, dict]] = None, num_runs: int = 1, max_dataset_rows: Optional[int] = None, mcp_config_path: Optional[str] = None, @@ -109,7 +107,6 @@ def evaluation_test( # noqa: C901 which can be used to easily group and identify your dataset by. Args: - model: Model identifiers to query. input_messages: Messages to send to the model. This is useful if you don't have a dataset but can hard-code the messages. Will be passed as "input_dataset" to the test function. @@ -118,12 +115,12 @@ def evaluation_test( # noqa: C901 to a list of EvaluationRows if you have a custom dataset format. dataset_adapter: Function to convert the input dataset to a list of EvaluationRows. This is useful if you have a custom dataset format. - rollout_input_params: Generation parameters for the rollout. + completion_params: Generation parameters for the rollout. rollout_processor: Function used to perform the rollout. evaluation_test_kwargs: Kwargs for the evaluation function. rollout_processor_kwargs: Kwargs for the rollout processor. aggregation_method: How to aggregate scores across rows. - passed_threshold: Threshold configuration for test success. + passed_threshold: Threshold configuration for test success. Must be a float or EvaluationThreshold object. Success rate must be above success, and if set, standard deviation must be below standard_deviation. num_runs: Number of times to repeat the rollout and evaluations. max_dataset_rows: Limit dataset to the first N rows. @@ -242,7 +239,7 @@ def generate_combinations(): datasets = [[input_dataset]] # type: ignore else: datasets = [None] - rips: List[Optional[RolloutInputParam]] = rollout_input_params if rollout_input_params is not None else [None] # type: ignore + cps: List[Optional[CompletionParams]] = completion_params if completion_params is not None else [None] # type: ignore # Apply EP_MAX_DATASET_ROWS to input_messages, but do NOT parameterize over # each row. Instead, pass the entire sliced list through in a single test run # so summaries aggregate all rows together (AIME-style behavior). @@ -259,17 +256,16 @@ def generate_combinations(): kwargs: List[Optional[EvaluationInputParam]] = evaluation_test_kwargs if evaluation_test_kwargs is not None else [None] # type: ignore # Generate all combinations - for m in model: - for ds in datasets: - for rip in rips: - for im in messages: - for etk in kwargs: - # if no dataset and no messages, raise an error - if ds is None and im is None: - raise ValueError( - "No dataset or messages provided. Please provide at least one of input_dataset or input_messages." - ) - combinations.append((m, ds, rip, im, etk)) + for ds in datasets: + for cp in cps: + for im in messages: + for etk in kwargs: + # if no dataset and no messages, raise an error + if ds is None and im is None: + raise ValueError( + "No dataset or messages provided. Please provide at least one of input_dataset or input_messages." + ) + combinations.append((ds, cp, im, etk)) return combinations @@ -282,12 +278,12 @@ def generate_combinations(): # Create parameter tuples for pytest.mark.parametrize param_tuples = [] for combo in combinations: - model_name, dataset, rip, messages, etk = combo - param_tuple = [model_name] + dataset, cp, messages, etk = combo + param_tuple = [] if input_dataset is not None: param_tuple.append(dataset) - if rollout_input_params is not None: - param_tuple.append(rip) + if completion_params is not None: + param_tuple.append(cp) if input_messages is not None: param_tuple.append(messages) if evaluation_test_kwargs is not None: @@ -295,11 +291,11 @@ def generate_combinations(): param_tuples.append(tuple(param_tuple)) # For batch mode, use the original parameter names - test_param_names = ["model"] + test_param_names = [] if input_dataset is not None: test_param_names.append("dataset_path") - if rollout_input_params is not None: - test_param_names.append("input_params") + if completion_params is not None: + test_param_names.append("completion_params") if input_messages is not None: test_param_names.append("input_messages") if evaluation_test_kwargs is not None: @@ -311,7 +307,6 @@ def create_wrapper_with_signature() -> Callable: invocation_id = generate_id() async def wrapper_body(**kwargs): - model_name = kwargs["model"] eval_metadata = None all_results: List[List[EvaluationRow]] = [[] for _ in range(num_runs)] @@ -352,7 +347,16 @@ def _log_eval_error( else: raise ValueError("No input dataset or input messages provided") - input_params = kwargs.get("input_params") or {} + if "completion_params" not in kwargs or not kwargs["completion_params"]: + raise ValueError( + "No completion parameters provided. Please provide a completion parameters object." + ) + completion_params = kwargs["completion_params"] + if "model" not in completion_params or not completion_params["model"]: + raise ValueError( + "No model provided. Please provide a model in the completion parameters object." + ) + # Optional global overrides via environment for ad-hoc experimentation # EP_INPUT_PARAMS_JSON can contain a JSON object that will be deep-merged # into input_params (e.g., '{"temperature":0,"extra_body":{"reasoning":{"effort":"low"}}}'). @@ -363,7 +367,7 @@ def _log_eval_error( if _env_override: override_obj = _json.loads(_env_override) if isinstance(override_obj, dict): - input_params = _deep_update_dict(dict(input_params), override_obj) + completion_params = _deep_update_dict(dict(completion_params), override_obj) except Exception: pass @@ -378,14 +382,6 @@ def _log_eval_error( passed=None, ) - # Populate completion_params in input_metadata for all rows and initialize eval_metadata BEFORE rollouts - completion_params = CompletionParams( - model=model_name, - temperature=input_params.get("temperature"), - max_tokens=input_params.get("max_tokens"), - max_tool_calls=input_params.get("max_tool_calls"), - ) - for row in data: if row.input_metadata is None: row.input_metadata = InputMetadata() @@ -405,14 +401,13 @@ def _log_eval_error( # Prepare rollout processor config once; we will generate fresh outputs per run config = RolloutProcessorConfig( - model=model_name, - input_params=input_params, + completion_params=completion_params, mcp_config_path=mcp_config_path or "", max_concurrent_rollouts=max_concurrent_rollouts, server_script_path=server_script_path, steps=steps, logger=active_logger, - kwargs=rollout_processor_kwargs, + kwargs=rollout_processor_kwargs or {}, ) for i in range(num_runs): @@ -535,7 +530,7 @@ async def _execute_with_semaphore(row): should_print = os.getenv("EP_PRINT_SUMMARY") == "1" summary_path = os.getenv("EP_SUMMARY_JSON") suite_name = test_func.__name__ - model_used = model_name + model_used = config.completion_params.model total_rows = len([item for sublist in all_results for item in sublist]) summary_obj = { "suite": suite_name, @@ -619,7 +614,7 @@ def _extract_effort_tag(params: dict) -> str | None: return None model_slug = _sanitize_filename(model_used) - effort_tag = _extract_effort_tag(input_params) or "" + effort_tag = _extract_effort_tag(completion_params) or "" effort_suffix = f"__effort-{_sanitize_filename(effort_tag)}" if effort_tag else "" base_name = f"{suite_name}__{model_slug}{effort_suffix}__{mode}__runs{num_runs}.json" @@ -756,11 +751,10 @@ async def dual_mode_wrapper(*args, **kwargs): try: dual_mode_wrapper.__ep_original_test_func = test_func # type: ignore[attr-defined] dual_mode_wrapper.__ep_config = { - "model": model, "input_messages": input_messages, "input_dataset": input_dataset, "dataset_adapter": dataset_adapter, - "rollout_input_params": rollout_input_params, + "rollout_input_params": completion_params, "rollout_processor": rollout_processor, "evaluation_test_kwargs": evaluation_test_kwargs, "rollout_processor_kwargs": rollout_processor_kwargs, @@ -794,14 +788,13 @@ def __ep_run_direct( rip = rip_list[0] if isinstance(rip_list, list) and rip_list else {} return run_evaluation_test_direct( test_func=dual_mode_wrapper.__ep_original_test_func, # type: ignore[attr-defined] - model=_model, input_messages=cfg.get("input_messages"), input_dataset=cfg.get("input_dataset"), dataset_adapter=cfg.get("dataset_adapter"), - rollout_input_params=rip, + completion_params=rip, rollout_processor=cfg.get("rollout_processor"), aggregation_method=cfg.get("aggregation_method"), - threshold_of_success=cfg.get("passed_threshold"), + passed_threshold=cfg.get("passed_threshold"), num_runs=(num_runs_override if num_runs_override is not None else cfg.get("num_runs")), max_dataset_rows=cfg.get("max_dataset_rows"), mcp_config_path=cfg.get("mcp_config_path"), @@ -825,15 +818,14 @@ def __ep_run_direct( def run_evaluation_test_direct( *, test_func: TestFunction, - model: str, input_messages: Optional[List[InputMessagesParam]] = None, input_dataset: Optional[List[DatasetPathParam]] = None, dataset_adapter: Callable[[List[Dict[str, Any]]], Dataset] = default_dataset_adapter, - rollout_input_params: Optional[RolloutInputParam] = None, + completion_params: Optional[CompletionParams] = None, rollout_processor: RolloutProcessor = default_no_op_rollout_processor, rollout_processor_kwargs: Optional[RolloutProcessorInputParam] = None, aggregation_method: AggregationMethod = "mean", - threshold_of_success: Optional[float] = None, + passed_threshold: Optional[Union[EvaluationThreshold, float]] = None, num_runs: int = 1, max_dataset_rows: Optional[int] = None, mcp_config_path: Optional[str] = None, @@ -849,6 +841,9 @@ def run_evaluation_test_direct( Returns a dict with keys: summary, results. """ + if passed_threshold is not None and not isinstance(passed_threshold, EvaluationThreshold): + passed_threshold = EvaluationThreshold(success=passed_threshold) + def _parse_ep_max_rows(default_value: int | None) -> int | None: raw = os.getenv("EP_MAX_DATASET_ROWS") if raw is None: @@ -893,7 +888,7 @@ def _deep_update_dict(base: dict, override: dict) -> dict: raise ValueError("No input dataset or input messages provided") # Build input params and apply env JSON override - input_params: Dict[str, Any] = rollout_input_params or {} + completion_params: Dict[str, Any] = completion_params or {} try: import json as _json @@ -901,7 +896,7 @@ def _deep_update_dict(base: dict, override: dict) -> dict: if _env_override: override_obj = _json.loads(_env_override) if isinstance(override_obj, dict): - input_params = _deep_update_dict(dict(input_params), override_obj) + completion_params = _deep_update_dict(dict(completion_params), override_obj) except Exception: pass @@ -912,17 +907,10 @@ def _deep_update_dict(base: dict, override: dict) -> dict: status="running", num_runs=num_runs, aggregation_method=aggregation_method, - threshold_of_success=threshold_of_success, + passed_threshold=passed_threshold, passed=None, ) - completion_params = CompletionParams( - model=model, - temperature=input_params.get("temperature"), - max_tokens=input_params.get("max_tokens"), - max_tool_calls=input_params.get("max_tool_calls"), - ) - for row in data: if row.input_metadata is None: row.input_metadata = InputMetadata() @@ -935,13 +923,12 @@ def _deep_update_dict(base: dict, override: dict) -> dict: default_logger.log(row) config = RolloutProcessorConfig( - model=model, - input_params=input_params, + completion_params=completion_params, mcp_config_path=mcp_config_path or "", max_concurrent_rollouts=max_concurrent_rollouts, server_script_path=server_script_path, steps=steps, - kwargs=rollout_processor_kwargs, + kwargs=rollout_processor_kwargs or {}, ) all_results: List[EvaluationRow] = [] @@ -986,8 +973,8 @@ def _deep_update_dict(base: dict, override: dict) -> dict: ci_high = None passed = None - if threshold_of_success is not None: - passed = agg_score >= threshold_of_success + if passed_threshold is not None: + passed = agg_score >= passed_threshold.success for r in all_results: if r.eval_metadata is not None: r.eval_metadata.status = "finished" @@ -1003,7 +990,7 @@ def _deep_update_dict(base: dict, override: dict) -> dict: total_rows = len(all_results) summary_obj = { "suite": suite_name, - "model": model, + "model": config.completion_params.model, "agg_score": float(agg_score) if agg_score is not None else None, "num_runs": num_runs, "rows": total_rows, @@ -1014,11 +1001,11 @@ def _deep_update_dict(base: dict, override: dict) -> dict: if should_print: if ci_low is not None and ci_high is not None: print( - f"EP Summary | suite={suite_name} model={model} agg={summary_obj['agg_score']:.3f} ci95=[{ci_low:.3f},{ci_high:.3f}] runs={num_runs} rows={total_rows}" + f"EP Summary | suite={suite_name} model={config.completion_params.model} agg={summary_obj['agg_score']:.3f} ci95=[{ci_low:.3f},{ci_high:.3f}] runs={num_runs} rows={total_rows}" ) else: print( - f"EP Summary | suite={suite_name} model={model} agg={summary_obj['agg_score']:.3f} runs={num_runs} rows={total_rows}" + f"EP Summary | suite={suite_name} model={config.completion_params.model} agg={summary_obj['agg_score']:.3f} runs={num_runs} rows={total_rows}" ) if summary_path: import json as _json @@ -1050,8 +1037,8 @@ def _extract_effort_tag(params: dict) -> str | None: return None return None - model_slug = _sanitize_filename(model) - effort_tag = _extract_effort_tag(input_params) or "" + model_slug = _sanitize_filename(config.completion_params.model) + effort_tag = _extract_effort_tag(completion_params) or "" effort_suffix = f"__effort-{_sanitize_filename(effort_tag)}" if effort_tag else "" base_name = f"{suite_name}__{model_slug}{effort_suffix}__{mode}__runs{num_runs}.json" @@ -1073,10 +1060,10 @@ def _extract_effort_tag(params: dict) -> str | None: except Exception: pass - if threshold_of_success is not None and not passed: + if passed_threshold is not None and not passed: assert ( - agg_score >= threshold_of_success - ), f"Aggregated score {agg_score:.3f} below threshold {threshold_of_success}" + agg_score >= passed_threshold.success + ), f"Aggregated score {agg_score:.3f} below threshold {passed_threshold}" return {"summary": summary_obj, "results": all_results} except Exception: diff --git a/eval_protocol/pytest/types.py b/eval_protocol/pytest/types.py index 9f564ce1..1a80254b 100644 --- a/eval_protocol/pytest/types.py +++ b/eval_protocol/pytest/types.py @@ -8,11 +8,10 @@ from eval_protocol.dataset_logger import default_logger from eval_protocol.dataset_logger.dataset_logger import DatasetLogger -from ..models import EvaluationRow, Message +from ..models import CompletionParams, EvaluationRow, Message ModelParam = str # gpt-4o, gpt-4o-mini, accounts/fireworks/models/llama-3.1-8b-instruct DatasetPathParam = str -RolloutInputParam = Dict[str, Any] InputMessagesParam = List[Message] EvaluationInputParam = Dict[str, Any] RolloutProcessorInputParam = Dict[str, Any] @@ -41,8 +40,7 @@ @dataclass class RolloutProcessorConfig: - model: ModelParam - input_params: RolloutInputParam # optional input parameters for inference + completion_params: CompletionParams # input parameters for inference mcp_config_path: str server_script_path: Optional[str] = ( None # TODO: change from server_script_path to mcp_config_path for agent rollout processor diff --git a/examples/gpqa/tests/test_gpqa.py b/examples/gpqa/tests/test_gpqa.py index c59d51e4..dcbf7b53 100644 --- a/examples/gpqa/tests/test_gpqa.py +++ b/examples/gpqa/tests/test_gpqa.py @@ -62,10 +62,9 @@ def _load_gpqa_messages_from_csv() -> List[List[Message]]: @evaluation_test( - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], input_messages=_GPQA_INPUT_MESSAGES, - rollout_input_params=[ - {"extra_body": {"reasoning_effort": "low"}} + completion_params=[ + {"extra_body": {"reasoning_effort": "low"}, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"} ], # default to low effort; override via CLI plugin rollout_processor=default_single_turn_rollout_processor, aggregation_method="mean", diff --git a/examples/healthbench/tests/test_evaluation.py b/examples/healthbench/tests/test_evaluation.py index be70c261..a40c5d96 100644 --- a/examples/healthbench/tests/test_evaluation.py +++ b/examples/healthbench/tests/test_evaluation.py @@ -47,9 +47,10 @@ @evaluation_test( - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], input_messages=_HB_INPUT_MESSAGES, - rollout_input_params=[{"temperature": 0.2, "max_tokens": 512}], + completion_params=[ + {"temperature": 0.2, "max_tokens": 512, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"} + ], rollout_processor=default_single_turn_rollout_processor, aggregation_method="mean", passed_threshold=None, diff --git a/tests/pytest/test_apps_coding.py b/tests/pytest/test_apps_coding.py index f9c84695..7cb976ac 100644 --- a/tests/pytest/test_apps_coding.py +++ b/tests/pytest/test_apps_coding.py @@ -26,8 +26,9 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio @evaluation_test( input_dataset=["tests/pytest/data/apps_sample_dataset.jsonl"], dataset_adapter=apps_dataset_to_evaluation_row, - model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], - rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}], + completion_params=[ + {"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"} + ], passed_threshold=0.33, rollout_processor=default_single_turn_rollout_processor, num_runs=1, diff --git a/tests/pytest/test_basic_coding.py b/tests/pytest/test_basic_coding.py index c96a8302..2b1c2a4a 100644 --- a/tests/pytest/test_basic_coding.py +++ b/tests/pytest/test_basic_coding.py @@ -28,8 +28,9 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat @evaluation_test( input_dataset=["tests/pytest/data/basic_coding_dataset.jsonl"], dataset_adapter=coding_dataset_to_evaluation_row, - model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], - rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}], + completion_params=[ + {"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"} + ], passed_threshold=0.8, rollout_processor=default_single_turn_rollout_processor, num_runs=1, diff --git a/tests/pytest/test_frozen_lake.py b/tests/pytest/test_frozen_lake.py index 74d5e317..bea42bed 100644 --- a/tests/pytest/test_frozen_lake.py +++ b/tests/pytest/test_frozen_lake.py @@ -7,7 +7,7 @@ from typing import Any, Dict, List -from eval_protocol.models import CompletionParams, EvaluateResult, EvaluationRow, InputMetadata, Message, MetricResult +from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message, MetricResult from eval_protocol.pytest import evaluation_test from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor @@ -38,8 +38,9 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation @evaluation_test( input_dataset=["tests/pytest/data/frozen_lake_dataset.jsonl"], dataset_adapter=frozen_lake_to_evaluation_row, - model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], - rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}], + completion_params=[ + {"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"} + ], rollout_processor=default_mcp_gym_rollout_processor, passed_threshold=0.66, num_runs=1, diff --git a/tests/pytest/test_hallucination.py b/tests/pytest/test_hallucination.py index 54779f09..b29fb53c 100644 --- a/tests/pytest/test_hallucination.py +++ b/tests/pytest/test_hallucination.py @@ -32,8 +32,9 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation @evaluation_test( input_dataset=["tests/pytest/data/halueval_sample_dataset.jsonl"], dataset_adapter=hallucination_dataset_adapter, - model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], - rollout_input_params=[{"temperature": 0.0, "max_tokens": 512}], + completion_params=[ + {"temperature": 0.0, "max_tokens": 512, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"} + ], rollout_processor=default_single_turn_rollout_processor, passed_threshold=0.33, num_runs=1, diff --git a/tests/pytest/test_lunar_lander.py b/tests/pytest/test_lunar_lander.py index ab4dad69..3fddac62 100644 --- a/tests/pytest/test_lunar_lander.py +++ b/tests/pytest/test_lunar_lander.py @@ -7,7 +7,7 @@ from typing import Any, Dict, List -from eval_protocol.models import CompletionParams, EvaluateResult, EvaluationRow, InputMetadata, Message +from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message from eval_protocol.pytest import evaluation_test from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor @@ -38,8 +38,7 @@ def lunar_lander_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio @evaluation_test( input_dataset=["tests/pytest/data/lunar_lander_dataset.jsonl"], dataset_adapter=lunar_lander_to_evaluation_row, - model=["gpt-4.1"], - rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}], + completion_params=[{"temperature": 0.0, "max_tokens": 4096, "model": "gpt-4.1"}], rollout_processor=default_mcp_gym_rollout_processor, passed_threshold=0.0, num_runs=1, diff --git a/tests/pytest/test_markdown_highlighting.py b/tests/pytest/test_markdown_highlighting.py index cf32cb5f..9c70721f 100644 --- a/tests/pytest/test_markdown_highlighting.py +++ b/tests/pytest/test_markdown_highlighting.py @@ -28,8 +28,9 @@ def markdown_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu @evaluation_test( input_dataset=["tests/pytest/data/markdown_dataset.jsonl"], dataset_adapter=markdown_dataset_to_evaluation_row, - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], - rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}], + completion_params=[ + {"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"} + ], passed_threshold=0.5, rollout_processor=default_single_turn_rollout_processor, num_runs=1, diff --git a/tests/pytest/test_pytest_async.py b/tests/pytest/test_pytest_async.py index 60f6fd6a..1cfc2db6 100644 --- a/tests/pytest/test_pytest_async.py +++ b/tests/pytest/test_pytest_async.py @@ -17,7 +17,7 @@ Message(role="user", content="What is the capital of the moon?"), ], ], - model=["accounts/fireworks/models/kimi-k2-instruct"], + completion_params=[{"model": "accounts/fireworks/models/kimi-k2-instruct"}], ) async def test_pytest_async(rows: List[EvaluationRow]) -> List[EvaluationRow]: """Run math evaluation on sample dataset using pytest interface.""" @@ -30,7 +30,7 @@ async def test_pytest_async(rows: List[EvaluationRow]) -> List[EvaluationRow]: Message(role="user", content="What is the capital of France?"), ], ], - model=["accounts/fireworks/models/kimi-k2-instruct"], + completion_params=[{"model": "accounts/fireworks/models/kimi-k2-instruct"}], mode="pointwise", ) async def test_pytest_async_pointwise(row: EvaluationRow) -> EvaluationRow: diff --git a/tests/pytest/test_pytest_default_agent_rollout_processor.py b/tests/pytest/test_pytest_default_agent_rollout_processor.py index 06762046..8320ec8a 100644 --- a/tests/pytest/test_pytest_default_agent_rollout_processor.py +++ b/tests/pytest/test_pytest_default_agent_rollout_processor.py @@ -1,7 +1,7 @@ from datetime import datetime from typing import List -from eval_protocol.models import Message, EvaluationRow +from eval_protocol.models import EvaluationRow, Message from eval_protocol.pytest import default_agent_rollout_processor, evaluation_test @@ -17,7 +17,7 @@ ] ], rollout_processor=default_agent_rollout_processor, - model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}], ) def test_pytest_default_agent_rollout_processor(rows: List[EvaluationRow]) -> List[EvaluationRow]: """Run math evaluation on sample dataset using pytest interface.""" diff --git a/tests/pytest/test_pytest_ensure_logging.py b/tests/pytest/test_pytest_ensure_logging.py new file mode 100644 index 00000000..4300e1b4 --- /dev/null +++ b/tests/pytest/test_pytest_ensure_logging.py @@ -0,0 +1,73 @@ +from typing import List +from unittest.mock import Mock, patch + +import eval_protocol.dataset_logger as dataset_logger +from eval_protocol.dataset_logger.dataset_logger import DatasetLogger +from eval_protocol.dataset_logger.sqlite_evaluation_row_store import SqliteEvaluationRowStore +from eval_protocol.models import EvaluationRow +from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor +from tests.pytest.test_markdown_highlighting import markdown_dataset_to_evaluation_row + + +async def test_ensure_logging(monkeypatch): + """ + Ensure that default SQLITE logger gets called by mocking the storage and checking that the storage is called. + """ + from eval_protocol.pytest.evaluation_test import evaluation_test + + # Mock the SqliteEvaluationRowStore to track calls + mock_store = Mock(spec=SqliteEvaluationRowStore) + mock_store.upsert_row = Mock() + mock_store.read_rows = Mock(return_value=[]) + mock_store.db_path = "/tmp/test.db" + + # Create a custom logger that uses our mocked store + class MockSqliteLogger(DatasetLogger): + def __init__(self, store: SqliteEvaluationRowStore): + self._store = store + + def log(self, row: EvaluationRow) -> None: + data = row.model_dump(exclude_none=True, mode="json") + self._store.upsert_row(data=data) + + def read(self, rollout_id=None) -> List[EvaluationRow]: + results = self._store.read_rows(rollout_id=rollout_id) + return [EvaluationRow(**data) for data in results] + + mock_logger = MockSqliteLogger(mock_store) + + @evaluation_test( + input_dataset=[ + "tests/pytest/data/markdown_dataset.jsonl", + ], + completion_params=[{"temperature": 0.0, "model": "dummy/local-model"}], + dataset_adapter=markdown_dataset_to_evaluation_row, + rollout_processor=default_no_op_rollout_processor, + mode="pointwise", + combine_datasets=False, + num_runs=2, + logger=mock_logger, # Use our mocked logger + ) + def eval_fn(row: EvaluationRow) -> EvaluationRow: + return row + + await eval_fn( + dataset_path=["tests/pytest/data/markdown_dataset.jsonl"], + completion_params={"temperature": 0.0, "model": "dummy/local-model"}, + ) + + # Verify that the store's upsert_row method was called + assert mock_store.upsert_row.called, "SqliteEvaluationRowStore.upsert_row should have been called" + + # Check that it was called multiple times (once for each row) + call_count = mock_store.upsert_row.call_count + assert call_count > 0, f"Expected upsert_row to be called at least once, but it was called {call_count} times" + + # Verify the calls were made with proper data structure + for call in mock_store.upsert_row.call_args_list: + args, kwargs = call + data = args[0] if args else kwargs.get("data") + assert data is not None, "upsert_row should be called with data parameter" + assert isinstance(data, dict), "data should be a dictionary" + assert "execution_metadata" in data, "data should contain execution_metadata" + assert "rollout_id" in data["execution_metadata"], "data should contain rollout_id in execution_metadata" diff --git a/tests/pytest/test_pytest_flaky_sometimes.py b/tests/pytest/test_pytest_flaky_sometimes.py index cb70ec1e..65e1e63d 100644 --- a/tests/pytest/test_pytest_flaky_sometimes.py +++ b/tests/pytest/test_pytest_flaky_sometimes.py @@ -12,7 +12,7 @@ @pytest.mark.skipif(os.getenv("CI") == "true", reason="Skipping flaky test in CI") @evaluation_test( input_messages=[[Message(role="user", content="Return HEADS or TAILS at random.")]], - model=["dummy/local-model"], + completion_params=[{"model": "dummy/local-model"}], rollout_processor=default_no_op_rollout_processor, mode="pointwise", num_runs=5, diff --git a/tests/pytest/test_pytest_function_calling.py b/tests/pytest/test_pytest_function_calling.py index 84f44fc5..63488dbe 100644 --- a/tests/pytest/test_pytest_function_calling.py +++ b/tests/pytest/test_pytest_function_calling.py @@ -20,7 +20,7 @@ def function_calling_to_evaluation_row(rows: List[Dict[str, Any]]) -> List[Evalu @evaluation_test( input_dataset=["tests/pytest/data/function_calling.jsonl"], - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], mode="pointwise", dataset_adapter=function_calling_to_evaluation_row, rollout_processor=default_single_turn_rollout_processor, diff --git a/tests/pytest/test_pytest_ids.py b/tests/pytest/test_pytest_ids.py index 24ba3baf..045d2a19 100644 --- a/tests/pytest/test_pytest_ids.py +++ b/tests/pytest/test_pytest_ids.py @@ -28,7 +28,7 @@ async def test_evaluation_test_decorator(monkeypatch): input_dataset=[ "tests/pytest/data/markdown_dataset.jsonl", ], - model=["dummy/local-model"], + completion_params=[{"temperature": 0.0, "model": "dummy/local-model"}], dataset_adapter=markdown_dataset_to_evaluation_row, rollout_processor=default_no_op_rollout_processor, mode="pointwise", @@ -45,7 +45,7 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow: # Manually invoke all parameter combinations within a single test for ds_path in dataset_paths: - await eval_fn(model="dummy/local-model", dataset_path=[ds_path]) + await eval_fn(dataset_path=[ds_path], completion_params={"temperature": 0.0, "model": "dummy/local-model"}) # Assertions on IDs generated by the decorator logic assert len(logger.read()) == 38 @@ -66,8 +66,10 @@ async def test_evaluation_test_decorator_ids_single(monkeypatch): "tests/pytest/data/markdown_dataset.jsonl", "tests/pytest/data/markdown_dataset.jsonl", ], - rollout_input_params=[{"temperature": 0.0}, {"temperature": 1.0}], - model=["dummy/local-model"], + completion_params=[ + {"temperature": 0.0, "model": "dummy/local-model"}, + {"temperature": 1.0, "model": "dummy/local-model"}, + ], dataset_adapter=markdown_dataset_to_evaluation_row, rollout_processor=default_no_op_rollout_processor, mode="pointwise", @@ -87,12 +89,15 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow: "tests/pytest/data/markdown_dataset.jsonl", "tests/pytest/data/markdown_dataset.jsonl", ] - input_params_list = [{"temperature": 0.0}, {"temperature": 1.0}] + completion_params_list = [ + {"temperature": 0.0, "model": "dummy/local-model"}, + {"temperature": 1.0, "model": "dummy/local-model"}, + ] # Manually invoke all parameter combinations within a single test for ds_path in dataset_paths: - for params in input_params_list: - await eval_fn(model="dummy/local-model", dataset_path=[ds_path], input_params=params) + for params in completion_params_list: + await eval_fn(dataset_path=[ds_path], completion_params=params) # Assertions on IDs generated by the decorator logic assert len(unique_invocation_ids) == 1 diff --git a/tests/pytest/test_pytest_input_messages.py b/tests/pytest/test_pytest_input_messages.py index edb69b83..dc460aa5 100644 --- a/tests/pytest/test_pytest_input_messages.py +++ b/tests/pytest/test_pytest_input_messages.py @@ -10,7 +10,7 @@ Message(role="user", content="What is the capital of France?"), ] ], - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], rollout_processor=default_single_turn_rollout_processor, ) def test_input_messages_in_decorator(rows: List[EvaluationRow]) -> List[EvaluationRow]: diff --git a/tests/pytest/test_pytest_json_schema.py b/tests/pytest/test_pytest_json_schema.py index 3c18ff2b..158874f1 100644 --- a/tests/pytest/test_pytest_json_schema.py +++ b/tests/pytest/test_pytest_json_schema.py @@ -24,7 +24,7 @@ def json_schema_to_evaluation_row(rows: List[Dict[str, Any]]) -> List[Evaluation @evaluation_test( input_dataset=["tests/pytest/data/json_schema.jsonl"], - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], mode="pointwise", rollout_processor=default_single_turn_rollout_processor, dataset_adapter=json_schema_to_evaluation_row, diff --git a/tests/pytest/test_pytest_math_example.py b/tests/pytest/test_pytest_math_example.py index afe74a4e..23010797 100644 --- a/tests/pytest/test_pytest_math_example.py +++ b/tests/pytest/test_pytest_math_example.py @@ -8,8 +8,7 @@ @evaluation_test( input_dataset=["development/gsm8k_sample.jsonl"], dataset_adapter=gsm8k_to_evaluation_row, - model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], - rollout_input_params=[{"temperature": 0.0}], + completion_params=[{"temperature": 0.0, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], max_dataset_rows=5, passed_threshold=0.0, rollout_processor=default_single_turn_rollout_processor, diff --git a/tests/pytest/test_pytest_math_format_length.py b/tests/pytest/test_pytest_math_format_length.py index e51b062f..5bba5c0e 100644 --- a/tests/pytest/test_pytest_math_format_length.py +++ b/tests/pytest/test_pytest_math_format_length.py @@ -11,8 +11,7 @@ @evaluation_test( input_dataset=["development/gsm8k_sample.jsonl"], dataset_adapter=gsm8k_to_evaluation_row, - model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], - rollout_input_params=[{"temperature": 0.0}], + completion_params=[{"temperature": 0.0, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], max_dataset_rows=5, passed_threshold=0.0, rollout_processor=default_single_turn_rollout_processor, diff --git a/tests/pytest/test_pytest_mcp_config.py b/tests/pytest/test_pytest_mcp_config.py index c1b55d51..dde15aa9 100644 --- a/tests/pytest/test_pytest_mcp_config.py +++ b/tests/pytest/test_pytest_mcp_config.py @@ -20,7 +20,7 @@ ] ], rollout_processor=default_agent_rollout_processor, - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-20b"], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b"}], mode="pointwise", mcp_config_path="tests/pytest/mcp_configurations/mock_discord_mcp_config.json", ) diff --git a/tests/pytest/test_pytest_mcp_url.py b/tests/pytest/test_pytest_mcp_url.py index 2a1c1cfc..01c06c45 100644 --- a/tests/pytest/test_pytest_mcp_url.py +++ b/tests/pytest/test_pytest_mcp_url.py @@ -1,4 +1,4 @@ -from eval_protocol.models import EvaluateResult, Message, EvaluationRow +from eval_protocol.models import EvaluateResult, EvaluationRow, Message from eval_protocol.pytest import default_agent_rollout_processor, evaluation_test @@ -19,7 +19,7 @@ ] ], rollout_processor=default_agent_rollout_processor, - model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}], mode="pointwise", mcp_config_path="tests/pytest/mcp_configurations/docs_mcp_config.json", ) diff --git a/tests/pytest/test_pytest_word_count_example.py b/tests/pytest/test_pytest_word_count_example.py index b0c4850d..339c5152 100644 --- a/tests/pytest/test_pytest_word_count_example.py +++ b/tests/pytest/test_pytest_word_count_example.py @@ -8,8 +8,7 @@ @evaluation_test( input_dataset=["development/gsm8k_sample.jsonl"], dataset_adapter=word_count_to_evaluation_row, - model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], - rollout_input_params=[{"temperature": 0.0}], + completion_params=[{"temperature": 0.0, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], max_dataset_rows=5, passed_threshold=0.3, # Reasonable threshold for word count evaluation rollout_processor=default_single_turn_rollout_processor, diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py index f5472092..0eeba626 100644 --- a/tests/pytest/test_tau_bench_airline.py +++ b/tests/pytest/test_tau_bench_airline.py @@ -10,7 +10,7 @@ from pathlib import Path from typing import Any, Dict, List -from eval_protocol.models import CompletionParams, EvaluateResult, EvaluationRow, InputMetadata, Message +from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message from eval_protocol.pytest import evaluation_test from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor from vendor.tau2.data_model.message import ( @@ -64,8 +64,14 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval @evaluation_test( input_dataset=["tests/pytest/data/airline_dataset.jsonl"], dataset_adapter=tau_bench_airline_to_evaluation_row, - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], - rollout_input_params=[{"temperature": 0.8, "extra_body": {"reasoning_effort": "medium"}}], + completion_params=[ + { + "temperature": 0.8, + "max_tokens": 4096, + "extra_body": {"reasoning_effort": "low"}, + "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", + } + ], rollout_processor=default_mcp_gym_rollout_processor, passed_threshold={"success": 0.4, "standard_deviation": 0.1}, num_runs=8, diff --git a/tests/test_models.py b/tests/test_models.py index 61c3b3c0..1358344b 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -4,7 +4,6 @@ import pytest from eval_protocol.models import ( # Added Message to existing import - CompletionParams, EvaluateResult, EvaluationRow, InputMetadata, @@ -271,7 +270,7 @@ def test_evaluation_row_creation(): evaluation_result=evaluation_result, input_metadata=InputMetadata( row_id="math_001", - completion_params=CompletionParams(model="gpt-4"), + completion_params={"model": "gpt-4"}, dataset_info={"source": "math_eval"}, session_data={"timestamp": 1234567890}, ), @@ -322,7 +321,7 @@ def test_evaluation_row_serialization(): evaluation_result=evaluation_result, input_metadata=InputMetadata( row_id="test_123", - completion_params=CompletionParams(model="gpt-4"), + completion_params={"model": "gpt-4"}, dataset_info={"test": True}, session_data={"timestamp": 1234567890}, ), diff --git a/tests/test_tau_bench_airline_smoke.py b/tests/test_tau_bench_airline_smoke.py index e96baabe..200f7ca8 100644 --- a/tests/test_tau_bench_airline_smoke.py +++ b/tests/test_tau_bench_airline_smoke.py @@ -65,8 +65,13 @@ def tau_bench_airline_smoke_to_evaluation_row(data: List[Dict[str, Any]]) -> Lis @evaluation_test( input_dataset=["tests/pytest/data/airline_dataset.jsonl"], dataset_adapter=tau_bench_airline_smoke_to_evaluation_row, - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], - rollout_input_params=[{"temperature": 0.8, "extra_body": {"reasoning_effort": "medium"}}], + completion_params=[ + { + "temperature": 0.8, + "extra_body": {"reasoning_effort": "medium"}, + "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", + } + ], rollout_processor=default_mcp_gym_rollout_processor, passed_threshold=0.36, num_runs=1, # Smoke test: single run for quick feedback diff --git a/vite-app/src/components/EvaluationRow.tsx b/vite-app/src/components/EvaluationRow.tsx index fdeaf03c..03412e61 100644 --- a/vite-app/src/components/EvaluationRow.tsx +++ b/vite-app/src/components/EvaluationRow.tsx @@ -231,7 +231,7 @@ export const EvaluationRow = observer( {/* Model */} - + {/* Score */} diff --git a/vite-app/src/types/eval-protocol.ts b/vite-app/src/types/eval-protocol.ts index b18697f1..f86ea058 100644 --- a/vite-app/src/types/eval-protocol.ts +++ b/vite-app/src/types/eval-protocol.ts @@ -54,16 +54,11 @@ export const EvaluateResultSchema = z.object({ final_control_plane_info: z.record(z.string(), z.any()).optional().describe('The final control plane state that led to termination.') }); -export const CompletionParamsSchema = z.object({ - model: z.string().describe('Model identifier (e.g., \'gpt-4.1\', \'fireworks/llama\')'), - temperature: z.number().optional().describe('Temperature setting for model generation'), - max_tokens: z.number().optional().describe('Maximum tokens to generate'), - max_tool_calls: z.number().optional().describe('Maximum tool calls per turn') -}); +export const CompletionParamsSchema = z.record(z.string(), z.any()); export const InputMetadataSchema = z.object({ row_id: z.string().optional().describe('Unique string to ID the row'), - completion_params: CompletionParamsSchema.optional().describe('Completion endpoint parameters used'), + completion_params: CompletionParamsSchema.describe('Completion endpoint parameters used'), dataset_info: z.record(z.string(), z.any()).optional().describe('Dataset row details: seed, system_prompt, environment_context, etc'), session_data: z.record(z.string(), z.any()).optional().describe('Session metadata like timestamp (input only, no duration/usage)') }).loose();