From 5bc39d746352cc145bfb4014dc02adf30cfc8506 Mon Sep 17 00:00:00 2001 From: benjibc Date: Sat, 9 Aug 2025 23:59:12 +0000 Subject: [PATCH 1/3] Add AIME2025, GPQA, HealthBench evaluation_test suites; unify row-limiting via pytest flag; clean up examples --- eval_protocol/common_utils.py | 41 +++++-- eval_protocol/generation/clients.py | 5 +- eval_protocol/pytest/evaluation_test.py | 30 ++++- eval_protocol/pytest/plugin.py | 54 ++++++++ examples/aime2025_chat_completion/README.md | 24 ++++ examples/aime2025_chat_completion/__init__.py | 4 + examples/aime2025_chat_completion/main.py | 110 +++++++++++++++++ .../tests/test_evaluation.py | 115 ++++++++++++++++++ examples/gpqa/tests/test_evaluation.py | 101 +++++++++++++++ examples/healthbench/tests/test_evaluation.py | 95 +++++++++++++++ pyproject.toml | 12 ++ 11 files changed, 579 insertions(+), 12 deletions(-) create mode 100644 eval_protocol/pytest/plugin.py create mode 100644 examples/aime2025_chat_completion/README.md create mode 100644 examples/aime2025_chat_completion/__init__.py create mode 100644 examples/aime2025_chat_completion/main.py create mode 100644 examples/aime2025_chat_completion/tests/test_evaluation.py create mode 100644 examples/gpqa/tests/test_evaluation.py create mode 100644 examples/healthbench/tests/test_evaluation.py diff --git a/eval_protocol/common_utils.py b/eval_protocol/common_utils.py index e39f80d0..42ad47ad 100644 --- a/eval_protocol/common_utils.py +++ b/eval_protocol/common_utils.py @@ -2,6 +2,8 @@ import re from typing import Any, Dict, List +import requests + def load_jsonl(file_path: str) -> List[Dict[str, Any]]: """ @@ -15,16 +17,39 @@ def load_jsonl(file_path: str) -> List[Dict[str, Any]]: Returns an empty list if the file is not found or if errors occur during parsing. """ data: List[Dict[str, Any]] = [] - with open(file_path, "r", encoding="utf-8") as f: - for line_number, line in enumerate(f): + if file_path.startswith("http://") or file_path.startswith("https://"): + resp = requests.get(file_path, stream=True, timeout=30) + resp.raise_for_status() + for line_number, raw in enumerate(resp.iter_lines(decode_unicode=True), start=1): + if raw is None: + continue + stripped = raw.strip() + if not stripped: + continue try: - data.append(json.loads(line.strip())) + data.append(json.loads(stripped)) except json.JSONDecodeError as e: - print(f"Error parsing JSON line for file {file_path} at line {line_number}") - # attempt to find "row_id" in the line by finding index of "row_id" and performing regex of `"row_id": (.*),` - row_id_index = line.find("row_id") + print(f"Error parsing JSON line for URL {file_path} at line {line_number}") + row_id_index = stripped.find("row_id") if row_id_index != -1: - row_id = re.search(r'"row_id": (.*),', line[row_id_index:]) - raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})") + row_id = re.search(r'"row_id": (.*),', stripped[row_id_index:]) + raise ValueError(f"{e.msg} at line {line_number}: {stripped} ({row_id})") raise e + else: + with open(file_path, "r", encoding="utf-8") as f: + for line_number, line in enumerate(f, start=1): + # Skip entirely blank or whitespace-only lines to be robust to trailing newlines + stripped = line.strip() + if not stripped: + continue + try: + data.append(json.loads(stripped)) + except json.JSONDecodeError as e: + print(f"Error parsing JSON line for file {file_path} at line {line_number}") + # attempt to find "row_id" in the line by finding index of "row_id" and performing regex of `"row_id": (.*),` + row_id_index = line.find("row_id") + if row_id_index != -1: + row_id = re.search(r'"row_id": (.*),', line[row_id_index:]) + raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})") + raise e return data diff --git a/eval_protocol/generation/clients.py b/eval_protocol/generation/clients.py index 8f386290..45be6ab0 100644 --- a/eval_protocol/generation/clients.py +++ b/eval_protocol/generation/clients.py @@ -11,7 +11,7 @@ import aiohttp from omegaconf import DictConfig -from pydantic import BaseModel, Field # Added for new models +from pydantic import BaseModel # Added for new models logger = logging.getLogger(__name__) @@ -83,6 +83,9 @@ async def generate( } if self.top_p is not None: payload["top_p"] = self.top_p + # Include reasoning settings if configured (for reasoning-capable models) + if self.reasoning_effort: + payload["reasoning_effort"] = self.reasoning_effort if tools: payload["tools"] = tools diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 245467bb..a28bbca5 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -1,5 +1,6 @@ import inspect import os +import os from typing import Any, Callable, Dict, List, Optional import pytest @@ -132,13 +133,34 @@ def execute_with_params( return execute_function(test_func, **kwargs) # Calculate all possible combinations of parameters + def _parse_ep_max_rows(default_value: int | None) -> int | None: + """Read EP_MAX_DATASET_ROWS env override as int or None.""" + raw = os.getenv("EP_MAX_DATASET_ROWS") + if raw is None: + return default_value + s = raw.strip().lower() + if s == "none": + return None + try: + return int(s) + except ValueError: + return default_value + def generate_combinations(): combinations = [] # Handle optional parameters with defaults datasets: List[Optional[DatasetPathParam]] = input_dataset if input_dataset is not None else [None] # type: ignore params: List[Optional[RolloutInputParam]] = rollout_input_params if rollout_input_params is not None else [None] # type: ignore - messages: List[Optional[InputMessagesParam]] = input_messages if input_messages is not None else [None] # type: ignore + # Apply EP_MAX_DATASET_ROWS to input_messages to uniformly control row count when messages are provided + if input_messages is not None and isinstance(input_messages, list): + effective_max_rows = _parse_ep_max_rows(max_dataset_rows) + if effective_max_rows is not None: + messages: List[Optional[InputMessagesParam]] = input_messages[:effective_max_rows] # type: ignore + else: + messages = input_messages # type: ignore + else: + messages = [None] # type: ignore kwargs: List[Optional[EvaluationInputParam]] = evaluation_test_kwargs if evaluation_test_kwargs is not None else [None] # type: ignore # Generate all combinations @@ -201,8 +223,10 @@ def wrapper_body(**kwargs): data: List[EvaluationRow] = [] if "dataset_path" in kwargs and kwargs["dataset_path"] is not None: data_jsonl = load_jsonl(kwargs["dataset_path"]) - if max_dataset_rows is not None: - data_jsonl = data_jsonl[:max_dataset_rows] + # Apply env override for max rows if present + effective_max_rows = _parse_ep_max_rows(max_dataset_rows) + if effective_max_rows is not None: + data_jsonl = data_jsonl[:effective_max_rows] data = dataset_adapter(data_jsonl) elif "input_messages" in kwargs and kwargs["input_messages"] is not None: data: List[EvaluationRow] = [EvaluationRow(messages=kwargs["input_messages"])] diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py new file mode 100644 index 00000000..e3a98128 --- /dev/null +++ b/eval_protocol/pytest/plugin.py @@ -0,0 +1,54 @@ +""" +Pytest plugin for Eval Protocol developer ergonomics. + +Adds a discoverable CLI flag `--ep-max-rows` to control how many rows +evaluation_test processes. This sets the environment variable +`EP_MAX_DATASET_ROWS` so the core decorator can apply it uniformly to +both URL datasets and in-memory input_messages. + +Usage: + - CLI: pytest --ep-max-rows=2 # or --ep-max-rows=all for no limit + - Defaults: If not provided, no override is applied (tests use the + max_dataset_rows value set in the decorator). +""" + +import os +from typing import Optional + +import pytest + + +def pytest_addoption(parser: pytest.Parser) -> None: + group = parser.getgroup("eval-protocol") + group.addoption( + "--ep-max-rows", + action="store", + default=None, + help=( + "Limit number of dataset rows processed by evaluation_test. " + "Pass an integer (e.g., 2, 50) or 'all' for no limit." + ), + ) + + +def _normalize_max_rows(val: Optional[str]) -> Optional[str]: + if val is None: + return None + s = val.strip().lower() + if s == "all": + return "None" + # Validate int; if invalid, ignore and return None (no override) + try: + int(s) + return s + except ValueError: + return None + + +def pytest_configure(config: pytest.Config) -> None: + cli_val = config.getoption("--ep-max-rows") + norm = _normalize_max_rows(cli_val) + if norm is not None: + os.environ["EP_MAX_DATASET_ROWS"] = norm + + diff --git a/examples/aime2025_chat_completion/README.md b/examples/aime2025_chat_completion/README.md new file mode 100644 index 00000000..dbe79527 --- /dev/null +++ b/examples/aime2025_chat_completion/README.md @@ -0,0 +1,24 @@ +## AIME2025 Chat Completion Example + +This example reproduces gpt-oss's AIME2025 chat completion evaluation inside Eval Protocol. + +### What it does +- Loads AIME2025 questions from Hugging Face +- Prompts a reasoning-capable chat-completions model +- Extracts the final integer answer from \boxed{...} +- Scores exact-match vs. the ground-truth integer + +### Quick run (pytest, CI-friendly) +The evaluation is implemented as a pytest `evaluation_test` under `tests/`. Run it directly: + +```bash +pytest -q examples/aime2025_chat_completion/tests/test_evaluation.py -q +``` + +Environment variables expected: +- `FIREWORKS_API_KEY` + +To scale up, adjust parameters in the decorator (e.g., `threshold_of_success`, `max_dataset_rows`). + + + diff --git a/examples/aime2025_chat_completion/__init__.py b/examples/aime2025_chat_completion/__init__.py new file mode 100644 index 00000000..8bcaacfb --- /dev/null +++ b/examples/aime2025_chat_completion/__init__.py @@ -0,0 +1,4 @@ +__all__ = ["main"] + + + diff --git a/examples/aime2025_chat_completion/main.py b/examples/aime2025_chat_completion/main.py new file mode 100644 index 00000000..92c6dd83 --- /dev/null +++ b/examples/aime2025_chat_completion/main.py @@ -0,0 +1,110 @@ +""" +Eval Protocol example: AIME2025 chat completion evaluation + +This example mirrors gpt-oss's AIME 2025 evaluation using OpenAI-compatible +chat completions. It evaluates whether the assistant's final answer matches the +ground-truth integer, extracting answers from \\boxed{...} or fallback digits. +""" + +import re +from typing import Any, Dict, List, Optional, Union + +from eval_protocol import EvaluateResult, MetricResult, reward_function +from eval_protocol.models import Message + + +def _extract_boxed_text(text: str) -> str: + """ + Extract the last occurrence of a boxed answer (\\boxed{...} or \\framebox{...}). + If none found, fall back to the last integer found in the text. + """ + if not text: + return "" + + pattern_boxed = r"boxed{(.*?)}|framebox{(.*?)}" + matches = re.findall(pattern_boxed, text, re.DOTALL) + if matches: + # Iterate from the end to prioritize the final boxed answer + for match in matches[::-1]: + for group in match: + if group: + return group.split(",")[-1].strip() + + # Fallback: last integer in the text + matches_digits = re.findall(r"\d+", text, re.DOTALL) + if matches_digits: + return matches_digits[-1] + return "" + + +def _normalize_to_int_or_none(s: str) -> Optional[int]: + if s is None: + return None + # Only take leading digits + m = re.match(r"\d+", str(s).strip()) + if not m: + return None + try: + return int(m.group(0)) + except ValueError: + return None + + +@reward_function(id="aime2025_exact_match") +def evaluate( + messages: Union[List[Message], List[Dict[str, Any]]], + ground_truth: Optional[str] = None, + **kwargs, +) -> EvaluateResult: + """ + Score 1.0 if extracted final answer equals the ground-truth integer, else 0.0. + """ + if not messages: + return EvaluateResult( + score=0.0, + reason="No messages provided", + is_score_valid=False, + metrics={ + "parse_status": MetricResult(score=0.0, is_score_valid=False, reason="empty messages") + }, + ) + + last_msg = messages[-1] + content = last_msg["content"] if isinstance(last_msg, dict) else (last_msg.content or "") + + extracted_text = _extract_boxed_text(content) + extracted_int = _normalize_to_int_or_none(extracted_text) + gt_int = _normalize_to_int_or_none(ground_truth if ground_truth is not None else "") + + is_valid = extracted_int is not None and gt_int is not None + score = 1.0 if (is_valid and extracted_int == gt_int) else 0.0 + + metrics: Dict[str, MetricResult] = { + "exact_match": MetricResult( + score=score, + is_score_valid=is_valid, + reason=( + "Parsed both integers and they matched" + if score == 1.0 + else ( + "Parsed integers did not match" + if is_valid + else "Failed to parse integer from prediction or ground truth" + ) + ), + data={ + "extracted_text": extracted_text, + "extracted_int": extracted_int, + "ground_truth_int": gt_int, + }, + ) + } + + return EvaluateResult( + score=score, + reason=("Answer correct" if score == 1.0 else "Answer incorrect"), + is_score_valid=is_valid, + metrics=metrics, + ) + + diff --git a/examples/aime2025_chat_completion/tests/test_evaluation.py b/examples/aime2025_chat_completion/tests/test_evaluation.py new file mode 100644 index 00000000..0ef42ffd --- /dev/null +++ b/examples/aime2025_chat_completion/tests/test_evaluation.py @@ -0,0 +1,115 @@ +from typing import Any, Dict, List + +from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult +from eval_protocol.pytest.default_single_turn_rollout_process import ( + default_single_turn_rollout_processor, +) +from eval_protocol.pytest.evaluation_test import evaluation_test + +from examples.aime2025_chat_completion.main import _extract_boxed_text, _normalize_to_int_or_none + + +SYSTEM_PROMPT = ( + "You are a helpful math assistant. Please reason step by step, and put your " + "final answer within \\boxed{...}." +) + +""" +This test consumes the AIME2025 dataset directly from Hugging Face JSONL URLs via +the evaluation_test dataset loader + adapter. By default, max_dataset_rows=2 to +keep CI fast; set it to None to run the full dataset. +""" + + +def _ep_int(var_name: str, default_value: int | None) -> int | None: + """Read EP_*-prefixed integer or 'None' from environment for easy overrides.""" + raw = os.getenv(var_name) + if raw is None: + return default_value + raw_stripped = raw.strip().lower() + if raw_stripped == "none": + return None + try: + return int(raw_stripped) + except ValueError: + return default_value + + + + +def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: + """ + Convert raw AIME2025 rows (with keys 'question' and 'answer') to EvaluationRow. + Limits handled by evaluation_test's max_dataset_rows, so adapter is simple. + """ + converted: List[EvaluationRow] = [] + for r in rows: + question = r.get("question", "") + answer = r.get("answer", None) + messages = [ + Message(role="system", content=SYSTEM_PROMPT), + Message(role="user", content=str(question)), + ] + converted.append(EvaluationRow(messages=messages, ground_truth=str(answer) if answer is not None else None)) + return converted + + +@evaluation_test( + model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], + input_dataset=[ + "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl", + "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl", + ], + dataset_adapter=aime2025_dataset_adapter, + rollout_input_params=[{"temperature": 0.0, "max_tokens": 1024}], + rollout_processor=default_single_turn_rollout_processor, + aggregation_method="mean", + threshold_of_success=None, + num_runs=1, + max_dataset_rows=2, + mode="pointwise", +) +def test_aime2025_pointwise(row: EvaluationRow) -> EvaluationRow: + """ + Pointwise evaluation of AIME2025 rows: extract final integer from assistant message and compare to ground truth. + """ + # After rollout, the last message should be assistant's response + assistant_msgs = [m for m in row.messages if m.role == "assistant"] + content = assistant_msgs[-1].content if assistant_msgs else "" + + extracted_text = _extract_boxed_text(content or "") + extracted_int = _normalize_to_int_or_none(extracted_text) + # Ground truth comes from dataset_adapter + gt_int = _normalize_to_int_or_none(row.ground_truth or "") + + is_valid = extracted_int is not None and gt_int is not None + score = 1.0 if (is_valid and extracted_int == gt_int) else 0.0 + + metrics = { + "exact_match": MetricResult( + score=score, + is_score_valid=is_valid, + reason=( + "Parsed both integers and they matched" + if score == 1.0 + else ( + "Parsed integers did not match" if is_valid else "Failed to parse integer" + ) + ), + data={ + "extracted_text": extracted_text, + "extracted_int": extracted_int, + "ground_truth_int": gt_int, + }, + ) + } + + row.evaluation_result = EvaluateResult( + score=score, + reason=("Answer correct" if score == 1.0 else "Answer incorrect"), + is_score_valid=is_valid, + metrics=metrics, + ) + return row + + diff --git a/examples/gpqa/tests/test_evaluation.py b/examples/gpqa/tests/test_evaluation.py new file mode 100644 index 00000000..42c3c91b --- /dev/null +++ b/examples/gpqa/tests/test_evaluation.py @@ -0,0 +1,101 @@ +from typing import Any, Dict, List + +import re + +from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult +from eval_protocol.pytest.evaluation_test import evaluation_test +from eval_protocol.pytest.default_single_turn_rollout_process import ( + default_single_turn_rollout_processor, +) + + +SYSTEM_PROMPT = ( + "You are a helpful assistant. Read the question and options carefully. " + "Express your final answer strictly as a single letter: A, B, C, or D." +) + + +def extract_abcd_letter(text: str) -> str | None: + if not text: + return None + m = re.search(r"\b([ABCD])\b", text.upper()) + return m.group(1) if m else None + + +def _build_gpqa_messages_from_hf(max_samples: int | None = 2) -> List[List[Message]]: + """ + Load GPQA (diamond) from the reference blob CSV and construct prompts. + For full dataset, call with max_samples=None. + """ + from datasets import load_dataset # type: ignore + + url = "https://openaipublic.blob.core.windows.net/simple-evals/gpqa_diamond.csv" + ds = load_dataset("csv", data_files=url, split="train") + messages_list: List[List[Message]] = [] + # We will store the correct letter in a trailing system message for lookup (not given to the model) + for ex in ds: + if max_samples is not None and len(messages_list) >= max_samples: + break + q = str(ex.get("Question", "")) + correct = str(ex.get("Correct Answer", "")).strip() + inc1 = str(ex.get("Incorrect Answer 1", "")) + inc2 = str(ex.get("Incorrect Answer 2", "")) + inc3 = str(ex.get("Incorrect Answer 3", "")) + choices = [correct, inc1, inc2, inc3] + user_content = ( + f"{q}\n\n(A) {choices[0]}\n(B) {choices[1]}\n(C) {choices[2]}\n(D) {choices[3]}\n\nAnswer with one letter." + ) + messages_list.append( + [ + Message(role="system", content=SYSTEM_PROMPT), + Message(role="user", content=user_content), + Message(role="system", content=f"__GT__:A"), + ] + ) + if not messages_list: + raise RuntimeError("Failed to load GPQA messages: no rows found from source") + return messages_list + + +_GPQA_INPUT_MESSAGES = _build_gpqa_messages_from_hf(max_samples=2) + + +@evaluation_test( + model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], + input_messages=_GPQA_INPUT_MESSAGES, + rollout_input_params=[{"temperature": 0.0, "max_tokens": 512}], + rollout_processor=default_single_turn_rollout_processor, + aggregation_method="mean", + threshold_of_success=None, + num_runs=1, + max_dataset_rows=2, + mode="pointwise", +) +def test_gpqa_pointwise(row: EvaluationRow) -> EvaluationRow: + assistant_msgs = [m for m in row.messages if m.role == "assistant"] + content = assistant_msgs[-1].content if assistant_msgs else "" + + pred = extract_abcd_letter(content or "") + # Retrieve GT from the trailing system message we appended + gt_tokens = [m.content for m in row.messages if m.role == "system" and (m.content or "").startswith("__GT__:")] + gt = gt_tokens[-1].split(":", 1)[1].strip() if gt_tokens else None + + is_valid = pred is not None and gt in {"A", "B", "C", "D"} + score = 1.0 if (is_valid and pred == gt) else 0.0 + + row.evaluation_result = EvaluateResult( + score=score, + reason=("Correct option" if score == 1.0 else "Incorrect option"), + is_score_valid=is_valid, + metrics={ + "exact_match": MetricResult( + score=score, + is_score_valid=is_valid, + reason=("Matched" if score == 1.0 else "Not matched"), + data={"pred": pred, "gt": gt}, + ) + }, + ) + return row + + diff --git a/examples/healthbench/tests/test_evaluation.py b/examples/healthbench/tests/test_evaluation.py new file mode 100644 index 00000000..c6877487 --- /dev/null +++ b/examples/healthbench/tests/test_evaluation.py @@ -0,0 +1,95 @@ +from typing import Dict, List + +import json + +from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult +from eval_protocol.pytest.evaluation_test import evaluation_test +from eval_protocol.pytest.default_single_turn_rollout_process import ( + default_single_turn_rollout_processor, +) + + +SYSTEM_PROMPT = ( + "You are a clinician assistant. Provide safe, accurate guidance." +) + + +# Inline two small HealthBench-like samples and attach tiny rubrics in-memory +_HB_SAMPLES = [ + { + "prompt_text": "I have a high fever and chest pain. Should I go to the hospital?", + "rubrics": [ + { + "criterion": "advises going to hospital or urgent care for severe symptoms such as chest pain or high fever", + "points": 1.0, + "tags": ["safety"], + } + ], + }, + { + "prompt_text": "I feel slightly dizzy after a run. What should I do?", + "rubrics": [ + { + "criterion": "suggests rest and hydration for mild symptoms, and discusses when to seek urgent care if symptoms worsen", + "points": 1.0, + "tags": ["quality"], + } + ], + }, +] + +_HB_INPUT_MESSAGES: List[List[Message]] = [] +_HB_RUBRICS_MAP: Dict[str, List[Dict]] = {} +for s in _HB_SAMPLES: + _HB_INPUT_MESSAGES.append( + [ + Message(role="system", content=SYSTEM_PROMPT), + Message(role="user", content=s["prompt_text"]), + ] + ) + _HB_RUBRICS_MAP[s["prompt_text"]] = s["rubrics"] + + +@evaluation_test( + model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], + input_messages=_HB_INPUT_MESSAGES, + rollout_input_params=[{"temperature": 0.2, "max_tokens": 512}], + rollout_processor=default_single_turn_rollout_processor, + aggregation_method="mean", + threshold_of_success=None, + num_runs=1, + max_dataset_rows=2, + mode="pointwise", +) +def test_healthbench_pointwise(row: EvaluationRow) -> EvaluationRow: + # Minimal proxy: award 1.0 if model mentions at least one required keyword from the rubric + assistant_msgs = [m for m in row.messages if m.role == "assistant"] + content = (assistant_msgs[-1].content if assistant_msgs else "").lower() + + # Retrieve rubrics for this prompt + user_text = [m.content for m in row.messages if m.role == "user"][-1] + rubrics = _HB_RUBRICS_MAP.get(user_text or "", []) + + required_keywords = set() + for item in rubrics: + crit = str(item.get("criterion", "")).lower() + for kw in ["hospital", "symptom", "risk", "treatment", "urgent", "hydration", "rest"]: + if kw in crit: + required_keywords.add(kw) + + hit = any(kw in content for kw in required_keywords) if required_keywords else False + score = 1.0 if hit else 0.0 + + row.evaluation_result = EvaluateResult( + score=score, + reason=("Meets minimal rubric keyword" if hit else "Does not meet minimal rubric keyword"), + is_score_valid=True, + metrics={ + "keyword_hit": MetricResult( + score=score, is_score_valid=True, reason=f"keywords={sorted(list(required_keywords))}" + ) + }, + ) + return row + + diff --git a/pyproject.toml b/pyproject.toml index 9e6112a3..def06560 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -115,11 +115,23 @@ adapters = [ "transformers>=4.0.0", ] +[tool.pytest.ini_options] +addopts = "-q" +testpaths = [ + "examples", +] +plugins = [ + "eval_protocol.pytest.plugin", +] + [project.scripts] fireworks-reward = "eval_protocol.cli:main" eval-protocol = "eval_protocol.cli:main" ep = "eval_protocol.cli:main" +[project.entry-points.pytest11] +eval_protocol = "eval_protocol.pytest.plugin" + [tool.setuptools.packages.find] include = ["eval_protocol*", "development*", "vendor*"] From 4aa9e5c40a2f1feb0cc9b59c25f5001bc79a774b Mon Sep 17 00:00:00 2001 From: benjibc Date: Sun, 10 Aug 2025 05:24:48 +0000 Subject: [PATCH 2/3] evaluation with aggregated scores --- development/RUNNING_EVALUATIONS.md | 80 +++++++++++ .../default_single_turn_rollout_process.py | 26 +++- eval_protocol/pytest/evaluation_test.py | 132 ++++++++++++++++-- eval_protocol/pytest/plugin.py | 23 +++ .../tests/test_evaluation.py | 5 +- 5 files changed, 254 insertions(+), 12 deletions(-) create mode 100644 development/RUNNING_EVALUATIONS.md diff --git a/development/RUNNING_EVALUATIONS.md b/development/RUNNING_EVALUATIONS.md new file mode 100644 index 00000000..4f1832a3 --- /dev/null +++ b/development/RUNNING_EVALUATIONS.md @@ -0,0 +1,80 @@ +# Running AIME/GPQA Evaluations in CI and Locally + +This guide explains how to run the AIME2025 and GPQA evaluations using the +pytest-based `evaluation_test` decorator, how to control dataset size and +concurrency, how to select effort presets, and how to print/persist results +for CI dashboards/artifacts. + +## Objectives +- Simple pass/fail: ensure evaluation configs don’t regress. +- Comparable metrics: capture aggregated accuracy across runs/rows. +- CI-friendly outputs: print summary lines to logs and save JSON artifacts. + +## Prerequisites +- `FIREWORKS_API_KEY` set in the environment +- Install SDK: `pip install -e .[dev]` + +## Controls +- Row limit + - Default `max_dataset_rows=2` in each test decorator for quick CI. + - Override centrally: `pytest --ep-max-rows=all` or `--ep-max-rows=50`. +- Concurrency + - Set `max_concurrent_rollouts` in the decorator (recommend 4 for production Fireworks). +- Repeats + - Set `num_runs` in the decorator (e.g., 4). +- Effort (Fireworks reasoning) + - Provide `{"reasoning": {"effort": "low|medium|high"}}` in the test’s `rollout_input_params`. + - The default rollout forwards it via LiteLLM `extra_body`. + +## Printing & Persisting Results +- Flags: + - `--ep-print-summary`: print concise summary lines at end of each eval + - `--ep-summary-json=PATH`: write JSON with suite/model/agg_score/runs/rows/timestamp +- Example GitHub Actions snippet: +```yaml +- name: Run AIME low effort (full) + run: | + cd python-sdk + pytest --ep-max-rows=all --ep-print-summary \ + --ep-summary-json=outputs/aime_low.json \ + -q examples/aime2025_chat_completion/tests/test_evaluation.py::test_aime2025_pointwise -q +- name: Upload AIME results + uses: actions/upload-artifact@v4 + with: + name: aime2025-low-summary + path: python-sdk/outputs/aime_low.json +``` + +## Examples +### AIME (Low Effort, Full, Repeats=4, Concurrency=4) +```bash +cd python-sdk +pytest --ep-max-rows=all --ep-print-summary \ + --ep-summary-json=outputs/aime_low.json \ + -q examples/aime2025_chat_completion/tests/test_evaluation.py::test_aime2025_pointwise -q +``` +Expected: +- Terminal summary: `EP Summary | suite=test_aime2025_pointwise model=... agg=0.530 runs=4 rows=...` +- JSON artifact at `outputs/aime_low.json` +- For `.../gpt-oss-120b`, low-effort pass rate should be ~≥ 0.50 when repeated + +For medium/high effort, add `{"reasoning": {"effort": "medium|high"}}` to +`rollout_input_params` in the test decorator and rerun with a different JSON path. + +### GPQA (Diamond, Low Effort) +```bash +cd python-sdk +pytest --ep-max-rows=all --ep-print-summary \ + --ep-summary-json=outputs/gpqa_low.json \ + -q examples/gpqa/tests/test_evaluation.py -q +``` +Adjust repeats/concurrency/effort in the test decorator similarly to AIME. + +## Pass/Fail Signals +- If `threshold_of_success` is set in a test, it will fail when aggregated score < threshold. +- Otherwise, printing and writing artifacts occur and the run succeeds for CI. + +## Tips +- Use `--ep-max-rows` for toggling quick checks vs full evaluations without editing tests. +- Upload JSON artifacts for dashboards and historical comparisons. +- Keep concurrency conservative (e.g., 4) to avoid rate limiting. diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py index 26023d75..b5c18809 100644 --- a/eval_protocol/pytest/default_single_turn_rollout_process.py +++ b/eval_protocol/pytest/default_single_turn_rollout_process.py @@ -2,6 +2,7 @@ from typing import List from litellm import acompletion +import litellm from openai.types.chat.chat_completion_message import ChatCompletionMessageToolCall from eval_protocol.dataset_logger import default_logger @@ -14,6 +15,15 @@ async def default_single_turn_rollout_processor( ) -> List[EvaluationRow]: """Generate a single response from any supported model provider using LiteLLM.""" + # Explicitly disable LiteLLM caching to avoid reused responses across runs + try: + litellm.cache = None + # Some versions expose a helper; ignore if unavailable + if hasattr(litellm, "disable_cache"): + litellm.disable_cache() # type: ignore[call-arg] + except Exception: + pass + async def process_row(row: EvaluationRow) -> EvaluationRow: """Process a single row asynchronously.""" if len(row.messages) == 0: @@ -22,6 +32,11 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: messages_payload = [{"role": m.role, "content": m.content} for m in row.messages] request_params = {"model": config.model, "messages": messages_payload, **config.input_params} + # Allow passing reasoning effort to Fireworks via LiteLLM using extra_body + # Expected: config.input_params may contain {"reasoning": {"effort": "low|medium|high"}} + if "reasoning" in config.input_params: + request_params.setdefault("extra_body", {}) + request_params["extra_body"]["reasoning"] = config.input_params["reasoning"] if row.tools is not None: request_params["tools"] = row.tools @@ -57,8 +72,15 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: default_logger.log(row) return row - # Process all rows concurrently - tasks = [process_row(row) for row in rows] + # Process rows with bounded concurrency if configured + max_concurrent = getattr(config, "max_concurrent_rollouts", 8) or 8 + semaphore = asyncio.Semaphore(max_concurrent) + + async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow: + async with semaphore: + return await process_row(r) + + tasks = [_sem_wrapper(row) for row in rows] dataset = list(await asyncio.gather(*tasks)) return dataset diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index a28bbca5..04da1f03 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -1,6 +1,8 @@ import inspect import os -import os +import copy +import math +import statistics from typing import Any, Callable, Dict, List, Optional import pytest @@ -91,11 +93,11 @@ def decorator( if mode == "pointwise": # Pointwise mode: function should accept messages and other row-level params if "row" not in sig.parameters: - raise ValueError(f"In pointwise mode, your eval function must have a parameter named 'row'") + raise ValueError("In pointwise mode, your eval function must have a parameter named 'row'") # validate that "Row" is of type EvaluationRow if sig.parameters["row"].annotation is not EvaluationRow: - raise ValueError(f"In pointwise mode, the 'row' parameter must be of type EvaluationRow") + raise ValueError("In pointwise mode, the 'row' parameter must be of type EvaluationRow") # validate that the function has a return type of EvaluationRow if sig.return_annotation is not EvaluationRow: @@ -107,7 +109,7 @@ def decorator( # validate that "Rows" is of type List[EvaluationRow] if sig.parameters["rows"].annotation is not List[EvaluationRow]: - raise ValueError(f"In batch mode, the 'rows' parameter must be of type List[EvaluationRow]") + raise ValueError("In batch mode, the 'rows' parameter must be of type List[EvaluationRow") # validate that the function has a return type of List[EvaluationRow] if sig.return_annotation is not List[EvaluationRow]: @@ -150,7 +152,13 @@ def generate_combinations(): combinations = [] # Handle optional parameters with defaults - datasets: List[Optional[DatasetPathParam]] = input_dataset if input_dataset is not None else [None] # type: ignore + # Treat multiple dataset paths as a single combined dataset rather than + # parameterizing over each path separately. This produces one summary + # that reflects the aggregate of all provided files (e.g., AIME I+II). + if input_dataset is not None: + datasets: List[Optional[List[DatasetPathParam]]] = [input_dataset] # type: ignore + else: + datasets = [None] params: List[Optional[RolloutInputParam]] = rollout_input_params if rollout_input_params is not None else [None] # type: ignore # Apply EP_MAX_DATASET_ROWS to input_messages to uniformly control row count when messages are provided if input_messages is not None and isinstance(input_messages, list): @@ -222,7 +230,15 @@ def wrapper_body(**kwargs): # Handle dataset loading data: List[EvaluationRow] = [] if "dataset_path" in kwargs and kwargs["dataset_path"] is not None: - data_jsonl = load_jsonl(kwargs["dataset_path"]) + ds_arg = kwargs["dataset_path"] + # Support either a single path or a list of paths; if a list is provided, + # concatenate the rows from each file in order. + if isinstance(ds_arg, list): + data_jsonl = [] + for p in ds_arg: + data_jsonl.extend(load_jsonl(p)) + else: + data_jsonl = load_jsonl(ds_arg) # Apply env override for max rows if present effective_max_rows = _parse_ep_max_rows(max_dataset_rows) if effective_max_rows is not None: @@ -270,7 +286,7 @@ def wrapper_body(**kwargs): row.pid = os.getpid() default_logger.log(row) - # Now run the rollout processor with metadata-initialized data + # Prepare rollout processor config once; we will generate fresh outputs per run config = RolloutProcessorConfig( model=model_name, input_params=input_params, @@ -279,9 +295,12 @@ def wrapper_body(**kwargs): server_script_path=server_script_path, steps=steps, ) - input_dataset = execute_function(rollout_processor, rows=data, config=config) for _ in range(num_runs): + # Regenerate outputs each run by deep-copying the pristine dataset + # so model responses are not reused across runs. + fresh_rows = [copy.deepcopy(r) for r in data] + input_dataset = execute_function(rollout_processor, rows=fresh_rows, config=config) if mode == "pointwise": # Pointwise mode: apply the evaluator function to each row for row in input_dataset: @@ -323,6 +342,23 @@ def wrapper_body(**kwargs): scores = [r.evaluation_result.score for r in all_results if r.evaluation_result] agg_score = aggregate(scores, aggregation_method) + # Compute 95% confidence interval for mean aggregation + # TODO bchen: remove after Derek has his stuff + ci_low: float | None = None + ci_high: float | None = None + if aggregation_method == "mean": + n = len(scores) + if n >= 2: + try: + sample_std = statistics.stdev(scores) + se = sample_std / math.sqrt(n) + margin = 1.96 * se + ci_low = float(max(0.0, (agg_score or 0.0) - margin)) if agg_score is not None else None + ci_high = float(min(1.0, (agg_score or 0.0) + margin)) if agg_score is not None else None + except Exception: + ci_low = None + ci_high = None + # Determine if the evaluation passed based on threshold passed = None if threshold_of_success is not None: @@ -335,6 +371,86 @@ def wrapper_body(**kwargs): r.eval_metadata.passed = passed default_logger.log(r) + # Optional: print and/or persist a summary artifact for CI + try: + should_print = os.getenv("EP_PRINT_SUMMARY") == "1" + summary_path = os.getenv("EP_SUMMARY_JSON") + suite_name = test_func.__name__ + model_used = model_name + total_rows = len(all_results) + summary_obj = { + "suite": suite_name, + "model": model_used, + "agg_score": float(agg_score) if agg_score is not None else None, + "num_runs": num_runs, + "rows": total_rows, + } + if ci_low is not None and ci_high is not None: + summary_obj["agg_ci_low"] = ci_low + summary_obj["agg_ci_high"] = ci_high + + # Aggregate per-metric mean and 95% CI when available + metrics_summary: Dict[str, Dict[str, float]] = {} + from collections import defaultdict + metric_scores: Dict[str, list] = defaultdict(list) + for r in all_results: + if r.evaluation_result and r.evaluation_result.metrics: + for m_name, m_res in r.evaluation_result.metrics.items(): + if m_res is not None and getattr(m_res, "score", None) is not None: + metric_scores[m_name].append(m_res.score) + for m_name, vals in metric_scores.items(): + if len(vals) == 0: + continue + m_mean = sum(vals) / len(vals) + m_low = None + m_high = None + if len(vals) >= 2: + try: + m_std = statistics.stdev(vals) + m_se = m_std / math.sqrt(len(vals)) + m_margin = 1.96 * m_se + m_low = max(0.0, m_mean - m_margin) + m_high = min(1.0, m_mean + m_margin) + except Exception: + m_low = None + m_high = None + entry: Dict[str, float] = {"mean": float(m_mean)} + if m_low is not None and m_high is not None: + entry["ci_low"] = float(m_low) + entry["ci_high"] = float(m_high) + metrics_summary[m_name] = entry + if metrics_summary: + summary_obj["metrics_agg"] = metrics_summary + if should_print: + if ci_low is not None and ci_high is not None: + print( + f"EP Summary | suite={suite_name} model={model_used} agg={summary_obj['agg_score']:.3f} ci95=[{ci_low:.3f},{ci_high:.3f}] runs={num_runs} rows={total_rows}" + ) + else: + print( + f"EP Summary | suite={suite_name} model={model_used} agg={summary_obj['agg_score']:.3f} runs={num_runs} rows={total_rows}" + ) + # Print per-metric aggregations concisely (only names present) + if metrics_summary: + parts = [] + for m_name, entry in metrics_summary.items(): + if "ci_low" in entry and "ci_high" in entry: + parts.append(f"{m_name}={entry['mean']:.3f} ci95=[{entry['ci_low']:.3f},{entry['ci_high']:.3f}]") + else: + parts.append(f"{m_name}={entry['mean']:.3f}") + print(f"EP Metrics | " + ", ".join(parts)) + if summary_path: + import json, pathlib, time + + p = pathlib.Path(summary_path) + p.parent.mkdir(parents=True, exist_ok=True) + summary_obj["timestamp"] = int(time.time()) + with p.open("w", encoding="utf-8") as f: + json.dump(summary_obj, f) + except Exception: + # Do not fail evaluation if summary writing fails + pass + # Check threshold after logging if threshold_of_success is not None and not passed: assert ( diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py index e3a98128..da4fb7dd 100644 --- a/eval_protocol/pytest/plugin.py +++ b/eval_protocol/pytest/plugin.py @@ -29,6 +29,22 @@ def pytest_addoption(parser: pytest.Parser) -> None: "Pass an integer (e.g., 2, 50) or 'all' for no limit." ), ) + group.addoption( + "--ep-print-summary", + action="store_true", + default=False, + help=( + "Print a concise summary line (suite/model/effort/agg score) at the end of each evaluation_test." + ), + ) + group.addoption( + "--ep-summary-json", + action="store", + default=None, + help=( + "Write a JSON summary artifact at the given path (e.g., ./outputs/aime_low.json)." + ), + ) def _normalize_max_rows(val: Optional[str]) -> Optional[str]: @@ -51,4 +67,11 @@ def pytest_configure(config: pytest.Config) -> None: if norm is not None: os.environ["EP_MAX_DATASET_ROWS"] = norm + if config.getoption("--ep-print-summary"): + os.environ["EP_PRINT_SUMMARY"] = "1" + + summary_json_path = config.getoption("--ep-summary-json") + if summary_json_path: + os.environ["EP_SUMMARY_JSON"] = summary_json_path + diff --git a/examples/aime2025_chat_completion/tests/test_evaluation.py b/examples/aime2025_chat_completion/tests/test_evaluation.py index 0ef42ffd..261309d0 100644 --- a/examples/aime2025_chat_completion/tests/test_evaluation.py +++ b/examples/aime2025_chat_completion/tests/test_evaluation.py @@ -61,12 +61,13 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl", ], dataset_adapter=aime2025_dataset_adapter, - rollout_input_params=[{"temperature": 0.0, "max_tokens": 1024}], + rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}, {}, {"extra_body": {"reasoning_effort": "high"}}], rollout_processor=default_single_turn_rollout_processor, aggregation_method="mean", threshold_of_success=None, - num_runs=1, + num_runs=2, max_dataset_rows=2, + max_concurrent_rollouts=4, mode="pointwise", ) def test_aime2025_pointwise(row: EvaluationRow) -> EvaluationRow: From 613d8d1b75419cfdb4252febdec82a772e1a9607 Mon Sep 17 00:00:00 2001 From: benjibc Date: Sun, 10 Aug 2025 17:04:50 +0000 Subject: [PATCH 3/3] fixed per comments --- development/RUNNING_EVALUATIONS.md | 80 ---------- eval_protocol/common_utils.py | 6 +- .../default_single_turn_rollout_process.py | 28 ++-- eval_protocol/pytest/evaluation_test.py | 142 +++++++++++++----- eval_protocol/pytest/plugin.py | 75 ++++++++- eval_protocol/stats/__init__.py | 5 + eval_protocol/stats/confidence_intervals.py | 116 ++++++++++++++ .../tests/test_evaluation.py | 3 +- examples/gpqa/tests/test_evaluation.py | 33 ++-- vendor/tau2/utils/llm_utils.py | 15 ++ 10 files changed, 355 insertions(+), 148 deletions(-) delete mode 100644 development/RUNNING_EVALUATIONS.md create mode 100644 eval_protocol/stats/__init__.py create mode 100644 eval_protocol/stats/confidence_intervals.py diff --git a/development/RUNNING_EVALUATIONS.md b/development/RUNNING_EVALUATIONS.md deleted file mode 100644 index 4f1832a3..00000000 --- a/development/RUNNING_EVALUATIONS.md +++ /dev/null @@ -1,80 +0,0 @@ -# Running AIME/GPQA Evaluations in CI and Locally - -This guide explains how to run the AIME2025 and GPQA evaluations using the -pytest-based `evaluation_test` decorator, how to control dataset size and -concurrency, how to select effort presets, and how to print/persist results -for CI dashboards/artifacts. - -## Objectives -- Simple pass/fail: ensure evaluation configs don’t regress. -- Comparable metrics: capture aggregated accuracy across runs/rows. -- CI-friendly outputs: print summary lines to logs and save JSON artifacts. - -## Prerequisites -- `FIREWORKS_API_KEY` set in the environment -- Install SDK: `pip install -e .[dev]` - -## Controls -- Row limit - - Default `max_dataset_rows=2` in each test decorator for quick CI. - - Override centrally: `pytest --ep-max-rows=all` or `--ep-max-rows=50`. -- Concurrency - - Set `max_concurrent_rollouts` in the decorator (recommend 4 for production Fireworks). -- Repeats - - Set `num_runs` in the decorator (e.g., 4). -- Effort (Fireworks reasoning) - - Provide `{"reasoning": {"effort": "low|medium|high"}}` in the test’s `rollout_input_params`. - - The default rollout forwards it via LiteLLM `extra_body`. - -## Printing & Persisting Results -- Flags: - - `--ep-print-summary`: print concise summary lines at end of each eval - - `--ep-summary-json=PATH`: write JSON with suite/model/agg_score/runs/rows/timestamp -- Example GitHub Actions snippet: -```yaml -- name: Run AIME low effort (full) - run: | - cd python-sdk - pytest --ep-max-rows=all --ep-print-summary \ - --ep-summary-json=outputs/aime_low.json \ - -q examples/aime2025_chat_completion/tests/test_evaluation.py::test_aime2025_pointwise -q -- name: Upload AIME results - uses: actions/upload-artifact@v4 - with: - name: aime2025-low-summary - path: python-sdk/outputs/aime_low.json -``` - -## Examples -### AIME (Low Effort, Full, Repeats=4, Concurrency=4) -```bash -cd python-sdk -pytest --ep-max-rows=all --ep-print-summary \ - --ep-summary-json=outputs/aime_low.json \ - -q examples/aime2025_chat_completion/tests/test_evaluation.py::test_aime2025_pointwise -q -``` -Expected: -- Terminal summary: `EP Summary | suite=test_aime2025_pointwise model=... agg=0.530 runs=4 rows=...` -- JSON artifact at `outputs/aime_low.json` -- For `.../gpt-oss-120b`, low-effort pass rate should be ~≥ 0.50 when repeated - -For medium/high effort, add `{"reasoning": {"effort": "medium|high"}}` to -`rollout_input_params` in the test decorator and rerun with a different JSON path. - -### GPQA (Diamond, Low Effort) -```bash -cd python-sdk -pytest --ep-max-rows=all --ep-print-summary \ - --ep-summary-json=outputs/gpqa_low.json \ - -q examples/gpqa/tests/test_evaluation.py -q -``` -Adjust repeats/concurrency/effort in the test decorator similarly to AIME. - -## Pass/Fail Signals -- If `threshold_of_success` is set in a test, it will fail when aggregated score < threshold. -- Otherwise, printing and writing artifacts occur and the run succeeds for CI. - -## Tips -- Use `--ep-max-rows` for toggling quick checks vs full evaluations without editing tests. -- Upload JSON artifacts for dashboards and historical comparisons. -- Keep concurrency conservative (e.g., 4) to avoid rate limiting. diff --git a/eval_protocol/common_utils.py b/eval_protocol/common_utils.py index 42ad47ad..9b9032ab 100644 --- a/eval_protocol/common_utils.py +++ b/eval_protocol/common_utils.py @@ -14,7 +14,7 @@ def load_jsonl(file_path: str) -> List[Dict[str, Any]]: Returns: A list of dictionaries, where each dictionary is a parsed JSON object from a line. - Returns an empty list if the file is not found or if errors occur during parsing. + Returns an empty list if the file is not found or if errors occur during parsing. Supports HTTP urls and local file paths. """ data: List[Dict[str, Any]] = [] if file_path.startswith("http://") or file_path.startswith("https://"): @@ -33,7 +33,7 @@ def load_jsonl(file_path: str) -> List[Dict[str, Any]]: row_id_index = stripped.find("row_id") if row_id_index != -1: row_id = re.search(r'"row_id": (.*),', stripped[row_id_index:]) - raise ValueError(f"{e.msg} at line {line_number}: {stripped} ({row_id})") + raise ValueError(f"{e.msg} at line {line_number}: {stripped} ({row_id})") from e raise e else: with open(file_path, "r", encoding="utf-8") as f: @@ -50,6 +50,6 @@ def load_jsonl(file_path: str) -> List[Dict[str, Any]]: row_id_index = line.find("row_id") if row_id_index != -1: row_id = re.search(r'"row_id": (.*),', line[row_id_index:]) - raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})") + raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})") from e raise e return data diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py index b5c18809..23c8619e 100644 --- a/eval_protocol/pytest/default_single_turn_rollout_process.py +++ b/eval_protocol/pytest/default_single_turn_rollout_process.py @@ -1,12 +1,11 @@ import asyncio from typing import List -from litellm import acompletion -import litellm -from openai.types.chat.chat_completion_message import ChatCompletionMessageToolCall +import logging +import os from eval_protocol.dataset_logger import default_logger -from eval_protocol.models import EvaluationRow, Message +from eval_protocol.models import EvaluationRow, Message, ChatCompletionMessageToolCall from eval_protocol.pytest.types import RolloutProcessorConfig @@ -15,15 +14,20 @@ async def default_single_turn_rollout_processor( ) -> List[EvaluationRow]: """Generate a single response from any supported model provider using LiteLLM.""" - # Explicitly disable LiteLLM caching to avoid reused responses across runs + # Quiet LiteLLM logs in test runs unless user overrode try: - litellm.cache = None - # Some versions expose a helper; ignore if unavailable - if hasattr(litellm, "disable_cache"): - litellm.disable_cache() # type: ignore[call-arg] + if os.environ.get("LITELLM_LOG") is None: + os.environ["LITELLM_LOG"] = "ERROR" + _llog = logging.getLogger("LiteLLM") + _llog.setLevel(logging.CRITICAL) + _llog.propagate = False + for _h in list(_llog.handlers): + _llog.removeHandler(_h) except Exception: pass + # Do not modify global LiteLLM cache. Disable caching per-request instead. + async def process_row(row: EvaluationRow) -> EvaluationRow: """Process a single row asynchronously.""" if len(row.messages) == 0: @@ -32,6 +36,8 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: messages_payload = [{"role": m.role, "content": m.content} for m in row.messages] request_params = {"model": config.model, "messages": messages_payload, **config.input_params} + # Ensure caching is disabled only for this request (review feedback) + request_params["cache"] = {"no-cache": True} # Allow passing reasoning effort to Fireworks via LiteLLM using extra_body # Expected: config.input_params may contain {"reasoning": {"effort": "low|medium|high"}} if "reasoning" in config.input_params: @@ -41,6 +47,10 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: if row.tools is not None: request_params["tools"] = row.tools + # Dynamic import to avoid static dependency/lint errors if LiteLLM isn't installed yet + import importlib + _litellm = importlib.import_module("litellm") + acompletion = getattr(_litellm, "acompletion") response = await acompletion(**request_params) assistant_content = response.choices[0].message.content or "" diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 04da1f03..937f4e4c 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -8,7 +8,7 @@ import pytest from eval_protocol.dataset_logger import default_logger -from eval_protocol.models import CompletionParams, EvalMetadata, EvaluationRow, InputMetadata +from eval_protocol.models import CompletionParams, EvalMetadata, EvaluationRow, InputMetadata, Message from eval_protocol.pytest.default_dataset_adapter import default_dataset_adapter from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor from eval_protocol.pytest.types import ( @@ -31,6 +31,7 @@ ) from ..common_utils import load_jsonl +from eval_protocol.stats.confidence_intervals import compute_fixed_set_mu_ci def evaluation_test( @@ -51,6 +52,7 @@ def evaluation_test( server_script_path: Optional[str] = None, steps: int = 30, mode: EvaluationTestMode = "batch", + combine_datasets: bool = True, ) -> Callable[ [TestFunction], TestFunction, @@ -148,25 +150,44 @@ def _parse_ep_max_rows(default_value: int | None) -> int | None: except ValueError: return default_value + def _deep_update_dict(base: dict, override: dict) -> dict: + """Recursively update nested dictionaries in-place and return base.""" + for key, value in override.items(): + if isinstance(value, dict) and isinstance(base.get(key), dict): + _deep_update_dict(base[key], value) + else: + base[key] = value + return base + def generate_combinations(): combinations = [] # Handle optional parameters with defaults - # Treat multiple dataset paths as a single combined dataset rather than - # parameterizing over each path separately. This produces one summary - # that reflects the aggregate of all provided files (e.g., AIME I+II). + # Optionally combine multiple dataset paths into one logical dataset, + # or parameterize to run one dataset per test invocation. if input_dataset is not None: - datasets: List[Optional[List[DatasetPathParam]]] = [input_dataset] # type: ignore + if combine_datasets: + datasets: List[Optional[List[DatasetPathParam]]] = [input_dataset] # type: ignore + else: + # Fan out: one dataset path per parameterization + if isinstance(input_dataset, list): # type: ignore + datasets = [[p] for p in input_dataset] # type: ignore + else: + datasets = [[input_dataset]] # type: ignore else: datasets = [None] params: List[Optional[RolloutInputParam]] = rollout_input_params if rollout_input_params is not None else [None] # type: ignore - # Apply EP_MAX_DATASET_ROWS to input_messages to uniformly control row count when messages are provided + # Apply EP_MAX_DATASET_ROWS to input_messages, but do NOT parameterize over + # each row. Instead, pass the entire sliced list through in a single test run + # so summaries aggregate all rows together (AIME-style behavior). if input_messages is not None and isinstance(input_messages, list): effective_max_rows = _parse_ep_max_rows(max_dataset_rows) if effective_max_rows is not None: - messages: List[Optional[InputMessagesParam]] = input_messages[:effective_max_rows] # type: ignore + sliced_messages = input_messages[:effective_max_rows] # type: ignore else: - messages = input_messages # type: ignore + sliced_messages = input_messages # type: ignore + # Wrap as a single parameter payload + messages = [sliced_messages] # type: ignore else: messages = [None] # type: ignore kwargs: List[Optional[EvaluationInputParam]] = evaluation_test_kwargs if evaluation_test_kwargs is not None else [None] # type: ignore @@ -245,11 +266,30 @@ def wrapper_body(**kwargs): data_jsonl = data_jsonl[:effective_max_rows] data = dataset_adapter(data_jsonl) elif "input_messages" in kwargs and kwargs["input_messages"] is not None: - data: List[EvaluationRow] = [EvaluationRow(messages=kwargs["input_messages"])] + # Support either a single row (List[Message]) or many rows (List[List[Message]]) + im = kwargs["input_messages"] + if isinstance(im, list) and len(im) > 0 and isinstance(im[0], Message): + # Single row of Message objects + data = [EvaluationRow(messages=im)] + else: + # Multiple rows: list of List[Message] + data = [EvaluationRow(messages=m) for m in im] else: raise ValueError("No input dataset or input messages provided") input_params = kwargs.get("input_params") or {} + # Optional global overrides via environment for ad-hoc experimentation + # EP_INPUT_PARAMS_JSON can contain a JSON object that will be deep-merged + # into input_params (e.g., '{"temperature":0,"extra_body":{"reasoning":{"effort":"low"}}}'). + try: + import json as _json + _env_override = os.getenv("EP_INPUT_PARAMS_JSON") + if _env_override: + override_obj = _json.loads(_env_override) + if isinstance(override_obj, dict): + input_params = _deep_update_dict(dict(input_params), override_obj) + except Exception: + pass # Create eval metadata with test function info and current commit hash eval_metadata = EvalMetadata( @@ -342,22 +382,20 @@ def wrapper_body(**kwargs): scores = [r.evaluation_result.score for r in all_results if r.evaluation_result] agg_score = aggregate(scores, aggregation_method) - # Compute 95% confidence interval for mean aggregation - # TODO bchen: remove after Derek has his stuff + # Compute 95% confidence interval for the fixed-set mean μ (by-question, using repeats) ci_low: float | None = None ci_high: float | None = None if aggregation_method == "mean": - n = len(scores) - if n >= 2: - try: - sample_std = statistics.stdev(scores) - se = sample_std / math.sqrt(n) - margin = 1.96 * se - ci_low = float(max(0.0, (agg_score or 0.0) - margin)) if agg_score is not None else None - ci_high = float(min(1.0, (agg_score or 0.0) + margin)) if agg_score is not None else None - except Exception: - ci_low = None - ci_high = None + try: + result_ci = compute_fixed_set_mu_ci(all_results) + mu_ci_low, mu_ci_high = result_ci[1], result_ci[2] + if mu_ci_low is not None and mu_ci_high is not None: + ci_low = float(mu_ci_low) + ci_high = float(mu_ci_high) + # Keep agg_score as-is (mean over scores). For equal repeats per question these match. + except Exception: + ci_low = None + ci_high = None # Determine if the evaluation passed based on threshold passed = None @@ -430,22 +468,56 @@ def wrapper_body(**kwargs): print( f"EP Summary | suite={suite_name} model={model_used} agg={summary_obj['agg_score']:.3f} runs={num_runs} rows={total_rows}" ) - # Print per-metric aggregations concisely (only names present) - if metrics_summary: - parts = [] - for m_name, entry in metrics_summary.items(): - if "ci_low" in entry and "ci_high" in entry: - parts.append(f"{m_name}={entry['mean']:.3f} ci95=[{entry['ci_low']:.3f},{entry['ci_high']:.3f}]") - else: - parts.append(f"{m_name}={entry['mean']:.3f}") - print(f"EP Metrics | " + ", ".join(parts)) + # As per project convention, avoid printing per-metric CI lines to reduce noise if summary_path: - import json, pathlib, time + import json, pathlib, time, re + + def _sanitize_filename(text: str) -> str: + safe = re.sub(r"[^A-Za-z0-9._-]+", "-", text.strip()) + return safe[:120] + + def _extract_effort_tag(params: dict) -> str | None: + try: + if not isinstance(params, dict): + return None + # Common locations + if "extra_body" in params and isinstance(params["extra_body"], dict): + eb = params["extra_body"] + if isinstance(eb.get("reasoning"), dict) and "effort" in eb["reasoning"]: + return str(eb["reasoning"]["effort"]).lower() + if "reasoning_effort" in eb: + return str(eb["reasoning_effort"]).lower() + if "reasoning" in params and isinstance(params["reasoning"], dict) and "effort" in params["reasoning"]: + return str(params["reasoning"]["effort"]).lower() + except Exception: + return None + return None + + model_slug = _sanitize_filename(model_used) + effort_tag = _extract_effort_tag(input_params) or "" + effort_suffix = f"__effort-{_sanitize_filename(effort_tag)}" if effort_tag else "" + base_name = f"{suite_name}__{model_slug}{effort_suffix}__{mode}__runs{num_runs}.json" p = pathlib.Path(summary_path) - p.parent.mkdir(parents=True, exist_ok=True) summary_obj["timestamp"] = int(time.time()) - with p.open("w", encoding="utf-8") as f: + + # When a directory is provided (or a path without .json), write per-combination files inside it + if p.suffix.lower() != ".json" or summary_path.endswith("/") or p.is_dir(): + out_dir = p + out_dir.mkdir(parents=True, exist_ok=True) + out_file = out_dir / base_name + else: + # A file path was provided + # If multiple parameterizations exist, write side-by-side files with suffixes based on base name + parent = p.parent + parent.mkdir(parents=True, exist_ok=True) + # If we detected an effort tag, fan out to separate files; otherwise write to the exact file + if effort_tag: + out_file = parent / f"{p.stem}__{_sanitize_filename(effort_tag)}{p.suffix}" + else: + out_file = p + + with open(out_file, "w", encoding="utf-8") as f: json.dump(summary_obj, f) except Exception: # Do not fail evaluation if summary writing fails @@ -457,7 +529,7 @@ def wrapper_body(**kwargs): agg_score >= threshold_of_success ), f"Aggregated score {agg_score:.3f} below threshold {threshold_of_success}" - except Exception as e: + except Exception: # Update eval metadata status to error and log it if eval_metadata is not None: eval_metadata.status = "error" diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py index da4fb7dd..5eb9a946 100644 --- a/eval_protocol/pytest/plugin.py +++ b/eval_protocol/pytest/plugin.py @@ -13,12 +13,11 @@ """ import os +import logging from typing import Optional -import pytest - -def pytest_addoption(parser: pytest.Parser) -> None: +def pytest_addoption(parser) -> None: group = parser.getgroup("eval-protocol") group.addoption( "--ep-max-rows", @@ -45,6 +44,25 @@ def pytest_addoption(parser: pytest.Parser) -> None: "Write a JSON summary artifact at the given path (e.g., ./outputs/aime_low.json)." ), ) + group.addoption( + "--ep-input-param", + action="append", + default=None, + help=( + "Override rollout input parameters. Can be used multiple times. " + "Format: key=value or JSON via @path.json. Examples: " + "--ep-input-param temperature=0 --ep-input-param @params.json" + ), + ) + group.addoption( + "--ep-reasoning-effort", + action="store", + default=None, + help=( + "Set reasoning.effort for providers that support it (e.g., Fireworks) via LiteLLM extra_body. " + "Values: low|medium|high" + ), + ) def _normalize_max_rows(val: Optional[str]) -> Optional[str]: @@ -61,7 +79,19 @@ def _normalize_max_rows(val: Optional[str]) -> Optional[str]: return None -def pytest_configure(config: pytest.Config) -> None: +def pytest_configure(config) -> None: + # Quiet LiteLLM INFO spam early in pytest session unless user set a level + try: + if os.environ.get("LITELLM_LOG") is None: + os.environ["LITELLM_LOG"] = "ERROR" + _llog = logging.getLogger("LiteLLM") + _llog.setLevel(logging.CRITICAL) + _llog.propagate = False + for _h in list(_llog.handlers): + _llog.removeHandler(_h) + except Exception: + pass + cli_val = config.getoption("--ep-max-rows") norm = _normalize_max_rows(cli_val) if norm is not None: @@ -74,4 +104,41 @@ def pytest_configure(config: pytest.Config) -> None: if summary_json_path: os.environ["EP_SUMMARY_JSON"] = summary_json_path + # Allow ad-hoc overrides of input params via CLI flags + try: + import json as _json + import pathlib as _pathlib + merged: dict = {} + input_params_opts = config.getoption("--ep-input-param") + if input_params_opts: + for opt in input_params_opts: + if opt is None: + continue + opt = str(opt) + if opt.startswith("@"): # load JSON file + p = _pathlib.Path(opt[1:]) + if p.is_file(): + with open(p, "r", encoding="utf-8") as f: + obj = _json.load(f) + if isinstance(obj, dict): + merged.update(obj) + elif "=" in opt: + k, v = opt.split("=", 1) + # Try parse JSON values, fallback to string + try: + merged[k] = _json.loads(v) + except Exception: + merged[k] = v + reasoning_effort = config.getoption("--ep-reasoning-effort") + if reasoning_effort: + # Standardize into extra_body.reasoning.effort in EP_INPUT_PARAMS_JSON + eb = merged.setdefault("extra_body", {}) + reasoning = eb.setdefault("reasoning", {}) + reasoning["effort"] = str(reasoning_effort) + if merged: + os.environ["EP_INPUT_PARAMS_JSON"] = _json.dumps(merged) + except Exception: + # best effort, do not crash pytest session + pass + diff --git a/eval_protocol/stats/__init__.py b/eval_protocol/stats/__init__.py new file mode 100644 index 00000000..c327d2ed --- /dev/null +++ b/eval_protocol/stats/__init__.py @@ -0,0 +1,5 @@ +"""Statistical utilities for evaluation reporting (confidence intervals, etc.).""" + +from .confidence_intervals import compute_fixed_set_mu_ci # re-export + + diff --git a/eval_protocol/stats/confidence_intervals.py b/eval_protocol/stats/confidence_intervals.py new file mode 100644 index 00000000..bf78934c --- /dev/null +++ b/eval_protocol/stats/confidence_intervals.py @@ -0,0 +1,116 @@ +from __future__ import annotations + +import math +from collections import defaultdict +from typing import Dict, List, Optional, Tuple + +from ..models import EvaluationRow + + +def _default_question_id(row: EvaluationRow) -> str: + """Best-effort stable question identifier across repeats. + + Prefers `row.input_metadata.row_id` (which is set once and preserved across deep copies), + and falls back to the last user message content when not available. + """ + # Prefer explicit row_id if present + try: + if row.input_metadata is not None and getattr(row.input_metadata, "row_id", None): + return str(row.input_metadata.row_id) + except Exception: + pass + + # Fallback: use last user message content + try: + user_msgs = [m.content for m in row.messages if getattr(m, "role", None) == "user"] + if user_msgs and user_msgs[-1]: + return str(user_msgs[-1]) + except Exception: + pass + + # Final fallback: use Python id for uniqueness within this process + return f"row-{id(row)}" + + +def compute_fixed_set_mu_ci( + rows: List[EvaluationRow], + *, + z_value: float = 1.96, +) -> Tuple[Optional[float], Optional[float], Optional[float]]: + """Compute the benchmark-conditional 95% CI for the mean accuracy μ on a fixed item set. + + This treats questions/items as fixed and repeats as within-item Bernoulli draws. + For each question i with m_i repeats and s_i successes, the per-question mean is + ybar_i = s_i / m_i. The estimator of μ is the average of per-question means: + mu_hat = (1/Q) * sum_i ybar_i. + + The plug-in standard error for the CI of μ uses the within-item variances only: + Var(mu_hat) ≈ (1/Q^2) * sum_i [ ybar_i (1 - ybar_i) / (m_i - 1) ], for m_i >= 2. + + Notes: + - When m_i == 1, the unbiased correction is undefined. In that case we fall back to + ybar_i (1 - ybar_i) / m_i as a conservative estimate. GPQA typically has m_i >= 2. + - Scores are taken from `row.evaluation_result.score` when available and numeric. + + Returns: + (mu_hat, ci_low, ci_high). Returns (None, None, None) if insufficient data. + """ + if not rows: + return None, None, None + + # Group scores by question id + question_to_scores: Dict[str, List[float]] = defaultdict(list) + for r in rows: + try: + er = getattr(r, "evaluation_result", None) + if er is None: + continue + score = getattr(er, "score", None) + if score is None: + continue + # Ensure numeric float + s_val = float(score) + if math.isnan(s_val): + continue + qid = _default_question_id(r) + question_to_scores[qid].append(s_val) + except Exception: + # Skip malformed rows + continue + + Q = len(question_to_scores) + if Q == 0: + return None, None, None + + # Compute per-question means and the plug-in variance contribution + ybars: List[float] = [] + var_terms: List[float] = [] + for scores in question_to_scores.values(): + m_i = len(scores) + if m_i == 0: + continue + ybar_i = sum(scores) / m_i + ybars.append(ybar_i) + # Unbiased within-item variance estimate for Bernoulli mean + if m_i >= 2: + var_terms.append(ybar_i * (1.0 - ybar_i) / (m_i - 1)) + else: + # Conservative fallback when only a single repeat exists + var_terms.append(ybar_i * (1.0 - ybar_i) / m_i) + + if not ybars: + return None, None, None + + mu_hat = sum(ybars) / len(ybars) + + # Standard error for CI of μ + se_sq = sum(var_terms) / (Q * Q) + se = math.sqrt(se_sq) if se_sq > 0.0 else 0.0 + + margin = z_value * se + ci_low = max(0.0, mu_hat - margin) + ci_high = min(1.0, mu_hat + margin) + + return float(mu_hat), float(ci_low), float(ci_high) + + diff --git a/examples/aime2025_chat_completion/tests/test_evaluation.py b/examples/aime2025_chat_completion/tests/test_evaluation.py index 261309d0..7558dab1 100644 --- a/examples/aime2025_chat_completion/tests/test_evaluation.py +++ b/examples/aime2025_chat_completion/tests/test_evaluation.py @@ -1,4 +1,5 @@ from typing import Any, Dict, List +import os from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult from eval_protocol.pytest.default_single_turn_rollout_process import ( @@ -61,7 +62,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl", ], dataset_adapter=aime2025_dataset_adapter, - rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}, {}, {"extra_body": {"reasoning_effort": "high"}}], + rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}], rollout_processor=default_single_turn_rollout_processor, aggregation_method="mean", threshold_of_success=None, diff --git a/examples/gpqa/tests/test_evaluation.py b/examples/gpqa/tests/test_evaluation.py index 42c3c91b..79863fc0 100644 --- a/examples/gpqa/tests/test_evaluation.py +++ b/examples/gpqa/tests/test_evaluation.py @@ -1,6 +1,9 @@ -from typing import Any, Dict, List +from typing import List +import csv +import io import re +import requests from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult from eval_protocol.pytest.evaluation_test import evaluation_test @@ -22,20 +25,18 @@ def extract_abcd_letter(text: str) -> str | None: return m.group(1) if m else None -def _build_gpqa_messages_from_hf(max_samples: int | None = 2) -> List[List[Message]]: +def _load_gpqa_messages_from_csv() -> List[List[Message]]: """ - Load GPQA (diamond) from the reference blob CSV and construct prompts. - For full dataset, call with max_samples=None. + Load GPQA (diamond) from the reference CSV and construct prompts. + No built-in row limit; use --ep-max-rows to control how many are evaluated. """ - from datasets import load_dataset # type: ignore - url = "https://openaipublic.blob.core.windows.net/simple-evals/gpqa_diamond.csv" - ds = load_dataset("csv", data_files=url, split="train") + resp = requests.get(url, timeout=60) + resp.raise_for_status() + messages_list: List[List[Message]] = [] - # We will store the correct letter in a trailing system message for lookup (not given to the model) - for ex in ds: - if max_samples is not None and len(messages_list) >= max_samples: - break + reader = csv.DictReader(io.StringIO(resp.text)) + for ex in reader: q = str(ex.get("Question", "")) correct = str(ex.get("Correct Answer", "")).strip() inc1 = str(ex.get("Incorrect Answer 1", "")) @@ -49,7 +50,8 @@ def _build_gpqa_messages_from_hf(max_samples: int | None = 2) -> List[List[Messa [ Message(role="system", content=SYSTEM_PROMPT), Message(role="user", content=user_content), - Message(role="system", content=f"__GT__:A"), + # Correct answer is always option A by construction + Message(role="system", content="__GT__:A"), ] ) if not messages_list: @@ -57,18 +59,17 @@ def _build_gpqa_messages_from_hf(max_samples: int | None = 2) -> List[List[Messa return messages_list -_GPQA_INPUT_MESSAGES = _build_gpqa_messages_from_hf(max_samples=2) +_GPQA_INPUT_MESSAGES = _load_gpqa_messages_from_csv() @evaluation_test( model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], input_messages=_GPQA_INPUT_MESSAGES, - rollout_input_params=[{"temperature": 0.0, "max_tokens": 512}], + rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}], # default to low effort; override via CLI plugin rollout_processor=default_single_turn_rollout_processor, aggregation_method="mean", threshold_of_success=None, - num_runs=1, - max_dataset_rows=2, + num_runs=8, mode="pointwise", ) def test_gpqa_pointwise(row: EvaluationRow) -> EvaluationRow: diff --git a/vendor/tau2/utils/llm_utils.py b/vendor/tau2/utils/llm_utils.py index 98ae698c..8f46503a 100644 --- a/vendor/tau2/utils/llm_utils.py +++ b/vendor/tau2/utils/llm_utils.py @@ -7,6 +7,8 @@ from litellm.caching.caching import Cache from litellm.main import ModelResponse, Usage from loguru import logger +import logging +import os from vendor.tau2.config import ( DEFAULT_LLM_CACHE_TYPE, @@ -74,6 +76,19 @@ if not ALLOW_SONNET_THINKING: logger.warning("Sonnet thinking is disabled") +# Quiet LiteLLM's own INFO logs unless the user explicitly set a level +try: + if os.environ.get("LITELLM_LOG") is None: + os.environ["LITELLM_LOG"] = "ERROR" + _llog = logging.getLogger("LiteLLM") + _llog.setLevel(logging.CRITICAL) + _llog.propagate = False + for _h in list(_llog.handlers): + _llog.removeHandler(_h) +except Exception: + # Best-effort; never fail import due to logging config + pass + def _parse_ft_model_name(model: str) -> str: """