diff --git a/eval_protocol/benchmarks/registry.py b/eval_protocol/benchmarks/registry.py index 31840fd1..5cb181fb 100644 --- a/eval_protocol/benchmarks/registry.py +++ b/eval_protocol/benchmarks/registry.py @@ -12,7 +12,7 @@ from eval_protocol.benchmarks.registry import export_benchmark - @export_benchmark("aime25_low") + @export_benchmark("aime25") @evaluation_test(...) def test_aime_pointwise(row: EvaluationRow) -> EvaluationRow: ... @@ -20,7 +20,7 @@ def test_aime_pointwise(row: EvaluationRow) -> EvaluationRow: Programmatic run: from eval_protocol.benchmarks.registry import get_benchmark_runner - get_benchmark_runner("aime25_low")(model="fireworks_ai/...", print_summary=True, out="artifacts/aime.json") + get_benchmark_runner("aime25")(model="fireworks_ai/...", print_summary=True, out="artifacts/aime.json") """ from __future__ import annotations diff --git a/eval_protocol/benchmarks/run.py b/eval_protocol/benchmarks/run.py index 9195666f..9e8e293d 100644 --- a/eval_protocol/benchmarks/run.py +++ b/eval_protocol/benchmarks/run.py @@ -3,10 +3,10 @@ Usage: - python -m eval_protocol.benchmarks.run aime25_low \ + python -m eval_protocol.benchmarks.run aime25 \ --model fireworks_ai/accounts/fireworks/models/gpt-oss-120b \ --print-summary \ - --out artifacts/aime25_low.json \ + --out artifacts/aime25.json \ --max-rows 50 \ --reasoning-effort low """ @@ -14,7 +14,6 @@ from __future__ import annotations import argparse -from typing import Any from importlib import import_module import pkgutil @@ -60,7 +59,7 @@ def main() -> int: # Fallback: if nothing registered yet and a known suite was requested, try explicit import if not list_benchmarks(): known_map = { - "aime25_low": "eval_protocol.benchmarks.suites.aime25", + "aime25": "eval_protocol.benchmarks.suites.aime25", } forced = known_map.get(args.name) if forced: @@ -73,7 +72,7 @@ def main() -> int: if args.max_rows is not None: try: max_rows = int(args.max_rows) - except Exception: + except ValueError: max_rows = str(args.max_rows) # Build input params override if needed ip_override = {} diff --git a/examples/aime2025_chat_completion/README.md b/examples/aime2025_chat_completion/README.md deleted file mode 100644 index dbe79527..00000000 --- a/examples/aime2025_chat_completion/README.md +++ /dev/null @@ -1,24 +0,0 @@ -## AIME2025 Chat Completion Example - -This example reproduces gpt-oss's AIME2025 chat completion evaluation inside Eval Protocol. - -### What it does -- Loads AIME2025 questions from Hugging Face -- Prompts a reasoning-capable chat-completions model -- Extracts the final integer answer from \boxed{...} -- Scores exact-match vs. the ground-truth integer - -### Quick run (pytest, CI-friendly) -The evaluation is implemented as a pytest `evaluation_test` under `tests/`. Run it directly: - -```bash -pytest -q examples/aime2025_chat_completion/tests/test_evaluation.py -q -``` - -Environment variables expected: -- `FIREWORKS_API_KEY` - -To scale up, adjust parameters in the decorator (e.g., `threshold_of_success`, `max_dataset_rows`). - - - diff --git a/examples/aime2025_chat_completion/__init__.py b/examples/aime2025_chat_completion/__init__.py deleted file mode 100644 index 8bcaacfb..00000000 --- a/examples/aime2025_chat_completion/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -__all__ = ["main"] - - - diff --git a/examples/aime2025_chat_completion/main.py b/examples/aime2025_chat_completion/main.py deleted file mode 100644 index 92c6dd83..00000000 --- a/examples/aime2025_chat_completion/main.py +++ /dev/null @@ -1,110 +0,0 @@ -""" -Eval Protocol example: AIME2025 chat completion evaluation - -This example mirrors gpt-oss's AIME 2025 evaluation using OpenAI-compatible -chat completions. It evaluates whether the assistant's final answer matches the -ground-truth integer, extracting answers from \\boxed{...} or fallback digits. -""" - -import re -from typing import Any, Dict, List, Optional, Union - -from eval_protocol import EvaluateResult, MetricResult, reward_function -from eval_protocol.models import Message - - -def _extract_boxed_text(text: str) -> str: - """ - Extract the last occurrence of a boxed answer (\\boxed{...} or \\framebox{...}). - If none found, fall back to the last integer found in the text. - """ - if not text: - return "" - - pattern_boxed = r"boxed{(.*?)}|framebox{(.*?)}" - matches = re.findall(pattern_boxed, text, re.DOTALL) - if matches: - # Iterate from the end to prioritize the final boxed answer - for match in matches[::-1]: - for group in match: - if group: - return group.split(",")[-1].strip() - - # Fallback: last integer in the text - matches_digits = re.findall(r"\d+", text, re.DOTALL) - if matches_digits: - return matches_digits[-1] - return "" - - -def _normalize_to_int_or_none(s: str) -> Optional[int]: - if s is None: - return None - # Only take leading digits - m = re.match(r"\d+", str(s).strip()) - if not m: - return None - try: - return int(m.group(0)) - except ValueError: - return None - - -@reward_function(id="aime2025_exact_match") -def evaluate( - messages: Union[List[Message], List[Dict[str, Any]]], - ground_truth: Optional[str] = None, - **kwargs, -) -> EvaluateResult: - """ - Score 1.0 if extracted final answer equals the ground-truth integer, else 0.0. - """ - if not messages: - return EvaluateResult( - score=0.0, - reason="No messages provided", - is_score_valid=False, - metrics={ - "parse_status": MetricResult(score=0.0, is_score_valid=False, reason="empty messages") - }, - ) - - last_msg = messages[-1] - content = last_msg["content"] if isinstance(last_msg, dict) else (last_msg.content or "") - - extracted_text = _extract_boxed_text(content) - extracted_int = _normalize_to_int_or_none(extracted_text) - gt_int = _normalize_to_int_or_none(ground_truth if ground_truth is not None else "") - - is_valid = extracted_int is not None and gt_int is not None - score = 1.0 if (is_valid and extracted_int == gt_int) else 0.0 - - metrics: Dict[str, MetricResult] = { - "exact_match": MetricResult( - score=score, - is_score_valid=is_valid, - reason=( - "Parsed both integers and they matched" - if score == 1.0 - else ( - "Parsed integers did not match" - if is_valid - else "Failed to parse integer from prediction or ground truth" - ) - ), - data={ - "extracted_text": extracted_text, - "extracted_int": extracted_int, - "ground_truth_int": gt_int, - }, - ) - } - - return EvaluateResult( - score=score, - reason=("Answer correct" if score == 1.0 else "Answer incorrect"), - is_score_valid=is_valid, - metrics=metrics, - ) - -