Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions eval_protocol/benchmarks/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@

from eval_protocol.benchmarks.registry import export_benchmark

@export_benchmark("aime25_low")
@export_benchmark("aime25")
@evaluation_test(...)
def test_aime_pointwise(row: EvaluationRow) -> EvaluationRow:
...

Programmatic run:

from eval_protocol.benchmarks.registry import get_benchmark_runner
get_benchmark_runner("aime25_low")(model="fireworks_ai/...", print_summary=True, out="artifacts/aime.json")
get_benchmark_runner("aime25")(model="fireworks_ai/...", print_summary=True, out="artifacts/aime.json")
"""

from __future__ import annotations
Expand Down
9 changes: 4 additions & 5 deletions eval_protocol/benchmarks/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,17 @@

Usage:

python -m eval_protocol.benchmarks.run aime25_low \
python -m eval_protocol.benchmarks.run aime25 \
--model fireworks_ai/accounts/fireworks/models/gpt-oss-120b \
--print-summary \
--out artifacts/aime25_low.json \
--out artifacts/aime25.json \
--max-rows 50 \
--reasoning-effort low
"""

from __future__ import annotations

import argparse
from typing import Any

from importlib import import_module
import pkgutil
Expand Down Expand Up @@ -60,7 +59,7 @@ def main() -> int:
# Fallback: if nothing registered yet and a known suite was requested, try explicit import
if not list_benchmarks():
known_map = {
"aime25_low": "eval_protocol.benchmarks.suites.aime25",
"aime25": "eval_protocol.benchmarks.suites.aime25",
}
forced = known_map.get(args.name)
if forced:
Expand All @@ -73,7 +72,7 @@ def main() -> int:
if args.max_rows is not None:
try:
max_rows = int(args.max_rows)
except Exception:
except ValueError:
max_rows = str(args.max_rows)
# Build input params override if needed
ip_override = {}
Expand Down
24 changes: 0 additions & 24 deletions examples/aime2025_chat_completion/README.md

This file was deleted.

4 changes: 0 additions & 4 deletions examples/aime2025_chat_completion/__init__.py

This file was deleted.

110 changes: 0 additions & 110 deletions examples/aime2025_chat_completion/main.py

This file was deleted.

Loading