diff --git a/eval_protocol/benchmarks/registry.py b/eval_protocol/benchmarks/registry.py
index 31840fd1..5cb181fb 100644
--- a/eval_protocol/benchmarks/registry.py
+++ b/eval_protocol/benchmarks/registry.py
@@ -12,7 +12,7 @@
 
     from eval_protocol.benchmarks.registry import export_benchmark
 
-    @export_benchmark("aime25_low")
+    @export_benchmark("aime25")
     @evaluation_test(...)
     def test_aime_pointwise(row: EvaluationRow) -> EvaluationRow:
         ...
@@ -20,7 +20,7 @@ def test_aime_pointwise(row: EvaluationRow) -> EvaluationRow:
 Programmatic run:
 
     from eval_protocol.benchmarks.registry import get_benchmark_runner
-    get_benchmark_runner("aime25_low")(model="fireworks_ai/...", print_summary=True, out="artifacts/aime.json")
+    get_benchmark_runner("aime25")(model="fireworks_ai/...", print_summary=True, out="artifacts/aime.json")
 """
 
 from __future__ import annotations
diff --git a/eval_protocol/benchmarks/run.py b/eval_protocol/benchmarks/run.py
index 9195666f..9e8e293d 100644
--- a/eval_protocol/benchmarks/run.py
+++ b/eval_protocol/benchmarks/run.py
@@ -3,10 +3,10 @@
 
 Usage:
 
-  python -m eval_protocol.benchmarks.run aime25_low \
+  python -m eval_protocol.benchmarks.run aime25 \
     --model fireworks_ai/accounts/fireworks/models/gpt-oss-120b \
     --print-summary \
-    --out artifacts/aime25_low.json \
+    --out artifacts/aime25.json \
     --max-rows 50 \
     --reasoning-effort low
 """
@@ -14,7 +14,6 @@
 from __future__ import annotations
 
 import argparse
-from typing import Any
 
 from importlib import import_module
 import pkgutil
@@ -60,7 +59,7 @@ def main() -> int:
     # Fallback: if nothing registered yet and a known suite was requested, try explicit import
     if not list_benchmarks():
         known_map = {
-            "aime25_low": "eval_protocol.benchmarks.suites.aime25",
+            "aime25": "eval_protocol.benchmarks.suites.aime25",
         }
         forced = known_map.get(args.name)
         if forced:
@@ -73,7 +72,7 @@ def main() -> int:
     if args.max_rows is not None:
         try:
             max_rows = int(args.max_rows)
-        except Exception:
+        except ValueError:
             max_rows = str(args.max_rows)
     # Build input params override if needed
     ip_override = {}
diff --git a/examples/aime2025_chat_completion/README.md b/examples/aime2025_chat_completion/README.md
deleted file mode 100644
index dbe79527..00000000
--- a/examples/aime2025_chat_completion/README.md
+++ /dev/null
@@ -1,24 +0,0 @@
-## AIME2025 Chat Completion Example
-
-This example reproduces gpt-oss's AIME2025 chat completion evaluation inside Eval Protocol.
-
-### What it does
-- Loads AIME2025 questions from Hugging Face
-- Prompts a reasoning-capable chat-completions model
-- Extracts the final integer answer from \boxed{...}
-- Scores exact-match vs. the ground-truth integer
-
-### Quick run (pytest, CI-friendly)
-The evaluation is implemented as a pytest `evaluation_test` under `tests/`. Run it directly:
-
-```bash
-pytest -q examples/aime2025_chat_completion/tests/test_evaluation.py -q
-```
-
-Environment variables expected:
-- `FIREWORKS_API_KEY`
-
-To scale up, adjust parameters in the decorator (e.g., `threshold_of_success`, `max_dataset_rows`).
-
-
-
diff --git a/examples/aime2025_chat_completion/__init__.py b/examples/aime2025_chat_completion/__init__.py
deleted file mode 100644
index 8bcaacfb..00000000
--- a/examples/aime2025_chat_completion/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-__all__ = ["main"]
-
-
-
diff --git a/examples/aime2025_chat_completion/main.py b/examples/aime2025_chat_completion/main.py
deleted file mode 100644
index 92c6dd83..00000000
--- a/examples/aime2025_chat_completion/main.py
+++ /dev/null
@@ -1,110 +0,0 @@
-"""
-Eval Protocol example: AIME2025 chat completion evaluation
-
-This example mirrors gpt-oss's AIME 2025 evaluation using OpenAI-compatible
-chat completions. It evaluates whether the assistant's final answer matches the
-ground-truth integer, extracting answers from \\boxed{...} or fallback digits.
-"""
-
-import re
-from typing import Any, Dict, List, Optional, Union
-
-from eval_protocol import EvaluateResult, MetricResult, reward_function
-from eval_protocol.models import Message
-
-
-def _extract_boxed_text(text: str) -> str:
-    """
-    Extract the last occurrence of a boxed answer (\\boxed{...} or \\framebox{...}).
-    If none found, fall back to the last integer found in the text.
-    """
-    if not text:
-        return ""
-
-    pattern_boxed = r"boxed{(.*?)}|framebox{(.*?)}"
-    matches = re.findall(pattern_boxed, text, re.DOTALL)
-    if matches:
-        # Iterate from the end to prioritize the final boxed answer
-        for match in matches[::-1]:
-            for group in match:
-                if group:
-                    return group.split(",")[-1].strip()
-
-    # Fallback: last integer in the text
-    matches_digits = re.findall(r"\d+", text, re.DOTALL)
-    if matches_digits:
-        return matches_digits[-1]
-    return ""
-
-
-def _normalize_to_int_or_none(s: str) -> Optional[int]:
-    if s is None:
-        return None
-    # Only take leading digits
-    m = re.match(r"\d+", str(s).strip())
-    if not m:
-        return None
-    try:
-        return int(m.group(0))
-    except ValueError:
-        return None
-
-
-@reward_function(id="aime2025_exact_match")
-def evaluate(
-    messages: Union[List[Message], List[Dict[str, Any]]],
-    ground_truth: Optional[str] = None,
-    **kwargs,
-) -> EvaluateResult:
-    """
-    Score 1.0 if extracted final answer equals the ground-truth integer, else 0.0.
-    """
-    if not messages:
-        return EvaluateResult(
-            score=0.0,
-            reason="No messages provided",
-            is_score_valid=False,
-            metrics={
-                "parse_status": MetricResult(score=0.0, is_score_valid=False, reason="empty messages")
-            },
-        )
-
-    last_msg = messages[-1]
-    content = last_msg["content"] if isinstance(last_msg, dict) else (last_msg.content or "")
-
-    extracted_text = _extract_boxed_text(content)
-    extracted_int = _normalize_to_int_or_none(extracted_text)
-    gt_int = _normalize_to_int_or_none(ground_truth if ground_truth is not None else "")
-
-    is_valid = extracted_int is not None and gt_int is not None
-    score = 1.0 if (is_valid and extracted_int == gt_int) else 0.0
-
-    metrics: Dict[str, MetricResult] = {
-        "exact_match": MetricResult(
-            score=score,
-            is_score_valid=is_valid,
-            reason=(
-                "Parsed both integers and they matched"
-                if score == 1.0
-                else (
-                    "Parsed integers did not match"
-                    if is_valid
-                    else "Failed to parse integer from prediction or ground truth"
-                )
-            ),
-            data={
-                "extracted_text": extracted_text,
-                "extracted_int": extracted_int,
-                "ground_truth_int": gt_int,
-            },
-        )
-    }
-
-    return EvaluateResult(
-        score=score,
-        reason=("Answer correct" if score == 1.0 else "Answer incorrect"),
-        is_score_valid=is_valid,
-        metrics=metrics,
-    )
-
-