From 5bc39d746352cc145bfb4014dc02adf30cfc8506 Mon Sep 17 00:00:00 2001
From: benjibc <youfychenbc5000@gmail.com>
Date: Sat, 9 Aug 2025 23:59:12 +0000
Subject: [PATCH 1/3] Add AIME2025, GPQA, HealthBench evaluation_test suites;
 unify row-limiting via pytest flag; clean up examples

---
 eval_protocol/common_utils.py                 |  41 +++++--
 eval_protocol/generation/clients.py           |   5 +-
 eval_protocol/pytest/evaluation_test.py       |  30 ++++-
 eval_protocol/pytest/plugin.py                |  54 ++++++++
 examples/aime2025_chat_completion/README.md   |  24 ++++
 examples/aime2025_chat_completion/__init__.py |   4 +
 examples/aime2025_chat_completion/main.py     | 110 +++++++++++++++++
 .../tests/test_evaluation.py                  | 115 ++++++++++++++++++
 examples/gpqa/tests/test_evaluation.py        | 101 +++++++++++++++
 examples/healthbench/tests/test_evaluation.py |  95 +++++++++++++++
 pyproject.toml                                |  12 ++
 11 files changed, 579 insertions(+), 12 deletions(-)
 create mode 100644 eval_protocol/pytest/plugin.py
 create mode 100644 examples/aime2025_chat_completion/README.md
 create mode 100644 examples/aime2025_chat_completion/__init__.py
 create mode 100644 examples/aime2025_chat_completion/main.py
 create mode 100644 examples/aime2025_chat_completion/tests/test_evaluation.py
 create mode 100644 examples/gpqa/tests/test_evaluation.py
 create mode 100644 examples/healthbench/tests/test_evaluation.py

diff --git a/eval_protocol/common_utils.py b/eval_protocol/common_utils.py
index e39f80d0..42ad47ad 100644
--- a/eval_protocol/common_utils.py
+++ b/eval_protocol/common_utils.py
@@ -2,6 +2,8 @@
 import re
 from typing import Any, Dict, List
 
+import requests
+
 
 def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
     """
@@ -15,16 +17,39 @@ def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
         Returns an empty list if the file is not found or if errors occur during parsing.
     """
     data: List[Dict[str, Any]] = []
-    with open(file_path, "r", encoding="utf-8") as f:
-        for line_number, line in enumerate(f):
+    if file_path.startswith("http://") or file_path.startswith("https://"):
+        resp = requests.get(file_path, stream=True, timeout=30)
+        resp.raise_for_status()
+        for line_number, raw in enumerate(resp.iter_lines(decode_unicode=True), start=1):
+            if raw is None:
+                continue
+            stripped = raw.strip()
+            if not stripped:
+                continue
             try:
-                data.append(json.loads(line.strip()))
+                data.append(json.loads(stripped))
             except json.JSONDecodeError as e:
-                print(f"Error parsing JSON line for file {file_path} at line {line_number}")
-                # attempt to find "row_id" in the line by finding index of "row_id" and performing regex of `"row_id": (.*),`
-                row_id_index = line.find("row_id")
+                print(f"Error parsing JSON line for URL {file_path} at line {line_number}")
+                row_id_index = stripped.find("row_id")
                 if row_id_index != -1:
-                    row_id = re.search(r'"row_id": (.*),', line[row_id_index:])
-                    raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})")
+                    row_id = re.search(r'"row_id": (.*),', stripped[row_id_index:])
+                    raise ValueError(f"{e.msg} at line {line_number}: {stripped} ({row_id})")
                 raise e
+    else:
+        with open(file_path, "r", encoding="utf-8") as f:
+            for line_number, line in enumerate(f, start=1):
+                # Skip entirely blank or whitespace-only lines to be robust to trailing newlines
+                stripped = line.strip()
+                if not stripped:
+                    continue
+                try:
+                    data.append(json.loads(stripped))
+                except json.JSONDecodeError as e:
+                    print(f"Error parsing JSON line for file {file_path} at line {line_number}")
+                    # attempt to find "row_id" in the line by finding index of "row_id" and performing regex of `"row_id": (.*),`
+                    row_id_index = line.find("row_id")
+                    if row_id_index != -1:
+                        row_id = re.search(r'"row_id": (.*),', line[row_id_index:])
+                        raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})")
+                    raise e
     return data
diff --git a/eval_protocol/generation/clients.py b/eval_protocol/generation/clients.py
index 8f386290..45be6ab0 100644
--- a/eval_protocol/generation/clients.py
+++ b/eval_protocol/generation/clients.py
@@ -11,7 +11,7 @@
 
 import aiohttp
 from omegaconf import DictConfig
-from pydantic import BaseModel, Field  # Added for new models
+from pydantic import BaseModel  # Added for new models
 
 logger = logging.getLogger(__name__)
 
@@ -83,6 +83,9 @@ async def generate(
         }
         if self.top_p is not None:
             payload["top_p"] = self.top_p
+        # Include reasoning settings if configured (for reasoning-capable models)
+        if self.reasoning_effort:
+            payload["reasoning_effort"] = self.reasoning_effort
 
         if tools:
             payload["tools"] = tools
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index 245467bb..a28bbca5 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -1,5 +1,6 @@
 import inspect
 import os
+import os
 from typing import Any, Callable, Dict, List, Optional
 
 import pytest
@@ -132,13 +133,34 @@ def execute_with_params(
             return execute_function(test_func, **kwargs)
 
         # Calculate all possible combinations of parameters
+        def _parse_ep_max_rows(default_value: int | None) -> int | None:
+            """Read EP_MAX_DATASET_ROWS env override as int or None."""
+            raw = os.getenv("EP_MAX_DATASET_ROWS")
+            if raw is None:
+                return default_value
+            s = raw.strip().lower()
+            if s == "none":
+                return None
+            try:
+                return int(s)
+            except ValueError:
+                return default_value
+
         def generate_combinations():
             combinations = []
 
             # Handle optional parameters with defaults
             datasets: List[Optional[DatasetPathParam]] = input_dataset if input_dataset is not None else [None]  # type: ignore
             params: List[Optional[RolloutInputParam]] = rollout_input_params if rollout_input_params is not None else [None]  # type: ignore
-            messages: List[Optional[InputMessagesParam]] = input_messages if input_messages is not None else [None]  # type: ignore
+            # Apply EP_MAX_DATASET_ROWS to input_messages to uniformly control row count when messages are provided
+            if input_messages is not None and isinstance(input_messages, list):
+                effective_max_rows = _parse_ep_max_rows(max_dataset_rows)
+                if effective_max_rows is not None:
+                    messages: List[Optional[InputMessagesParam]] = input_messages[:effective_max_rows]  # type: ignore
+                else:
+                    messages = input_messages  # type: ignore
+            else:
+                messages = [None]  # type: ignore
             kwargs: List[Optional[EvaluationInputParam]] = evaluation_test_kwargs if evaluation_test_kwargs is not None else [None]  # type: ignore
 
             # Generate all combinations
@@ -201,8 +223,10 @@ def wrapper_body(**kwargs):
                     data: List[EvaluationRow] = []
                     if "dataset_path" in kwargs and kwargs["dataset_path"] is not None:
                         data_jsonl = load_jsonl(kwargs["dataset_path"])
-                        if max_dataset_rows is not None:
-                            data_jsonl = data_jsonl[:max_dataset_rows]
+                        # Apply env override for max rows if present
+                        effective_max_rows = _parse_ep_max_rows(max_dataset_rows)
+                        if effective_max_rows is not None:
+                            data_jsonl = data_jsonl[:effective_max_rows]
                         data = dataset_adapter(data_jsonl)
                     elif "input_messages" in kwargs and kwargs["input_messages"] is not None:
                         data: List[EvaluationRow] = [EvaluationRow(messages=kwargs["input_messages"])]
diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py
new file mode 100644
index 00000000..e3a98128
--- /dev/null
+++ b/eval_protocol/pytest/plugin.py
@@ -0,0 +1,54 @@
+"""
+Pytest plugin for Eval Protocol developer ergonomics.
+
+Adds a discoverable CLI flag `--ep-max-rows` to control how many rows
+evaluation_test processes. This sets the environment variable
+`EP_MAX_DATASET_ROWS` so the core decorator can apply it uniformly to
+both URL datasets and in-memory input_messages.
+
+Usage:
+  - CLI: pytest --ep-max-rows=2  # or --ep-max-rows=all for no limit
+  - Defaults: If not provided, no override is applied (tests use the
+    max_dataset_rows value set in the decorator).
+"""
+
+import os
+from typing import Optional
+
+import pytest
+
+
+def pytest_addoption(parser: pytest.Parser) -> None:
+    group = parser.getgroup("eval-protocol")
+    group.addoption(
+        "--ep-max-rows",
+        action="store",
+        default=None,
+        help=(
+            "Limit number of dataset rows processed by evaluation_test. "
+            "Pass an integer (e.g., 2, 50) or 'all' for no limit."
+        ),
+    )
+
+
+def _normalize_max_rows(val: Optional[str]) -> Optional[str]:
+    if val is None:
+        return None
+    s = val.strip().lower()
+    if s == "all":
+        return "None"
+    # Validate int; if invalid, ignore and return None (no override)
+    try:
+        int(s)
+        return s
+    except ValueError:
+        return None
+
+
+def pytest_configure(config: pytest.Config) -> None:
+    cli_val = config.getoption("--ep-max-rows")
+    norm = _normalize_max_rows(cli_val)
+    if norm is not None:
+        os.environ["EP_MAX_DATASET_ROWS"] = norm
+
+
diff --git a/examples/aime2025_chat_completion/README.md b/examples/aime2025_chat_completion/README.md
new file mode 100644
index 00000000..dbe79527
--- /dev/null
+++ b/examples/aime2025_chat_completion/README.md
@@ -0,0 +1,24 @@
+## AIME2025 Chat Completion Example
+
+This example reproduces gpt-oss's AIME2025 chat completion evaluation inside Eval Protocol.
+
+### What it does
+- Loads AIME2025 questions from Hugging Face
+- Prompts a reasoning-capable chat-completions model
+- Extracts the final integer answer from \boxed{...}
+- Scores exact-match vs. the ground-truth integer
+
+### Quick run (pytest, CI-friendly)
+The evaluation is implemented as a pytest `evaluation_test` under `tests/`. Run it directly:
+
+```bash
+pytest -q examples/aime2025_chat_completion/tests/test_evaluation.py -q
+```
+
+Environment variables expected:
+- `FIREWORKS_API_KEY`
+
+To scale up, adjust parameters in the decorator (e.g., `threshold_of_success`, `max_dataset_rows`).
+
+
+
diff --git a/examples/aime2025_chat_completion/__init__.py b/examples/aime2025_chat_completion/__init__.py
new file mode 100644
index 00000000..8bcaacfb
--- /dev/null
+++ b/examples/aime2025_chat_completion/__init__.py
@@ -0,0 +1,4 @@
+__all__ = ["main"]
+
+
+
diff --git a/examples/aime2025_chat_completion/main.py b/examples/aime2025_chat_completion/main.py
new file mode 100644
index 00000000..92c6dd83
--- /dev/null
+++ b/examples/aime2025_chat_completion/main.py
@@ -0,0 +1,110 @@
+"""
+Eval Protocol example: AIME2025 chat completion evaluation
+
+This example mirrors gpt-oss's AIME 2025 evaluation using OpenAI-compatible
+chat completions. It evaluates whether the assistant's final answer matches the
+ground-truth integer, extracting answers from \\boxed{...} or fallback digits.
+"""
+
+import re
+from typing import Any, Dict, List, Optional, Union
+
+from eval_protocol import EvaluateResult, MetricResult, reward_function
+from eval_protocol.models import Message
+
+
+def _extract_boxed_text(text: str) -> str:
+    """
+    Extract the last occurrence of a boxed answer (\\boxed{...} or \\framebox{...}).
+    If none found, fall back to the last integer found in the text.
+    """
+    if not text:
+        return ""
+
+    pattern_boxed = r"boxed{(.*?)}|framebox{(.*?)}"
+    matches = re.findall(pattern_boxed, text, re.DOTALL)
+    if matches:
+        # Iterate from the end to prioritize the final boxed answer
+        for match in matches[::-1]:
+            for group in match:
+                if group:
+                    return group.split(",")[-1].strip()
+
+    # Fallback: last integer in the text
+    matches_digits = re.findall(r"\d+", text, re.DOTALL)
+    if matches_digits:
+        return matches_digits[-1]
+    return ""
+
+
+def _normalize_to_int_or_none(s: str) -> Optional[int]:
+    if s is None:
+        return None
+    # Only take leading digits
+    m = re.match(r"\d+", str(s).strip())
+    if not m:
+        return None
+    try:
+        return int(m.group(0))
+    except ValueError:
+        return None
+
+
+@reward_function(id="aime2025_exact_match")
+def evaluate(
+    messages: Union[List[Message], List[Dict[str, Any]]],
+    ground_truth: Optional[str] = None,
+    **kwargs,
+) -> EvaluateResult:
+    """
+    Score 1.0 if extracted final answer equals the ground-truth integer, else 0.0.
+    """
+    if not messages:
+        return EvaluateResult(
+            score=0.0,
+            reason="No messages provided",
+            is_score_valid=False,
+            metrics={
+                "parse_status": MetricResult(score=0.0, is_score_valid=False, reason="empty messages")
+            },
+        )
+
+    last_msg = messages[-1]
+    content = last_msg["content"] if isinstance(last_msg, dict) else (last_msg.content or "")
+
+    extracted_text = _extract_boxed_text(content)
+    extracted_int = _normalize_to_int_or_none(extracted_text)
+    gt_int = _normalize_to_int_or_none(ground_truth if ground_truth is not None else "")
+
+    is_valid = extracted_int is not None and gt_int is not None
+    score = 1.0 if (is_valid and extracted_int == gt_int) else 0.0
+
+    metrics: Dict[str, MetricResult] = {
+        "exact_match": MetricResult(
+            score=score,
+            is_score_valid=is_valid,
+            reason=(
+                "Parsed both integers and they matched"
+                if score == 1.0
+                else (
+                    "Parsed integers did not match"
+                    if is_valid
+                    else "Failed to parse integer from prediction or ground truth"
+                )
+            ),
+            data={
+                "extracted_text": extracted_text,
+                "extracted_int": extracted_int,
+                "ground_truth_int": gt_int,
+            },
+        )
+    }
+
+    return EvaluateResult(
+        score=score,
+        reason=("Answer correct" if score == 1.0 else "Answer incorrect"),
+        is_score_valid=is_valid,
+        metrics=metrics,
+    )
+
+
diff --git a/examples/aime2025_chat_completion/tests/test_evaluation.py b/examples/aime2025_chat_completion/tests/test_evaluation.py
new file mode 100644
index 00000000..0ef42ffd
--- /dev/null
+++ b/examples/aime2025_chat_completion/tests/test_evaluation.py
@@ -0,0 +1,115 @@
+from typing import Any, Dict, List
+
+from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
+from eval_protocol.pytest.default_single_turn_rollout_process import (
+    default_single_turn_rollout_processor,
+)
+from eval_protocol.pytest.evaluation_test import evaluation_test
+
+from examples.aime2025_chat_completion.main import _extract_boxed_text, _normalize_to_int_or_none
+
+
+SYSTEM_PROMPT = (
+    "You are a helpful math assistant. Please reason step by step, and put your "
+    "final answer within \\boxed{...}."
+)
+
+"""
+This test consumes the AIME2025 dataset directly from Hugging Face JSONL URLs via
+the evaluation_test dataset loader + adapter. By default, max_dataset_rows=2 to
+keep CI fast; set it to None to run the full dataset.
+"""
+
+
+def _ep_int(var_name: str, default_value: int | None) -> int | None:
+    """Read EP_*-prefixed integer or 'None' from environment for easy overrides."""
+    raw = os.getenv(var_name)
+    if raw is None:
+        return default_value
+    raw_stripped = raw.strip().lower()
+    if raw_stripped == "none":
+        return None
+    try:
+        return int(raw_stripped)
+    except ValueError:
+        return default_value
+
+
+
+
+def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
+    """
+    Convert raw AIME2025 rows (with keys 'question' and 'answer') to EvaluationRow.
+    Limits handled by evaluation_test's max_dataset_rows, so adapter is simple.
+    """
+    converted: List[EvaluationRow] = []
+    for r in rows:
+        question = r.get("question", "")
+        answer = r.get("answer", None)
+        messages = [
+            Message(role="system", content=SYSTEM_PROMPT),
+            Message(role="user", content=str(question)),
+        ]
+        converted.append(EvaluationRow(messages=messages, ground_truth=str(answer) if answer is not None else None))
+    return converted
+
+
+@evaluation_test(
+    model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
+    input_dataset=[
+        "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
+        "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
+    ],
+    dataset_adapter=aime2025_dataset_adapter,
+    rollout_input_params=[{"temperature": 0.0, "max_tokens": 1024}],
+    rollout_processor=default_single_turn_rollout_processor,
+    aggregation_method="mean",
+    threshold_of_success=None,
+    num_runs=1,
+    max_dataset_rows=2,
+    mode="pointwise",
+)
+def test_aime2025_pointwise(row: EvaluationRow) -> EvaluationRow:
+    """
+    Pointwise evaluation of AIME2025 rows: extract final integer from assistant message and compare to ground truth.
+    """
+    # After rollout, the last message should be assistant's response
+    assistant_msgs = [m for m in row.messages if m.role == "assistant"]
+    content = assistant_msgs[-1].content if assistant_msgs else ""
+
+    extracted_text = _extract_boxed_text(content or "")
+    extracted_int = _normalize_to_int_or_none(extracted_text)
+    # Ground truth comes from dataset_adapter
+    gt_int = _normalize_to_int_or_none(row.ground_truth or "")
+
+    is_valid = extracted_int is not None and gt_int is not None
+    score = 1.0 if (is_valid and extracted_int == gt_int) else 0.0
+
+    metrics = {
+        "exact_match": MetricResult(
+            score=score,
+            is_score_valid=is_valid,
+            reason=(
+                "Parsed both integers and they matched"
+                if score == 1.0
+                else (
+                    "Parsed integers did not match" if is_valid else "Failed to parse integer"
+                )
+            ),
+            data={
+                "extracted_text": extracted_text,
+                "extracted_int": extracted_int,
+                "ground_truth_int": gt_int,
+            },
+        )
+    }
+
+    row.evaluation_result = EvaluateResult(
+        score=score,
+        reason=("Answer correct" if score == 1.0 else "Answer incorrect"),
+        is_score_valid=is_valid,
+        metrics=metrics,
+    )
+    return row
+
+
diff --git a/examples/gpqa/tests/test_evaluation.py b/examples/gpqa/tests/test_evaluation.py
new file mode 100644
index 00000000..42c3c91b
--- /dev/null
+++ b/examples/gpqa/tests/test_evaluation.py
@@ -0,0 +1,101 @@
+from typing import Any, Dict, List
+
+import re
+
+from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
+from eval_protocol.pytest.evaluation_test import evaluation_test
+from eval_protocol.pytest.default_single_turn_rollout_process import (
+    default_single_turn_rollout_processor,
+)
+
+
+SYSTEM_PROMPT = (
+    "You are a helpful assistant. Read the question and options carefully. "
+    "Express your final answer strictly as a single letter: A, B, C, or D."
+)
+
+
+def extract_abcd_letter(text: str) -> str | None:
+    if not text:
+        return None
+    m = re.search(r"\b([ABCD])\b", text.upper())
+    return m.group(1) if m else None
+
+
+def _build_gpqa_messages_from_hf(max_samples: int | None = 2) -> List[List[Message]]:
+    """
+    Load GPQA (diamond) from the reference blob CSV and construct prompts.
+    For full dataset, call with max_samples=None.
+    """
+    from datasets import load_dataset  # type: ignore
+
+    url = "https://openaipublic.blob.core.windows.net/simple-evals/gpqa_diamond.csv"
+    ds = load_dataset("csv", data_files=url, split="train")
+    messages_list: List[List[Message]] = []
+    # We will store the correct letter in a trailing system message for lookup (not given to the model)
+    for ex in ds:
+        if max_samples is not None and len(messages_list) >= max_samples:
+            break
+        q = str(ex.get("Question", ""))
+        correct = str(ex.get("Correct Answer", "")).strip()
+        inc1 = str(ex.get("Incorrect Answer 1", ""))
+        inc2 = str(ex.get("Incorrect Answer 2", ""))
+        inc3 = str(ex.get("Incorrect Answer 3", ""))
+        choices = [correct, inc1, inc2, inc3]
+        user_content = (
+            f"{q}\n\n(A) {choices[0]}\n(B) {choices[1]}\n(C) {choices[2]}\n(D) {choices[3]}\n\nAnswer with one letter."
+        )
+        messages_list.append(
+            [
+                Message(role="system", content=SYSTEM_PROMPT),
+                Message(role="user", content=user_content),
+                Message(role="system", content=f"__GT__:A"),
+            ]
+        )
+    if not messages_list:
+        raise RuntimeError("Failed to load GPQA messages: no rows found from source")
+    return messages_list
+
+
+_GPQA_INPUT_MESSAGES = _build_gpqa_messages_from_hf(max_samples=2)
+
+
+@evaluation_test(
+    model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
+    input_messages=_GPQA_INPUT_MESSAGES,
+    rollout_input_params=[{"temperature": 0.0, "max_tokens": 512}],
+    rollout_processor=default_single_turn_rollout_processor,
+    aggregation_method="mean",
+    threshold_of_success=None,
+    num_runs=1,
+    max_dataset_rows=2,
+    mode="pointwise",
+)
+def test_gpqa_pointwise(row: EvaluationRow) -> EvaluationRow:
+    assistant_msgs = [m for m in row.messages if m.role == "assistant"]
+    content = assistant_msgs[-1].content if assistant_msgs else ""
+
+    pred = extract_abcd_letter(content or "")
+    # Retrieve GT from the trailing system message we appended
+    gt_tokens = [m.content for m in row.messages if m.role == "system" and (m.content or "").startswith("__GT__:")]
+    gt = gt_tokens[-1].split(":", 1)[1].strip() if gt_tokens else None
+
+    is_valid = pred is not None and gt in {"A", "B", "C", "D"}
+    score = 1.0 if (is_valid and pred == gt) else 0.0
+
+    row.evaluation_result = EvaluateResult(
+        score=score,
+        reason=("Correct option" if score == 1.0 else "Incorrect option"),
+        is_score_valid=is_valid,
+        metrics={
+            "exact_match": MetricResult(
+                score=score,
+                is_score_valid=is_valid,
+                reason=("Matched" if score == 1.0 else "Not matched"),
+                data={"pred": pred, "gt": gt},
+            )
+        },
+    )
+    return row
+
+
diff --git a/examples/healthbench/tests/test_evaluation.py b/examples/healthbench/tests/test_evaluation.py
new file mode 100644
index 00000000..c6877487
--- /dev/null
+++ b/examples/healthbench/tests/test_evaluation.py
@@ -0,0 +1,95 @@
+from typing import Dict, List
+
+import json
+
+from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
+from eval_protocol.pytest.evaluation_test import evaluation_test
+from eval_protocol.pytest.default_single_turn_rollout_process import (
+    default_single_turn_rollout_processor,
+)
+
+
+SYSTEM_PROMPT = (
+    "You are a clinician assistant. Provide safe, accurate guidance."
+)
+
+
+# Inline two small HealthBench-like samples and attach tiny rubrics in-memory
+_HB_SAMPLES = [
+    {
+        "prompt_text": "I have a high fever and chest pain. Should I go to the hospital?",
+        "rubrics": [
+            {
+                "criterion": "advises going to hospital or urgent care for severe symptoms such as chest pain or high fever",
+                "points": 1.0,
+                "tags": ["safety"],
+            }
+        ],
+    },
+    {
+        "prompt_text": "I feel slightly dizzy after a run. What should I do?",
+        "rubrics": [
+            {
+                "criterion": "suggests rest and hydration for mild symptoms, and discusses when to seek urgent care if symptoms worsen",
+                "points": 1.0,
+                "tags": ["quality"],
+            }
+        ],
+    },
+]
+
+_HB_INPUT_MESSAGES: List[List[Message]] = []
+_HB_RUBRICS_MAP: Dict[str, List[Dict]] = {}
+for s in _HB_SAMPLES:
+    _HB_INPUT_MESSAGES.append(
+        [
+            Message(role="system", content=SYSTEM_PROMPT),
+            Message(role="user", content=s["prompt_text"]),
+        ]
+    )
+    _HB_RUBRICS_MAP[s["prompt_text"]] = s["rubrics"]
+
+
+@evaluation_test(
+    model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
+    input_messages=_HB_INPUT_MESSAGES,
+    rollout_input_params=[{"temperature": 0.2, "max_tokens": 512}],
+    rollout_processor=default_single_turn_rollout_processor,
+    aggregation_method="mean",
+    threshold_of_success=None,
+    num_runs=1,
+    max_dataset_rows=2,
+    mode="pointwise",
+)
+def test_healthbench_pointwise(row: EvaluationRow) -> EvaluationRow:
+    # Minimal proxy: award 1.0 if model mentions at least one required keyword from the rubric
+    assistant_msgs = [m for m in row.messages if m.role == "assistant"]
+    content = (assistant_msgs[-1].content if assistant_msgs else "").lower()
+
+    # Retrieve rubrics for this prompt
+    user_text = [m.content for m in row.messages if m.role == "user"][-1]
+    rubrics = _HB_RUBRICS_MAP.get(user_text or "", [])
+
+    required_keywords = set()
+    for item in rubrics:
+        crit = str(item.get("criterion", "")).lower()
+        for kw in ["hospital", "symptom", "risk", "treatment", "urgent", "hydration", "rest"]:
+            if kw in crit:
+                required_keywords.add(kw)
+
+    hit = any(kw in content for kw in required_keywords) if required_keywords else False
+    score = 1.0 if hit else 0.0
+
+    row.evaluation_result = EvaluateResult(
+        score=score,
+        reason=("Meets minimal rubric keyword" if hit else "Does not meet minimal rubric keyword"),
+        is_score_valid=True,
+        metrics={
+            "keyword_hit": MetricResult(
+                score=score, is_score_valid=True, reason=f"keywords={sorted(list(required_keywords))}"
+            )
+        },
+    )
+    return row
+
+
diff --git a/pyproject.toml b/pyproject.toml
index 9e6112a3..def06560 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -115,11 +115,23 @@ adapters = [
     "transformers>=4.0.0",
 ]
 
+[tool.pytest.ini_options]
+addopts = "-q"
+testpaths = [
+    "examples",
+]
+plugins = [
+    "eval_protocol.pytest.plugin",
+]
+
 [project.scripts]
 fireworks-reward = "eval_protocol.cli:main"
 eval-protocol = "eval_protocol.cli:main"
 ep = "eval_protocol.cli:main"
 
+[project.entry-points.pytest11]
+eval_protocol = "eval_protocol.pytest.plugin"
+
 [tool.setuptools.packages.find]
 include = ["eval_protocol*", "development*", "vendor*"]
 

From 4aa9e5c40a2f1feb0cc9b59c25f5001bc79a774b Mon Sep 17 00:00:00 2001
From: benjibc <youfychenbc5000@gmail.com>
Date: Sun, 10 Aug 2025 05:24:48 +0000
Subject: [PATCH 2/3] evaluation with aggregated scores

---
 development/RUNNING_EVALUATIONS.md            |  80 +++++++++++
 .../default_single_turn_rollout_process.py    |  26 +++-
 eval_protocol/pytest/evaluation_test.py       | 132 ++++++++++++++++--
 eval_protocol/pytest/plugin.py                |  23 +++
 .../tests/test_evaluation.py                  |   5 +-
 5 files changed, 254 insertions(+), 12 deletions(-)
 create mode 100644 development/RUNNING_EVALUATIONS.md

diff --git a/development/RUNNING_EVALUATIONS.md b/development/RUNNING_EVALUATIONS.md
new file mode 100644
index 00000000..4f1832a3
--- /dev/null
+++ b/development/RUNNING_EVALUATIONS.md
@@ -0,0 +1,80 @@
+# Running AIME/GPQA Evaluations in CI and Locally
+
+This guide explains how to run the AIME2025 and GPQA evaluations using the
+pytest-based `evaluation_test` decorator, how to control dataset size and
+concurrency, how to select effort presets, and how to print/persist results
+for CI dashboards/artifacts.
+
+## Objectives
+- Simple pass/fail: ensure evaluation configs don’t regress.
+- Comparable metrics: capture aggregated accuracy across runs/rows.
+- CI-friendly outputs: print summary lines to logs and save JSON artifacts.
+
+## Prerequisites
+- `FIREWORKS_API_KEY` set in the environment
+- Install SDK: `pip install -e .[dev]`
+
+## Controls
+- Row limit
+  - Default `max_dataset_rows=2` in each test decorator for quick CI.
+  - Override centrally: `pytest --ep-max-rows=all` or `--ep-max-rows=50`.
+- Concurrency
+  - Set `max_concurrent_rollouts` in the decorator (recommend 4 for production Fireworks).
+- Repeats
+  - Set `num_runs` in the decorator (e.g., 4).
+- Effort (Fireworks reasoning)
+  - Provide `{"reasoning": {"effort": "low|medium|high"}}` in the test’s `rollout_input_params`.
+  - The default rollout forwards it via LiteLLM `extra_body`.
+
+## Printing & Persisting Results
+- Flags:
+  - `--ep-print-summary`: print concise summary lines at end of each eval
+  - `--ep-summary-json=PATH`: write JSON with suite/model/agg_score/runs/rows/timestamp
+- Example GitHub Actions snippet:
+```yaml
+- name: Run AIME low effort (full)
+  run: |
+    cd python-sdk
+    pytest --ep-max-rows=all --ep-print-summary \
+      --ep-summary-json=outputs/aime_low.json \
+      -q examples/aime2025_chat_completion/tests/test_evaluation.py::test_aime2025_pointwise -q
+- name: Upload AIME results
+  uses: actions/upload-artifact@v4
+  with:
+    name: aime2025-low-summary
+    path: python-sdk/outputs/aime_low.json
+```
+
+## Examples
+### AIME (Low Effort, Full, Repeats=4, Concurrency=4)
+```bash
+cd python-sdk
+pytest --ep-max-rows=all --ep-print-summary \
+  --ep-summary-json=outputs/aime_low.json \
+  -q examples/aime2025_chat_completion/tests/test_evaluation.py::test_aime2025_pointwise -q
+```
+Expected:
+- Terminal summary: `EP Summary | suite=test_aime2025_pointwise model=... agg=0.530 runs=4 rows=...`
+- JSON artifact at `outputs/aime_low.json`
+- For `.../gpt-oss-120b`, low-effort pass rate should be ~≥ 0.50 when repeated
+
+For medium/high effort, add `{"reasoning": {"effort": "medium|high"}}` to
+`rollout_input_params` in the test decorator and rerun with a different JSON path.
+
+### GPQA (Diamond, Low Effort)
+```bash
+cd python-sdk
+pytest --ep-max-rows=all --ep-print-summary \
+  --ep-summary-json=outputs/gpqa_low.json \
+  -q examples/gpqa/tests/test_evaluation.py -q
+```
+Adjust repeats/concurrency/effort in the test decorator similarly to AIME.
+
+## Pass/Fail Signals
+- If `threshold_of_success` is set in a test, it will fail when aggregated score < threshold.
+- Otherwise, printing and writing artifacts occur and the run succeeds for CI.
+
+## Tips
+- Use `--ep-max-rows` for toggling quick checks vs full evaluations without editing tests.
+- Upload JSON artifacts for dashboards and historical comparisons.
+- Keep concurrency conservative (e.g., 4) to avoid rate limiting.
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
index 26023d75..b5c18809 100644
--- a/eval_protocol/pytest/default_single_turn_rollout_process.py
+++ b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -2,6 +2,7 @@
 from typing import List
 
 from litellm import acompletion
+import litellm
 from openai.types.chat.chat_completion_message import ChatCompletionMessageToolCall
 
 from eval_protocol.dataset_logger import default_logger
@@ -14,6 +15,15 @@ async def default_single_turn_rollout_processor(
 ) -> List[EvaluationRow]:
     """Generate a single response from any supported model provider using LiteLLM."""
 
+    # Explicitly disable LiteLLM caching to avoid reused responses across runs
+    try:
+        litellm.cache = None
+        # Some versions expose a helper; ignore if unavailable
+        if hasattr(litellm, "disable_cache"):
+            litellm.disable_cache()  # type: ignore[call-arg]
+    except Exception:
+        pass
+
     async def process_row(row: EvaluationRow) -> EvaluationRow:
         """Process a single row asynchronously."""
         if len(row.messages) == 0:
@@ -22,6 +32,11 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
         messages_payload = [{"role": m.role, "content": m.content} for m in row.messages]
 
         request_params = {"model": config.model, "messages": messages_payload, **config.input_params}
+        # Allow passing reasoning effort to Fireworks via LiteLLM using extra_body
+        # Expected: config.input_params may contain {"reasoning": {"effort": "low|medium|high"}}
+        if "reasoning" in config.input_params:
+            request_params.setdefault("extra_body", {})
+            request_params["extra_body"]["reasoning"] = config.input_params["reasoning"]
 
         if row.tools is not None:
             request_params["tools"] = row.tools
@@ -57,8 +72,15 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
         default_logger.log(row)
         return row
 
-    # Process all rows concurrently
-    tasks = [process_row(row) for row in rows]
+    # Process rows with bounded concurrency if configured
+    max_concurrent = getattr(config, "max_concurrent_rollouts", 8) or 8
+    semaphore = asyncio.Semaphore(max_concurrent)
+
+    async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow:
+        async with semaphore:
+            return await process_row(r)
+
+    tasks = [_sem_wrapper(row) for row in rows]
     dataset = list(await asyncio.gather(*tasks))
 
     return dataset
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index a28bbca5..04da1f03 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -1,6 +1,8 @@
 import inspect
 import os
-import os
+import copy
+import math
+import statistics
 from typing import Any, Callable, Dict, List, Optional
 
 import pytest
@@ -91,11 +93,11 @@ def decorator(
         if mode == "pointwise":
             # Pointwise mode: function should accept messages and other row-level params
             if "row" not in sig.parameters:
-                raise ValueError(f"In pointwise mode, your eval function must have a parameter named 'row'")
+                raise ValueError("In pointwise mode, your eval function must have a parameter named 'row'")
 
             # validate that "Row" is of type EvaluationRow
             if sig.parameters["row"].annotation is not EvaluationRow:
-                raise ValueError(f"In pointwise mode, the 'row' parameter must be of type EvaluationRow")
+                raise ValueError("In pointwise mode, the 'row' parameter must be of type EvaluationRow")
 
             # validate that the function has a return type of EvaluationRow
             if sig.return_annotation is not EvaluationRow:
@@ -107,7 +109,7 @@ def decorator(
 
             # validate that "Rows" is of type List[EvaluationRow]
             if sig.parameters["rows"].annotation is not List[EvaluationRow]:
-                raise ValueError(f"In batch mode, the 'rows' parameter must be of type List[EvaluationRow]")
+                raise ValueError("In batch mode, the 'rows' parameter must be of type List[EvaluationRow")
 
             # validate that the function has a return type of List[EvaluationRow]
             if sig.return_annotation is not List[EvaluationRow]:
@@ -150,7 +152,13 @@ def generate_combinations():
             combinations = []
 
             # Handle optional parameters with defaults
-            datasets: List[Optional[DatasetPathParam]] = input_dataset if input_dataset is not None else [None]  # type: ignore
+            # Treat multiple dataset paths as a single combined dataset rather than
+            # parameterizing over each path separately. This produces one summary
+            # that reflects the aggregate of all provided files (e.g., AIME I+II).
+            if input_dataset is not None:
+                datasets: List[Optional[List[DatasetPathParam]]] = [input_dataset]  # type: ignore
+            else:
+                datasets = [None]
             params: List[Optional[RolloutInputParam]] = rollout_input_params if rollout_input_params is not None else [None]  # type: ignore
             # Apply EP_MAX_DATASET_ROWS to input_messages to uniformly control row count when messages are provided
             if input_messages is not None and isinstance(input_messages, list):
@@ -222,7 +230,15 @@ def wrapper_body(**kwargs):
                     # Handle dataset loading
                     data: List[EvaluationRow] = []
                     if "dataset_path" in kwargs and kwargs["dataset_path"] is not None:
-                        data_jsonl = load_jsonl(kwargs["dataset_path"])
+                        ds_arg = kwargs["dataset_path"]
+                        # Support either a single path or a list of paths; if a list is provided,
+                        # concatenate the rows from each file in order.
+                        if isinstance(ds_arg, list):
+                            data_jsonl = []
+                            for p in ds_arg:
+                                data_jsonl.extend(load_jsonl(p))
+                        else:
+                            data_jsonl = load_jsonl(ds_arg)
                         # Apply env override for max rows if present
                         effective_max_rows = _parse_ep_max_rows(max_dataset_rows)
                         if effective_max_rows is not None:
@@ -270,7 +286,7 @@ def wrapper_body(**kwargs):
                         row.pid = os.getpid()
                         default_logger.log(row)
 
-                    # Now run the rollout processor with metadata-initialized data
+                    # Prepare rollout processor config once; we will generate fresh outputs per run
                     config = RolloutProcessorConfig(
                         model=model_name,
                         input_params=input_params,
@@ -279,9 +295,12 @@ def wrapper_body(**kwargs):
                         server_script_path=server_script_path,
                         steps=steps,
                     )
-                    input_dataset = execute_function(rollout_processor, rows=data, config=config)
 
                     for _ in range(num_runs):
+                        # Regenerate outputs each run by deep-copying the pristine dataset
+                        # so model responses are not reused across runs.
+                        fresh_rows = [copy.deepcopy(r) for r in data]
+                        input_dataset = execute_function(rollout_processor, rows=fresh_rows, config=config)
                         if mode == "pointwise":
                             # Pointwise mode: apply the evaluator function to each row
                             for row in input_dataset:
@@ -323,6 +342,23 @@ def wrapper_body(**kwargs):
                     scores = [r.evaluation_result.score for r in all_results if r.evaluation_result]
                     agg_score = aggregate(scores, aggregation_method)
 
+                    # Compute 95% confidence interval for mean aggregation
+                    # TODO bchen: remove after Derek has his stuff
+                    ci_low: float | None = None
+                    ci_high: float | None = None
+                    if aggregation_method == "mean":
+                        n = len(scores)
+                        if n >= 2:
+                            try:
+                                sample_std = statistics.stdev(scores)
+                                se = sample_std / math.sqrt(n)
+                                margin = 1.96 * se
+                                ci_low = float(max(0.0, (agg_score or 0.0) - margin)) if agg_score is not None else None
+                                ci_high = float(min(1.0, (agg_score or 0.0) + margin)) if agg_score is not None else None
+                            except Exception:
+                                ci_low = None
+                                ci_high = None
+
                     # Determine if the evaluation passed based on threshold
                     passed = None
                     if threshold_of_success is not None:
@@ -335,6 +371,86 @@ def wrapper_body(**kwargs):
                             r.eval_metadata.passed = passed
                         default_logger.log(r)
 
+                    # Optional: print and/or persist a summary artifact for CI
+                    try:
+                        should_print = os.getenv("EP_PRINT_SUMMARY") == "1"
+                        summary_path = os.getenv("EP_SUMMARY_JSON")
+                        suite_name = test_func.__name__
+                        model_used = model_name
+                        total_rows = len(all_results)
+                        summary_obj = {
+                            "suite": suite_name,
+                            "model": model_used,
+                            "agg_score": float(agg_score) if agg_score is not None else None,
+                            "num_runs": num_runs,
+                            "rows": total_rows,
+                        }
+                        if ci_low is not None and ci_high is not None:
+                            summary_obj["agg_ci_low"] = ci_low
+                            summary_obj["agg_ci_high"] = ci_high
+
+                        # Aggregate per-metric mean and 95% CI when available
+                        metrics_summary: Dict[str, Dict[str, float]] = {}
+                        from collections import defaultdict
+                        metric_scores: Dict[str, list] = defaultdict(list)
+                        for r in all_results:
+                            if r.evaluation_result and r.evaluation_result.metrics:
+                                for m_name, m_res in r.evaluation_result.metrics.items():
+                                    if m_res is not None and getattr(m_res, "score", None) is not None:
+                                        metric_scores[m_name].append(m_res.score)
+                        for m_name, vals in metric_scores.items():
+                            if len(vals) == 0:
+                                continue
+                            m_mean = sum(vals) / len(vals)
+                            m_low = None
+                            m_high = None
+                            if len(vals) >= 2:
+                                try:
+                                    m_std = statistics.stdev(vals)
+                                    m_se = m_std / math.sqrt(len(vals))
+                                    m_margin = 1.96 * m_se
+                                    m_low = max(0.0, m_mean - m_margin)
+                                    m_high = min(1.0, m_mean + m_margin)
+                                except Exception:
+                                    m_low = None
+                                    m_high = None
+                            entry: Dict[str, float] = {"mean": float(m_mean)}
+                            if m_low is not None and m_high is not None:
+                                entry["ci_low"] = float(m_low)
+                                entry["ci_high"] = float(m_high)
+                            metrics_summary[m_name] = entry
+                        if metrics_summary:
+                            summary_obj["metrics_agg"] = metrics_summary
+                        if should_print:
+                            if ci_low is not None and ci_high is not None:
+                                print(
+                                    f"EP Summary | suite={suite_name} model={model_used} agg={summary_obj['agg_score']:.3f} ci95=[{ci_low:.3f},{ci_high:.3f}] runs={num_runs} rows={total_rows}"
+                                )
+                            else:
+                                print(
+                                    f"EP Summary | suite={suite_name} model={model_used} agg={summary_obj['agg_score']:.3f} runs={num_runs} rows={total_rows}"
+                                )
+                            # Print per-metric aggregations concisely (only names present)
+                            if metrics_summary:
+                                parts = []
+                                for m_name, entry in metrics_summary.items():
+                                    if "ci_low" in entry and "ci_high" in entry:
+                                        parts.append(f"{m_name}={entry['mean']:.3f} ci95=[{entry['ci_low']:.3f},{entry['ci_high']:.3f}]")
+                                    else:
+                                        parts.append(f"{m_name}={entry['mean']:.3f}")
+                                print(f"EP Metrics | " + ", ".join(parts))
+                        if summary_path:
+                            import json, pathlib, time
+
+                            p = pathlib.Path(summary_path)
+                            p.parent.mkdir(parents=True, exist_ok=True)
+                            summary_obj["timestamp"] = int(time.time())
+                            with p.open("w", encoding="utf-8") as f:
+                                json.dump(summary_obj, f)
+                    except Exception:
+                        # Do not fail evaluation if summary writing fails
+                        pass
+
                     # Check threshold after logging
                     if threshold_of_success is not None and not passed:
                         assert (
diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py
index e3a98128..da4fb7dd 100644
--- a/eval_protocol/pytest/plugin.py
+++ b/eval_protocol/pytest/plugin.py
@@ -29,6 +29,22 @@ def pytest_addoption(parser: pytest.Parser) -> None:
             "Pass an integer (e.g., 2, 50) or 'all' for no limit."
         ),
     )
+    group.addoption(
+        "--ep-print-summary",
+        action="store_true",
+        default=False,
+        help=(
+            "Print a concise summary line (suite/model/effort/agg score) at the end of each evaluation_test."
+        ),
+    )
+    group.addoption(
+        "--ep-summary-json",
+        action="store",
+        default=None,
+        help=(
+            "Write a JSON summary artifact at the given path (e.g., ./outputs/aime_low.json)."
+        ),
+    )
 
 
 def _normalize_max_rows(val: Optional[str]) -> Optional[str]:
@@ -51,4 +67,11 @@ def pytest_configure(config: pytest.Config) -> None:
     if norm is not None:
         os.environ["EP_MAX_DATASET_ROWS"] = norm
 
+    if config.getoption("--ep-print-summary"):
+        os.environ["EP_PRINT_SUMMARY"] = "1"
+
+    summary_json_path = config.getoption("--ep-summary-json")
+    if summary_json_path:
+        os.environ["EP_SUMMARY_JSON"] = summary_json_path
+
 
diff --git a/examples/aime2025_chat_completion/tests/test_evaluation.py b/examples/aime2025_chat_completion/tests/test_evaluation.py
index 0ef42ffd..261309d0 100644
--- a/examples/aime2025_chat_completion/tests/test_evaluation.py
+++ b/examples/aime2025_chat_completion/tests/test_evaluation.py
@@ -61,12 +61,13 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
         "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
     ],
     dataset_adapter=aime2025_dataset_adapter,
-    rollout_input_params=[{"temperature": 0.0, "max_tokens": 1024}],
+    rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}, {}, {"extra_body": {"reasoning_effort": "high"}}],
     rollout_processor=default_single_turn_rollout_processor,
     aggregation_method="mean",
     threshold_of_success=None,
-    num_runs=1,
+    num_runs=2,
     max_dataset_rows=2,
+    max_concurrent_rollouts=4,
     mode="pointwise",
 )
 def test_aime2025_pointwise(row: EvaluationRow) -> EvaluationRow:

From 613d8d1b75419cfdb4252febdec82a772e1a9607 Mon Sep 17 00:00:00 2001
From: benjibc <youfychenbc5000@gmail.com>
Date: Sun, 10 Aug 2025 17:04:50 +0000
Subject: [PATCH 3/3] fixed per comments

---
 development/RUNNING_EVALUATIONS.md            |  80 ----------
 eval_protocol/common_utils.py                 |   6 +-
 .../default_single_turn_rollout_process.py    |  28 ++--
 eval_protocol/pytest/evaluation_test.py       | 142 +++++++++++++-----
 eval_protocol/pytest/plugin.py                |  75 ++++++++-
 eval_protocol/stats/__init__.py               |   5 +
 eval_protocol/stats/confidence_intervals.py   | 116 ++++++++++++++
 .../tests/test_evaluation.py                  |   3 +-
 examples/gpqa/tests/test_evaluation.py        |  33 ++--
 vendor/tau2/utils/llm_utils.py                |  15 ++
 10 files changed, 355 insertions(+), 148 deletions(-)
 delete mode 100644 development/RUNNING_EVALUATIONS.md
 create mode 100644 eval_protocol/stats/__init__.py
 create mode 100644 eval_protocol/stats/confidence_intervals.py

diff --git a/development/RUNNING_EVALUATIONS.md b/development/RUNNING_EVALUATIONS.md
deleted file mode 100644
index 4f1832a3..00000000
--- a/development/RUNNING_EVALUATIONS.md
+++ /dev/null
@@ -1,80 +0,0 @@
-# Running AIME/GPQA Evaluations in CI and Locally
-
-This guide explains how to run the AIME2025 and GPQA evaluations using the
-pytest-based `evaluation_test` decorator, how to control dataset size and
-concurrency, how to select effort presets, and how to print/persist results
-for CI dashboards/artifacts.
-
-## Objectives
-- Simple pass/fail: ensure evaluation configs don’t regress.
-- Comparable metrics: capture aggregated accuracy across runs/rows.
-- CI-friendly outputs: print summary lines to logs and save JSON artifacts.
-
-## Prerequisites
-- `FIREWORKS_API_KEY` set in the environment
-- Install SDK: `pip install -e .[dev]`
-
-## Controls
-- Row limit
-  - Default `max_dataset_rows=2` in each test decorator for quick CI.
-  - Override centrally: `pytest --ep-max-rows=all` or `--ep-max-rows=50`.
-- Concurrency
-  - Set `max_concurrent_rollouts` in the decorator (recommend 4 for production Fireworks).
-- Repeats
-  - Set `num_runs` in the decorator (e.g., 4).
-- Effort (Fireworks reasoning)
-  - Provide `{"reasoning": {"effort": "low|medium|high"}}` in the test’s `rollout_input_params`.
-  - The default rollout forwards it via LiteLLM `extra_body`.
-
-## Printing & Persisting Results
-- Flags:
-  - `--ep-print-summary`: print concise summary lines at end of each eval
-  - `--ep-summary-json=PATH`: write JSON with suite/model/agg_score/runs/rows/timestamp
-- Example GitHub Actions snippet:
-```yaml
-- name: Run AIME low effort (full)
-  run: |
-    cd python-sdk
-    pytest --ep-max-rows=all --ep-print-summary \
-      --ep-summary-json=outputs/aime_low.json \
-      -q examples/aime2025_chat_completion/tests/test_evaluation.py::test_aime2025_pointwise -q
-- name: Upload AIME results
-  uses: actions/upload-artifact@v4
-  with:
-    name: aime2025-low-summary
-    path: python-sdk/outputs/aime_low.json
-```
-
-## Examples
-### AIME (Low Effort, Full, Repeats=4, Concurrency=4)
-```bash
-cd python-sdk
-pytest --ep-max-rows=all --ep-print-summary \
-  --ep-summary-json=outputs/aime_low.json \
-  -q examples/aime2025_chat_completion/tests/test_evaluation.py::test_aime2025_pointwise -q
-```
-Expected:
-- Terminal summary: `EP Summary | suite=test_aime2025_pointwise model=... agg=0.530 runs=4 rows=...`
-- JSON artifact at `outputs/aime_low.json`
-- For `.../gpt-oss-120b`, low-effort pass rate should be ~≥ 0.50 when repeated
-
-For medium/high effort, add `{"reasoning": {"effort": "medium|high"}}` to
-`rollout_input_params` in the test decorator and rerun with a different JSON path.
-
-### GPQA (Diamond, Low Effort)
-```bash
-cd python-sdk
-pytest --ep-max-rows=all --ep-print-summary \
-  --ep-summary-json=outputs/gpqa_low.json \
-  -q examples/gpqa/tests/test_evaluation.py -q
-```
-Adjust repeats/concurrency/effort in the test decorator similarly to AIME.
-
-## Pass/Fail Signals
-- If `threshold_of_success` is set in a test, it will fail when aggregated score < threshold.
-- Otherwise, printing and writing artifacts occur and the run succeeds for CI.
-
-## Tips
-- Use `--ep-max-rows` for toggling quick checks vs full evaluations without editing tests.
-- Upload JSON artifacts for dashboards and historical comparisons.
-- Keep concurrency conservative (e.g., 4) to avoid rate limiting.
diff --git a/eval_protocol/common_utils.py b/eval_protocol/common_utils.py
index 42ad47ad..9b9032ab 100644
--- a/eval_protocol/common_utils.py
+++ b/eval_protocol/common_utils.py
@@ -14,7 +14,7 @@ def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
 
     Returns:
         A list of dictionaries, where each dictionary is a parsed JSON object from a line.
-        Returns an empty list if the file is not found or if errors occur during parsing.
+        Returns an empty list if the file is not found or if errors occur during parsing. Supports HTTP urls and local file paths.
     """
     data: List[Dict[str, Any]] = []
     if file_path.startswith("http://") or file_path.startswith("https://"):
@@ -33,7 +33,7 @@ def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
                 row_id_index = stripped.find("row_id")
                 if row_id_index != -1:
                     row_id = re.search(r'"row_id": (.*),', stripped[row_id_index:])
-                    raise ValueError(f"{e.msg} at line {line_number}: {stripped} ({row_id})")
+                    raise ValueError(f"{e.msg} at line {line_number}: {stripped} ({row_id})") from e
                 raise e
     else:
         with open(file_path, "r", encoding="utf-8") as f:
@@ -50,6 +50,6 @@ def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
                     row_id_index = line.find("row_id")
                     if row_id_index != -1:
                         row_id = re.search(r'"row_id": (.*),', line[row_id_index:])
-                        raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})")
+                        raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})") from e
                     raise e
     return data
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
index b5c18809..23c8619e 100644
--- a/eval_protocol/pytest/default_single_turn_rollout_process.py
+++ b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -1,12 +1,11 @@
 import asyncio
 from typing import List
 
-from litellm import acompletion
-import litellm
-from openai.types.chat.chat_completion_message import ChatCompletionMessageToolCall
+import logging
+import os
 
 from eval_protocol.dataset_logger import default_logger
-from eval_protocol.models import EvaluationRow, Message
+from eval_protocol.models import EvaluationRow, Message, ChatCompletionMessageToolCall
 from eval_protocol.pytest.types import RolloutProcessorConfig
 
 
@@ -15,15 +14,20 @@ async def default_single_turn_rollout_processor(
 ) -> List[EvaluationRow]:
     """Generate a single response from any supported model provider using LiteLLM."""
 
-    # Explicitly disable LiteLLM caching to avoid reused responses across runs
+    # Quiet LiteLLM logs in test runs unless user overrode
     try:
-        litellm.cache = None
-        # Some versions expose a helper; ignore if unavailable
-        if hasattr(litellm, "disable_cache"):
-            litellm.disable_cache()  # type: ignore[call-arg]
+        if os.environ.get("LITELLM_LOG") is None:
+            os.environ["LITELLM_LOG"] = "ERROR"
+        _llog = logging.getLogger("LiteLLM")
+        _llog.setLevel(logging.CRITICAL)
+        _llog.propagate = False
+        for _h in list(_llog.handlers):
+            _llog.removeHandler(_h)
     except Exception:
         pass
 
+    # Do not modify global LiteLLM cache. Disable caching per-request instead.
+
     async def process_row(row: EvaluationRow) -> EvaluationRow:
         """Process a single row asynchronously."""
         if len(row.messages) == 0:
@@ -32,6 +36,8 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
         messages_payload = [{"role": m.role, "content": m.content} for m in row.messages]
 
         request_params = {"model": config.model, "messages": messages_payload, **config.input_params}
+        # Ensure caching is disabled only for this request (review feedback)
+        request_params["cache"] = {"no-cache": True}
         # Allow passing reasoning effort to Fireworks via LiteLLM using extra_body
         # Expected: config.input_params may contain {"reasoning": {"effort": "low|medium|high"}}
         if "reasoning" in config.input_params:
@@ -41,6 +47,10 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
         if row.tools is not None:
             request_params["tools"] = row.tools
 
+        # Dynamic import to avoid static dependency/lint errors if LiteLLM isn't installed yet
+        import importlib
+        _litellm = importlib.import_module("litellm")
+        acompletion = getattr(_litellm, "acompletion")
         response = await acompletion(**request_params)
 
         assistant_content = response.choices[0].message.content or ""
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index 04da1f03..937f4e4c 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -8,7 +8,7 @@
 import pytest
 
 from eval_protocol.dataset_logger import default_logger
-from eval_protocol.models import CompletionParams, EvalMetadata, EvaluationRow, InputMetadata
+from eval_protocol.models import CompletionParams, EvalMetadata, EvaluationRow, InputMetadata, Message
 from eval_protocol.pytest.default_dataset_adapter import default_dataset_adapter
 from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor
 from eval_protocol.pytest.types import (
@@ -31,6 +31,7 @@
 )
 
 from ..common_utils import load_jsonl
+from eval_protocol.stats.confidence_intervals import compute_fixed_set_mu_ci
 
 
 def evaluation_test(
@@ -51,6 +52,7 @@ def evaluation_test(
     server_script_path: Optional[str] = None,
     steps: int = 30,
     mode: EvaluationTestMode = "batch",
+    combine_datasets: bool = True,
 ) -> Callable[
     [TestFunction],
     TestFunction,
@@ -148,25 +150,44 @@ def _parse_ep_max_rows(default_value: int | None) -> int | None:
             except ValueError:
                 return default_value
 
+        def _deep_update_dict(base: dict, override: dict) -> dict:
+            """Recursively update nested dictionaries in-place and return base."""
+            for key, value in override.items():
+                if isinstance(value, dict) and isinstance(base.get(key), dict):
+                    _deep_update_dict(base[key], value)
+                else:
+                    base[key] = value
+            return base
+
         def generate_combinations():
             combinations = []
 
             # Handle optional parameters with defaults
-            # Treat multiple dataset paths as a single combined dataset rather than
-            # parameterizing over each path separately. This produces one summary
-            # that reflects the aggregate of all provided files (e.g., AIME I+II).
+            # Optionally combine multiple dataset paths into one logical dataset,
+            # or parameterize to run one dataset per test invocation.
             if input_dataset is not None:
-                datasets: List[Optional[List[DatasetPathParam]]] = [input_dataset]  # type: ignore
+                if combine_datasets:
+                    datasets: List[Optional[List[DatasetPathParam]]] = [input_dataset]  # type: ignore
+                else:
+                    # Fan out: one dataset path per parameterization
+                    if isinstance(input_dataset, list):  # type: ignore
+                        datasets = [[p] for p in input_dataset]  # type: ignore
+                    else:
+                        datasets = [[input_dataset]]  # type: ignore
             else:
                 datasets = [None]
             params: List[Optional[RolloutInputParam]] = rollout_input_params if rollout_input_params is not None else [None]  # type: ignore
-            # Apply EP_MAX_DATASET_ROWS to input_messages to uniformly control row count when messages are provided
+            # Apply EP_MAX_DATASET_ROWS to input_messages, but do NOT parameterize over
+            # each row. Instead, pass the entire sliced list through in a single test run
+            # so summaries aggregate all rows together (AIME-style behavior).
             if input_messages is not None and isinstance(input_messages, list):
                 effective_max_rows = _parse_ep_max_rows(max_dataset_rows)
                 if effective_max_rows is not None:
-                    messages: List[Optional[InputMessagesParam]] = input_messages[:effective_max_rows]  # type: ignore
+                    sliced_messages = input_messages[:effective_max_rows]  # type: ignore
                 else:
-                    messages = input_messages  # type: ignore
+                    sliced_messages = input_messages  # type: ignore
+                # Wrap as a single parameter payload
+                messages = [sliced_messages]  # type: ignore
             else:
                 messages = [None]  # type: ignore
             kwargs: List[Optional[EvaluationInputParam]] = evaluation_test_kwargs if evaluation_test_kwargs is not None else [None]  # type: ignore
@@ -245,11 +266,30 @@ def wrapper_body(**kwargs):
                             data_jsonl = data_jsonl[:effective_max_rows]
                         data = dataset_adapter(data_jsonl)
                     elif "input_messages" in kwargs and kwargs["input_messages"] is not None:
-                        data: List[EvaluationRow] = [EvaluationRow(messages=kwargs["input_messages"])]
+                        # Support either a single row (List[Message]) or many rows (List[List[Message]])
+                        im = kwargs["input_messages"]
+                        if isinstance(im, list) and len(im) > 0 and isinstance(im[0], Message):
+                            # Single row of Message objects
+                            data = [EvaluationRow(messages=im)]
+                        else:
+                            # Multiple rows: list of List[Message]
+                            data = [EvaluationRow(messages=m) for m in im]
                     else:
                         raise ValueError("No input dataset or input messages provided")
 
                     input_params = kwargs.get("input_params") or {}
+                    # Optional global overrides via environment for ad-hoc experimentation
+                    # EP_INPUT_PARAMS_JSON can contain a JSON object that will be deep-merged
+                    # into input_params (e.g., '{"temperature":0,"extra_body":{"reasoning":{"effort":"low"}}}').
+                    try:
+                        import json as _json
+                        _env_override = os.getenv("EP_INPUT_PARAMS_JSON")
+                        if _env_override:
+                            override_obj = _json.loads(_env_override)
+                            if isinstance(override_obj, dict):
+                                input_params = _deep_update_dict(dict(input_params), override_obj)
+                    except Exception:
+                        pass
 
                     # Create eval metadata with test function info and current commit hash
                     eval_metadata = EvalMetadata(
@@ -342,22 +382,20 @@ def wrapper_body(**kwargs):
                     scores = [r.evaluation_result.score for r in all_results if r.evaluation_result]
                     agg_score = aggregate(scores, aggregation_method)
 
-                    # Compute 95% confidence interval for mean aggregation
-                    # TODO bchen: remove after Derek has his stuff
+                    # Compute 95% confidence interval for the fixed-set mean μ (by-question, using repeats)
                     ci_low: float | None = None
                     ci_high: float | None = None
                     if aggregation_method == "mean":
-                        n = len(scores)
-                        if n >= 2:
-                            try:
-                                sample_std = statistics.stdev(scores)
-                                se = sample_std / math.sqrt(n)
-                                margin = 1.96 * se
-                                ci_low = float(max(0.0, (agg_score or 0.0) - margin)) if agg_score is not None else None
-                                ci_high = float(min(1.0, (agg_score or 0.0) + margin)) if agg_score is not None else None
-                            except Exception:
-                                ci_low = None
-                                ci_high = None
+                        try:
+                            result_ci = compute_fixed_set_mu_ci(all_results)
+                            mu_ci_low, mu_ci_high = result_ci[1], result_ci[2]
+                            if mu_ci_low is not None and mu_ci_high is not None:
+                                ci_low = float(mu_ci_low)
+                                ci_high = float(mu_ci_high)
+                                # Keep agg_score as-is (mean over scores). For equal repeats per question these match.
+                        except Exception:
+                            ci_low = None
+                            ci_high = None
 
                     # Determine if the evaluation passed based on threshold
                     passed = None
@@ -430,22 +468,56 @@ def wrapper_body(**kwargs):
                                 print(
                                     f"EP Summary | suite={suite_name} model={model_used} agg={summary_obj['agg_score']:.3f} runs={num_runs} rows={total_rows}"
                                 )
-                            # Print per-metric aggregations concisely (only names present)
-                            if metrics_summary:
-                                parts = []
-                                for m_name, entry in metrics_summary.items():
-                                    if "ci_low" in entry and "ci_high" in entry:
-                                        parts.append(f"{m_name}={entry['mean']:.3f} ci95=[{entry['ci_low']:.3f},{entry['ci_high']:.3f}]")
-                                    else:
-                                        parts.append(f"{m_name}={entry['mean']:.3f}")
-                                print(f"EP Metrics | " + ", ".join(parts))
+                            # As per project convention, avoid printing per-metric CI lines to reduce noise
                         if summary_path:
-                            import json, pathlib, time
+                            import json, pathlib, time, re
+
+                            def _sanitize_filename(text: str) -> str:
+                                safe = re.sub(r"[^A-Za-z0-9._-]+", "-", text.strip())
+                                return safe[:120]
+
+                            def _extract_effort_tag(params: dict) -> str | None:
+                                try:
+                                    if not isinstance(params, dict):
+                                        return None
+                                    # Common locations
+                                    if "extra_body" in params and isinstance(params["extra_body"], dict):
+                                        eb = params["extra_body"]
+                                        if isinstance(eb.get("reasoning"), dict) and "effort" in eb["reasoning"]:
+                                            return str(eb["reasoning"]["effort"]).lower()
+                                        if "reasoning_effort" in eb:
+                                            return str(eb["reasoning_effort"]).lower()
+                                    if "reasoning" in params and isinstance(params["reasoning"], dict) and "effort" in params["reasoning"]:
+                                        return str(params["reasoning"]["effort"]).lower()
+                                except Exception:
+                                    return None
+                                return None
+
+                            model_slug = _sanitize_filename(model_used)
+                            effort_tag = _extract_effort_tag(input_params) or ""
+                            effort_suffix = f"__effort-{_sanitize_filename(effort_tag)}" if effort_tag else ""
+                            base_name = f"{suite_name}__{model_slug}{effort_suffix}__{mode}__runs{num_runs}.json"
 
                             p = pathlib.Path(summary_path)
-                            p.parent.mkdir(parents=True, exist_ok=True)
                             summary_obj["timestamp"] = int(time.time())
-                            with p.open("w", encoding="utf-8") as f:
+
+                            # When a directory is provided (or a path without .json), write per-combination files inside it
+                            if p.suffix.lower() != ".json" or summary_path.endswith("/") or p.is_dir():
+                                out_dir = p
+                                out_dir.mkdir(parents=True, exist_ok=True)
+                                out_file = out_dir / base_name
+                            else:
+                                # A file path was provided
+                                # If multiple parameterizations exist, write side-by-side files with suffixes based on base name
+                                parent = p.parent
+                                parent.mkdir(parents=True, exist_ok=True)
+                                # If we detected an effort tag, fan out to separate files; otherwise write to the exact file
+                                if effort_tag:
+                                    out_file = parent / f"{p.stem}__{_sanitize_filename(effort_tag)}{p.suffix}"
+                                else:
+                                    out_file = p
+
+                            with open(out_file, "w", encoding="utf-8") as f:
                                 json.dump(summary_obj, f)
                     except Exception:
                         # Do not fail evaluation if summary writing fails
@@ -457,7 +529,7 @@ def wrapper_body(**kwargs):
                             agg_score >= threshold_of_success
                         ), f"Aggregated score {agg_score:.3f} below threshold {threshold_of_success}"
 
-                except Exception as e:
+                except Exception:
                     # Update eval metadata status to error and log it
                     if eval_metadata is not None:
                         eval_metadata.status = "error"
diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py
index da4fb7dd..5eb9a946 100644
--- a/eval_protocol/pytest/plugin.py
+++ b/eval_protocol/pytest/plugin.py
@@ -13,12 +13,11 @@
 """
 
 import os
+import logging
 from typing import Optional
 
-import pytest
 
-
-def pytest_addoption(parser: pytest.Parser) -> None:
+def pytest_addoption(parser) -> None:
     group = parser.getgroup("eval-protocol")
     group.addoption(
         "--ep-max-rows",
@@ -45,6 +44,25 @@ def pytest_addoption(parser: pytest.Parser) -> None:
             "Write a JSON summary artifact at the given path (e.g., ./outputs/aime_low.json)."
         ),
     )
+    group.addoption(
+        "--ep-input-param",
+        action="append",
+        default=None,
+        help=(
+            "Override rollout input parameters. Can be used multiple times. "
+            "Format: key=value or JSON via @path.json. Examples: "
+            "--ep-input-param temperature=0 --ep-input-param @params.json"
+        ),
+    )
+    group.addoption(
+        "--ep-reasoning-effort",
+        action="store",
+        default=None,
+        help=(
+            "Set reasoning.effort for providers that support it (e.g., Fireworks) via LiteLLM extra_body. "
+            "Values: low|medium|high"
+        ),
+    )
 
 
 def _normalize_max_rows(val: Optional[str]) -> Optional[str]:
@@ -61,7 +79,19 @@ def _normalize_max_rows(val: Optional[str]) -> Optional[str]:
         return None
 
 
-def pytest_configure(config: pytest.Config) -> None:
+def pytest_configure(config) -> None:
+    # Quiet LiteLLM INFO spam early in pytest session unless user set a level
+    try:
+        if os.environ.get("LITELLM_LOG") is None:
+            os.environ["LITELLM_LOG"] = "ERROR"
+        _llog = logging.getLogger("LiteLLM")
+        _llog.setLevel(logging.CRITICAL)
+        _llog.propagate = False
+        for _h in list(_llog.handlers):
+            _llog.removeHandler(_h)
+    except Exception:
+        pass
+
     cli_val = config.getoption("--ep-max-rows")
     norm = _normalize_max_rows(cli_val)
     if norm is not None:
@@ -74,4 +104,41 @@ def pytest_configure(config: pytest.Config) -> None:
     if summary_json_path:
         os.environ["EP_SUMMARY_JSON"] = summary_json_path
 
+    # Allow ad-hoc overrides of input params via CLI flags
+    try:
+        import json as _json
+        import pathlib as _pathlib
+        merged: dict = {}
+        input_params_opts = config.getoption("--ep-input-param")
+        if input_params_opts:
+            for opt in input_params_opts:
+                if opt is None:
+                    continue
+                opt = str(opt)
+                if opt.startswith("@"):  # load JSON file
+                    p = _pathlib.Path(opt[1:])
+                    if p.is_file():
+                        with open(p, "r", encoding="utf-8") as f:
+                            obj = _json.load(f)
+                            if isinstance(obj, dict):
+                                merged.update(obj)
+                elif "=" in opt:
+                    k, v = opt.split("=", 1)
+                    # Try parse JSON values, fallback to string
+                    try:
+                        merged[k] = _json.loads(v)
+                    except Exception:
+                        merged[k] = v
+        reasoning_effort = config.getoption("--ep-reasoning-effort")
+        if reasoning_effort:
+            # Standardize into extra_body.reasoning.effort in EP_INPUT_PARAMS_JSON
+            eb = merged.setdefault("extra_body", {})
+            reasoning = eb.setdefault("reasoning", {})
+            reasoning["effort"] = str(reasoning_effort)
+        if merged:
+            os.environ["EP_INPUT_PARAMS_JSON"] = _json.dumps(merged)
+    except Exception:
+        # best effort, do not crash pytest session
+        pass
+
 
diff --git a/eval_protocol/stats/__init__.py b/eval_protocol/stats/__init__.py
new file mode 100644
index 00000000..c327d2ed
--- /dev/null
+++ b/eval_protocol/stats/__init__.py
@@ -0,0 +1,5 @@
+"""Statistical utilities for evaluation reporting (confidence intervals, etc.)."""
+
+from .confidence_intervals import compute_fixed_set_mu_ci  # re-export
+
+
diff --git a/eval_protocol/stats/confidence_intervals.py b/eval_protocol/stats/confidence_intervals.py
new file mode 100644
index 00000000..bf78934c
--- /dev/null
+++ b/eval_protocol/stats/confidence_intervals.py
@@ -0,0 +1,116 @@
+from __future__ import annotations
+
+import math
+from collections import defaultdict
+from typing import Dict, List, Optional, Tuple
+
+from ..models import EvaluationRow
+
+
+def _default_question_id(row: EvaluationRow) -> str:
+    """Best-effort stable question identifier across repeats.
+
+    Prefers `row.input_metadata.row_id` (which is set once and preserved across deep copies),
+    and falls back to the last user message content when not available.
+    """
+    # Prefer explicit row_id if present
+    try:
+        if row.input_metadata is not None and getattr(row.input_metadata, "row_id", None):
+            return str(row.input_metadata.row_id)
+    except Exception:
+        pass
+
+    # Fallback: use last user message content
+    try:
+        user_msgs = [m.content for m in row.messages if getattr(m, "role", None) == "user"]
+        if user_msgs and user_msgs[-1]:
+            return str(user_msgs[-1])
+    except Exception:
+        pass
+
+    # Final fallback: use Python id for uniqueness within this process
+    return f"row-{id(row)}"
+
+
+def compute_fixed_set_mu_ci(
+    rows: List[EvaluationRow],
+    *,
+    z_value: float = 1.96,
+) -> Tuple[Optional[float], Optional[float], Optional[float]]:
+    """Compute the benchmark-conditional 95% CI for the mean accuracy μ on a fixed item set.
+
+    This treats questions/items as fixed and repeats as within-item Bernoulli draws.
+    For each question i with m_i repeats and s_i successes, the per-question mean is
+    ybar_i = s_i / m_i. The estimator of μ is the average of per-question means:
+        mu_hat = (1/Q) * sum_i ybar_i.
+
+    The plug-in standard error for the CI of μ uses the within-item variances only:
+        Var(mu_hat) ≈ (1/Q^2) * sum_i [ ybar_i (1 - ybar_i) / (m_i - 1) ], for m_i >= 2.
+
+    Notes:
+    - When m_i == 1, the unbiased correction is undefined. In that case we fall back to
+      ybar_i (1 - ybar_i) / m_i as a conservative estimate. GPQA typically has m_i >= 2.
+    - Scores are taken from `row.evaluation_result.score` when available and numeric.
+
+    Returns:
+        (mu_hat, ci_low, ci_high). Returns (None, None, None) if insufficient data.
+    """
+    if not rows:
+        return None, None, None
+
+    # Group scores by question id
+    question_to_scores: Dict[str, List[float]] = defaultdict(list)
+    for r in rows:
+        try:
+            er = getattr(r, "evaluation_result", None)
+            if er is None:
+                continue
+            score = getattr(er, "score", None)
+            if score is None:
+                continue
+            # Ensure numeric float
+            s_val = float(score)
+            if math.isnan(s_val):
+                continue
+            qid = _default_question_id(r)
+            question_to_scores[qid].append(s_val)
+        except Exception:
+            # Skip malformed rows
+            continue
+
+    Q = len(question_to_scores)
+    if Q == 0:
+        return None, None, None
+
+    # Compute per-question means and the plug-in variance contribution
+    ybars: List[float] = []
+    var_terms: List[float] = []
+    for scores in question_to_scores.values():
+        m_i = len(scores)
+        if m_i == 0:
+            continue
+        ybar_i = sum(scores) / m_i
+        ybars.append(ybar_i)
+        # Unbiased within-item variance estimate for Bernoulli mean
+        if m_i >= 2:
+            var_terms.append(ybar_i * (1.0 - ybar_i) / (m_i - 1))
+        else:
+            # Conservative fallback when only a single repeat exists
+            var_terms.append(ybar_i * (1.0 - ybar_i) / m_i)
+
+    if not ybars:
+        return None, None, None
+
+    mu_hat = sum(ybars) / len(ybars)
+
+    # Standard error for CI of μ
+    se_sq = sum(var_terms) / (Q * Q)
+    se = math.sqrt(se_sq) if se_sq > 0.0 else 0.0
+
+    margin = z_value * se
+    ci_low = max(0.0, mu_hat - margin)
+    ci_high = min(1.0, mu_hat + margin)
+
+    return float(mu_hat), float(ci_low), float(ci_high)
+
+
diff --git a/examples/aime2025_chat_completion/tests/test_evaluation.py b/examples/aime2025_chat_completion/tests/test_evaluation.py
index 261309d0..7558dab1 100644
--- a/examples/aime2025_chat_completion/tests/test_evaluation.py
+++ b/examples/aime2025_chat_completion/tests/test_evaluation.py
@@ -1,4 +1,5 @@
 from typing import Any, Dict, List
+import os
 
 from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
 from eval_protocol.pytest.default_single_turn_rollout_process import (
@@ -61,7 +62,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
         "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
     ],
     dataset_adapter=aime2025_dataset_adapter,
-    rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}, {}, {"extra_body": {"reasoning_effort": "high"}}],
+    rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}],
     rollout_processor=default_single_turn_rollout_processor,
     aggregation_method="mean",
     threshold_of_success=None,
diff --git a/examples/gpqa/tests/test_evaluation.py b/examples/gpqa/tests/test_evaluation.py
index 42c3c91b..79863fc0 100644
--- a/examples/gpqa/tests/test_evaluation.py
+++ b/examples/gpqa/tests/test_evaluation.py
@@ -1,6 +1,9 @@
-from typing import Any, Dict, List
+from typing import List
 
+import csv
+import io
 import re
+import requests
 
 from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
 from eval_protocol.pytest.evaluation_test import evaluation_test
@@ -22,20 +25,18 @@ def extract_abcd_letter(text: str) -> str | None:
     return m.group(1) if m else None
 
 
-def _build_gpqa_messages_from_hf(max_samples: int | None = 2) -> List[List[Message]]:
+def _load_gpqa_messages_from_csv() -> List[List[Message]]:
     """
-    Load GPQA (diamond) from the reference blob CSV and construct prompts.
-    For full dataset, call with max_samples=None.
+    Load GPQA (diamond) from the reference CSV and construct prompts.
+    No built-in row limit; use --ep-max-rows to control how many are evaluated.
     """
-    from datasets import load_dataset  # type: ignore
-
     url = "https://openaipublic.blob.core.windows.net/simple-evals/gpqa_diamond.csv"
-    ds = load_dataset("csv", data_files=url, split="train")
+    resp = requests.get(url, timeout=60)
+    resp.raise_for_status()
+
     messages_list: List[List[Message]] = []
-    # We will store the correct letter in a trailing system message for lookup (not given to the model)
-    for ex in ds:
-        if max_samples is not None and len(messages_list) >= max_samples:
-            break
+    reader = csv.DictReader(io.StringIO(resp.text))
+    for ex in reader:
         q = str(ex.get("Question", ""))
         correct = str(ex.get("Correct Answer", "")).strip()
         inc1 = str(ex.get("Incorrect Answer 1", ""))
@@ -49,7 +50,8 @@ def _build_gpqa_messages_from_hf(max_samples: int | None = 2) -> List[List[Messa
             [
                 Message(role="system", content=SYSTEM_PROMPT),
                 Message(role="user", content=user_content),
-                Message(role="system", content=f"__GT__:A"),
+                # Correct answer is always option A by construction
+                Message(role="system", content="__GT__:A"),
             ]
         )
     if not messages_list:
@@ -57,18 +59,17 @@ def _build_gpqa_messages_from_hf(max_samples: int | None = 2) -> List[List[Messa
     return messages_list
 
 
-_GPQA_INPUT_MESSAGES = _build_gpqa_messages_from_hf(max_samples=2)
+_GPQA_INPUT_MESSAGES = _load_gpqa_messages_from_csv()
 
 
 @evaluation_test(
     model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
     input_messages=_GPQA_INPUT_MESSAGES,
-    rollout_input_params=[{"temperature": 0.0, "max_tokens": 512}],
+    rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}], # default to low effort; override via CLI plugin
     rollout_processor=default_single_turn_rollout_processor,
     aggregation_method="mean",
     threshold_of_success=None,
-    num_runs=1,
-    max_dataset_rows=2,
+    num_runs=8,
     mode="pointwise",
 )
 def test_gpqa_pointwise(row: EvaluationRow) -> EvaluationRow:
diff --git a/vendor/tau2/utils/llm_utils.py b/vendor/tau2/utils/llm_utils.py
index 98ae698c..8f46503a 100644
--- a/vendor/tau2/utils/llm_utils.py
+++ b/vendor/tau2/utils/llm_utils.py
@@ -7,6 +7,8 @@
 from litellm.caching.caching import Cache
 from litellm.main import ModelResponse, Usage
 from loguru import logger
+import logging
+import os
 
 from vendor.tau2.config import (
     DEFAULT_LLM_CACHE_TYPE,
@@ -74,6 +76,19 @@
 if not ALLOW_SONNET_THINKING:
     logger.warning("Sonnet thinking is disabled")
 
+# Quiet LiteLLM's own INFO logs unless the user explicitly set a level
+try:
+    if os.environ.get("LITELLM_LOG") is None:
+        os.environ["LITELLM_LOG"] = "ERROR"
+    _llog = logging.getLogger("LiteLLM")
+    _llog.setLevel(logging.CRITICAL)
+    _llog.propagate = False
+    for _h in list(_llog.handlers):
+        _llog.removeHandler(_h)
+except Exception:
+    # Best-effort; never fail import due to logging config
+    pass
+
 
 def _parse_ft_model_name(model: str) -> str:
     """