eval-protocol
diff --git a/‎eval_protocol/common_utils.py‎
Lines changed: 33 additions & 8 deletions b/‎eval_protocol/common_utils.py‎
Lines changed: 33 additions & 8 deletions
diff --git a/‎eval_protocol/generation/clients.py‎
Lines changed: 4 additions & 1 deletion b/‎eval_protocol/generation/clients.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎eval_protocol/pytest/evaluation_test.py‎
Lines changed: 27 additions & 3 deletions b/‎eval_protocol/pytest/evaluation_test.py‎
Lines changed: 27 additions & 3 deletions
diff --git a/‎eval_protocol/pytest/plugin.py‎
Lines changed: 54 additions & 0 deletions b/‎eval_protocol/pytest/plugin.py‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎examples/aime2025_chat_completion/README.md‎
Lines changed: 24 additions & 0 deletions b/‎examples/aime2025_chat_completion/README.md‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎examples/aime2025_chat_completion/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎examples/aime2025_chat_completion/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/aime2025_chat_completion/main.py‎
Lines changed: 110 additions & 0 deletions b/‎examples/aime2025_chat_completion/main.py‎
Lines changed: 110 additions & 0 deletions
@@ -2,6 +2,8 @@
 import re
 from typing import Any, Dict, List
 
+import requests
+
 
 def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
     """
@@ -15,16 +17,39 @@ def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
         Returns an empty list if the file is not found or if errors occur during parsing.
     """
     data: List[Dict[str, Any]] = []
-    with open(file_path, "r", encoding="utf-8") as f:
-        for line_number, line in enumerate(f):
+    if file_path.startswith("http://") or file_path.startswith("https://"):
+        resp = requests.get(file_path, stream=True, timeout=30)
+        resp.raise_for_status()
+        for line_number, raw in enumerate(resp.iter_lines(decode_unicode=True), start=1):
+            if raw is None:
+                continue
+            stripped = raw.strip()
+            if not stripped:
+                continue
             try:
-                data.append(json.loads(line.strip()))
+                data.append(json.loads(stripped))
             except json.JSONDecodeError as e:
-                print(f"Error parsing JSON line for file {file_path} at line {line_number}")
-                # attempt to find "row_id" in the line by finding index of "row_id" and performing regex of `"row_id": (.*),`
-                row_id_index = line.find("row_id")
+                print(f"Error parsing JSON line for URL {file_path} at line {line_number}")
+                row_id_index = stripped.find("row_id")
                 if row_id_index != -1:
-                    row_id = re.search(r'"row_id": (.*),', line[row_id_index:])
-                    raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})")
+                    row_id = re.search(r'"row_id": (.*),', stripped[row_id_index:])
+                    raise ValueError(f"{e.msg} at line {line_number}: {stripped} ({row_id})")
                 raise e
+    else:
+        with open(file_path, "r", encoding="utf-8") as f:
+            for line_number, line in enumerate(f, start=1):
+                # Skip entirely blank or whitespace-only lines to be robust to trailing newlines
+                stripped = line.strip()
+                if not stripped:
+                    continue
+                try:
+                    data.append(json.loads(stripped))
+                except json.JSONDecodeError as e:
+                    print(f"Error parsing JSON line for file {file_path} at line {line_number}")
+                    # attempt to find "row_id" in the line by finding index of "row_id" and performing regex of `"row_id": (.*),`
+                    row_id_index = line.find("row_id")
+                    if row_id_index != -1:
+                        row_id = re.search(r'"row_id": (.*),', line[row_id_index:])
+                        raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})")
+                    raise e
     return data
@@ -11,7 +11,7 @@
 
 import aiohttp
 from omegaconf import DictConfig
-from pydantic import BaseModel, Field  # Added for new models
+from pydantic import BaseModel  # Added for new models
 
 logger = logging.getLogger(__name__)
 
@@ -83,6 +83,9 @@ async def generate(
         }
         if self.top_p is not None:
             payload["top_p"] = self.top_p
+        # Include reasoning settings if configured (for reasoning-capable models)
+        if self.reasoning_effort:
+            payload["reasoning_effort"] = self.reasoning_effort
 
         if tools:
             payload["tools"] = tools
 
@@ -1,5 +1,6 @@
 import inspect
 import os
+import os
 from typing import Any, Callable, Dict, List, Optional
 
 import pytest
@@ -132,13 +133,34 @@ def execute_with_params(
             return execute_function(test_func, **kwargs)
 
         # Calculate all possible combinations of parameters
+        def _parse_ep_max_rows(default_value: int | None) -> int | None:
+            """Read EP_MAX_DATASET_ROWS env override as int or None."""
+            raw = os.getenv("EP_MAX_DATASET_ROWS")
+            if raw is None:
+                return default_value
+            s = raw.strip().lower()
+            if s == "none":
+                return None
+            try:
+                return int(s)
+            except ValueError:
+                return default_value
+
         def generate_combinations():
             combinations = []
 
             # Handle optional parameters with defaults
             datasets: List[Optional[DatasetPathParam]] = input_dataset if input_dataset is not None else [None]  # type: ignore
             params: List[Optional[RolloutInputParam]] = rollout_input_params if rollout_input_params is not None else [None]  # type: ignore
-            messages: List[Optional[InputMessagesParam]] = input_messages if input_messages is not None else [None]  # type: ignore
+            # Apply EP_MAX_DATASET_ROWS to input_messages to uniformly control row count when messages are provided
+            if input_messages is not None and isinstance(input_messages, list):
+                effective_max_rows = _parse_ep_max_rows(max_dataset_rows)
+                if effective_max_rows is not None:
+                    messages: List[Optional[InputMessagesParam]] = input_messages[:effective_max_rows]  # type: ignore
+                else:
+                    messages = input_messages  # type: ignore
+            else:
+                messages = [None]  # type: ignore
             kwargs: List[Optional[EvaluationInputParam]] = evaluation_test_kwargs if evaluation_test_kwargs is not None else [None]  # type: ignore
 
             # Generate all combinations
@@ -201,8 +223,10 @@ def wrapper_body(**kwargs):
                     data: List[EvaluationRow] = []
                     if "dataset_path" in kwargs and kwargs["dataset_path"] is not None:
                         data_jsonl = load_jsonl(kwargs["dataset_path"])
-                        if max_dataset_rows is not None:
-                            data_jsonl = data_jsonl[:max_dataset_rows]
+                        # Apply env override for max rows if present
+                        effective_max_rows = _parse_ep_max_rows(max_dataset_rows)
+                        if effective_max_rows is not None:
+                            data_jsonl = data_jsonl[:effective_max_rows]
                         data = dataset_adapter(data_jsonl)
                     elif "input_messages" in kwargs and kwargs["input_messages"] is not None:
                         data: List[EvaluationRow] = [EvaluationRow(messages=kwargs["input_messages"])]
 
@@ -0,0 +1,54 @@
+"""
+Pytest plugin for Eval Protocol developer ergonomics.
+
+Adds a discoverable CLI flag `--ep-max-rows` to control how many rows
+evaluation_test processes. This sets the environment variable
+`EP_MAX_DATASET_ROWS` so the core decorator can apply it uniformly to
+both URL datasets and in-memory input_messages.
+
+Usage:
+  - CLI: pytest --ep-max-rows=2  # or --ep-max-rows=all for no limit
+  - Defaults: If not provided, no override is applied (tests use the
+    max_dataset_rows value set in the decorator).
+"""
+
+import os
+from typing import Optional
+
+import pytest
+
+
+def pytest_addoption(parser: pytest.Parser) -> None:
+    group = parser.getgroup("eval-protocol")
+    group.addoption(
+        "--ep-max-rows",
+        action="store",
+        default=None,
+        help=(
+            "Limit number of dataset rows processed by evaluation_test. "
+            "Pass an integer (e.g., 2, 50) or 'all' for no limit."
+        ),
+    )
+
+
+def _normalize_max_rows(val: Optional[str]) -> Optional[str]:
+    if val is None:
+        return None
+    s = val.strip().lower()
+    if s == "all":
+        return "None"
+    # Validate int; if invalid, ignore and return None (no override)
+    try:
+        int(s)
+        return s
+    except ValueError:
+        return None
+
+
+def pytest_configure(config: pytest.Config) -> None:
+    cli_val = config.getoption("--ep-max-rows")
+    norm = _normalize_max_rows(cli_val)
+    if norm is not None:
+        os.environ["EP_MAX_DATASET_ROWS"] = norm
+
+
@@ -0,0 +1,24 @@
+## AIME2025 Chat Completion Example
+
+This example reproduces gpt-oss's AIME2025 chat completion evaluation inside Eval Protocol.
+
+### What it does
+- Loads AIME2025 questions from Hugging Face
+- Prompts a reasoning-capable chat-completions model
+- Extracts the final integer answer from \boxed{...}
+- Scores exact-match vs. the ground-truth integer
+
+### Quick run (pytest, CI-friendly)
+The evaluation is implemented as a pytest `evaluation_test` under `tests/`. Run it directly:
+
+```bash
+pytest -q examples/aime2025_chat_completion/tests/test_evaluation.py -q
+```
+
+Environment variables expected:
+- `FIREWORKS_API_KEY`
+
+To scale up, adjust parameters in the decorator (e.g., `threshold_of_success`, `max_dataset_rows`).
+
+
+
@@ -0,0 +1,4 @@
+__all__ = ["main"]
+
+
+
@@ -0,0 +1,110 @@
+"""
+Eval Protocol example: AIME2025 chat completion evaluation
+
+This example mirrors gpt-oss's AIME 2025 evaluation using OpenAI-compatible
+chat completions. It evaluates whether the assistant's final answer matches the
+ground-truth integer, extracting answers from \\boxed{...} or fallback digits.
+"""
+
+import re
+from typing import Any, Dict, List, Optional, Union
+
+from eval_protocol import EvaluateResult, MetricResult, reward_function
+from eval_protocol.models import Message
+
+
+def _extract_boxed_text(text: str) -> str:
+    """
+    Extract the last occurrence of a boxed answer (\\boxed{...} or \\framebox{...}).
+    If none found, fall back to the last integer found in the text.
+    """
+    if not text:
+        return ""
+
+    pattern_boxed = r"boxed{(.*?)}|framebox{(.*?)}"
+    matches = re.findall(pattern_boxed, text, re.DOTALL)
+    if matches:
+        # Iterate from the end to prioritize the final boxed answer
+        for match in matches[::-1]:
+            for group in match:
+                if group:
+                    return group.split(",")[-1].strip()
+
+    # Fallback: last integer in the text
+    matches_digits = re.findall(r"\d+", text, re.DOTALL)
+    if matches_digits:
+        return matches_digits[-1]
+    return ""
+
+
+def _normalize_to_int_or_none(s: str) -> Optional[int]:
+    if s is None:
+        return None
+    # Only take leading digits
+    m = re.match(r"\d+", str(s).strip())
+    if not m:
+        return None
+    try:
+        return int(m.group(0))
+    except ValueError:
+        return None
+
+
+@reward_function(id="aime2025_exact_match")
+def evaluate(
+    messages: Union[List[Message], List[Dict[str, Any]]],
+    ground_truth: Optional[str] = None,
+    **kwargs,
+) -> EvaluateResult:
+    """
+    Score 1.0 if extracted final answer equals the ground-truth integer, else 0.0.
+    """
+    if not messages:
+        return EvaluateResult(
+            score=0.0,
+            reason="No messages provided",
+            is_score_valid=False,
+            metrics={
+                "parse_status": MetricResult(score=0.0, is_score_valid=False, reason="empty messages")
+            },
+        )
+
+    last_msg = messages[-1]
+    content = last_msg["content"] if isinstance(last_msg, dict) else (last_msg.content or "")
+
+    extracted_text = _extract_boxed_text(content)
+    extracted_int = _normalize_to_int_or_none(extracted_text)
+    gt_int = _normalize_to_int_or_none(ground_truth if ground_truth is not None else "")
+
+    is_valid = extracted_int is not None and gt_int is not None
+    score = 1.0 if (is_valid and extracted_int == gt_int) else 0.0
+
+    metrics: Dict[str, MetricResult] = {
+        "exact_match": MetricResult(
+            score=score,
+            is_score_valid=is_valid,
+            reason=(
+                "Parsed both integers and they matched"
+                if score == 1.0
+                else (
+                    "Parsed integers did not match"
+                    if is_valid
+                    else "Failed to parse integer from prediction or ground truth"
+                )
+            ),
+            data={
+                "extracted_text": extracted_text,
+                "extracted_int": extracted_int,
+                "ground_truth_int": gt_int,
+            },
+        )
+    }
+
+    return EvaluateResult(
+        score=score,
+        reason=("Answer correct" if score == 1.0 else "Answer incorrect"),
+        is_score_valid=is_valid,
+        metrics=metrics,
+    )
+
+
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +__all__ = ["main"]
++
++
++