eval-protocol
diff --git a/‎eval_protocol/common_utils.py‎
Lines changed: 34 additions & 9 deletions b/‎eval_protocol/common_utils.py‎
Lines changed: 34 additions & 9 deletions
diff --git a/‎eval_protocol/generation/clients.py‎
Lines changed: 4 additions & 1 deletion b/‎eval_protocol/generation/clients.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎eval_protocol/mcp/execution/manager.py‎
Lines changed: 12 additions & 13 deletions b/‎eval_protocol/mcp/execution/manager.py‎
Lines changed: 12 additions & 13 deletions
diff --git a/‎eval_protocol/pytest/default_single_turn_rollout_process.py‎
Lines changed: 37 additions & 5 deletions b/‎eval_protocol/pytest/default_single_turn_rollout_process.py‎
Lines changed: 37 additions & 5 deletions
@@ -2,6 +2,8 @@
 import re
 from typing import Any, Dict, List
 
+import requests
+
 
 def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
     """
@@ -12,19 +14,42 @@ def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
 
     Returns:
         A list of dictionaries, where each dictionary is a parsed JSON object from a line.
-        Returns an empty list if the file is not found or if errors occur during parsing.
+        Returns an empty list if the file is not found or if errors occur during parsing. Supports HTTP urls and local file paths.
     """
     data: List[Dict[str, Any]] = []
-    with open(file_path, "r", encoding="utf-8") as f:
-        for line_number, line in enumerate(f):
+    if file_path.startswith("http://") or file_path.startswith("https://"):
+        resp = requests.get(file_path, stream=True, timeout=30)
+        resp.raise_for_status()
+        for line_number, raw in enumerate(resp.iter_lines(decode_unicode=True), start=1):
+            if raw is None:
+                continue
+            stripped = raw.strip()
+            if not stripped:
+                continue
             try:
-                data.append(json.loads(line.strip()))
+                data.append(json.loads(stripped))
             except json.JSONDecodeError as e:
-                print(f"Error parsing JSON line for file {file_path} at line {line_number}")
-                # attempt to find "row_id" in the line by finding index of "row_id" and performing regex of `"row_id": (.*),`
-                row_id_index = line.find("row_id")
+                print(f"Error parsing JSON line for URL {file_path} at line {line_number}")
+                row_id_index = stripped.find("row_id")
                 if row_id_index != -1:
-                    row_id = re.search(r'"row_id": (.*),', line[row_id_index:])
-                    raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})")
+                    row_id = re.search(r'"row_id": (.*),', stripped[row_id_index:])
+                    raise ValueError(f"{e.msg} at line {line_number}: {stripped} ({row_id})") from e
                 raise e
+    else:
+        with open(file_path, "r", encoding="utf-8") as f:
+            for line_number, line in enumerate(f, start=1):
+                # Skip entirely blank or whitespace-only lines to be robust to trailing newlines
+                stripped = line.strip()
+                if not stripped:
+                    continue
+                try:
+                    data.append(json.loads(stripped))
+                except json.JSONDecodeError as e:
+                    print(f"Error parsing JSON line for file {file_path} at line {line_number}")
+                    # attempt to find "row_id" in the line by finding index of "row_id" and performing regex of `"row_id": (.*),`
+                    row_id_index = line.find("row_id")
+                    if row_id_index != -1:
+                        row_id = re.search(r'"row_id": (.*),', line[row_id_index:])
+                        raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})") from e
+                    raise e
     return data
@@ -11,7 +11,7 @@
 
 import aiohttp
 from omegaconf import DictConfig
-from pydantic import BaseModel, Field  # Added for new models
+from pydantic import BaseModel  # Added for new models
 
 logger = logging.getLogger(__name__)
 
@@ -83,6 +83,9 @@ async def generate(
         }
         if self.top_p is not None:
             payload["top_p"] = self.top_p
+        # Include reasoning settings if configured (for reasoning-capable models)
+        if self.reasoning_effort:
+            payload["reasoning_effort"] = self.reasoning_effort
 
         if tools:
             payload["tools"] = tools
 
@@ -163,7 +163,7 @@ async def _execute_with_semaphore(idx):
             evaluation_rows[idx].input_metadata.row_id = envs.dataset_rows[idx].id
             evaluation_rows[idx].input_metadata.dataset_info = asdict(envs.dataset_rows[idx])
             evaluation_rows[idx].tools = shared_tool_schema
-            evaluation_rows[idx].usage = trajectory.usage
+            evaluation_rows[idx].usage = CompletionUsage(**trajectory.usage)
             evaluation_rows[idx].input_metadata.completion_params = CompletionParams(
                 model=policy.model_id,
                 temperature=getattr(policy, "temperature", None),
@@ -260,8 +260,6 @@ async def _execute_rollout(
                 {"role": "user", "content": user_prompt},
             ]
 
-            usage_stats_list: List[CompletionUsage] = []
-
             logger.info(f"🎯 Starting rollout {rollout_idx} in thread {threading.current_thread().name}")
 
             # Run rollout loop for this specific environment
@@ -299,6 +297,12 @@ async def _execute_rollout(
                 while not turn_completed and not trajectory.terminated:
                     tool_calls, usage_stats = await policy(tool_schema, rollout_idx, conversation_history)
 
+                    # calc llm usage stats happened in this turn if there is aany
+                    if usage_stats:
+                        trajectory.usage["prompt_tokens"] += usage_stats.prompt_tokens
+                        trajectory.usage["completion_tokens"] += usage_stats.completion_tokens
+                        trajectory.usage["total_tokens"] += usage_stats.total_tokens
+
                     # If no tool call is generated, turn is finished
                     if len(tool_calls) == 1:
                         # If there's a user simulator, no tool call means the policy is ready to provide final response on this turn
@@ -308,6 +312,8 @@ async def _execute_rollout(
                         # If there's no user simulator, no tool call means policy failed and we should terminate the rollout
                         elif tool_calls[0].tool_name in ["_playback_terminate", "_no_tool_call"]:
                             trajectory.terminated = True
+                            trajectory.termination_reason = TerminationReason.ERROR
+                            trajectory.control_plane_summary.update({"error_message": "No expected tool call"})
                             break
 
                     # Execute each tool call sequentially
@@ -373,10 +379,6 @@ async def _execute_rollout(
                     if observation is not None:
                         current_observation = observation
 
-                    # calc llm usage stats happened in this turn if there is aany
-                    if usage_stats:
-                        usage_stats_list.append(usage_stats)
-
                 # With user simulator, increment step after an entire conversation step
                 if user_simulator is not None:
                     step += 1
@@ -409,7 +411,9 @@ async def _execute_rollout(
                     # tool indicates rollout should be terminated, call policy one last time to get the final response
                     _, usage_stats = await policy(tool_schema, rollout_idx, conversation_history)
                     if usage_stats:
-                        usage_stats_list.append(usage_stats)
+                        trajectory.usage["prompt_tokens"] += usage_stats.prompt_tokens
+                        trajectory.usage["completion_tokens"] += usage_stats.completion_tokens
+                        trajectory.usage["total_tokens"] += usage_stats.total_tokens
 
                     # Add final control plane summary
                     trajectory.control_plane_summary.update(
@@ -460,11 +464,6 @@ async def _execute_rollout(
                     msg["control_plane_step"]["termination_reason"] = trajectory.termination_reason
                     break
 
-            for usage_stats in usage_stats_list:
-                trajectory.usage["prompt_tokens"] += usage_stats.prompt_tokens
-                trajectory.usage["completion_tokens"] += usage_stats.completion_tokens
-                trajectory.usage["total_tokens"] += usage_stats.total_tokens
-
             logger.info(
                 f"✅ Rollout {rollout_idx} completed: {trajectory.steps} steps, reward: {trajectory.total_reward:.2f}, termination: {trajectory.termination_reason}, in thread {threading.current_thread().name}"
             )
 
@@ -1,11 +1,11 @@
 import asyncio
 from typing import List
 
-from litellm import acompletion
-from openai.types.chat.chat_completion_message import ChatCompletionMessageToolCall
+import logging
+import os
 
 from eval_protocol.dataset_logger import default_logger
-from eval_protocol.models import EvaluationRow, Message
+from eval_protocol.models import EvaluationRow, Message, ChatCompletionMessageToolCall
 from eval_protocol.pytest.types import RolloutProcessorConfig
 
 
@@ -14,6 +14,20 @@ async def default_single_turn_rollout_processor(
 ) -> List[EvaluationRow]:
     """Generate a single response from any supported model provider using LiteLLM."""
 
+    # Quiet LiteLLM logs in test runs unless user overrode
+    try:
+        if os.environ.get("LITELLM_LOG") is None:
+            os.environ["LITELLM_LOG"] = "ERROR"
+        _llog = logging.getLogger("LiteLLM")
+        _llog.setLevel(logging.CRITICAL)
+        _llog.propagate = False
+        for _h in list(_llog.handlers):
+            _llog.removeHandler(_h)
+    except Exception:
+        pass
+
+    # Do not modify global LiteLLM cache. Disable caching per-request instead.
+
     async def process_row(row: EvaluationRow) -> EvaluationRow:
         """Process a single row asynchronously."""
         if len(row.messages) == 0:
@@ -22,10 +36,21 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
         messages_payload = [{"role": m.role, "content": m.content} for m in row.messages]
 
         request_params = {"model": config.model, "messages": messages_payload, **config.input_params}
+        # Ensure caching is disabled only for this request (review feedback)
+        request_params["cache"] = {"no-cache": True}
+        # Allow passing reasoning effort to Fireworks via LiteLLM using extra_body
+        # Expected: config.input_params may contain {"reasoning": {"effort": "low|medium|high"}}
+        if "reasoning" in config.input_params:
+            request_params.setdefault("extra_body", {})
+            request_params["extra_body"]["reasoning"] = config.input_params["reasoning"]
 
         if row.tools is not None:
             request_params["tools"] = row.tools
 
+        # Dynamic import to avoid static dependency/lint errors if LiteLLM isn't installed yet
+        import importlib
+        _litellm = importlib.import_module("litellm")
+        acompletion = getattr(_litellm, "acompletion")
         response = await acompletion(**request_params)
 
         assistant_content = response.choices[0].message.content or ""
@@ -57,8 +82,15 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
         default_logger.log(row)
         return row
 
-    # Process all rows concurrently
-    tasks = [process_row(row) for row in rows]
+    # Process rows with bounded concurrency if configured
+    max_concurrent = getattr(config, "max_concurrent_rollouts", 8) or 8
+    semaphore = asyncio.Semaphore(max_concurrent)
+
+    async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow:
+        async with semaphore:
+            return await process_row(r)
+
+    tasks = [_sem_wrapper(row) for row in rows]
     dataset = list(await asyncio.gather(*tasks))
 
     return dataset