eval-protocol
diff --git a/‎eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py‎
Lines changed: 3 additions & 4 deletions b/‎eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎eval_protocol/dataset_logger/sqlite_evaluation_row_store.py‎
Lines changed: 14 additions & 11 deletions b/‎eval_protocol/dataset_logger/sqlite_evaluation_row_store.py‎
Lines changed: 14 additions & 11 deletions
diff --git a/‎eval_protocol/mcp/client/connection.py‎
Lines changed: 2 additions & 2 deletions b/‎eval_protocol/mcp/client/connection.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎eval_protocol/mcp/execution/base_policy.py‎
Lines changed: 1 addition & 1 deletion b/‎eval_protocol/mcp/execution/base_policy.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎eval_protocol/mcp/execution/manager.py‎
Lines changed: 27 additions & 10 deletions b/‎eval_protocol/mcp/execution/manager.py‎
Lines changed: 27 additions & 10 deletions
diff --git a/‎eval_protocol/mcp/session/manager.py‎
Lines changed: 1 addition & 0 deletions b/‎eval_protocol/mcp/session/manager.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎eval_protocol/mcp_env.py‎
Lines changed: 6 additions & 14 deletions b/‎eval_protocol/mcp_env.py‎
Lines changed: 6 additions & 14 deletions
diff --git a/‎eval_protocol/pytest/default_agent_rollout_processor.py‎
Lines changed: 5 additions & 4 deletions b/‎eval_protocol/pytest/default_agent_rollout_processor.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎eval_protocol/pytest/default_mcp_gym_rollout_processor.py‎
Lines changed: 1 addition & 1 deletion b/‎eval_protocol/pytest/default_mcp_gym_rollout_processor.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎eval_protocol/pytest/default_single_turn_rollout_process.py‎
Lines changed: 4 additions & 5 deletions b/‎eval_protocol/pytest/default_single_turn_rollout_process.py‎
Lines changed: 4 additions & 5 deletions
@@ -22,18 +22,17 @@ def __init__(self, db_path: Optional[str] = None, store: Optional[SqliteEvaluati
             self._store = SqliteEvaluationRowStore(self.db_path)
 
     def log(self, row: "EvaluationRow") -> None:
-        row_id = row.input_metadata.row_id
         data = row.model_dump(exclude_none=True, mode="json")
-        self._store.upsert_row(row_id=row_id, data=data)
+        self._store.upsert_row(data=data)
         try:
             event_bus.emit(LOG_EVENT_TYPE, EvaluationRow(**data))
         except Exception as e:
             # Avoid breaking storage due to event emission issues
             logger.error(f"Failed to emit row_upserted event: {e}")
             pass
 
-    def read(self, row_id: Optional[str] = None) -> List["EvaluationRow"]:
+    def read(self, rollout_id: Optional[str] = None) -> List["EvaluationRow"]:
         from eval_protocol.models import EvaluationRow
 
-        results = self._store.read_rows(row_id=row_id)
+        results = self._store.read_rows(rollout_id=rollout_id)
         return [EvaluationRow(**data) for data in results]
@@ -11,7 +11,7 @@ class SqliteEvaluationRowStore:
     """
     Lightweight reusable SQLite store for evaluation rows.
 
-    Stores arbitrary row data as JSON keyed by a unique string `row_id`.
+    Stores arbitrary row data as JSON keyed by a unique string `rollout_id`.
     """
 
     def __init__(self, db_path: str):
@@ -24,7 +24,7 @@ class Meta:
                 database = self._db
 
         class EvaluationRow(BaseModel):  # type: ignore
-            row_id = CharField(unique=True)
+            rollout_id = CharField(unique=True)
             data = JSONField()
 
         self._EvaluationRow = EvaluationRow
@@ -36,22 +36,25 @@ class EvaluationRow(BaseModel):  # type: ignore
     def db_path(self) -> str:
         return self._db_path
 
-    def upsert_row(self, row_id: str, data: dict) -> None:
-        if self._EvaluationRow.select().where(self._EvaluationRow.row_id == row_id).exists():
-            self._EvaluationRow.update(data=data).where(self._EvaluationRow.row_id == row_id).execute()
+    def upsert_row(self, data: dict) -> None:
+        rollout_id = data["rollout_id"]
+        if "rollout_id" not in data:
+            raise ValueError("rollout_id is required to upsert a row")
+        if self._EvaluationRow.select().where(self._EvaluationRow.rollout_id == rollout_id).exists():
+            self._EvaluationRow.update(data=data).where(self._EvaluationRow.rollout_id == rollout_id).execute()
         else:
-            self._EvaluationRow.create(row_id=row_id, data=data)
+            self._EvaluationRow.create(rollout_id=rollout_id, data=data)
 
-    def read_rows(self, row_id: Optional[str] = None) -> List[dict]:
-        if row_id is None:
+    def read_rows(self, rollout_id: Optional[str] = None) -> List[dict]:
+        if rollout_id is None:
             query = self._EvaluationRow.select().dicts()
         else:
-            query = self._EvaluationRow.select().dicts().where(self._EvaluationRow.row_id == row_id)
+            query = self._EvaluationRow.select().dicts().where(self._EvaluationRow.rollout_id == rollout_id)
         results = list(query)
         return [result["data"] for result in results]
 
-    def delete_row(self, row_id: str) -> int:
-        return self._EvaluationRow.delete().where(self._EvaluationRow.row_id == row_id).execute()
+    def delete_row(self, rollout_id: str) -> int:
+        return self._EvaluationRow.delete().where(self._EvaluationRow.rollout_id == rollout_id).execute()
 
     def delete_all_rows(self) -> int:
         return self._EvaluationRow.delete().execute()
@@ -539,10 +539,10 @@ async def close_session(self, session: MCPSession) -> None:
                 await session._exit_stack.aclose()
             except asyncio.CancelledError:
                 # Handle cancellation gracefully (especially important for Python 3.12)
-                logger.debug(f"Session {session.session_id} close was cancelled")
+                logger.error(f"Session {session.session_id} close was cancelled")
             except Exception as e:
                 # Hitting this error, probably because of use of threads: "Attempted to exit cancel scope in a different task than it was entered in"
-                logger.debug(f"Error closing session {session.session_id}: {e}")
+                logger.error(f"Error closing session {session.session_id}: {e}")
             finally:
                 session._exit_stack = None
                 session._mcp_session = None
@@ -220,7 +220,7 @@ async def _generate_live_tool_calls(
             return mcp_tool_calls, usage_stats
         else:
             # No tool calls in response - this is normal when episode ends or LLM provides only text
-            logger.info(f"No tool calls in response for env {env_index}, message content: {message.get('content')}")
+            logger.debug(f"No tool calls in response for env {env_index}, message content: {message.get('content')}")
             return [
                 MCPToolCall(
                     tool_name="_no_tool_call",
 
@@ -97,10 +97,12 @@ async def execute_rollouts(
 
         async def _execute_with_semaphore(idx):
             async with semaphore:
-                return await self._execute_rollout(
+                result = await self._execute_rollout(
                     envs, policy, idx, steps, openai_logger, recording_mode, playback_mode, start_time
                 )
 
+                return result
+
         tasks = [_execute_with_semaphore(i) for i in range(envs.n)]
         # exceptions will be try catched inside single _execute_rollout
         trajectories = await asyncio.gather(*tasks)
@@ -112,9 +114,6 @@ async def _execute_with_semaphore(idx):
 
         shared_tool_schema = envs.tool_schemas
 
-        # Clean up
-        await envs.close()
-
         # Enhanced reporting with control plane info
         successful = sum(1 for traj in trajectories if traj.total_reward > 0)
         terminated_by_control_plane = sum(
@@ -175,8 +174,11 @@ async def _execute_with_semaphore(idx):
                     TerminationReason.USER_STOP,
                 }:
                     evaluation_rows[idx].rollout_status.status = "finished"
-                elif trajectory.termination_reason == TerminationReason.MAX_STEPS:
+                elif trajectory.termination_reason in {TerminationReason.MAX_STEPS, TerminationReason.INTERRUPTED}:
                     evaluation_rows[idx].rollout_status.status = "stopped"
+                    evaluation_rows[idx].rollout_status.error_message = trajectory.control_plane_summary.get(
+                        "termination_reason", trajectory.termination_reason
+                    )
                 else:
                     evaluation_rows[idx].rollout_status.status = "error"
                     evaluation_rows[idx].rollout_status.error_message = trajectory.control_plane_summary.get(
@@ -226,6 +228,7 @@ async def _execute_rollout(
                 "total_tokens": 0,
             },
         )
+        failure_reason = None
         try:
             current_observation, tool_schema = await envs.reset(session)
             system_prompt = dataset_row.system_prompt
@@ -311,8 +314,7 @@ async def _execute_rollout(
                         # If there's no user simulator, no tool call means policy failed and we should terminate the rollout
                         elif tool_calls[0].tool_name in ["_playback_terminate", "_no_tool_call"]:
                             trajectory.terminated = True
-                            trajectory.termination_reason = TerminationReason.ERROR
-                            trajectory.control_plane_summary.update({"error_message": "No expected tool call"})
+                            trajectory.termination_reason = TerminationReason.INTERRUPTED
                             break
 
                     # Execute each tool call sequentially
@@ -466,11 +468,26 @@ async def _execute_rollout(
             logger.info(
                 f"✅ Rollout {rollout_idx} completed: {trajectory.steps} steps, reward: {trajectory.total_reward:.2f}, termination: {trajectory.termination_reason}, in thread {threading.current_thread().name}"
             )
+
+        except asyncio.CancelledError:
+            logger.error(f"🚨 AsyncIO Cancel Error in roll out {rollout_idx}", exc_info=True)
+            failure_reason = "asyncio context cancelled"
         except Exception as e:
             logger.error(f"🚨 Error in rollout {rollout_idx}: {e}", exc_info=True)
-            trajectory.terminated = True
-            trajectory.termination_reason = TerminationReason.ERROR
-            trajectory.control_plane_summary.update({"error_message": str(e)})
+            failure_reason = str(e)
+        finally:
+            if failure_reason:
+                trajectory.terminated = True
+                trajectory.termination_reason = TerminationReason.ERROR
+                trajectory.control_plane_summary.update({"error_message": f"{failure_reason}"})
+            try:
+                await envs.connection_manager.reset_session(session)
+            except:
+                logger.error(f"Error resetting session {session.session_id}")
+            try:
+                await envs.connection_manager.close_session(session)
+            except:
+                logger.error(f"Error closing session {session.session_id}")
         return trajectory
 
     async def _get_control_plane_status(self, session) -> Optional[Dict[str, Any]]:
 
@@ -58,6 +58,7 @@ async def reset(self, session: MCPSession) -> Tuple[Any, List[Dict]]:
 
         This is thread-safe and can be called from worker threads.
         """
+        await self.connection_manager.initialize_session(session)
         # Get available tools from MCP server
         tool_schemas = await self.connection_manager.discover_tools(session)
 
 
@@ -17,7 +17,7 @@
     policy = ep.FireworksPolicy(model_id="accounts/fireworks/models/qwen3-235b-a22b")
 
     # Create environments with evaluation_rows configuration
-    envs = await ep.make("http://localhost:8000/mcp", evaluation_rows=evaluation_rows)
+    envs = ep.make("http://localhost:8000/mcp", evaluation_rows=evaluation_rows)
 
     # Execute tool-calling rollouts
     evaluation_rows = await ep.rollout(envs, policy=policy, steps=512)
@@ -86,18 +86,17 @@ async def reset_mcp_sessions(envs: GeneralMCPVectorEnv):
     Reset mcp server sessions
     """
     tasks = [envs.connection_manager.reset_session(session) for session in envs.sessions]
-    await asyncio.gather(*tasks)
+    await asyncio.gather(*tasks, return_exceptions=True)
 
 
-async def make(
+def make(
     env_spec: str,
     evaluation_rows: Optional[List[EvaluationRow]] = None,
     dataset: Optional[List[Dict]] = None,
     n: Optional[int] = None,
     seeds: Optional[List[int]] = None,
     model_id: str = "unknown",
     user_prompt_formatter: Optional[Callable] = None,
-    reset_sessions: bool = False,
 ) -> GeneralMCPVectorEnv:
     """
     Create general MCP environments driven by evaluation_rows configuration.
@@ -110,20 +109,19 @@ async def make(
         seeds: List of seeds (for backward compatibility)
         model_id: Model identifier
         user_prompt_formatter: Optional callback for formatting user prompts
-        reset_sessions: Whether to reset sessions before returning the environment
 
     Returns:
         General MCP environment that works with any MCP server
 
     Example:
         # EvaluationRow approach (preferred)
-        envs = await ep.make("http://localhost:8000/mcp", evaluation_rows=evaluation_rows)
+        envs = ep.make("http://localhost:8000/mcp", evaluation_rows=evaluation_rows)
 
         # Dataset approach (backward compatibility)
-        envs = await ep.make("http://localhost:8000/mcp", dataset=dataset)
+        envs = ep.make("http://localhost:8000/mcp", dataset=dataset)
 
         # Legacy approach (backward compatibility)
-        envs = await ep.make("http://localhost:8000/mcp", n=10, seeds=seeds)
+        envs = ep.make("http://localhost:8000/mcp", n=10, seeds=seeds)
     """
     # Parse environment specification - make sure URL format is correct
     base_url = env_spec
@@ -236,12 +234,6 @@ async def make(
             sessions.append(session)
 
     mcp_envs = GeneralMCPVectorEnv(sessions, dataset_rows, user_prompt_formatter)
-    tasks = [mcp_envs.connection_manager.initialize_session(session) for session in sessions]
-    await asyncio.gather(*tasks)
-
-    if reset_sessions:
-        await reset_mcp_sessions(mcp_envs)
-
     return mcp_envs
 
 
 
@@ -8,7 +8,7 @@
 from openai.types.chat import ChatCompletionContentPartTextParam, ChatCompletionMessage, ChatCompletionToolParam
 from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam
 
-from eval_protocol.dataset_logger import default_logger
+from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
 from eval_protocol.mcp.execution.policy import LiteLLMPolicy
 from eval_protocol.mcp.mcp_multi_client import MCPMultiClient
 from eval_protocol.models import EvaluationRow, Message
@@ -20,12 +20,13 @@ class Agent:
     A really simple agent that calls the model until no more tool calls are needed.
     """
 
-    def __init__(self, model: str, row: EvaluationRow, config_path: str):
+    def __init__(self, model: str, row: EvaluationRow, config_path: str, logger: DatasetLogger):
         self.model = model
         self.evaluation_row: EvaluationRow = row
         self._policy = LiteLLMPolicy(model_id=model)
         self.mcp_client = MCPMultiClient(config_path=config_path) if config_path else None
         self.tools: Union[List[ChatCompletionToolParam], NotGiven] = NOT_GIVEN
+        self.logger: DatasetLogger = logger
 
     async def setup(self):
         if self.mcp_client:
@@ -42,7 +43,7 @@ def messages(self) -> list[Message]:
 
     def append_message_and_log(self, message: Message):
         self.messages.append(message)
-        default_logger.log(self.evaluation_row)
+        self.logger.log(self.evaluation_row)
 
     async def call_agent(self) -> str:
         """
@@ -116,7 +117,7 @@ async def default_agent_rollout_processor(
 ) -> List[EvaluationRow]:
     dataset: Dataset = []
     for row in rows:
-        agent = Agent(model=config.model, row=row, config_path=config.mcp_config_path)
+        agent = Agent(model=config.model, row=row, config_path=config.mcp_config_path, logger=config.logger)
         await agent.setup()
         await agent.call_agent()
         dataset.append(agent.evaluation_row)
 
@@ -226,7 +226,7 @@ async def default_mcp_gym_rollout_processor(
         )
 
         # Create MCP environments directly from evaluation_rows
-        envs = await ep.make(
+        envs = ep.make(
             "http://localhost:9700/mcp/",
             evaluation_rows=rows,
             model_id=policy.model_id,
 
@@ -1,11 +1,9 @@
 import asyncio
-from typing import List
-
 import logging
 import os
+from typing import List
 
-from eval_protocol.dataset_logger import default_logger
-from eval_protocol.models import EvaluationRow, Message, ChatCompletionMessageToolCall
+from eval_protocol.models import ChatCompletionMessageToolCall, EvaluationRow, Message
 from eval_protocol.pytest.types import RolloutProcessorConfig
 
 
@@ -49,6 +47,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
 
         # Dynamic import to avoid static dependency/lint errors if LiteLLM isn't installed yet
         import importlib
+
         _litellm = importlib.import_module("litellm")
         acompletion = getattr(_litellm, "acompletion")
         response = await acompletion(**request_params)
@@ -79,7 +78,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
         ]
 
         row.messages = messages
-        default_logger.log(row)
+        config.logger.log(row)
         return row
 
     # Process rows with bounded concurrency if configured
Original file line number	Diff line number	Diff line change
`@@ -226,7 +226,7 @@ async def default_mcp_gym_rollout_processor(`
`226`	`226`	`)`
`227`	`227`
`228`	`228`	`# Create MCP environments directly from evaluation_rows`
`229`		`- envs = await ep.make(`
	`229`	`+ envs = ep.make(`
`230`	`230`	`"http://localhost:9700/mcp/",`
`231`	`231`	`evaluation_rows=rows,`
`232`	`232`	`model_id=policy.model_id,`