From 5f865e198d1374be63fe9454a9caf77f7b8b5e36 Mon Sep 17 00:00:00 2001
From: Benny Chen <youfychenbc5000@gmail.com>
Date: Tue, 12 Aug 2025 22:04:22 -0700
Subject: [PATCH] remove http rollout old code

---
 development/notes/frozen_lake_context.md      |  33 --
 development/notes/frozen_lake_plan.md         |  93 ---
 development/notes/http_rollout.md             |  38 --
 eval_protocol/agent/orchestrator.py           |   3 -
 eval_protocol/agent/resources/__init__.py     |  25 -
 .../agent/resources/http_rollout_protocol.py  |  85 ---
 .../agent/resources/http_rollout_resource.py  | 325 -----------
 examples/frozen_lake/README.md                | 152 -----
 examples/frozen_lake/analyze_trajectory.py    | 273 ---------
 examples/frozen_lake/client/dataset.jsonl     |   3 -
 examples/frozen_lake/client/reward.py         | 126 ----
 examples/frozen_lake/client/task_def.yaml     |  46 --
 .../gymnasium_frozen_lake_server.py           | 281 ---------
 examples/frozen_lake/run_full_evaluation.sh   | 318 ----------
 examples/frozen_lake/server/README.md         | 187 ------
 .../frozen_lake/server/http_rollout_server.py | 104 ----
 tests/test_frozen_lake_http_server.py         | 269 ---------
 tests/test_frozen_lake_seed_evaluation.py     | 541 ------------------
 18 files changed, 2902 deletions(-)
 delete mode 100644 development/notes/frozen_lake_context.md
 delete mode 100644 development/notes/frozen_lake_plan.md
 delete mode 100644 development/notes/http_rollout.md
 delete mode 100644 eval_protocol/agent/resources/http_rollout_protocol.py
 delete mode 100644 eval_protocol/agent/resources/http_rollout_resource.py
 delete mode 100644 examples/frozen_lake/README.md
 delete mode 100755 examples/frozen_lake/analyze_trajectory.py
 delete mode 100644 examples/frozen_lake/client/dataset.jsonl
 delete mode 100644 examples/frozen_lake/client/reward.py
 delete mode 100644 examples/frozen_lake/client/task_def.yaml
 delete mode 100644 examples/frozen_lake/gymnasium_frozen_lake_server.py
 delete mode 100755 examples/frozen_lake/run_full_evaluation.sh
 delete mode 100644 examples/frozen_lake/server/README.md
 delete mode 100644 examples/frozen_lake/server/http_rollout_server.py
 delete mode 100644 tests/test_frozen_lake_http_server.py
 delete mode 100644 tests/test_frozen_lake_seed_evaluation.py

diff --git a/development/notes/frozen_lake_context.md b/development/notes/frozen_lake_context.md
deleted file mode 100644
index 49c85152..00000000
--- a/development/notes/frozen_lake_context.md
+++ /dev/null
@@ -1,33 +0,0 @@
-# Frozen Lake Implementation Context
-
-This document provides context for the implementation of the Frozen Lake example within the `eval-protocol` framework.
-
-## High-Level Goal
-
-The primary objective is to create a robust and reproducible reinforcement learning environment for the Frozen Lake game. This involves allowing an LLM-based agent to interact with the game, and critically, enabling a data-driven approach to evaluations where initial conditions (like random seeds) are controlled by a dataset.
-
-## Core Components
-
-The implementation is distributed across several key files:
-
--   **`examples/frozen_lake/client/dataset.jsonl`**: The source of truth for evaluation runs. Each line defines a scenario, specifying the `seed` for the environment's initial state.
--   **`examples/frozen_lake/client/task_def.yaml`**: The main configuration file for the task. It points to the dataset and defines how many rollouts to perform for each sample in the dataset.
--   **`examples/frozen_lake/server/http_rollout_server.py`**: A FastAPI server that wraps the Frozen Lake game logic, exposing it via an HTTP API that the `eval-protocol` agent can interact with.
--   **`examples/frozen_lake/gymnasium_frozen_lake_server.py`**: The core game logic, which wraps the official `gymnasium` Frozen Lake environment. It is responsible for accepting a `seed` to create a deterministic starting state.
--   **`examples/frozen_lake/client/reward.py`**: A reward function that evaluates the agent's performance based on the outcome of the game (e.g., reaching the goal).
--   **`eval_protocol/agent/`**: The core agent framework, including the `TaskManager` and `Orchestrator`, which together manage the data-driven execution of rollouts based on the task definition and dataset.
-
-## Data-Driven Rollout Flow
-
-The evaluation process follows a clear, data-driven flow:
-
-1.  The **TaskManager** reads the `task_def.yaml`.
-2.  It loads the scenarios from the specified `dataset.jsonl` file.
-3.  For each scenario (i.e., each `seed` in the dataset), it schedules `num_rollouts_per_sample` rollouts.
-4.  For each individual rollout, the **Orchestrator** is invoked with the specific `seed`.
-5.  The **Orchestrator** passes the `seed` to the **HttpRolloutResource**.
-6.  The **HttpRolloutResource** sends the `seed` in a request to the `/start_episode` endpoint of the **http_rollout_server**.
-7.  The server uses the `seed` to initialize the **GymnasiumFrozenLakeGame** in a deterministic state.
-8.  The agent then plays the game, and the final outcome is evaluated by the reward function.
-
-This architecture ensures that evaluations are reproducible and that the agent's performance can be measured across a controlled set of initial conditions.
diff --git a/development/notes/frozen_lake_plan.md b/development/notes/frozen_lake_plan.md
deleted file mode 100644
index 2ae3005f..00000000
--- a/development/notes/frozen_lake_plan.md
+++ /dev/null
@@ -1,93 +0,0 @@
-# Frozen Lake Example Plan: Data-Driven Rollouts
-
-This document outlines the plan for refactoring the Frozen Lake example to use a data-driven evaluation workflow. The goal is to make the system more robust, extensible, and aligned with standard practices in reinforcement learning research.
-
-The core principle is to treat the initial conditions of an environment (like a random seed) as data. Each row in a dataset will define a specific scenario, and the framework will run a configurable number of rollouts for each scenario.
-
-### 1. The Dataset (`dataset.jsonl`)
-
-The foundation of this new approach is a dataset file that defines the experimental conditions.
-
--   **Action:** Create a new dataset file at `examples/frozen_lake/client/dataset.jsonl`.
--   **Format:** Each line in the file will be a JSON object representing a single experimental sample. Initially, this will just contain a unique `id` and a `seed`.
--   **Example Content:**
-    ```json
-    {"id": "run_001", "seed": 42}
-    {"id": "run_002", "seed": 123}
-    {"id": "run_003", "seed": 555}
-    {"id": "run_004", "seed": 678}
-    ```
-
-### 2. The Task Definition (`task_def.yaml`)
-
-The task definition will be updated to reference the dataset and specify how many rollouts (`N`) to perform for each sample.
-
--   **File to Modify:** `examples/frozen_lake/client/task_def.yaml`
--   **Changes:**
-    -   Remove the old `num_rollouts` field.
-    -   Add `dataset_path` to point to our new `dataset.jsonl` file.
-    -   Add `num_rollouts_per_sample` to define `N`.
--   **Example:**
-    ```yaml
-    name: "frozen_lake_http_rollout"
-    description: "Evaluate an agent's ability to navigate a Frozen Lake environment via HTTP rollout"
-
-    # Data-driven configuration
-    dataset_path: "examples/frozen_lake/client/dataset.jsonl"
-    num_rollouts_per_sample: 5 # This is 'N', the number of rollouts per seed
-
-    # Resource configuration remains the same
-    resource_type: "http_rollout"
-    # ... (rest of the file)
-    ```
-
-### 3. Core Framework Modifications
-
-The following changes will plumb the `seed` from the dataset through the framework to the game environment.
-
-1.  **Data Model (`eval_protocol/models.py`):**
-    -   Update `TaskDefinitionModel` to include `dataset_path: Optional[str]` and `num_rollouts_per_sample: int`.
-
-2.  **TaskManager (`eval_protocol/agent/task_manager.py`):**
-    -   Modify the `execute_tasks` method to load samples from the `dataset_path`.
-    -   For each sample, generate `num_rollouts_per_sample` rollout jobs.
-    -   Pass the sample data (containing the `seed`) for each job down to the `Orchestrator`.
-
-3.  **Orchestrator (`eval_protocol/agent/orchestrator.py`):**
-    -   Modify `execute_task_poc` to accept `sample_data` as a parameter.
-    -   Pass this data to the resource's `initialize` method: `await episode_resource.initialize(**sample_data)`.
-
-4.  **HTTP Rollout Resource (`eval_protocol/agent/resources/http_rollout_resource.py`):**
-    -   The `initialize` method will accept `**kwargs`.
-    -   These `kwargs` (the `sample_data`) will be sent as the JSON body of the POST request to the `/start_episode` endpoint.
-
-5.  **HTTP Rollout Server & Protocol:**
-    -   The `/start_episode` endpoint in `examples/frozen_lake/server/http_rollout_server.py` will be updated to accept a JSON request body.
-    -   It will pass the entire request body as keyword arguments to the `GymnasiumFrozenLakeGame` constructor: `game = FrozenLakeGame(**request_data)`.
-    -   The `StartEpisodeRequest` model in `eval_protocol/agent/resources/http_rollout_protocol.py` will be updated to allow arbitrary extra fields.
-
-6.  **Gymnasium Game (`examples/frozen_lake/gymnasium_frozen_lake_server.py`):**
-    -   The `__init__` method of `GymnasiumFrozenLakeGame` will be changed to accept `**kwargs`.
-    -   The `reset` method will use the `seed` from these arguments to initialize the environment deterministically: `self.env.reset(seed=self.seed)`.
-
-### 4. Visualization of the Flow
-
-```mermaid
-sequenceDiagram
-    participant TaskManager
-    participant Orchestrator
-    participant Resource as HttpRolloutResource
-    participant Server as http_rollout_server
-    participant Game as GymnasiumFrozenLakeGame
-
-    TaskManager->>TaskManager: Reads dataset.jsonl
-    TaskManager->>Orchestrator: execute_task_poc(sample_data={"seed": 42})
-    Orchestrator->>Resource: fork()
-    Orchestrator->>Resource: initialize(**sample_data)
-    Resource->>Server: POST /start_episode (body={"seed": 42})
-    Server->>Game: __init__(**{"seed": 42})
-    Game->>Game: self.env.reset(seed=42)
-    Game-->>Server: observation
-    Server-->>Resource: {episode_id, observation}
-    Resource-->>Orchestrator: (initialization complete)
-    Orchestrator->>Orchestrator: (proceeds with agent interaction)
diff --git a/development/notes/http_rollout.md b/development/notes/http_rollout.md
deleted file mode 100644
index f81678d9..00000000
--- a/development/notes/http_rollout.md
+++ /dev/null
@@ -1,38 +0,0 @@
-# Remote Rollout Server API
-
-Eval Protocol can collect reinforcement learning trajectories from an external HTTP service.
-The service exposes three simple endpoints used by `RemoteHttpRolloutClient`:
-
-## `POST /start_episode`
-Returns an `episode_id` and the initial observation.
-
-## `POST /step`
-Request body:
-```json
-{
-  "episode_id": "string",
-  "action": {"any": "payload"}
-}
-```
-Returns a JSON object:
-```json
-{
-  "observation": {"any": "payload"},
-  "is_done": false
-}
-```
-representing the new observation after the action and whether the episode has ended.
-
-## `POST /end_episode`
-Request body:
-```json
-{"episode_id": "string"}
-```
-Signals that the episode is complete.
-
-The Eval Protocol pipeline is responsible for invoking an
-OpenAI-compatible API between steps and feeding the resulting assistant messages
-back into the rollout. This illustrates how an environment can interact with an
-LLM at every step while keeping model calls in the pipeline.
-
-A concrete example of this is the [Frozen Lake Example](./frozen_lake_plan.md), which uses a remote HTTP rollout server to play the Frozen Lake game.
diff --git a/eval_protocol/agent/orchestrator.py b/eval_protocol/agent/orchestrator.py
index 410baf5f..2f737e2c 100644
--- a/eval_protocol/agent/orchestrator.py
+++ b/eval_protocol/agent/orchestrator.py
@@ -57,7 +57,6 @@ class ChatCompletionMessageToolCall:
     BFCLSimAPIResource,
     DockerResource,
     FileSystemResource,
-    HttpRolloutResource,
     PythonStateResource,
     SQLResource,
 )
@@ -244,8 +243,6 @@ def _get_resource_class(self, resource_type_name: str) -> Type[ForkableResource]
             "FileSystemResource": FileSystemResource,
             "DockerResource": DockerResource,
             "BFCLSimAPIResource": BFCLSimAPIResource,  # Add BFCLSimAPIResource to mapping
-            "HttpRolloutResource": HttpRolloutResource,  # Add HttpRolloutResource to mapping
-            "http_rollout": HttpRolloutResource,  # Allow lowercase alias for convenience
         }
         resource_class = mapping.get(resource_type_name)
 
diff --git a/eval_protocol/agent/resources/__init__.py b/eval_protocol/agent/resources/__init__.py
index 7f5a03b0..852e1597 100644
--- a/eval_protocol/agent/resources/__init__.py
+++ b/eval_protocol/agent/resources/__init__.py
@@ -7,20 +7,6 @@
 from .bfcl_sim_api_resource import BFCLSimAPIResource
 from .docker_resource import DockerResource
 from .filesystem_resource import FileSystemResource
-
-# HTTP Rollout Protocol types for server implementations
-from .http_rollout_protocol import (
-    EndEpisodeRequest,
-    EndEpisodeResponse,
-    GameObservation,
-    HealthResponse,
-    HttpRolloutConfig,
-    StartEpisodeRequest,
-    StartEpisodeResponse,
-    StepRequest,
-    StepResponse,
-)
-from .http_rollout_resource import HttpRolloutResource
 from .python_state_resource import PythonStateResource
 from .sql_resource import SQLResource
 
@@ -30,15 +16,4 @@
     "FileSystemResource",
     "DockerResource",
     "BFCLSimAPIResource",
-    "HttpRolloutResource",
-    # HTTP Rollout Protocol
-    "HttpRolloutConfig",
-    "StartEpisodeRequest",
-    "StartEpisodeResponse",
-    "StepRequest",
-    "StepResponse",
-    "EndEpisodeRequest",
-    "EndEpisodeResponse",
-    "HealthResponse",
-    "GameObservation",
 ]
diff --git a/eval_protocol/agent/resources/http_rollout_protocol.py b/eval_protocol/agent/resources/http_rollout_protocol.py
deleted file mode 100644
index d6992d0f..00000000
--- a/eval_protocol/agent/resources/http_rollout_protocol.py
+++ /dev/null
@@ -1,85 +0,0 @@
-"""
-HTTP Rollout Protocol - Standardized types for HTTP rollout communication.
-
-This module defines the standard request/response models for HTTP rollout servers
-and clients, ensuring consistent communication across different implementations.
-"""
-
-from typing import Any, Dict, List, Optional
-
-from pydantic import BaseModel
-
-
-class StartEpisodeRequest(BaseModel):
-    """Request to start a new episode."""
-
-    class Config:
-        extra = "allow"  # Allow arbitrary extra fields (like seed)
-
-
-class StartEpisodeResponse(BaseModel):
-    """Response from starting a new episode."""
-
-    episode_id: str
-    observation: Dict[str, Any]
-
-
-class StepRequest(BaseModel):
-    """Request to take a step in the environment."""
-
-    episode_id: str
-    action: Any  # Can be int, str, dict, etc. depending on environment
-
-
-class StepResponse(BaseModel):
-    """Response from taking a step in the environment."""
-
-    observation: Dict[str, Any]
-    is_done: bool
-    info: Optional[Dict[str, Any]] = None
-
-
-class EndEpisodeRequest(BaseModel):
-    """Request to end an episode."""
-
-    episode_id: str
-
-
-class EndEpisodeResponse(BaseModel):
-    """Response from ending an episode."""
-
-    message: str
-
-
-class HealthResponse(BaseModel):
-    """Response from health check endpoint."""
-
-    status: str
-    game: Optional[str] = None
-    version: Optional[str] = None
-
-
-class HttpRolloutConfig(BaseModel):
-    """Configuration for HTTP rollout resource."""
-
-    base_url: str
-    start_episode_endpoint: str = "/start_episode"
-    step_endpoint: str = "/step"
-    end_episode_endpoint: str = "/end_episode"
-    health_endpoint: str = "/health"
-    timeout: float = 30.0
-    max_retries: int = 3
-
-
-# Observation structure for game environments
-class GameObservation(BaseModel):
-    """Standard observation structure for game environments."""
-
-    position: Optional[List[int]] = None
-    current_cell: Optional[str] = None
-    done: bool = False
-    won: bool = False
-    visual: Optional[str] = None
-    message: Optional[str] = None
-    step_count: Optional[int] = None
-    max_steps: Optional[int] = None
diff --git a/eval_protocol/agent/resources/http_rollout_resource.py b/eval_protocol/agent/resources/http_rollout_resource.py
deleted file mode 100644
index 04be93e0..00000000
--- a/eval_protocol/agent/resources/http_rollout_resource.py
+++ /dev/null
@@ -1,325 +0,0 @@
-"""
-HTTP Rollout Resource implementation for the agent evaluation framework.
-
-This resource bridges the HTTP rollout protocol with the ForkableResource interface,
-allowing HTTP-based environments to be used in agent evaluations.
-"""
-
-import json
-import uuid
-from typing import Any, Dict, List, Optional
-
-import httpx
-
-from ..resource_abc import ForkableResource
-from .http_rollout_protocol import (
-    EndEpisodeRequest,
-    GameObservation,
-    HttpRolloutConfig,
-    StartEpisodeRequest,
-    StartEpisodeResponse,
-    StepRequest,
-    StepResponse,
-)
-
-
-class HttpRolloutResource(ForkableResource):
-    """
-    A ForkableResource implementation that communicates with HTTP rollout servers.
-
-    This resource allows the agent evaluation framework to interact with
-    HTTP-based environments through a standardized rollout protocol.
-    """
-
-    def __init__(self):
-        """Initialize the HTTP rollout resource."""
-        super().__init__()
-        self.config: Optional[HttpRolloutConfig] = None
-        self.episode_id: Optional[str] = None
-        self.current_observation: Optional[Dict[str, Any]] = None
-        self.is_episode_active = False
-        self.client: Optional[httpx.Client] = None
-
-        # Set up logging
-        import logging
-
-        self.logger = logging.getLogger(f"{self.__class__.__name__}")
-
-    async def setup(self, config: Dict[str, Any]) -> None:
-        """
-        Set up the resource with the provided configuration.
-
-        Args:
-            config: Configuration dictionary from the task definition
-        """
-        self.config = HttpRolloutConfig(**config)
-        self.client = httpx.Client(timeout=self.config.timeout)
-
-    async def fork(self) -> "HttpRolloutResource":
-        """
-        Create a new independent instance of this resource.
-
-        For HTTP rollout, forking means creating a new resource instance
-        that will start its own episode when initialized.
-        """
-        if not self.config:
-            raise RuntimeError("Resource not set up. Call setup() first.")
-
-        # Create a new instance with the same config
-        new_resource = HttpRolloutResource()
-        await new_resource.setup(self.config.model_dump())
-        return new_resource
-
-    async def get_state(self) -> Dict[str, Any]:
-        """
-        Get the current state of the resource.
-
-        Returns the current observation and episode metadata.
-        """
-        return {
-            "episode_id": self.episode_id,
-            "observation": self.current_observation,
-            "is_episode_active": self.is_episode_active,
-            "type": "http_rollout",
-        }
-
-    async def initialize(self, **kwargs) -> None:
-        """
-        Initialize the resource by starting a new episode.
-        Passes any provided kwargs (like seed) to the server in the request body.
-        """
-        try:
-            url = f"{self.config.base_url}{self.config.start_episode_endpoint}"
-
-            # Include any sample data (like seed) in the request body
-            if kwargs:
-                self.logger.info(f"Sending initialization data to server: {kwargs}")
-                response = self.client.post(url, json=kwargs)
-            else:
-                response = self.client.post(url)
-            response.raise_for_status()
-
-            episode_data = response.json()
-            self.episode_id = episode_data["episode_id"]
-            self.current_observation = episode_data["observation"]
-            self.is_episode_active = True
-
-        except Exception as e:
-            raise RuntimeError(f"Failed to start HTTP rollout episode: {e}")
-
-    async def get_initial_state_description(self) -> str:
-        """
-        Get a formatted description of the initial game state for the agent.
-        Uses the observation from start_episode to build the prompt.
-        """
-        # Start episode to get current game state
-        if not self.is_episode_active:
-            await self.initialize()
-
-        if not self.current_observation:
-            return "No initial state available."
-
-        obs = self.current_observation
-
-        # Build comprehensive game prompt
-        content = """🎮 FROZEN LAKE GAME - AUTONOMOUS PLAY MODE
-
-🎯 OBJECTIVE: Navigate from S to G without hitting H
-
-📋 GAME RULES: S=start, F=safe, H=hole(death), G=goal(win)
-
-🤖 AUTONOMOUS MODE INSTRUCTIONS:
-- You are playing this game AUTONOMOUSLY until completion
-- KEEP MAKING MOVES using the step tool until you reach G or hit H
-- DO NOT ask for user input or wait for confirmation
-- DO NOT stop after one move - continue until the game ends
-- Each move should be followed immediately by another move
-- Game only ends when you reach G (win) or hit H (lose)
-
-🎮 ACTION: Use step tool with: "left", "right", "up", or "down"
-
-⚡ START NOW - Make your first move and continue until the game is complete!"""
-
-        description_parts = [content]
-
-        if obs.get("message"):
-            description_parts.append(f"\nEnvironment: {obs['message']}")
-
-        if obs.get("visual"):
-            description_parts.append(f"\nGame Board:\n{obs['visual']}")
-
-        if obs.get("position"):
-            description_parts.append(f"\nStarting Position: {obs['position']}")
-
-        description_parts.append("\nGame Rules:")
-        description_parts.append("- S = Start position")
-        description_parts.append("- F = Frozen (safe to step on)")
-        description_parts.append("- H = Hole (game over if you step here)")
-        description_parts.append("- G = Goal (reach this to win)")
-        description_parts.append("- [X] = Your current position")
-
-        return "\n".join(description_parts)
-
-    async def cleanup(self) -> None:
-        """
-        Clean up the resource by ending the current episode.
-        """
-        if self.is_episode_active and self.episode_id:
-            try:
-                url = f"{self.config.base_url}{self.config.end_episode_endpoint}"
-                response = self.client.post(url, json={"episode_id": self.episode_id})
-                response.raise_for_status()
-
-            except Exception as e:
-                # Log but don't raise - cleanup should be best effort
-                print(f"Warning: Failed to properly end episode {self.episode_id}: {e}")
-
-            finally:
-                self.episode_id = None
-                self.current_observation = None
-                self.is_episode_active = False
-
-        # Close the HTTP client
-        self.client.close()
-
-    async def get_tools_spec(self) -> List[Dict[str, Any]]:
-        """
-        Get the list of available tools for this resource.
-
-        For HTTP rollout, this returns the 'step' tool that allows
-        the agent to take actions in the environment.
-        """
-        return [
-            {
-                "name": "step",
-                "description": "Take a step in the Frozen Lake game by choosing a direction to move",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "action": {
-                            "type": "string",
-                            "enum": ["left", "down", "right", "up"],
-                            "description": "The direction to move in the game: 'left', 'down', 'right', or 'up'",
-                        }
-                    },
-                    "required": ["action"],
-                },
-            }
-        ]
-
-    async def step(self, action_name: str, action_params: Dict[str, Any]) -> Any:
-        """
-        Execute a tool call on this resource.
-
-        For HTTP rollout, this handles the 'step' tool by sending
-        the action to the HTTP rollout server.
-        """
-        if not self.is_episode_active or not self.episode_id:
-            # If no active episode, start one first
-            await self.initialize()
-
-        if action_name == "step":
-            action = action_params.get("action")
-            return await self._handle_step_tool(action)
-        else:
-            raise ValueError(f"Unknown action: {action_name}")
-
-    async def get_observation(self) -> Any:
-        """
-        Get the current observation from the environment.
-        """
-        if self.current_observation:
-            return self.current_observation
-        else:
-            return {"message": "No observation available. Start an episode first."}
-
-    async def checkpoint(self) -> Dict[str, Any]:
-        """
-        Create a checkpoint of the current resource state.
-
-        For HTTP rollout, this saves the episode ID and current observation.
-        """
-        return {
-            "episode_id": self.episode_id,
-            "current_observation": self.current_observation,
-            "is_episode_active": self.is_episode_active,
-        }
-
-    async def restore(self, state_data: Dict[str, Any]) -> None:
-        """
-        Restore the resource state from a checkpoint.
-
-        Note: This is limited for HTTP rollout since we can't restore
-        arbitrary server-side state.
-        """
-        self.episode_id = state_data.get("episode_id")
-        self.current_observation = state_data.get("current_observation")
-        self.is_episode_active = state_data.get("is_episode_active", False)
-
-    async def close(self) -> None:
-        """
-        Clean up and close the resource.
-        """
-        await self.cleanup()
-
-    async def _handle_step_tool(self, action: Any) -> Dict[str, Any]:
-        """
-        Handle the 'step' tool by sending an action to the HTTP rollout server.
-        """
-        try:
-            # Convert string action to integer for the server
-            action_map = {"left": 0, "down": 1, "right": 2, "up": 3}
-
-            if isinstance(action, str):
-                if action.lower() not in action_map:
-                    raise ValueError(f"Invalid action '{action}'. Must be one of: left, down, right, up")
-                numeric_action = action_map[action.lower()]
-            else:
-                # Backward compatibility with numeric actions
-                numeric_action = action
-
-            url = f"{self.config.base_url}{self.config.step_endpoint}"
-            step_data = {"episode_id": self.episode_id, "action": numeric_action}
-
-            response = self.client.post(url, json=step_data)
-            response.raise_for_status()
-
-            step_result = response.json()
-            self.current_observation = step_result["observation"]
-
-            # If the episode is done, mark it as inactive
-            if step_result.get("is_done", False):
-                self.is_episode_active = False
-
-            # Format the response for the agent
-            observation = step_result["observation"]
-            message = observation.get("message", "")
-            visual = observation.get("visual", "")
-
-            # Create a comprehensive response
-            response_content = []
-            if message:
-                response_content.append(f"Environment: {message}")
-            if visual:
-                response_content.append(f"Visual State:\n{visual}")
-
-            # Add structured data
-            response_content.append(f"Position: {observation.get('position', 'unknown')}")
-            response_content.append(f"Done: {step_result.get('is_done', False)}")
-
-            if step_result.get("is_done", False):
-                won = observation.get("won", False)
-                response_content.append(f"Result: {'Victory!' if won else 'Game Over'}")
-
-            return {"content": [{"type": "text", "text": "\n".join(response_content)}]}
-
-        except Exception as e:
-            raise RuntimeError(f"Failed to execute step: {e}")
-
-    def __del__(self):
-        """Ensure cleanup on deletion."""
-        if hasattr(self, "client") and self.client:
-            try:
-                self.client.close()
-            except Exception:
-                pass  # Ignore cleanup errors during deletion
diff --git a/examples/frozen_lake/README.md b/examples/frozen_lake/README.md
deleted file mode 100644
index 803591a7..00000000
--- a/examples/frozen_lake/README.md
+++ /dev/null
@@ -1,152 +0,0 @@
-# Frozen Lake Agent Evaluation
-
-This example demonstrates LLM agent evaluation on the Frozen Lake game using eval-protocol's HTTP rollout framework. The agent must navigate from start (S) to goal (G) while avoiding holes (H).
-
-## Quick Start
-
-### Setup
-```bash
-# For Fireworks AI
-export FIREWORKS_API_KEY="your_fireworks_api_key"
-export MODEL_AGENT="fireworks/accounts/fireworks/models/qwen3-235b-a22b"
-
-# For OpenAI
-export OPENAI_API_KEY="your_openai_api_key"
-export MODEL_AGENT="openai/gpt-4o-mini"
-
-# For other providers, set appropriate API key and MODEL_AGENT
-```
-
-### Run Evaluation
-```bash
-# Batch evaluation (8 parallel rollouts) - recommended
-eval-protocol agent-eval --task-def examples/frozen_lake/client/task_def.yaml
-
-# Single rollout for debugging
-eval-protocol agent-eval --task-def examples/frozen_lake/client/task_def.yaml --num-rollouts 1
-
-# Custom batch size
-eval-protocol agent-eval --task-def examples/frozen_lake/client/task_def.yaml --num-rollouts 16
-```
-
-### Output
-```bash
-Task 'frozen_lake_http_rollout' batch results:
-  - Rollouts: 6/8 successful
-  - Success rate: 75.00%
-  - Average score: 0.7500 ± 0.4330
-  - Trajectory data saved to: client/evaluation_logs/trajectory_frozen_lake_http_rollout_20250610_143052.jsonl
-```
-
-
-## Architecture
-
-```
-┌─────────────────┐    HTTP     ┌──────────────────┐
-│ Client Side     │ ◄─────────► │ Server Side      │
-│ (eval-protocol)    │  Rollout    │ (Game Env)       │
-│                 │             │                  │
-│ • Agent Eval    │             │ • Game Logic     │
-│ • Reward Func   │             │ • State Mgmt     │
-│ • Trajectory    │             │ • HTTP API       │
-└─────────────────┘             └──────────────────┘
-```
-
-## Project Structure
-
-```
-frozen_lake/
-├── README.md                    # This overview
-├── server/                      # Game Environment (HTTP API)
-│   ├── README.md               # Server documentation
-│   └── http_rollout_server.py  # FastAPI game server
-└── client/                     # Agent Evaluation
-    ├── task_def.yaml           # Task configuration (works with any model)
-    ├── reward.py               # Reward function
-    └── evaluation_logs/        # Generated results & trajectories
-        ├── trajectory_*.jsonl  # Conversation histories + tool calls
-        └── *_reeval_*.jsonl    # Re-evaluation results
-```
-
-## Game Rules
-
-**Objective:** Navigate from S to G without falling into holes (H)
-
-```
-[S] F  F  F
- F  H  F  H
- F  F  F  H
- H  F  F  G
-```
-
-**Actions:** `"left"`, `"right"`, `"up"`, `"down"`
-
-## Trajectory Data Format
-
-Each trajectory JSONL file contains:
-
-```json
-{"type": "summary", "task_id": "frozen_lake_http_rollout", "num_rollouts": 8, "success_rate": 0.75, "avg_score": 0.75}
-{"type": "individual_result", "rollout_index": 0, "score": 1.0, "conversation_messages": [...], "reward_function_inputs": {...}}
-```
-
-### Conversation Messages
-Complete OpenAI format conversation history:
-- User prompts
-- Assistant responses with reasoning
-- Tool calls (game actions)
-- Tool results (game observations)
-
-### Reward Function Inputs
-Exact parameters passed to reward functions:
-- `messages`: Full conversation history
-- `state`: Game state and successful function calls
-- `task_achieved`: Success/failure status
-- `ground_truth`: Reference data (if available)
-
-## Customization
-
-### Custom Reward Functions
-Create new reward functions and test them on existing trajectories:
-
-```python
-# my_rewards.py
-from eval_protocol.typed_interface import reward_function
-from eval_protocol.models import EvaluateResult, MetricResult
-
-@reward_function
-def efficiency_reward(messages, state=None, **kwargs):
-    # Count steps taken
-    step_count = len(state.get("successful_func_calls", [[]])[0])
-
-    # Reward fewer steps
-    efficiency_score = max(0.0, 1.0 - (step_count - 4) * 0.1)
-
-    return EvaluateResult(
-        score=efficiency_score,
-        reason=f"Efficiency reward: {step_count} steps",
-        metrics={"efficiency": MetricResult(score=efficiency_score, reason="Step efficiency")}
-    )
-```
-
-
-## Model Performance
-
-| Model | Success Rate | Average Score | Best Strategy |
-|-------|-------------|---------------|---------------|
-| qwen3-235b-a22b | 75-100% | 0.75-1.0 | down→down→right→right→down→right |
-| gpt-4o-mini | 0-25% | 0.0-0.25 | Often fails at holes |
-
-## Troubleshooting
-
-- **Connection errors**: Server auto-starts, check port conflicts
-- **API key issues**: Verify MODEL_AGENT and API key are set
-- **Empty trajectories**: Check `client/evaluation_logs/` directory
-- **Re-evaluation errors**: Ensure reward function module path is correct
-
-## Next Steps
-
-1. **Run the example**: Start with single rollout, then batch evaluation
-2. **Analyze trajectories**: Examine generated JSONL files
-3. **Create custom rewards**: Implement your own scoring functions
-4. **Compare approaches**: Use re-evaluation to test different strategies
diff --git a/examples/frozen_lake/analyze_trajectory.py b/examples/frozen_lake/analyze_trajectory.py
deleted file mode 100755
index fe6a90c0..00000000
--- a/examples/frozen_lake/analyze_trajectory.py
+++ /dev/null
@@ -1,273 +0,0 @@
-#!/usr/bin/env python3
-"""
-Agent Trajectory Analyzer for Frozen Lake HTTP Rollout Evaluation
-
-This script parses the evaluation logs and creates a human-readable
-trajectory showing the agent's decision making process.
-"""
-
-import json
-import re
-import sys
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-
-
-def extract_tool_calls_from_log(log_content: str) -> List[Dict[str, Any]]:
-    """Extract tool calls and their results from the log."""
-    tool_calls = []
-
-    # Find all tool call patterns
-    tool_call_pattern = r"Attempting tool call: (\w+)\((.*?)\)"
-    tool_result_pattern = r"Tool '(\w+)' result: (.*?)(?=INFO:|DEBUG:|ERROR:|$)"
-
-    tool_calls_matches = re.finditer(tool_call_pattern, log_content, re.DOTALL)
-
-    for match in tool_calls_matches:
-        tool_name = match.group(1)
-        tool_args = match.group(2)
-
-        # Try to parse the arguments as JSON
-        try:
-            args_dict = json.loads(tool_args)
-        except (json.JSONDecodeError, TypeError, ValueError):
-            args_dict = {"raw": tool_args}
-
-        tool_call = {"tool_name": tool_name, "arguments": args_dict, "result": None}
-
-        # Find the corresponding result
-        result_pattern = rf"Tool '{tool_name}' result: (.*?)(?=INFO:|DEBUG:|ERROR:|$)"
-        result_match = re.search(result_pattern, log_content[match.end() :], re.DOTALL)
-
-        if result_match:
-            result_text = result_match.group(1).strip()
-            # Try to parse as JSON
-            try:
-                tool_call["result"] = json.loads(result_text)
-            except (json.JSONDecodeError, TypeError, ValueError):
-                tool_call["result"] = {"raw": result_text}
-
-        tool_calls.append(tool_call)
-
-    return tool_calls
-
-
-def extract_agent_messages(log_content: str) -> List[Dict[str, Any]]:
-    """Extract the agent's reasoning and responses."""
-    messages = []
-
-    # Find OpenAI response messages
-    response_pattern = r"OpenAI response message: ChatCompletionMessage\((.*?)\)"
-
-    for match in re.finditer(response_pattern, log_content, re.DOTALL):
-        message_str = match.group(1)
-
-        # Extract thinking content
-        think_match = re.search(r"content='(.*?)', refusal=", message_str, re.DOTALL)
-        if think_match:
-            thinking = think_match.group(1)
-
-            # Extract <think> tags
-            think_content_match = re.search(r"<think>(.*?)</think>", thinking, re.DOTALL)
-            if think_content_match:
-                thinking_content = think_content_match.group(1).strip()
-            else:
-                thinking_content = thinking
-
-            messages.append({"type": "thinking", "content": thinking_content})
-
-        # Extract tool calls from the message
-        tool_calls_match = re.search(r"tool_calls=\[(.*?)\]", message_str, re.DOTALL)
-        if tool_calls_match:
-            messages.append({"type": "tool_calls", "content": tool_calls_match.group(1)})
-
-    return messages
-
-
-def extract_game_state_changes(log_content: str) -> List[Dict[str, Any]]:
-    """Extract game state changes from the environment responses."""
-    states = []
-
-    # Find environment responses
-    env_pattern = r"Environment: (.*?)(?=\\n|Position:|Done:)"
-    visual_pattern = r"Visual State:\\n(.*?)(?=\\nPosition:|\\nDone:)"
-    position_pattern = r"Position: (\[.*?\])"
-    done_pattern = r"Done: (True|False)"
-
-    # Find all environment messages
-    env_matches = re.finditer(
-        r"Tool 'step' result:.*?Environment: (.*?)\\nVisual State:\\n(.*?)\\nPosition: (\[.*?\])\\nDone: (True|False)",
-        log_content,
-        re.DOTALL,
-    )
-
-    for i, match in enumerate(env_matches):
-        env_message = match.group(1)
-        visual_state = match.group(2)
-        position = match.group(3)
-        done = match.group(4) == "True"
-
-        states.append(
-            {
-                "step": i + 1,
-                "message": env_message,
-                "visual_state": visual_state.replace("\\n", "\n"),
-                "position": position,
-                "done": done,
-            }
-        )
-
-    return states
-
-
-def create_trajectory_report(log_file: str) -> str:
-    """Create a detailed trajectory report."""
-
-    with open(log_file, "r") as f:
-        log_content = f.read()
-
-    tool_calls = extract_tool_calls_from_log(log_content)
-    agent_messages = extract_agent_messages(log_content)
-    game_states = extract_game_state_changes(log_content)
-
-    report = []
-    report.append("FROZEN LAKE AGENT TRAJECTORY ANALYSIS")
-    report.append("=" * 50)
-    report.append("")
-
-    # Summary
-    report.append(f"📊 SUMMARY:")
-    report.append(f"• Total tool calls: {len(tool_calls)}")
-    report.append(f"• Total reasoning steps: {len([m for m in agent_messages if m['type'] == 'thinking'])}")
-    report.append(f"• Game state changes: {len(game_states)}")
-    report.append("")
-
-    # Detailed trajectory
-    report.append("🎮 DETAILED TRAJECTORY:")
-    report.append("-" * 30)
-    report.append("")
-
-    for i, tool_call in enumerate(tool_calls):
-        step_num = i + 1
-        report.append(f"STEP {step_num}: {tool_call['tool_name'].upper()}")
-        report.append(f"Arguments: {tool_call['arguments']}")
-
-        # Add corresponding game state if available
-        if i < len(game_states):
-            state = game_states[i]
-            report.append(f"Result: {state['message']}")
-            report.append(f"Position: {state['position']}")
-            report.append(f"Visual State:")
-            for line in state["visual_state"].split("\n"):
-                if line.strip():
-                    report.append(f"  {line}")
-            report.append(f"Game Done: {state['done']}")
-
-        report.append("")
-
-    # Agent reasoning analysis
-    report.append("🧠 AGENT REASONING:")
-    report.append("-" * 20)
-    report.append("")
-
-    thinking_messages = [m for m in agent_messages if m["type"] == "thinking"]
-    for i, message in enumerate(thinking_messages[:3]):  # Show first 3 reasoning steps
-        report.append(f"REASONING STEP {i+1}:")
-        # Truncate long reasoning for readability
-        content = message["content"]
-        if len(content) > 500:
-            content = content[:500] + "...[truncated]"
-        report.append(content)
-        report.append("")
-
-    if len(thinking_messages) > 3:
-        report.append(f"... and {len(thinking_messages) - 3} more reasoning steps")
-        report.append("")
-
-    # Game progression analysis
-    report.append("📍 GAME PROGRESSION:")
-    report.append("-" * 20)
-    report.append("")
-
-    positions = []
-    for state in game_states:
-        try:
-            pos = eval(state["position"])  # Convert string representation to list
-            positions.append(pos)
-        except (SyntaxError, NameError, TypeError, ValueError):
-            positions.append(state["position"])
-
-    if positions:
-        report.append("Path taken:")
-        for i, pos in enumerate(positions):
-            if i == 0:
-                report.append(f"  Start: {pos}")
-            else:
-                prev_pos = positions[i - 1]
-                direction = get_direction(prev_pos, pos)
-                report.append(f"  Step {i}: {prev_pos} → {pos} ({direction})")
-
-        # Final position
-        if positions:
-            final_pos = positions[-1]
-            # Check if reached goal (typically at [3,3])
-            if final_pos == [3, 3]:
-                report.append(f"  🎉 GOAL REACHED at {final_pos}!")
-            else:
-                report.append(f"  Final position: {final_pos}")
-
-    return "\n".join(report)
-
-
-def get_direction(from_pos: List[int], to_pos: List[int]) -> str:
-    """Determine the direction of movement."""
-    if len(from_pos) != 2 or len(to_pos) != 2:
-        return "unknown"
-
-    row_diff = to_pos[0] - from_pos[0]
-    col_diff = to_pos[1] - from_pos[1]
-
-    if row_diff == 0 and col_diff == 1:
-        return "RIGHT"
-    elif row_diff == 0 and col_diff == -1:
-        return "LEFT"
-    elif row_diff == 1 and col_diff == 0:
-        return "DOWN"
-    elif row_diff == -1 and col_diff == 0:
-        return "UP"
-    else:
-        return f"DIAGONAL({row_diff},{col_diff})"
-
-
-def main():
-    if len(sys.argv) != 2:
-        print("Usage: python analyze_trajectory.py <log_file>")
-        sys.exit(1)
-
-    log_file = sys.argv[1]
-
-    if not Path(log_file).exists():
-        print(f"Error: Log file {log_file} not found")
-        sys.exit(1)
-
-    try:
-        report = create_trajectory_report(log_file)
-
-        # Save to analysis file
-        analysis_file = str(Path(log_file).with_suffix(".analysis.txt"))
-        with open(analysis_file, "w") as f:
-            f.write(report)
-
-        print(report)
-        print(f"\n📄 Analysis saved to: {analysis_file}")
-
-    except Exception as e:
-        print(f"Error analyzing trajectory: {e}")
-        import traceback
-
-        traceback.print_exc()
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/frozen_lake/client/dataset.jsonl b/examples/frozen_lake/client/dataset.jsonl
deleted file mode 100644
index 27c5c684..00000000
--- a/examples/frozen_lake/client/dataset.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-{"id": "run_001", "seed": 42}
-{"id": "run_002", "seed": 123}
-{"id": "run_003", "seed": 999}
diff --git a/examples/frozen_lake/client/reward.py b/examples/frozen_lake/client/reward.py
deleted file mode 100644
index 7239b8aa..00000000
--- a/examples/frozen_lake/client/reward.py
+++ /dev/null
@@ -1,126 +0,0 @@
-"""
-Reward function for the Frozen Lake evaluation task.
-"""
-
-from typing import List, Optional
-
-from eval_protocol.models import EvaluateResult, Message, MetricResult, StepOutput
-from eval_protocol.typed_interface import reward_function
-
-
-@reward_function
-def frozen_lake_reward(messages: List[Message], state=None, **kwargs) -> EvaluateResult:
-    """
-    Evaluate the final message list for a success string in the Frozen Lake game.
-
-    Args:
-        messages: List of conversation messages
-        state: State dictionary containing trajectory data
-        **kwargs: Additional keyword arguments
-
-    Returns:
-        EvaluateResult with score 1.0 for success, 0.0 for failure
-    """
-    # Check if the last message (from the game) contains success indicators
-    if not messages:
-        return EvaluateResult(
-            score=0.0,
-            reason="No messages provided",
-            metrics={"success": MetricResult(score=0.0, reason="No messages provided")},
-        )
-
-    # Check all messages (especially tool responses) for game outcome
-    def extract_content_from_message(msg):
-        """Extract text content from a message, handling JSON-encoded tool responses."""
-        content = msg.content
-        if content and isinstance(content, str):
-            try:
-                # Try to parse JSON content from tool responses
-                import json
-
-                parsed_content = json.loads(content)
-                if isinstance(parsed_content, dict) and "content" in parsed_content:
-                    # Extract text from tool response format
-                    content_list = parsed_content["content"]
-                    if isinstance(content_list, list) and len(content_list) > 0:
-                        text_item = content_list[0]
-                        if isinstance(text_item, dict) and "text" in text_item:
-                            content = text_item["text"]
-            except (json.JSONDecodeError, KeyError, IndexError, TypeError):
-                # If parsing fails, use original content
-                pass
-        return content.lower() if content else ""
-
-    # Check for success/failure indicators in all messages
-    success_indicators = [
-        "you win",
-        "you reached the goal",
-        "congratulations",
-        "success",
-        "goal reached",
-        "you made it",
-        "victory",
-    ]
-
-    failure_indicators = ["you lose", "game over", "you fell", "hole"]
-
-    is_success = False
-    is_failure = False
-    winning_message = ""
-    losing_message = ""
-
-    # Check all messages for game outcome indicators
-    for msg in messages:
-        content = extract_content_from_message(msg)
-
-        # Check for success
-        for indicator in success_indicators:
-            if indicator in content:
-                is_success = True
-                winning_message = content[:100]
-                break
-
-        # Check for failure
-        for indicator in failure_indicators:
-            if indicator in content:
-                is_failure = True
-                losing_message = content[:100]
-                break
-
-    # Determine the score (success takes precedence over failure)
-    if is_success:
-        score = 1.0
-        reason = "Successfully reached the goal in Frozen Lake"
-    elif is_failure:
-        score = 0.0
-        reason = "Failed to reach the goal (fell into hole or other failure)"
-    else:
-        # If no clear success/failure indicator, check if game is still ongoing
-        score = 0.0
-        reason = "Game outcome unclear or still in progress"
-
-    metrics = {"success": MetricResult(score=score, reason=reason, is_score_valid=True)}
-
-    # Extract trajectory data if available
-    step_outputs = None
-    if state and "successful_func_calls" in state:
-        successful_calls = state["successful_func_calls"]
-        step_outputs = []
-
-        # Convert function calls to StepOutput format
-        step_index = 0
-        for turn_calls in successful_calls:
-            for call in turn_calls:
-                # Extract action from function call arguments
-                action = call.get("args", {}).get("action", "unknown")
-                step_outputs.append(
-                    StepOutput(
-                        step_index=step_index,
-                        action=action,
-                        base_reward=(0.1 if action != "unknown" else 0.0),  # Small reward for valid actions
-                        reason=f"Agent took action: {action}",
-                    )
-                )
-                step_index += 1
-
-    return EvaluateResult(score=score, reason=reason, metrics=metrics, step_outputs=step_outputs)
diff --git a/examples/frozen_lake/client/task_def.yaml b/examples/frozen_lake/client/task_def.yaml
deleted file mode 100644
index 62ee9540..00000000
--- a/examples/frozen_lake/client/task_def.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-name: "frozen_lake_http_rollout"
-description: "Evaluate an agent's ability to navigate a Frozen Lake environment via HTTP rollout"
-
-# Data-driven configuration
-dataset_path: "examples/frozen_lake/client/dataset.jsonl"
-num_rollouts_per_sample: 1  # This is 'N', the number of rollouts per seed
-
-# Resource configuration - connects to the game server
-resource_type: "http_rollout"
-base_resource_config:
-  base_url: "http://localhost:8080"  # Will be dynamically updated by TaskManager
-  timeout: 30.0
-
-# Resource server configuration - automatically managed by TaskManager
-resource_server:
-  start_command: "python examples/frozen_lake/server/http_rollout_server.py --port {port}"
-  health_check_url: "http://localhost:{port}/health"
-
-# Reward function - the only client-side logic needed
-reward_function_path: "examples.frozen_lake.client.reward.frozen_lake_reward"
-
-# Initial user message - gets extended with game state from the server
-messages:
-  - role: "user"
-    content: "Start playing the game!"
-
-# Evaluation configuration
-poc_max_turns: 20
-
-# Generation configuration
-generation:
-  enabled: true
-  _target_: eval_protocol.generation.generate_responses
-  model_name: "accounts/fireworks/models/qwen3-235b-a22b"
-  temperature: 0.0
-  max_new_tokens: 500
-  batch_size: 1
-  cache:
-    enabled: true
-
-# Reward Function Configuration
-reward:
-  function_path: "examples.frozen_lake.client.reward.frozen_lake_reward"
-
-# Note: The TaskManager will automatically start the server on a free port
-# and update the base_url accordingly for parallel execution support
diff --git a/examples/frozen_lake/gymnasium_frozen_lake_server.py b/examples/frozen_lake/gymnasium_frozen_lake_server.py
deleted file mode 100644
index 6a99153e..00000000
--- a/examples/frozen_lake/gymnasium_frozen_lake_server.py
+++ /dev/null
@@ -1,281 +0,0 @@
-"""
-Gymnasium-based Frozen Lake game server implementation.
-
-This implementation wraps the official Gymnasium FrozenLake-v1 environment
-and provides the same interface as the hand-rolled implementation for
-seamless integration with the HTTP rollout server.
-"""
-
-from typing import Dict, Optional, Tuple, Union
-
-import gymnasium as gym
-import numpy as np
-from gymnasium.envs.toy_text.frozen_lake import generate_random_map
-
-
-class GymnasiumFrozenLakeGame:
-    """
-    Gymnasium-based Frozen Lake game implementation.
-
-    This class wraps the Gymnasium FrozenLake-v1 environment and provides
-    a compatible interface with the hand-rolled implementation.
-
-    The game is played on a 4x4 grid where:
-    - S: Starting position
-    - F: Frozen surface (safe to walk on)
-    - H: Hole (game over if you fall in)
-    - G: Goal (reach this to win)
-
-    Actions:
-    - 0: Left
-    - 1: Down
-    - 2: Right
-    - 3: Up
-    """
-
-    def __init__(
-        self,
-        map_name: str = "4x4",
-        is_slippery: bool = False,
-        render_mode: Optional[str] = None,
-        seed: Optional[int] = None,
-        **kwargs,
-    ):
-        """
-        Initialize the Gymnasium Frozen Lake game.
-
-        Args:
-            map_name: Map size ("4x4" or "8x8") - only used if seed is None
-            is_slippery: Whether the ice is slippery (stochastic environment)
-            render_mode: Rendering mode for Gymnasium environment
-            seed: Random seed for reproducible map generation and behavior
-            **kwargs: Additional keyword arguments (ignored for compatibility)
-        """
-        self.map_name = map_name
-        self.is_slippery = is_slippery
-        self.seed = seed
-
-        # Create the Gymnasium environment
-        if seed is not None:
-            # Use random map generation with seed for reproducible boards
-            size = 4 if map_name == "4x4" else 8
-            desc = generate_random_map(size=size, p=0.8, seed=seed)
-            self.env = gym.make(
-                "FrozenLake-v1",
-                desc=desc,
-                is_slippery=is_slippery,
-                render_mode=render_mode,
-            )
-        else:
-            # Use fixed predefined maps
-            self.env = gym.make(
-                "FrozenLake-v1",
-                map_name=map_name,
-                is_slippery=is_slippery,
-                render_mode=render_mode,
-            )
-
-        # Get environment properties
-        self.desc = self.env.unwrapped.desc
-        self.nrow, self.ncol = self.desc.shape
-        self.nS = self.env.observation_space.n
-        self.nA = self.env.action_space.n
-
-        # Find start and goal positions
-        self.start_pos = None
-        self.goal_pos = None
-        for i in range(self.nrow):
-            for j in range(self.ncol):
-                if self.desc[i, j] == b"S":
-                    self.start_pos = (i, j)
-                elif self.desc[i, j] == b"G":
-                    self.goal_pos = (i, j)
-
-        # Initialize state tracking
-        self.current_state: Optional[int] = None
-        self.current_pos: Optional[Tuple[int, int]] = None
-        self.done = False
-        self.won = False
-
-        self.reset()
-
-    def _state_to_pos(self, state: int) -> Tuple[int, int]:
-        """Convert state number to (row, col) position."""
-        return state // self.ncol, state % self.ncol
-
-    def _pos_to_state(self, row: int, col: int) -> int:
-        """Convert (row, col) position to state number."""
-        return row * self.ncol + col
-
-    def reset(self) -> Dict:
-        """Reset the game to the starting position."""
-        if self.seed is not None:
-            self.current_state, _ = self.env.reset(seed=self.seed)
-        else:
-            self.current_state, _ = self.env.reset()
-        self.current_pos = self._state_to_pos(self.current_state)
-        self.done = False
-        self.won = False
-        return self._get_observation()
-
-    def step(self, action: Union[int, str]) -> Tuple[Dict, bool]:
-        """
-        Take a step in the environment.
-
-        Args:
-            action: Action to take. Can be:
-                - Integer: 0=left, 1=down, 2=right, 3=up
-                - String: "left", "down", "right", "up"
-
-        Returns:
-            Tuple of (observation, done)
-        """
-        if self.done:
-            return self._get_observation(), True
-
-        # Convert string action to integer if needed
-        if isinstance(action, str):
-            action_map = {"left": 0, "down": 1, "right": 2, "up": 3}
-            if action.lower() not in action_map:
-                raise ValueError(f"Invalid action '{action}'. Must be one of: left, down, right, up")
-            numeric_action = action_map[action.lower()]
-        else:
-            numeric_action = action
-
-        if not (0 <= numeric_action < self.nA):
-            raise ValueError(f"Invalid action: {numeric_action}. Must be 0-{self.nA-1}")
-
-        # Take the step in the Gymnasium environment
-        new_state, reward, terminated, truncated, info = self.env.step(numeric_action)
-
-        # Update our state tracking
-        self.current_state = new_state
-        self.current_pos = self._state_to_pos(new_state)
-        self.done = terminated or truncated
-        self.won = reward > 0  # In FrozenLake, reward=1 for reaching goal, 0 otherwise
-
-        return self._get_observation(), self.done
-
-    def _get_observation(self) -> Dict:
-        """Get the current observation."""
-        if self.current_pos is None:
-            raise RuntimeError("Game not initialized")
-        row, col = self.current_pos
-        cell = self.desc[row, col].decode("utf-8")
-
-        # Create a visual representation
-        visual = []
-        for i in range(self.nrow):
-            row_str = ""
-            for j in range(self.ncol):
-                if (i, j) == self.current_pos:
-                    row_str += "[" + self.desc[i, j].decode("utf-8") + "]"
-                else:
-                    row_str += " " + self.desc[i, j].decode("utf-8") + " "
-            visual.append(row_str)
-
-        obs = {
-            "position": self.current_pos,
-            "current_cell": cell,
-            "done": self.done,
-            "won": self.won,
-            "visual": "\n".join(visual),
-            "message": self._get_message(),
-            "state": self.current_state,  # Add the Gymnasium state for compatibility
-        }
-
-        return obs
-
-    def _get_message(self) -> str:
-        """Get a descriptive message about the current state."""
-        if self.done:
-            if self.won:
-                return "Congratulations! You reached the goal! You win!"
-            else:
-                return "Oh no! You fell into a hole. Game over."
-        else:
-            if self.current_pos is None:
-                return "Game not initialized"
-            row, col = self.current_pos
-            cell = self.desc[row, col].decode("utf-8")
-            return f"You are at position ({row}, {col}) on a {cell} cell. Choose your next move carefully."
-
-    def close(self):
-        """Close the Gymnasium environment."""
-        self.env.close()
-
-    def render(self, mode: str = "human"):
-        """Render the environment using Gymnasium's rendering."""
-        return self.env.render()
-
-    def get_action_meanings(self):
-        """Get human-readable action meanings."""
-        return ["Left", "Down", "Right", "Up"]
-
-    def get_action_space_info(self):
-        """Get information about the action space."""
-        return {
-            "type": "Discrete",
-            "n": int(self.nA),  # Convert numpy int to Python int
-            "actions": {0: "left", 1: "down", 2: "right", 3: "up"},
-        }
-
-    def get_observation_space_info(self):
-        """Get information about the observation space."""
-        return {
-            "type": "Discrete",
-            "n": int(self.nS),  # Convert numpy int to Python int
-            "shape": (
-                int(self.nrow),
-                int(self.ncol),
-            ),  # Convert numpy ints to Python ints
-            "description": "State number representing position on grid",
-        }
-
-    def get_environment_info(self):
-        """Get comprehensive environment information."""
-        return {
-            "name": "FrozenLake-v1",
-            "map_name": self.map_name,
-            "is_slippery": self.is_slippery,
-            "nrow": int(self.nrow),  # Convert numpy int to Python int
-            "ncol": int(self.ncol),  # Convert numpy int to Python int
-            "action_space": self.get_action_space_info(),
-            "observation_space": self.get_observation_space_info(),
-            "description": [[cell.decode("utf-8") for cell in row] for row in self.desc],  # Convert to strings
-        }
-
-
-# Backward compatibility: alias the old class name to the new one
-FrozenLakeGame = GymnasiumFrozenLakeGame
-
-
-if __name__ == "__main__":
-    """Test the Gymnasium implementation."""
-    print("Testing Gymnasium FrozenLake implementation...")
-
-    # Test with deterministic environment
-    game = GymnasiumFrozenLakeGame(is_slippery=False)
-    print(f"Environment info: {game.get_environment_info()}")
-
-    obs = game.reset()
-    print(f"Initial observation: {obs}")
-
-    # Test both string and numeric actions
-    test_actions = ["down", "down", "right", "right", "down", "right"]
-
-    for i, action in enumerate(test_actions):
-        print(f"\nStep {i+1}: Taking action '{action}'")
-        obs, done = game.step(action)
-        print(f"Position: {obs['position']}, Done: {done}, Won: {obs['won']}")
-        print(f"Message: {obs['message']}")
-
-        if done:
-            if obs["won"]:
-                print("🎉 Success! Reached the goal!")
-            else:
-                print("💀 Failed! Fell into a hole!")
-            break
-
-    game.close()
-    print("\nTest completed!")
diff --git a/examples/frozen_lake/run_full_evaluation.sh b/examples/frozen_lake/run_full_evaluation.sh
deleted file mode 100755
index c2984c89..00000000
--- a/examples/frozen_lake/run_full_evaluation.sh
+++ /dev/null
@@ -1,318 +0,0 @@
-#!/bin/bash
-
-# Complete Frozen Lake HTTP Rollout Evaluation Script
-# This script demonstrates the full end-to-end HTTP rollout evaluation
-
-set -e  # Exit on any error
-
-# Configuration
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
-HTTP_ROLLOUT_SERVER_PORT=8082
-MAX_WAIT_TIME=30
-
-# PID files to track server processes
-HTTP_ROLLOUT_PID_FILE="/tmp/http_rollout_server.pid"
-
-# Colors for output
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-BLUE='\033[0;34m'
-NC='\033[0m' # No Color
-
-log() {
-    echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')] $1${NC}"
-}
-
-warn() {
-    echo -e "${YELLOW}[$(date +'%Y-%m-%d %H:%M:%S')] WARNING: $1${NC}"
-}
-
-error() {
-    echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $1${NC}"
-}
-
-info() {
-    echo -e "${BLUE}[$(date +'%Y-%m-%d %H:%M:%S')] INFO: $1${NC}"
-}
-
-# Cleanup function
-cleanup() {
-    log "Cleaning up servers..."
-
-    # Kill HTTP rollout server
-    if [ -f "$HTTP_ROLLOUT_PID_FILE" ]; then
-        HTTP_ROLLOUT_PID=$(cat "$HTTP_ROLLOUT_PID_FILE")
-        if kill -0 "$HTTP_ROLLOUT_PID" 2>/dev/null; then
-            log "Stopping HTTP rollout server (PID: $HTTP_ROLLOUT_PID)"
-            kill "$HTTP_ROLLOUT_PID" 2>/dev/null || true
-            sleep 2
-            kill -9 "$HTTP_ROLLOUT_PID" 2>/dev/null || true
-        fi
-        rm -f "$HTTP_ROLLOUT_PID_FILE"
-    fi
-
-    # Kill any remaining Python processes for our servers
-    pkill -f "http_rollout_server.py" 2>/dev/null || true
-
-    log "Cleanup complete"
-}
-
-# Set up signal handlers
-trap cleanup EXIT INT TERM
-
-# Function to wait for a server to be ready
-wait_for_server() {
-    local url=$1
-    local name=$2
-    local max_wait=$3
-
-    log "Waiting for $name to be ready at $url..."
-
-    for i in $(seq 1 $max_wait); do
-        if curl -s -f "$url" > /dev/null 2>&1; then
-            log "$name is ready!"
-            return 0
-        fi
-        sleep 1
-    done
-
-    error "$name failed to start within $max_wait seconds"
-    return 1
-}
-
-# Check prerequisites
-check_prerequisites() {
-    info "Checking prerequisites..."
-
-    # Check if reward-kit is available
-    if ! python -c "import eval_protocol" 2>/dev/null; then
-        error "reward-kit not installed or not in Python path"
-        exit 1
-    fi
-
-    # Check if required files exist
-    if [ ! -f "$SCRIPT_DIR/client/task_def.yaml" ]; then
-        error "task_def.yaml not found in $SCRIPT_DIR"
-        exit 1
-    fi
-
-    if [ ! -f "$SCRIPT_DIR/server/http_rollout_server.py" ]; then
-        error "http_rollout_server.py not found in $SCRIPT_DIR"
-        exit 1
-    fi
-
-    # Check if FIREWORKS_API_KEY is set
-    if [ -z "$FIREWORKS_API_KEY" ]; then
-        warn "FIREWORKS_API_KEY environment variable is not set"
-        warn "The evaluation will fail at the API call stage, but the infrastructure will be tested"
-        info "To run with a real model, set: export FIREWORKS_API_KEY=your_api_key"
-    else
-        info "FIREWORKS_API_KEY is set (length: ${#FIREWORKS_API_KEY})"
-    fi
-
-    info "Prerequisites check complete"
-}
-
-# Main execution
-main() {
-    echo ""
-    echo "========================================"
-    echo "🎮 FROZEN LAKE HTTP ROLLOUT EVALUATION"
-    echo "========================================"
-    echo ""
-
-    check_prerequisites
-
-    # Change to the script directory
-    cd "$SCRIPT_DIR"
-
-    # Check if ports are available
-    if lsof -Pi :$HTTP_ROLLOUT_SERVER_PORT -sTCP:LISTEN -t >/dev/null; then
-        error "Port $HTTP_ROLLOUT_SERVER_PORT is already in use"
-        exit 1
-    fi
-
-    # Start HTTP rollout server
-    log "Starting HTTP rollout server on port $HTTP_ROLLOUT_SERVER_PORT..."
-    python server/http_rollout_server.py --port $HTTP_ROLLOUT_SERVER_PORT &
-    HTTP_ROLLOUT_PID=$!
-    echo $HTTP_ROLLOUT_PID > "$HTTP_ROLLOUT_PID_FILE"
-
-    # Wait for servers to be ready
-    wait_for_server "http://localhost:$HTTP_ROLLOUT_SERVER_PORT/health" "HTTP rollout server" $MAX_WAIT_TIME
-
-    # Test the HTTP rollout server
-    info "Testing HTTP rollout server..."
-
-    # Test start episode
-    EPISODE_DATA=$(curl -s -X POST "http://localhost:$HTTP_ROLLOUT_SERVER_PORT/start_episode")
-    EPISODE_ID=$(echo "$EPISODE_DATA" | python -c "import sys, json; print(json.load(sys.stdin)['episode_id'])")
-    info "Started episode: $EPISODE_ID"
-
-    # Test step
-    STEP_DATA=$(curl -s -X POST "http://localhost:$HTTP_ROLLOUT_SERVER_PORT/step" \
-        -H "Content-Type: application/json" \
-        -d "{\"episode_id\": \"$EPISODE_ID\", \"action\": 2}")
-    info "Step result: $STEP_DATA"
-
-    # End episode
-    curl -s -X POST "http://localhost:$HTTP_ROLLOUT_SERVER_PORT/end_episode" \
-        -H "Content-Type: application/json" \
-        -d "{\"episode_id\": \"$EPISODE_ID\"}" > /dev/null
-    info "Episode ended successfully"
-
-    # Run the evaluation
-    log "Starting agent evaluation..."
-    cd "$REPO_ROOT"
-
-    # Set model configuration
-    export MODEL_AGENT="fireworks/accounts/fireworks/models/qwen3-235b-a22b"
-
-    # Create logs directory
-    LOG_DIR="$SCRIPT_DIR/evaluation_logs"
-    mkdir -p "$LOG_DIR"
-    TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
-    FULL_LOG_FILE="$LOG_DIR/full_evaluation_${TIMESTAMP}.log"
-    TRAJECTORY_LOG_FILE="$LOG_DIR/agent_trajectory_${TIMESTAMP}.log"
-
-    # Run the evaluation with detailed logging
-    info "Executing: python -m eval_protocol.cli agent-eval --task-def examples/frozen_lake/client/task_def.yaml"
-    info "Full logs will be saved to: $FULL_LOG_FILE"
-    info "Agent trajectory will be extracted to: $TRAJECTORY_LOG_FILE"
-
-    # Capture all output and filter agent trajectory
-    python -m eval_protocol.cli agent-eval --task-def examples/frozen_lake/client/task_def.yaml 2>&1 | tee "$FULL_LOG_FILE"
-
-    # Extract agent trajectory and tool calls
-    log "Extracting agent trajectory for review..."
-
-    # Create a detailed trajectory log
-    cat > "$TRAJECTORY_LOG_FILE" << 'EOF'
-FROZEN LAKE AGENT EVALUATION TRAJECTORY
-======================================
-
-This log contains the complete agent decision-making process including:
-- User prompts
-- Agent reasoning (thinking)
-- Tool calls made by the agent
-- Environment responses
-- Agent reactions to environment feedback
-
-======================================
-
-EOF
-
-    # Extract the relevant trajectory information
-    grep -A 5 -B 5 "User Turn\|Inner Step\|Tool.*result\|OpenAI response\|Calling OpenAI\|tool calls" "$FULL_LOG_FILE" >> "$TRAJECTORY_LOG_FILE" || true
-
-    echo "" >> "$TRAJECTORY_LOG_FILE"
-    echo "======================================" >> "$TRAJECTORY_LOG_FILE"
-    echo "DETAILED MESSAGES HISTORY" >> "$TRAJECTORY_LOG_FILE"
-    echo "======================================" >> "$TRAJECTORY_LOG_FILE"
-    echo "" >> "$TRAJECTORY_LOG_FILE"
-
-    # Extract the complete conversation flow
-    grep -A 20 "messages_FULL_HISTORY" "$FULL_LOG_FILE" >> "$TRAJECTORY_LOG_FILE" || true
-
-    echo "" >> "$TRAJECTORY_LOG_FILE"
-    echo "======================================" >> "$TRAJECTORY_LOG_FILE"
-    echo "TOOL CALLS AND RESPONSES" >> "$TRAJECTORY_LOG_FILE"
-    echo "======================================" >> "$TRAJECTORY_LOG_FILE"
-    echo "" >> "$TRAJECTORY_LOG_FILE"
-
-    # Extract tool call details
-    grep -A 10 -B 2 "tool_calls\|Tool.*result\|step.*action" "$FULL_LOG_FILE" >> "$TRAJECTORY_LOG_FILE" || true
-
-    # Run the trajectory analyzer
-    cd "$SCRIPT_DIR"
-    if [ -f "analyze_trajectory.py" ] && [ -f "$FULL_LOG_FILE" ]; then
-        info "Running trajectory analysis..."
-        python analyze_trajectory.py "$FULL_LOG_FILE" > "${LOG_DIR}/trajectory_analysis_${TIMESTAMP}.txt" 2>&1 || true
-    fi
-
-    echo ""
-    echo "========================================"
-    echo "✅ EVALUATION INFRASTRUCTURE COMPLETE"
-    echo "========================================"
-    echo ""
-    info "HTTP rollout support has been successfully implemented!"
-    echo ""
-    echo "Key achievements:"
-    echo "• ✅ HttpRolloutResource implemented and integrated"
-    echo "• ✅ Fireworks model support added to orchestrator"
-    echo "• ✅ Tool calling protocol working correctly"
-    echo "• ✅ HTTP rollout server communication verified"
-    echo "• ✅ Complete evaluation framework functional"
-    echo ""
-
-    echo "📋 EVALUATION LOGS SAVED:"
-    echo "• Full evaluation log: $FULL_LOG_FILE"
-    echo "• Agent trajectory log: $TRAJECTORY_LOG_FILE"
-
-    ANALYSIS_FILE="${LOG_DIR}/trajectory_analysis_${TIMESTAMP}.txt"
-    if [ -f "$ANALYSIS_FILE" ]; then
-        echo "• Trajectory analysis: $ANALYSIS_FILE"
-    fi
-    echo ""
-
-    echo "📊 AGENT TRAJECTORY SUMMARY:"
-    if [ -f "$FULL_LOG_FILE" ]; then
-        # Show a quick summary of tool calls
-        TOOL_CALL_COUNT=$(grep -c "Attempting tool call: step" "$FULL_LOG_FILE" || echo "0")
-        echo "• Total tool calls made: $TOOL_CALL_COUNT"
-
-        # Show quick trajectory analysis if available
-        if [ -f "$ANALYSIS_FILE" ]; then
-            echo ""
-            echo "Quick trajectory preview:"
-            head -20 "$ANALYSIS_FILE" | tail -15
-            echo ""
-            echo "📖 Full trajectory analysis:"
-            echo "   cat $ANALYSIS_FILE"
-        else
-            echo "• Review detailed trajectory in: $TRAJECTORY_LOG_FILE"
-
-            # Show the first few tool calls for quick review
-            echo ""
-            echo "First few tool calls:"
-            grep -m 3 -A 2 "Attempting tool call: step" "$FULL_LOG_FILE" | head -9 || true
-        fi
-    fi
-    echo ""
-
-    if [ -z "$FIREWORKS_API_KEY" ]; then
-        echo "To run with actual LLM inference:"
-        echo "1. Set FIREWORKS_API_KEY environment variable"
-        echo "2. Re-run this script"
-    else
-        echo "🎉 Ready for production use with LLM inference!"
-        echo ""
-        echo "📖 To review the agent's decision making:"
-        echo "   cat $TRAJECTORY_LOG_FILE"
-    fi
-    echo ""
-}
-
-# Check for help flag
-if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then
-    echo "Frozen Lake HTTP Rollout Evaluation"
-    echo ""
-    echo "This script demonstrates the complete HTTP rollout evaluation infrastructure:"
-    echo "1. Starts HTTP rollout server for Frozen Lake game"
-    echo "2. Tests the HTTP rollout protocol"
-    echo "3. Runs the agent evaluation framework"
-    echo "4. Shows tool calling and resource integration working"
-    echo ""
-    echo "Prerequisites:"
-    echo "- reward-kit installed and configured"
-    echo "- FIREWORKS_API_KEY environment variable (optional for infrastructure testing)"
-    echo ""
-    echo "Usage: $0"
-    echo ""
-    exit 0
-fi
-
-# Run main function
-main "$@"
diff --git a/examples/frozen_lake/server/README.md b/examples/frozen_lake/server/README.md
deleted file mode 100644
index 08d3f7eb..00000000
--- a/examples/frozen_lake/server/README.md
+++ /dev/null
@@ -1,187 +0,0 @@
-# Frozen Lake Game Server
-
-This is the **server-side implementation** of the Frozen Lake game environment that provides an HTTP API for agent evaluation.
-
-## Overview
-
-This server implements the HTTP Rollout Protocol that allows external evaluation frameworks (like eval-protocol) to interact with the game environment through standardized endpoints.
-
-## API Endpoints
-
-### `POST /start_episode`
-Initializes a new game episode.
-
-**Response:**
-```json
-{
-  "episode_id": "uuid-string",
-  "observation": {
-    "position": [0, 0],
-    "current_cell": "S",
-    "done": false,
-    "won": false,
-    "visual": "[S] F  F  F \n F  H  F  H \n F  F  F  H \n H  F  F  G ",
-    "message": "You are at position (0, 0) on a S cell. Choose your next move carefully."
-  }
-}
-```
-
-### `POST /step`
-Executes an action in the game.
-
-**Request:**
-```json
-{
-  "episode_id": "uuid-string",
-  "action": 2
-}
-```
-
-**Action Values:**
-- `0` = Left
-- `1` = Down
-- `2` = Right
-- `3` = Up
-
-**Response:**
-```json
-{
-  "observation": {
-    "position": [0, 1],
-    "current_cell": "F",
-    "done": false,
-    "won": false,
-    "visual": " S [F] F  F \n F  H  F  H \n F  F  F  H \n H  F  F  G ",
-    "message": "You are at position (0, 1) on a F cell. Choose your next move carefully."
-  },
-  "is_done": false
-}
-```
-
-### `POST /end_episode`
-Cleans up a completed episode.
-
-**Request:**
-```json
-{
-  "episode_id": "uuid-string"
-}
-```
-
-**Response:**
-```json
-{
-  "message": "Episode ended successfully"
-}
-```
-
-### `GET /health`
-Health check endpoint.
-
-**Response:**
-```json
-{
-  "status": "healthy",
-  "game": "frozen_lake"
-}
-```
-
-## Game Logic
-
-### Board Layout
-```
-S F F F
-F H F H
-F F F H
-H F F G
-```
-
-### Game Rules
-- **S**: Starting position (safe)
-- **F**: Frozen lake (safe to step on)
-- **H**: Hole (game over if stepped on)
-- **G**: Goal (win condition)
-
-### Win/Loss Conditions
-- **Win**: Reach the goal position (G)
-- **Loss**: Step on a hole (H) or exceed maximum steps
-
-## Running the Server
-
-### Prerequisites
-- Python 3.8+
-- FastAPI
-- Uvicorn
-
-### Installation
-```bash
-pip install fastapi uvicorn
-```
-
-### Start Server
-```bash
-python http_rollout_server.py
-```
-
-The server will start on `http://localhost:8080`
-
-### Configuration
-Environment variables:
-- `PORT`: Server port (default: 8080)
-- `HOST`: Server host (default: 0.0.0.0)
-
-## Integration Notes
-
-This server is designed to work with any HTTP rollout-compatible evaluation framework. The client side handles:
-- Action translation (string → numeric)
-- State interpretation
-- Reward calculation
-- Episode management
-
-## Customization
-
-### Different Board Layouts
-Modify the `FROZEN_LAKE_MAP` constant:
-```python
-FROZEN_LAKE_MAP = [
-    "SFFF",
-    "FHFH",
-    "FFFH",
-    "HFFG"
-]
-```
-
-### Game Variants
-- Slippery surfaces (movement uncertainty)
-- Larger boards
-- Dynamic obstacles
-- Multi-goal scenarios
-
-## Development
-
-### Testing the API
-```bash
-# Health check
-curl http://localhost:8080/health
-
-# Start episode
-curl -X POST http://localhost:8080/start_episode
-
-# Take action
-curl -X POST http://localhost:8080/step \
-  -H "Content-Type: application/json" \
-  -d '{"episode_id": "your-episode-id", "action": 2}'
-```
-
-### Logging
-The server logs all API interactions for debugging and monitoring.
-
-## Protocol Compliance
-
-This implementation follows the HTTP Rollout Protocol specification:
-- Stateless episode management
-- Structured observation format
-- Standardized error handling
-- Health monitoring endpoint
-
-Game environment developers can use this as a reference implementation for creating their own HTTP rollout-compatible environments.
diff --git a/examples/frozen_lake/server/http_rollout_server.py b/examples/frozen_lake/server/http_rollout_server.py
deleted file mode 100644
index afca6959..00000000
--- a/examples/frozen_lake/server/http_rollout_server.py
+++ /dev/null
@@ -1,104 +0,0 @@
-"""
-HTTP rollout server for Frozen Lake game.
-
-This server implements the standard HTTP rollout protocol using the reward-kit
-library's standardized types for consistent client/server communication.
-"""
-
-import os
-import sys
-import uuid
-from typing import Dict
-
-from fastapi import FastAPI, HTTPException
-
-# Add parent directory to path to import gymnasium frozen lake server
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from gymnasium_frozen_lake_server import GymnasiumFrozenLakeGame as FrozenLakeGame
-
-# Additional models for responses
-from pydantic import BaseModel
-
-# Import standardized HTTP rollout protocol types from reward-kit
-from eval_protocol.agent.resources import (
-    EndEpisodeRequest,
-    EndEpisodeResponse,
-    HealthResponse,
-    StartEpisodeRequest,
-    StartEpisodeResponse,
-    StepRequest,
-    StepResponse,
-)
-
-# FastAPI app
-app = FastAPI(title="Frozen Lake HTTP Rollout Server")
-
-# Store active episodes
-episodes: Dict[str, FrozenLakeGame] = {}
-
-
-@app.post("/start_episode", response_model=StartEpisodeResponse)
-async def start_episode(
-    request: StartEpisodeRequest = StartEpisodeRequest(),
-) -> StartEpisodeResponse:
-    """Start a new episode of the Frozen Lake game."""
-    episode_id = str(uuid.uuid4())
-
-    # Extract request data to pass to the game constructor
-    request_data = request.dict() if hasattr(request, "dict") else request.model_dump()
-
-    # Create Gymnasium-based game with configuration from request
-    # Default values maintained for backward compatibility
-    game_config = {
-        "map_name": "4x4",
-        "is_slippery": True,  # Enable stochastic behavior to demonstrate seed effect
-        "render_mode": None,
-    }
-    # Override with any values from the request (like seed)
-    game_config.update(request_data)
-
-    game = FrozenLakeGame(**game_config)
-    observation = game.reset()
-    episodes[episode_id] = game
-
-    return StartEpisodeResponse(episode_id=episode_id, observation=observation)
-
-
-@app.post("/step", response_model=StepResponse)
-async def step(req: StepRequest) -> StepResponse:
-    """Take a step in the specified episode."""
-    if req.episode_id not in episodes:
-        raise HTTPException(status_code=404, detail="Episode not found")
-
-    game = episodes[req.episode_id]
-    observation, is_done = game.step(req.action)
-
-    return StepResponse(observation=observation, is_done=is_done)
-
-
-@app.post("/end_episode", response_model=EndEpisodeResponse)
-async def end_episode(req: EndEpisodeRequest) -> EndEpisodeResponse:
-    """End the specified episode."""
-    if req.episode_id not in episodes:
-        raise HTTPException(status_code=404, detail="Episode not found")
-
-    del episodes[req.episode_id]
-    return EndEpisodeResponse(message=f"Episode {req.episode_id} ended successfully")
-
-
-@app.get("/health", response_model=HealthResponse)
-async def health_check() -> HealthResponse:
-    """Health check endpoint."""
-    return HealthResponse(status="healthy", game="frozen_lake_gymnasium")
-
-
-if __name__ == "__main__":
-    import argparse
-
-    import uvicorn
-
-    parser = argparse.ArgumentParser(description="Frozen Lake HTTP Rollout Server")
-    parser.add_argument("--port", type=int, default=8080, help="Port to run the server on")
-    args = parser.parse_args()
-
-    uvicorn.run(app, host="0.0.0.0", port=args.port)
diff --git a/tests/test_frozen_lake_http_server.py b/tests/test_frozen_lake_http_server.py
deleted file mode 100644
index ebe7104f..00000000
--- a/tests/test_frozen_lake_http_server.py
+++ /dev/null
@@ -1,269 +0,0 @@
-"""
-Tests for FrozenLake HTTP rollout server seed handling.
-
-This module tests the HTTP server's ability to accept and use seed parameters
-to create reproducible game environments.
-"""
-
-import json
-from unittest.mock import MagicMock, patch
-
-import pytest
-from fastapi.testclient import TestClient
-
-from examples.frozen_lake.gymnasium_frozen_lake_server import GymnasiumFrozenLakeGame
-
-# Import the server components
-from examples.frozen_lake.server.http_rollout_server import app
-
-
-class TestFrozenLakeHttpServer:
-    """Tests for the FrozenLake HTTP rollout server."""
-
-    def setup_method(self):
-        """Set up test client."""
-        self.client = TestClient(app)
-
-    def test_start_episode_without_seed(self):
-        """Test starting episode without seed uses default behavior."""
-        response = self.client.post("/start_episode")
-
-        assert response.status_code == 200
-        data = response.json()
-
-        assert "episode_id" in data
-        assert "observation" in data
-
-        # Should have standard game state
-        observation = data["observation"]
-        assert "position" in observation
-        assert "current_cell" in observation
-        assert "visual" in observation
-        assert observation["position"] == [0, 0]  # Start position
-        assert observation["current_cell"] == "S"  # Start cell
-
-    def test_start_episode_with_seed(self):
-        """Test starting episode with seed parameter."""
-        seed_value = 42
-        request_data = {"seed": seed_value}
-
-        response = self.client.post("/start_episode", json=request_data)
-
-        assert response.status_code == 200
-        data = response.json()
-
-        assert "episode_id" in data
-        assert "observation" in data
-
-        # Should still start at position (0,0)
-        observation = data["observation"]
-        assert observation["position"] == [0, 0]
-        assert observation["current_cell"] == "S"
-
-    def test_different_seeds_create_different_episodes(self):
-        """Test that different seeds create episodes with different board layouts."""
-        # Start episode with seed 42
-        response1 = self.client.post("/start_episode", json={"seed": 42})
-        assert response1.status_code == 200
-        data1 = response1.json()
-        episode_id1 = data1["episode_id"]
-        visual1 = data1["observation"]["visual"]
-
-        # Start episode with seed 123
-        response2 = self.client.post("/start_episode", json={"seed": 123})
-        assert response2.status_code == 200
-        data2 = response2.json()
-        episode_id2 = data2["episode_id"]
-        visual2 = data2["observation"]["visual"]
-
-        # Episodes should have different IDs
-        assert episode_id1 != episode_id2
-
-        # Board layouts should be different (high probability with different seeds)
-        assert visual1 != visual2, "Different seeds should create different board layouts"
-
-    def test_same_seed_creates_identical_episodes(self):
-        """Test that same seed creates episodes with identical board layouts."""
-        seed_value = 999
-
-        # Start two episodes with same seed
-        response1 = self.client.post("/start_episode", json={"seed": seed_value})
-        assert response1.status_code == 200
-        data1 = response1.json()
-        visual1 = data1["observation"]["visual"]
-
-        response2 = self.client.post("/start_episode", json={"seed": seed_value})
-        assert response2.status_code == 200
-        data2 = response2.json()
-        visual2 = data2["observation"]["visual"]
-
-        # Board layouts should be identical
-        assert visual1 == visual2, "Same seed should create identical board layouts"
-
-    def test_start_episode_with_additional_parameters(self):
-        """Test that server accepts additional parameters beyond seed."""
-        request_data = {"seed": 42, "custom_param": "test_value", "id": "test_run_001"}
-
-        response = self.client.post("/start_episode", json=request_data)
-
-        assert response.status_code == 200
-        data = response.json()
-
-        # Should work normally despite extra parameters
-        assert "episode_id" in data
-        assert "observation" in data
-
-    def test_step_action_in_seeded_episode(self):
-        """Test taking actions in a seeded episode."""
-        # Start episode with seed
-        response = self.client.post("/start_episode", json={"seed": 42})
-        assert response.status_code == 200
-        episode_id = response.json()["episode_id"]
-
-        # Take a step action
-        step_response = self.client.post("/step", json={"episode_id": episode_id, "action": "right"})
-
-        assert step_response.status_code == 200
-        step_data = step_response.json()
-
-        assert "observation" in step_data
-        assert "is_done" in step_data
-
-        # Position should have changed (unless blocked)
-        observation = step_data["observation"]
-        assert "position" in observation
-        assert "current_cell" in observation
-
-    def test_health_endpoint(self):
-        """Test the health check endpoint."""
-        response = self.client.get("/health")
-
-        assert response.status_code == 200
-        data = response.json()
-        assert data["status"] == "healthy"
-
-    def test_episode_cleanup_on_completion(self):
-        """Test that episodes are properly tracked and can be cleaned up."""
-        # Start an episode
-        response = self.client.post("/start_episode", json={"seed": 42})
-        assert response.status_code == 200
-        episode_id = response.json()["episode_id"]
-
-        # Episode should be trackable via step endpoint
-        step_response = self.client.post("/step", json={"episode_id": episode_id, "action": "right"})
-        assert step_response.status_code == 200
-
-    def test_invalid_episode_id_handling(self):
-        """Test handling of invalid episode IDs."""
-        # Try to step with non-existent episode ID
-        response = self.client.post("/step", json={"episode_id": "non_existent_episode", "action": "right"})
-
-        # Should return an error (400 or 404)
-        assert response.status_code in [400, 404]
-
-    def test_server_configuration_with_slippery_environment(self):
-        """Test that server is configured with slippery environment for seed demonstration."""
-        # Mock the game creation to verify configuration
-        with patch("examples.frozen_lake.server.http_rollout_server.FrozenLakeGame") as mock_game_class:
-            mock_game_instance = MagicMock()
-            mock_game_instance.reset.return_value = {
-                "position": [0, 0],
-                "current_cell": "S",
-                "visual": "test_visual",
-                "done": False,
-            }
-            mock_game_class.return_value = mock_game_instance
-
-            response = self.client.post("/start_episode", json={"seed": 42})
-
-            # Verify game was created with correct configuration
-            mock_game_class.assert_called_once()
-            call_kwargs = mock_game_class.call_args[1]
-
-            # Should include slippery=True and the seed
-            assert call_kwargs.get("is_slippery") is True
-            assert call_kwargs.get("seed") == 42
-
-
-class TestGymnasiumFrozenLakeIntegration:
-    """Integration tests between HTTP server and GymnasiumFrozenLakeGame."""
-
-    def test_gymnasium_integration_with_seeds(self):
-        """Test that the HTTP server correctly integrates with GymnasiumFrozenLakeGame seeds."""
-        client = TestClient(app)
-
-        # Test multiple seeds to ensure they work through the HTTP interface
-        seeds = [42, 123, 999]
-        board_layouts = []
-
-        for seed in seeds:
-            response = client.post("/start_episode", json={"seed": seed})
-            assert response.status_code == 200
-
-            observation = response.json()["observation"]
-            board_layouts.append(observation["visual"])
-
-        # All board layouts should be different
-        unique_layouts = set(board_layouts)
-        assert len(unique_layouts) == len(seeds), "Each seed should produce a unique board layout"
-
-    def test_episode_state_consistency(self):
-        """Test that episode state remains consistent within a single game."""
-        client = TestClient(app)
-
-        # Start episode with specific seed
-        response = client.post("/start_episode", json={"seed": 42})
-        assert response.status_code == 200
-
-        episode_id = response.json()["episode_id"]
-        initial_visual = response.json()["observation"]["visual"]
-
-        # Take a few actions and verify board layout doesn't change
-        for action in ["right", "down", "left"]:
-            step_response = client.post("/step", json={"episode_id": episode_id, "action": action})
-            assert step_response.status_code == 200
-
-            observation = step_response.json()["observation"]
-            # Visual board should remain the same (only position marker changes)
-            current_visual = observation["visual"]
-
-            # Board structure should be preserved (same letters, different position marker)
-            # Extract just the board cells without position markers
-            initial_cells = initial_visual.replace("[", "").replace("]", "")
-            current_cells = current_visual.replace("[", "").replace("]", "")
-
-            # The underlying board structure should be identical
-            assert len(initial_cells) == len(current_cells), "Board size should remain constant"
-
-    def test_seed_parameter_propagation(self):
-        """Test that seed parameter correctly propagates to the game engine."""
-        # This test verifies the complete data flow from HTTP request to game creation
-
-        with patch("examples.frozen_lake.server.http_rollout_server.FrozenLakeGame") as mock_game_class:
-            mock_game_instance = MagicMock()
-            mock_game_instance.reset.return_value = {
-                "position": [0, 0],
-                "current_cell": "S",
-                "visual": "mocked_visual",
-                "done": False,
-                "won": False,
-                "message": "test_message",
-            }
-            mock_game_class.return_value = mock_game_instance
-
-            client = TestClient(app)
-
-            # Send request with seed
-            seed_value = 1337
-            response = client.post("/start_episode", json={"seed": seed_value})
-
-            assert response.status_code == 200
-
-            # Verify the game was created with the correct seed
-            mock_game_class.assert_called_once()
-            call_kwargs = mock_game_class.call_args[1]
-            assert call_kwargs["seed"] == seed_value
-
-
-if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
diff --git a/tests/test_frozen_lake_seed_evaluation.py b/tests/test_frozen_lake_seed_evaluation.py
deleted file mode 100644
index 54bcf1c6..00000000
--- a/tests/test_frozen_lake_seed_evaluation.py
+++ /dev/null
@@ -1,541 +0,0 @@
-"""
-Tests for seed-based reproducible evaluation in FrozenLake example.
-
-This module tests the complete data-driven evaluation pipeline including:
-- Seed-based map generation for reproducible game boards
-- Data-driven evaluation infrastructure in TaskManager
-- Protocol enhancements for sample data passing
-- End-to-end integration tests
-"""
-
-import json
-import tempfile
-from pathlib import Path
-from typing import Any, Dict
-from unittest.mock import AsyncMock, MagicMock, patch
-
-import pytest
-import pytest_asyncio
-
-from eval_protocol.agent.orchestrator import Orchestrator
-from eval_protocol.agent.resources.http_rollout_protocol import StartEpisodeRequest
-from eval_protocol.agent.resources.http_rollout_resource import HttpRolloutResource
-from eval_protocol.agent.task_manager import TaskManager
-from eval_protocol.models import TaskDefinitionModel
-
-# Import components under test
-from examples.frozen_lake.gymnasium_frozen_lake_server import GymnasiumFrozenLakeGame
-
-
-class TestSeedBasedMapGeneration:
-    """Tests for seed-based reproducible map generation in GymnasiumFrozenLakeGame."""
-
-    def test_seed_generates_different_maps(self):
-        """Test that different seeds generate different map layouts."""
-        seed1 = 42
-        seed2 = 123
-
-        game1 = GymnasiumFrozenLakeGame(seed=seed1)
-        game2 = GymnasiumFrozenLakeGame(seed=seed2)
-
-        # Get map descriptions
-        map1 = game1.desc.tolist()
-        map2 = game2.desc.tolist()
-
-        # Maps should be different
-        assert map1 != map2, "Different seeds should generate different maps"
-
-        # Both should be 4x4 by default
-        assert len(map1) == 4 and len(map1[0]) == 4
-        assert len(map2) == 4 and len(map2[0]) == 4
-
-        game1.close()
-        game2.close()
-
-    def test_same_seed_generates_identical_maps(self):
-        """Test that the same seed generates identical map layouts."""
-        seed = 42
-
-        game1 = GymnasiumFrozenLakeGame(seed=seed)
-        game2 = GymnasiumFrozenLakeGame(seed=seed)
-
-        # Get map descriptions
-        map1 = game1.desc.tolist()
-        map2 = game2.desc.tolist()
-
-        # Maps should be identical
-        assert map1 == map2, "Same seed should generate identical maps"
-
-        game1.close()
-        game2.close()
-
-    def test_no_seed_uses_fixed_map(self):
-        """Test that no seed uses the fixed predefined map."""
-        game1 = GymnasiumFrozenLakeGame()  # No seed
-        game2 = GymnasiumFrozenLakeGame()  # No seed
-
-        # Get map descriptions
-        map1 = game1.desc.tolist()
-        map2 = game2.desc.tolist()
-
-        # Maps should be identical (both using fixed 4x4 map)
-        assert map1 == map2, "No seed should use identical fixed maps"
-
-        # Should be the standard 4x4 FrozenLake map
-        expected_map = [
-            [b"S", b"F", b"F", b"F"],
-            [b"F", b"H", b"F", b"H"],
-            [b"F", b"F", b"F", b"H"],
-            [b"H", b"F", b"F", b"G"],
-        ]
-        assert map1 == expected_map, "Should use standard 4x4 map when no seed"
-
-        game1.close()
-        game2.close()
-
-    def test_seed_with_8x8_map(self):
-        """Test seed-based generation with 8x8 map size."""
-        seed = 999
-
-        game = GymnasiumFrozenLakeGame(map_name="8x8", seed=seed)
-
-        # Should be 8x8
-        assert game.desc.shape == (8, 8)
-
-        # Should have start and goal
-        flat_map = game.desc.flatten()
-        assert b"S" in flat_map
-        assert b"G" in flat_map
-
-        game.close()
-
-    def test_seed_affects_reset_behavior(self):
-        """Test that seed affects the reset behavior for stochastic environments."""
-        seed = 42
-
-        # Test with slippery environment
-        game = GymnasiumFrozenLakeGame(seed=seed, is_slippery=True)
-
-        # Reset multiple times - should get same initial state
-        state1 = game.reset()
-        state2 = game.reset()
-
-        # Initial position should be consistent
-        assert state1["position"] == state2["position"] == (0, 0)
-        assert state1["current_cell"] == state2["current_cell"] == "S"
-
-        game.close()
-
-    def test_map_has_valid_path(self):
-        """Test that generated maps always have a valid path from start to goal."""
-        # Test multiple seeds to ensure path validity
-        for seed in [42, 123, 999, 1337]:
-            game = GymnasiumFrozenLakeGame(seed=seed)
-
-            # Should have exactly one start and one goal
-            flat_map = game.desc.flatten()
-            start_count = sum(1 for cell in flat_map if cell == b"S")
-            goal_count = sum(1 for cell in flat_map if cell == b"G")
-
-            assert start_count == 1, f"Should have exactly one start for seed {seed}"
-            assert goal_count == 1, f"Should have exactly one goal for seed {seed}"
-
-            # Start should be at (0,0) and goal should exist
-            assert game.desc[0, 0] == b"S", f"Start should be at (0,0) for seed {seed}"
-            assert game.start_pos == (
-                0,
-                0,
-            ), f"Start position should be (0,0) for seed {seed}"
-            assert game.goal_pos is not None, f"Goal position should exist for seed {seed}"
-
-            game.close()
-
-
-class TestStartEpisodeRequest:
-    """Tests for the enhanced StartEpisodeRequest protocol."""
-
-    def test_start_episode_request_accepts_arbitrary_fields(self):
-        """Test that StartEpisodeRequest accepts arbitrary fields like seed."""
-        # Should accept seed and other fields
-        request = StartEpisodeRequest(seed=42, custom_field="test_value")
-
-        # Access via model_dump or dict
-        if hasattr(request, "model_dump"):
-            data = request.model_dump()
-        else:
-            data = request.dict()
-
-        assert data["seed"] == 42
-        assert data["custom_field"] == "test_value"
-
-    def test_start_episode_request_empty(self):
-        """Test that StartEpisodeRequest works with no extra fields."""
-        request = StartEpisodeRequest()
-
-        # Should work without errors
-        if hasattr(request, "model_dump"):
-            data = request.model_dump()
-        else:
-            data = request.dict()
-
-        # Should be empty dict or have no extra fields
-        assert isinstance(data, dict)
-
-
-class TestDataDrivenTaskDefinition:
-    """Tests for data-driven evaluation fields in TaskDefinitionModel."""
-
-    def test_task_definition_with_dataset_path(self):
-        """Test TaskDefinitionModel with dataset_path field."""
-        task_def_dict = {
-            "name": "test_task",
-            "description": "Test task",
-            "resource_type": "http_rollout",
-            "base_resource_config": {"base_url": "http://localhost:8080"},
-            "reward_function_path": "test.reward",
-            "dataset_path": "test_dataset.jsonl",
-            "num_rollouts_per_sample": 3,
-        }
-
-        task_def = TaskDefinitionModel(**task_def_dict)
-
-        assert task_def.dataset_path == "test_dataset.jsonl"
-        assert task_def.num_rollouts_per_sample == 3
-
-    def test_task_definition_without_dataset_path(self):
-        """Test TaskDefinitionModel without dataset_path (traditional evaluation)."""
-        task_def_dict = {
-            "name": "test_task",
-            "description": "Test task",
-            "resource_type": "http_rollout",
-            "base_resource_config": {"base_url": "http://localhost:8080"},
-            "reward_function_path": "test.reward",
-            "num_rollouts": 5,
-        }
-
-        task_def = TaskDefinitionModel(**task_def_dict)
-
-        assert task_def.dataset_path is None
-        assert task_def.num_rollouts_per_sample == 1  # Default value
-        assert task_def.num_rollouts == 5
-
-    def test_num_rollouts_per_sample_validation(self):
-        """Test that num_rollouts_per_sample must be >= 1."""
-        task_def_dict = {
-            "name": "test_task",
-            "description": "Test task",
-            "resource_type": "http_rollout",
-            "base_resource_config": {"base_url": "http://localhost:8080"},
-            "reward_function_path": "test.reward",
-            "dataset_path": "test.jsonl",
-            "num_rollouts_per_sample": 0,  # Invalid
-        }
-
-        with pytest.raises(ValueError):
-            TaskDefinitionModel(**task_def_dict)
-
-
-@pytest.mark.asyncio
-class TestTaskManagerDataDrivenEvaluation:
-    """Tests for data-driven evaluation functionality in TaskManager."""
-
-    def test_load_dataset_samples_valid_jsonl(self):
-        """Test loading valid JSONL dataset."""
-        # Create temporary JSONL file
-        with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
-            f.write('{"id": "sample1", "seed": 42}\n')
-            f.write('{"id": "sample2", "seed": 123}\n')
-            f.write('{"id": "sample3", "seed": 999}\n')
-            temp_file = f.name
-
-        try:
-            task_manager = TaskManager()
-            samples = task_manager._load_dataset_samples(temp_file)
-
-            assert len(samples) == 3
-            assert samples[0] == {"id": "sample1", "seed": 42}
-            assert samples[1] == {"id": "sample2", "seed": 123}
-            assert samples[2] == {"id": "sample3", "seed": 999}
-        finally:
-            Path(temp_file).unlink()
-
-    def test_load_dataset_samples_invalid_json(self):
-        """Test loading JSONL with invalid JSON lines."""
-        # Create temporary JSONL file with some invalid lines
-        with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
-            f.write('{"id": "sample1", "seed": 42}\n')
-            f.write("invalid json line\n")  # Invalid JSON
-            f.write('{"id": "sample2", "seed": 123}\n')
-            temp_file = f.name
-
-        try:
-            task_manager = TaskManager()
-            samples = task_manager._load_dataset_samples(temp_file)
-
-            # Should skip invalid line and load valid ones
-            assert len(samples) == 2
-            assert samples[0] == {"id": "sample1", "seed": 42}
-            assert samples[1] == {"id": "sample2", "seed": 123}
-        finally:
-            Path(temp_file).unlink()
-
-    def test_load_dataset_samples_nonexistent_file(self):
-        """Test loading from nonexistent file."""
-        task_manager = TaskManager()
-        samples = task_manager._load_dataset_samples("nonexistent_file.jsonl")
-
-        assert samples == []
-
-    def test_load_dataset_samples_empty_file(self):
-        """Test loading from empty file."""
-        with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
-            # Empty file
-            temp_file = f.name
-
-        try:
-            task_manager = TaskManager()
-            samples = task_manager._load_dataset_samples(temp_file)
-
-            assert samples == []
-        finally:
-            Path(temp_file).unlink()
-
-    def test_load_dataset_samples_relative_path(self):
-        """Test loading dataset with relative path."""
-        # Create a temporary directory and file
-        with tempfile.TemporaryDirectory() as temp_dir:
-            dataset_file = Path(temp_dir) / "test_dataset.jsonl"
-            with open(dataset_file, "w") as f:
-                f.write('{"id": "test", "seed": 42}\n')
-
-            task_manager = TaskManager()
-
-            # Test with absolute path since temp dir is not relative to cwd
-            absolute_path = str(dataset_file)
-            samples = task_manager._load_dataset_samples(absolute_path)
-
-            assert len(samples) == 1
-            assert samples[0] == {"id": "test", "seed": 42}
-
-
-@pytest.mark.asyncio
-class TestHttpRolloutResourceInitialization:
-    """Tests for HttpRolloutResource initialization with sample data."""
-
-    async def test_initialize_with_kwargs(self):
-        """Test that initialize method sends kwargs in POST request."""
-        # Mock the HTTP client
-        mock_client = MagicMock()
-        mock_response = MagicMock()
-        mock_response.raise_for_status.return_value = None
-        mock_response.json.return_value = {
-            "episode_id": "test_episode",
-            "observation": {},
-        }
-        mock_client.post.return_value = mock_response
-
-        # Create resource with mock client
-        config = {
-            "base_url": "http://localhost:8080",
-            "start_episode_endpoint": "/start_episode",
-        }
-
-        resource = HttpRolloutResource()
-        await resource.setup(config)
-        resource.client = mock_client  # Replace with mock
-
-        # Initialize with sample data
-        sample_data = {"seed": 42, "custom_param": "test_value"}
-        await resource.initialize(**sample_data)
-
-        # Verify POST was called with correct parameters
-        mock_client.post.assert_called_once_with("http://localhost:8080/start_episode", json=sample_data)
-
-    async def test_initialize_without_kwargs(self):
-        """Test that initialize method works without kwargs."""
-        # Mock the HTTP client
-        mock_client = MagicMock()
-        mock_response = MagicMock()
-        mock_response.raise_for_status.return_value = None
-        mock_response.json.return_value = {
-            "episode_id": "test_episode",
-            "observation": {},
-        }
-        mock_client.post.return_value = mock_response
-
-        # Create resource with mock client
-        config = {
-            "base_url": "http://localhost:8080",
-            "start_episode_endpoint": "/start_episode",
-        }
-
-        resource = HttpRolloutResource()
-        await resource.setup(config)
-        resource.client = mock_client  # Replace with mock
-
-        # Initialize without sample data
-        await resource.initialize()
-
-        # Verify POST was called without json parameter
-        mock_client.post.assert_called_once_with("http://localhost:8080/start_episode")
-
-
-@pytest.mark.asyncio
-class TestOrchestratorSampleDataPassing:
-    """Tests for sample data passing in Orchestrator."""
-
-    async def test_execute_task_poc_with_sample_data(self):
-        """Test that execute_task_poc passes sample data to resource initialization."""
-        # Create a minimal task definition
-        task_def_dict = {
-            "name": "test_task",
-            "description": "Test task",
-            "resource_type": "test_resource",
-            "base_resource_config": {},
-            "reward_function_path": "test.reward",
-            "messages": [{"role": "user", "content": "test"}],
-        }
-        task_def = TaskDefinitionModel(**task_def_dict)
-
-        # Mock the base resource
-        mock_resource = AsyncMock()
-        mock_resource.fork.return_value = AsyncMock()
-        mock_episode_resource = mock_resource.fork.return_value
-        mock_episode_resource.initialize = AsyncMock()
-
-        # Create orchestrator
-        orchestrator = Orchestrator(task_definition=task_def)
-        orchestrator.base_resource = mock_resource
-
-        # Mock execute_task_poc to just test the sample data passing logic
-        async def mock_execute_task_poc(sample_data=None):
-            if sample_data:
-                # Simulate the resource initialization that would happen
-                episode_resource = await orchestrator.base_resource.fork()
-                await episode_resource.initialize(**sample_data)
-            return {"score": 1.0}
-
-        with patch.object(orchestrator, "execute_task_poc", side_effect=mock_execute_task_poc):
-            sample_data = {"seed": 42, "test_param": "value"}
-            await orchestrator.execute_task_poc(sample_data=sample_data)
-
-            # Verify that episode resource was initialized with sample data
-            mock_episode_resource.initialize.assert_called_once_with(**sample_data)
-
-    async def test_execute_task_poc_without_sample_data(self):
-        """Test that execute_task_poc works without sample data."""
-        # Create a minimal task definition
-        task_def_dict = {
-            "name": "test_task",
-            "description": "Test task",
-            "resource_type": "test_resource",
-            "base_resource_config": {},
-            "reward_function_path": "test.reward",
-            "messages": [{"role": "user", "content": "test"}],
-        }
-        task_def = TaskDefinitionModel(**task_def_dict)
-
-        # Mock the base resource
-        mock_resource = AsyncMock()
-        mock_resource.fork.return_value = AsyncMock()
-        mock_episode_resource = mock_resource.fork.return_value
-        mock_episode_resource.initialize = AsyncMock()
-
-        # Create orchestrator
-        orchestrator = Orchestrator(task_definition=task_def)
-        orchestrator.base_resource = mock_resource
-
-        # Mock execute_task_poc to just test the sample data passing logic
-        async def mock_execute_task_poc(sample_data=None):
-            if sample_data:
-                # Simulate the resource initialization that would happen
-                episode_resource = await orchestrator.base_resource.fork()
-                await episode_resource.initialize(**sample_data)
-            return {"score": 1.0}
-
-        with patch.object(orchestrator, "execute_task_poc", side_effect=mock_execute_task_poc):
-            await orchestrator.execute_task_poc(sample_data=None)
-
-            # Verify that episode resource was not initialized (no sample_data)
-            mock_episode_resource.initialize.assert_not_called()
-
-
-@pytest.mark.asyncio
-class TestEndToEndDataDrivenEvaluation:
-    """Integration tests for end-to-end data-driven evaluation."""
-
-    async def test_data_driven_task_execution_flow(self):
-        """Test the complete flow of data-driven task execution."""
-        # Create temporary dataset file
-        with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
-            f.write('{"id": "run_001", "seed": 42}\n')
-            f.write('{"id": "run_002", "seed": 123}\n')
-            temp_dataset = f.name
-
-        try:
-            # Create task definition with dataset
-            task_def_dict = {
-                "name": "frozen_lake_test",
-                "description": "Test frozen lake with seeds",
-                "resource_type": "http_rollout",
-                "base_resource_config": {"base_url": "http://localhost:8080"},
-                "reward_function_path": "test.reward",
-                "dataset_path": temp_dataset,
-                "num_rollouts_per_sample": 1,
-                "messages": [{"role": "user", "content": "test"}],
-            }
-
-            task_manager = TaskManager()
-            task_manager.register_task("test_task", TaskDefinitionModel(**task_def_dict))
-
-            # Mock the orchestrator execution
-            with patch.object(task_manager, "_execute_data_driven_rollouts") as mock_execute:
-                mock_execute.return_value = [
-                    {"score": 1.0, "sample_data": {"id": "run_001", "seed": 42}},
-                    {"score": 0.0, "sample_data": {"id": "run_002", "seed": 123}},
-                ]
-
-                # Execute tasks
-                results = await task_manager.execute_tasks(["test_task"], max_concurrency=1)
-
-                # Verify data-driven execution was called
-                mock_execute.assert_called_once()
-                call_args = mock_execute.call_args
-                samples = call_args[0][1]  # Second argument is samples
-
-                assert len(samples) == 2
-                assert samples[0] == {"id": "run_001", "seed": 42}
-                assert samples[1] == {"id": "run_002", "seed": 123}
-
-        finally:
-            Path(temp_dataset).unlink()
-
-    def test_frozen_lake_dataset_format_validation(self):
-        """Test that the actual frozen lake dataset has correct format."""
-        dataset_path = Path("examples/frozen_lake/client/dataset.jsonl")
-
-        if dataset_path.exists():
-            with open(dataset_path, "r") as f:
-                lines = f.readlines()
-
-            # Should have at least one sample
-            assert len(lines) > 0, "Dataset should not be empty"
-
-            for i, line in enumerate(lines):
-                line = line.strip()
-                if not line:
-                    continue
-
-                try:
-                    sample = json.loads(line)
-                except json.JSONDecodeError:
-                    pytest.fail(f"Invalid JSON on line {i+1}: {line}")
-
-                # Each sample should have id and seed
-                assert "id" in sample, f"Sample {i+1} missing 'id' field"
-                assert "seed" in sample, f"Sample {i+1} missing 'seed' field"
-                assert isinstance(sample["seed"], int), f"Sample {i+1} seed should be integer"
-
-
-if __name__ == "__main__":
-    pytest.main([__file__, "-v"])