From 5f865e198d1374be63fe9454a9caf77f7b8b5e36 Mon Sep 17 00:00:00 2001 From: Benny Chen Date: Tue, 12 Aug 2025 22:04:22 -0700 Subject: [PATCH] remove http rollout old code --- development/notes/frozen_lake_context.md | 33 -- development/notes/frozen_lake_plan.md | 93 --- development/notes/http_rollout.md | 38 -- eval_protocol/agent/orchestrator.py | 3 - eval_protocol/agent/resources/__init__.py | 25 - .../agent/resources/http_rollout_protocol.py | 85 --- .../agent/resources/http_rollout_resource.py | 325 ----------- examples/frozen_lake/README.md | 152 ----- examples/frozen_lake/analyze_trajectory.py | 273 --------- examples/frozen_lake/client/dataset.jsonl | 3 - examples/frozen_lake/client/reward.py | 126 ---- examples/frozen_lake/client/task_def.yaml | 46 -- .../gymnasium_frozen_lake_server.py | 281 --------- examples/frozen_lake/run_full_evaluation.sh | 318 ---------- examples/frozen_lake/server/README.md | 187 ------ .../frozen_lake/server/http_rollout_server.py | 104 ---- tests/test_frozen_lake_http_server.py | 269 --------- tests/test_frozen_lake_seed_evaluation.py | 541 ------------------ 18 files changed, 2902 deletions(-) delete mode 100644 development/notes/frozen_lake_context.md delete mode 100644 development/notes/frozen_lake_plan.md delete mode 100644 development/notes/http_rollout.md delete mode 100644 eval_protocol/agent/resources/http_rollout_protocol.py delete mode 100644 eval_protocol/agent/resources/http_rollout_resource.py delete mode 100644 examples/frozen_lake/README.md delete mode 100755 examples/frozen_lake/analyze_trajectory.py delete mode 100644 examples/frozen_lake/client/dataset.jsonl delete mode 100644 examples/frozen_lake/client/reward.py delete mode 100644 examples/frozen_lake/client/task_def.yaml delete mode 100644 examples/frozen_lake/gymnasium_frozen_lake_server.py delete mode 100755 examples/frozen_lake/run_full_evaluation.sh delete mode 100644 examples/frozen_lake/server/README.md delete mode 100644 examples/frozen_lake/server/http_rollout_server.py delete mode 100644 tests/test_frozen_lake_http_server.py delete mode 100644 tests/test_frozen_lake_seed_evaluation.py diff --git a/development/notes/frozen_lake_context.md b/development/notes/frozen_lake_context.md deleted file mode 100644 index 49c85152..00000000 --- a/development/notes/frozen_lake_context.md +++ /dev/null @@ -1,33 +0,0 @@ -# Frozen Lake Implementation Context - -This document provides context for the implementation of the Frozen Lake example within the `eval-protocol` framework. - -## High-Level Goal - -The primary objective is to create a robust and reproducible reinforcement learning environment for the Frozen Lake game. This involves allowing an LLM-based agent to interact with the game, and critically, enabling a data-driven approach to evaluations where initial conditions (like random seeds) are controlled by a dataset. - -## Core Components - -The implementation is distributed across several key files: - -- **`examples/frozen_lake/client/dataset.jsonl`**: The source of truth for evaluation runs. Each line defines a scenario, specifying the `seed` for the environment's initial state. -- **`examples/frozen_lake/client/task_def.yaml`**: The main configuration file for the task. It points to the dataset and defines how many rollouts to perform for each sample in the dataset. -- **`examples/frozen_lake/server/http_rollout_server.py`**: A FastAPI server that wraps the Frozen Lake game logic, exposing it via an HTTP API that the `eval-protocol` agent can interact with. -- **`examples/frozen_lake/gymnasium_frozen_lake_server.py`**: The core game logic, which wraps the official `gymnasium` Frozen Lake environment. It is responsible for accepting a `seed` to create a deterministic starting state. -- **`examples/frozen_lake/client/reward.py`**: A reward function that evaluates the agent's performance based on the outcome of the game (e.g., reaching the goal). -- **`eval_protocol/agent/`**: The core agent framework, including the `TaskManager` and `Orchestrator`, which together manage the data-driven execution of rollouts based on the task definition and dataset. - -## Data-Driven Rollout Flow - -The evaluation process follows a clear, data-driven flow: - -1. The **TaskManager** reads the `task_def.yaml`. -2. It loads the scenarios from the specified `dataset.jsonl` file. -3. For each scenario (i.e., each `seed` in the dataset), it schedules `num_rollouts_per_sample` rollouts. -4. For each individual rollout, the **Orchestrator** is invoked with the specific `seed`. -5. The **Orchestrator** passes the `seed` to the **HttpRolloutResource**. -6. The **HttpRolloutResource** sends the `seed` in a request to the `/start_episode` endpoint of the **http_rollout_server**. -7. The server uses the `seed` to initialize the **GymnasiumFrozenLakeGame** in a deterministic state. -8. The agent then plays the game, and the final outcome is evaluated by the reward function. - -This architecture ensures that evaluations are reproducible and that the agent's performance can be measured across a controlled set of initial conditions. diff --git a/development/notes/frozen_lake_plan.md b/development/notes/frozen_lake_plan.md deleted file mode 100644 index 2ae3005f..00000000 --- a/development/notes/frozen_lake_plan.md +++ /dev/null @@ -1,93 +0,0 @@ -# Frozen Lake Example Plan: Data-Driven Rollouts - -This document outlines the plan for refactoring the Frozen Lake example to use a data-driven evaluation workflow. The goal is to make the system more robust, extensible, and aligned with standard practices in reinforcement learning research. - -The core principle is to treat the initial conditions of an environment (like a random seed) as data. Each row in a dataset will define a specific scenario, and the framework will run a configurable number of rollouts for each scenario. - -### 1. The Dataset (`dataset.jsonl`) - -The foundation of this new approach is a dataset file that defines the experimental conditions. - -- **Action:** Create a new dataset file at `examples/frozen_lake/client/dataset.jsonl`. -- **Format:** Each line in the file will be a JSON object representing a single experimental sample. Initially, this will just contain a unique `id` and a `seed`. -- **Example Content:** - ```json - {"id": "run_001", "seed": 42} - {"id": "run_002", "seed": 123} - {"id": "run_003", "seed": 555} - {"id": "run_004", "seed": 678} - ``` - -### 2. The Task Definition (`task_def.yaml`) - -The task definition will be updated to reference the dataset and specify how many rollouts (`N`) to perform for each sample. - -- **File to Modify:** `examples/frozen_lake/client/task_def.yaml` -- **Changes:** - - Remove the old `num_rollouts` field. - - Add `dataset_path` to point to our new `dataset.jsonl` file. - - Add `num_rollouts_per_sample` to define `N`. -- **Example:** - ```yaml - name: "frozen_lake_http_rollout" - description: "Evaluate an agent's ability to navigate a Frozen Lake environment via HTTP rollout" - - # Data-driven configuration - dataset_path: "examples/frozen_lake/client/dataset.jsonl" - num_rollouts_per_sample: 5 # This is 'N', the number of rollouts per seed - - # Resource configuration remains the same - resource_type: "http_rollout" - # ... (rest of the file) - ``` - -### 3. Core Framework Modifications - -The following changes will plumb the `seed` from the dataset through the framework to the game environment. - -1. **Data Model (`eval_protocol/models.py`):** - - Update `TaskDefinitionModel` to include `dataset_path: Optional[str]` and `num_rollouts_per_sample: int`. - -2. **TaskManager (`eval_protocol/agent/task_manager.py`):** - - Modify the `execute_tasks` method to load samples from the `dataset_path`. - - For each sample, generate `num_rollouts_per_sample` rollout jobs. - - Pass the sample data (containing the `seed`) for each job down to the `Orchestrator`. - -3. **Orchestrator (`eval_protocol/agent/orchestrator.py`):** - - Modify `execute_task_poc` to accept `sample_data` as a parameter. - - Pass this data to the resource's `initialize` method: `await episode_resource.initialize(**sample_data)`. - -4. **HTTP Rollout Resource (`eval_protocol/agent/resources/http_rollout_resource.py`):** - - The `initialize` method will accept `**kwargs`. - - These `kwargs` (the `sample_data`) will be sent as the JSON body of the POST request to the `/start_episode` endpoint. - -5. **HTTP Rollout Server & Protocol:** - - The `/start_episode` endpoint in `examples/frozen_lake/server/http_rollout_server.py` will be updated to accept a JSON request body. - - It will pass the entire request body as keyword arguments to the `GymnasiumFrozenLakeGame` constructor: `game = FrozenLakeGame(**request_data)`. - - The `StartEpisodeRequest` model in `eval_protocol/agent/resources/http_rollout_protocol.py` will be updated to allow arbitrary extra fields. - -6. **Gymnasium Game (`examples/frozen_lake/gymnasium_frozen_lake_server.py`):** - - The `__init__` method of `GymnasiumFrozenLakeGame` will be changed to accept `**kwargs`. - - The `reset` method will use the `seed` from these arguments to initialize the environment deterministically: `self.env.reset(seed=self.seed)`. - -### 4. Visualization of the Flow - -```mermaid -sequenceDiagram - participant TaskManager - participant Orchestrator - participant Resource as HttpRolloutResource - participant Server as http_rollout_server - participant Game as GymnasiumFrozenLakeGame - - TaskManager->>TaskManager: Reads dataset.jsonl - TaskManager->>Orchestrator: execute_task_poc(sample_data={"seed": 42}) - Orchestrator->>Resource: fork() - Orchestrator->>Resource: initialize(**sample_data) - Resource->>Server: POST /start_episode (body={"seed": 42}) - Server->>Game: __init__(**{"seed": 42}) - Game->>Game: self.env.reset(seed=42) - Game-->>Server: observation - Server-->>Resource: {episode_id, observation} - Resource-->>Orchestrator: (initialization complete) - Orchestrator->>Orchestrator: (proceeds with agent interaction) diff --git a/development/notes/http_rollout.md b/development/notes/http_rollout.md deleted file mode 100644 index f81678d9..00000000 --- a/development/notes/http_rollout.md +++ /dev/null @@ -1,38 +0,0 @@ -# Remote Rollout Server API - -Eval Protocol can collect reinforcement learning trajectories from an external HTTP service. -The service exposes three simple endpoints used by `RemoteHttpRolloutClient`: - -## `POST /start_episode` -Returns an `episode_id` and the initial observation. - -## `POST /step` -Request body: -```json -{ - "episode_id": "string", - "action": {"any": "payload"} -} -``` -Returns a JSON object: -```json -{ - "observation": {"any": "payload"}, - "is_done": false -} -``` -representing the new observation after the action and whether the episode has ended. - -## `POST /end_episode` -Request body: -```json -{"episode_id": "string"} -``` -Signals that the episode is complete. - -The Eval Protocol pipeline is responsible for invoking an -OpenAI-compatible API between steps and feeding the resulting assistant messages -back into the rollout. This illustrates how an environment can interact with an -LLM at every step while keeping model calls in the pipeline. - -A concrete example of this is the [Frozen Lake Example](./frozen_lake_plan.md), which uses a remote HTTP rollout server to play the Frozen Lake game. diff --git a/eval_protocol/agent/orchestrator.py b/eval_protocol/agent/orchestrator.py index 410baf5f..2f737e2c 100644 --- a/eval_protocol/agent/orchestrator.py +++ b/eval_protocol/agent/orchestrator.py @@ -57,7 +57,6 @@ class ChatCompletionMessageToolCall: BFCLSimAPIResource, DockerResource, FileSystemResource, - HttpRolloutResource, PythonStateResource, SQLResource, ) @@ -244,8 +243,6 @@ def _get_resource_class(self, resource_type_name: str) -> Type[ForkableResource] "FileSystemResource": FileSystemResource, "DockerResource": DockerResource, "BFCLSimAPIResource": BFCLSimAPIResource, # Add BFCLSimAPIResource to mapping - "HttpRolloutResource": HttpRolloutResource, # Add HttpRolloutResource to mapping - "http_rollout": HttpRolloutResource, # Allow lowercase alias for convenience } resource_class = mapping.get(resource_type_name) diff --git a/eval_protocol/agent/resources/__init__.py b/eval_protocol/agent/resources/__init__.py index 7f5a03b0..852e1597 100644 --- a/eval_protocol/agent/resources/__init__.py +++ b/eval_protocol/agent/resources/__init__.py @@ -7,20 +7,6 @@ from .bfcl_sim_api_resource import BFCLSimAPIResource from .docker_resource import DockerResource from .filesystem_resource import FileSystemResource - -# HTTP Rollout Protocol types for server implementations -from .http_rollout_protocol import ( - EndEpisodeRequest, - EndEpisodeResponse, - GameObservation, - HealthResponse, - HttpRolloutConfig, - StartEpisodeRequest, - StartEpisodeResponse, - StepRequest, - StepResponse, -) -from .http_rollout_resource import HttpRolloutResource from .python_state_resource import PythonStateResource from .sql_resource import SQLResource @@ -30,15 +16,4 @@ "FileSystemResource", "DockerResource", "BFCLSimAPIResource", - "HttpRolloutResource", - # HTTP Rollout Protocol - "HttpRolloutConfig", - "StartEpisodeRequest", - "StartEpisodeResponse", - "StepRequest", - "StepResponse", - "EndEpisodeRequest", - "EndEpisodeResponse", - "HealthResponse", - "GameObservation", ] diff --git a/eval_protocol/agent/resources/http_rollout_protocol.py b/eval_protocol/agent/resources/http_rollout_protocol.py deleted file mode 100644 index d6992d0f..00000000 --- a/eval_protocol/agent/resources/http_rollout_protocol.py +++ /dev/null @@ -1,85 +0,0 @@ -""" -HTTP Rollout Protocol - Standardized types for HTTP rollout communication. - -This module defines the standard request/response models for HTTP rollout servers -and clients, ensuring consistent communication across different implementations. -""" - -from typing import Any, Dict, List, Optional - -from pydantic import BaseModel - - -class StartEpisodeRequest(BaseModel): - """Request to start a new episode.""" - - class Config: - extra = "allow" # Allow arbitrary extra fields (like seed) - - -class StartEpisodeResponse(BaseModel): - """Response from starting a new episode.""" - - episode_id: str - observation: Dict[str, Any] - - -class StepRequest(BaseModel): - """Request to take a step in the environment.""" - - episode_id: str - action: Any # Can be int, str, dict, etc. depending on environment - - -class StepResponse(BaseModel): - """Response from taking a step in the environment.""" - - observation: Dict[str, Any] - is_done: bool - info: Optional[Dict[str, Any]] = None - - -class EndEpisodeRequest(BaseModel): - """Request to end an episode.""" - - episode_id: str - - -class EndEpisodeResponse(BaseModel): - """Response from ending an episode.""" - - message: str - - -class HealthResponse(BaseModel): - """Response from health check endpoint.""" - - status: str - game: Optional[str] = None - version: Optional[str] = None - - -class HttpRolloutConfig(BaseModel): - """Configuration for HTTP rollout resource.""" - - base_url: str - start_episode_endpoint: str = "/start_episode" - step_endpoint: str = "/step" - end_episode_endpoint: str = "/end_episode" - health_endpoint: str = "/health" - timeout: float = 30.0 - max_retries: int = 3 - - -# Observation structure for game environments -class GameObservation(BaseModel): - """Standard observation structure for game environments.""" - - position: Optional[List[int]] = None - current_cell: Optional[str] = None - done: bool = False - won: bool = False - visual: Optional[str] = None - message: Optional[str] = None - step_count: Optional[int] = None - max_steps: Optional[int] = None diff --git a/eval_protocol/agent/resources/http_rollout_resource.py b/eval_protocol/agent/resources/http_rollout_resource.py deleted file mode 100644 index 04be93e0..00000000 --- a/eval_protocol/agent/resources/http_rollout_resource.py +++ /dev/null @@ -1,325 +0,0 @@ -""" -HTTP Rollout Resource implementation for the agent evaluation framework. - -This resource bridges the HTTP rollout protocol with the ForkableResource interface, -allowing HTTP-based environments to be used in agent evaluations. -""" - -import json -import uuid -from typing import Any, Dict, List, Optional - -import httpx - -from ..resource_abc import ForkableResource -from .http_rollout_protocol import ( - EndEpisodeRequest, - GameObservation, - HttpRolloutConfig, - StartEpisodeRequest, - StartEpisodeResponse, - StepRequest, - StepResponse, -) - - -class HttpRolloutResource(ForkableResource): - """ - A ForkableResource implementation that communicates with HTTP rollout servers. - - This resource allows the agent evaluation framework to interact with - HTTP-based environments through a standardized rollout protocol. - """ - - def __init__(self): - """Initialize the HTTP rollout resource.""" - super().__init__() - self.config: Optional[HttpRolloutConfig] = None - self.episode_id: Optional[str] = None - self.current_observation: Optional[Dict[str, Any]] = None - self.is_episode_active = False - self.client: Optional[httpx.Client] = None - - # Set up logging - import logging - - self.logger = logging.getLogger(f"{self.__class__.__name__}") - - async def setup(self, config: Dict[str, Any]) -> None: - """ - Set up the resource with the provided configuration. - - Args: - config: Configuration dictionary from the task definition - """ - self.config = HttpRolloutConfig(**config) - self.client = httpx.Client(timeout=self.config.timeout) - - async def fork(self) -> "HttpRolloutResource": - """ - Create a new independent instance of this resource. - - For HTTP rollout, forking means creating a new resource instance - that will start its own episode when initialized. - """ - if not self.config: - raise RuntimeError("Resource not set up. Call setup() first.") - - # Create a new instance with the same config - new_resource = HttpRolloutResource() - await new_resource.setup(self.config.model_dump()) - return new_resource - - async def get_state(self) -> Dict[str, Any]: - """ - Get the current state of the resource. - - Returns the current observation and episode metadata. - """ - return { - "episode_id": self.episode_id, - "observation": self.current_observation, - "is_episode_active": self.is_episode_active, - "type": "http_rollout", - } - - async def initialize(self, **kwargs) -> None: - """ - Initialize the resource by starting a new episode. - Passes any provided kwargs (like seed) to the server in the request body. - """ - try: - url = f"{self.config.base_url}{self.config.start_episode_endpoint}" - - # Include any sample data (like seed) in the request body - if kwargs: - self.logger.info(f"Sending initialization data to server: {kwargs}") - response = self.client.post(url, json=kwargs) - else: - response = self.client.post(url) - response.raise_for_status() - - episode_data = response.json() - self.episode_id = episode_data["episode_id"] - self.current_observation = episode_data["observation"] - self.is_episode_active = True - - except Exception as e: - raise RuntimeError(f"Failed to start HTTP rollout episode: {e}") - - async def get_initial_state_description(self) -> str: - """ - Get a formatted description of the initial game state for the agent. - Uses the observation from start_episode to build the prompt. - """ - # Start episode to get current game state - if not self.is_episode_active: - await self.initialize() - - if not self.current_observation: - return "No initial state available." - - obs = self.current_observation - - # Build comprehensive game prompt - content = """๐ŸŽฎ FROZEN LAKE GAME - AUTONOMOUS PLAY MODE - -๐ŸŽฏ OBJECTIVE: Navigate from S to G without hitting H - -๐Ÿ“‹ GAME RULES: S=start, F=safe, H=hole(death), G=goal(win) - -๐Ÿค– AUTONOMOUS MODE INSTRUCTIONS: -- You are playing this game AUTONOMOUSLY until completion -- KEEP MAKING MOVES using the step tool until you reach G or hit H -- DO NOT ask for user input or wait for confirmation -- DO NOT stop after one move - continue until the game ends -- Each move should be followed immediately by another move -- Game only ends when you reach G (win) or hit H (lose) - -๐ŸŽฎ ACTION: Use step tool with: "left", "right", "up", or "down" - -โšก START NOW - Make your first move and continue until the game is complete!""" - - description_parts = [content] - - if obs.get("message"): - description_parts.append(f"\nEnvironment: {obs['message']}") - - if obs.get("visual"): - description_parts.append(f"\nGame Board:\n{obs['visual']}") - - if obs.get("position"): - description_parts.append(f"\nStarting Position: {obs['position']}") - - description_parts.append("\nGame Rules:") - description_parts.append("- S = Start position") - description_parts.append("- F = Frozen (safe to step on)") - description_parts.append("- H = Hole (game over if you step here)") - description_parts.append("- G = Goal (reach this to win)") - description_parts.append("- [X] = Your current position") - - return "\n".join(description_parts) - - async def cleanup(self) -> None: - """ - Clean up the resource by ending the current episode. - """ - if self.is_episode_active and self.episode_id: - try: - url = f"{self.config.base_url}{self.config.end_episode_endpoint}" - response = self.client.post(url, json={"episode_id": self.episode_id}) - response.raise_for_status() - - except Exception as e: - # Log but don't raise - cleanup should be best effort - print(f"Warning: Failed to properly end episode {self.episode_id}: {e}") - - finally: - self.episode_id = None - self.current_observation = None - self.is_episode_active = False - - # Close the HTTP client - self.client.close() - - async def get_tools_spec(self) -> List[Dict[str, Any]]: - """ - Get the list of available tools for this resource. - - For HTTP rollout, this returns the 'step' tool that allows - the agent to take actions in the environment. - """ - return [ - { - "name": "step", - "description": "Take a step in the Frozen Lake game by choosing a direction to move", - "parameters": { - "type": "object", - "properties": { - "action": { - "type": "string", - "enum": ["left", "down", "right", "up"], - "description": "The direction to move in the game: 'left', 'down', 'right', or 'up'", - } - }, - "required": ["action"], - }, - } - ] - - async def step(self, action_name: str, action_params: Dict[str, Any]) -> Any: - """ - Execute a tool call on this resource. - - For HTTP rollout, this handles the 'step' tool by sending - the action to the HTTP rollout server. - """ - if not self.is_episode_active or not self.episode_id: - # If no active episode, start one first - await self.initialize() - - if action_name == "step": - action = action_params.get("action") - return await self._handle_step_tool(action) - else: - raise ValueError(f"Unknown action: {action_name}") - - async def get_observation(self) -> Any: - """ - Get the current observation from the environment. - """ - if self.current_observation: - return self.current_observation - else: - return {"message": "No observation available. Start an episode first."} - - async def checkpoint(self) -> Dict[str, Any]: - """ - Create a checkpoint of the current resource state. - - For HTTP rollout, this saves the episode ID and current observation. - """ - return { - "episode_id": self.episode_id, - "current_observation": self.current_observation, - "is_episode_active": self.is_episode_active, - } - - async def restore(self, state_data: Dict[str, Any]) -> None: - """ - Restore the resource state from a checkpoint. - - Note: This is limited for HTTP rollout since we can't restore - arbitrary server-side state. - """ - self.episode_id = state_data.get("episode_id") - self.current_observation = state_data.get("current_observation") - self.is_episode_active = state_data.get("is_episode_active", False) - - async def close(self) -> None: - """ - Clean up and close the resource. - """ - await self.cleanup() - - async def _handle_step_tool(self, action: Any) -> Dict[str, Any]: - """ - Handle the 'step' tool by sending an action to the HTTP rollout server. - """ - try: - # Convert string action to integer for the server - action_map = {"left": 0, "down": 1, "right": 2, "up": 3} - - if isinstance(action, str): - if action.lower() not in action_map: - raise ValueError(f"Invalid action '{action}'. Must be one of: left, down, right, up") - numeric_action = action_map[action.lower()] - else: - # Backward compatibility with numeric actions - numeric_action = action - - url = f"{self.config.base_url}{self.config.step_endpoint}" - step_data = {"episode_id": self.episode_id, "action": numeric_action} - - response = self.client.post(url, json=step_data) - response.raise_for_status() - - step_result = response.json() - self.current_observation = step_result["observation"] - - # If the episode is done, mark it as inactive - if step_result.get("is_done", False): - self.is_episode_active = False - - # Format the response for the agent - observation = step_result["observation"] - message = observation.get("message", "") - visual = observation.get("visual", "") - - # Create a comprehensive response - response_content = [] - if message: - response_content.append(f"Environment: {message}") - if visual: - response_content.append(f"Visual State:\n{visual}") - - # Add structured data - response_content.append(f"Position: {observation.get('position', 'unknown')}") - response_content.append(f"Done: {step_result.get('is_done', False)}") - - if step_result.get("is_done", False): - won = observation.get("won", False) - response_content.append(f"Result: {'Victory!' if won else 'Game Over'}") - - return {"content": [{"type": "text", "text": "\n".join(response_content)}]} - - except Exception as e: - raise RuntimeError(f"Failed to execute step: {e}") - - def __del__(self): - """Ensure cleanup on deletion.""" - if hasattr(self, "client") and self.client: - try: - self.client.close() - except Exception: - pass # Ignore cleanup errors during deletion diff --git a/examples/frozen_lake/README.md b/examples/frozen_lake/README.md deleted file mode 100644 index 803591a7..00000000 --- a/examples/frozen_lake/README.md +++ /dev/null @@ -1,152 +0,0 @@ -# Frozen Lake Agent Evaluation - -This example demonstrates LLM agent evaluation on the Frozen Lake game using eval-protocol's HTTP rollout framework. The agent must navigate from start (S) to goal (G) while avoiding holes (H). - -## Quick Start - -### Setup -```bash -# For Fireworks AI -export FIREWORKS_API_KEY="your_fireworks_api_key" -export MODEL_AGENT="fireworks/accounts/fireworks/models/qwen3-235b-a22b" - -# For OpenAI -export OPENAI_API_KEY="your_openai_api_key" -export MODEL_AGENT="openai/gpt-4o-mini" - -# For other providers, set appropriate API key and MODEL_AGENT -``` - -### Run Evaluation -```bash -# Batch evaluation (8 parallel rollouts) - recommended -eval-protocol agent-eval --task-def examples/frozen_lake/client/task_def.yaml - -# Single rollout for debugging -eval-protocol agent-eval --task-def examples/frozen_lake/client/task_def.yaml --num-rollouts 1 - -# Custom batch size -eval-protocol agent-eval --task-def examples/frozen_lake/client/task_def.yaml --num-rollouts 16 -``` - -### Output -```bash -Task 'frozen_lake_http_rollout' batch results: - - Rollouts: 6/8 successful - - Success rate: 75.00% - - Average score: 0.7500 ยฑ 0.4330 - - Trajectory data saved to: client/evaluation_logs/trajectory_frozen_lake_http_rollout_20250610_143052.jsonl -``` - - -## Architecture - -``` -โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” HTTP โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” -โ”‚ Client Side โ”‚ โ—„โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ–บ โ”‚ Server Side โ”‚ -โ”‚ (eval-protocol) โ”‚ Rollout โ”‚ (Game Env) โ”‚ -โ”‚ โ”‚ โ”‚ โ”‚ -โ”‚ โ€ข Agent Eval โ”‚ โ”‚ โ€ข Game Logic โ”‚ -โ”‚ โ€ข Reward Func โ”‚ โ”‚ โ€ข State Mgmt โ”‚ -โ”‚ โ€ข Trajectory โ”‚ โ”‚ โ€ข HTTP API โ”‚ -โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ -``` - -## Project Structure - -``` -frozen_lake/ -โ”œโ”€โ”€ README.md # This overview -โ”œโ”€โ”€ server/ # Game Environment (HTTP API) -โ”‚ โ”œโ”€โ”€ README.md # Server documentation -โ”‚ โ””โ”€โ”€ http_rollout_server.py # FastAPI game server -โ””โ”€โ”€ client/ # Agent Evaluation - โ”œโ”€โ”€ task_def.yaml # Task configuration (works with any model) - โ”œโ”€โ”€ reward.py # Reward function - โ””โ”€โ”€ evaluation_logs/ # Generated results & trajectories - โ”œโ”€โ”€ trajectory_*.jsonl # Conversation histories + tool calls - โ””โ”€โ”€ *_reeval_*.jsonl # Re-evaluation results -``` - -## Game Rules - -**Objective:** Navigate from S to G without falling into holes (H) - -``` -[S] F F F - F H F H - F F F H - H F F G -``` - -**Actions:** `"left"`, `"right"`, `"up"`, `"down"` - -## Trajectory Data Format - -Each trajectory JSONL file contains: - -```json -{"type": "summary", "task_id": "frozen_lake_http_rollout", "num_rollouts": 8, "success_rate": 0.75, "avg_score": 0.75} -{"type": "individual_result", "rollout_index": 0, "score": 1.0, "conversation_messages": [...], "reward_function_inputs": {...}} -``` - -### Conversation Messages -Complete OpenAI format conversation history: -- User prompts -- Assistant responses with reasoning -- Tool calls (game actions) -- Tool results (game observations) - -### Reward Function Inputs -Exact parameters passed to reward functions: -- `messages`: Full conversation history -- `state`: Game state and successful function calls -- `task_achieved`: Success/failure status -- `ground_truth`: Reference data (if available) - -## Customization - -### Custom Reward Functions -Create new reward functions and test them on existing trajectories: - -```python -# my_rewards.py -from eval_protocol.typed_interface import reward_function -from eval_protocol.models import EvaluateResult, MetricResult - -@reward_function -def efficiency_reward(messages, state=None, **kwargs): - # Count steps taken - step_count = len(state.get("successful_func_calls", [[]])[0]) - - # Reward fewer steps - efficiency_score = max(0.0, 1.0 - (step_count - 4) * 0.1) - - return EvaluateResult( - score=efficiency_score, - reason=f"Efficiency reward: {step_count} steps", - metrics={"efficiency": MetricResult(score=efficiency_score, reason="Step efficiency")} - ) -``` - - -## Model Performance - -| Model | Success Rate | Average Score | Best Strategy | -|-------|-------------|---------------|---------------| -| qwen3-235b-a22b | 75-100% | 0.75-1.0 | downโ†’downโ†’rightโ†’rightโ†’downโ†’right | -| gpt-4o-mini | 0-25% | 0.0-0.25 | Often fails at holes | - -## Troubleshooting - -- **Connection errors**: Server auto-starts, check port conflicts -- **API key issues**: Verify MODEL_AGENT and API key are set -- **Empty trajectories**: Check `client/evaluation_logs/` directory -- **Re-evaluation errors**: Ensure reward function module path is correct - -## Next Steps - -1. **Run the example**: Start with single rollout, then batch evaluation -2. **Analyze trajectories**: Examine generated JSONL files -3. **Create custom rewards**: Implement your own scoring functions -4. **Compare approaches**: Use re-evaluation to test different strategies diff --git a/examples/frozen_lake/analyze_trajectory.py b/examples/frozen_lake/analyze_trajectory.py deleted file mode 100755 index fe6a90c0..00000000 --- a/examples/frozen_lake/analyze_trajectory.py +++ /dev/null @@ -1,273 +0,0 @@ -#!/usr/bin/env python3 -""" -Agent Trajectory Analyzer for Frozen Lake HTTP Rollout Evaluation - -This script parses the evaluation logs and creates a human-readable -trajectory showing the agent's decision making process. -""" - -import json -import re -import sys -from pathlib import Path -from typing import Any, Dict, List, Optional - - -def extract_tool_calls_from_log(log_content: str) -> List[Dict[str, Any]]: - """Extract tool calls and their results from the log.""" - tool_calls = [] - - # Find all tool call patterns - tool_call_pattern = r"Attempting tool call: (\w+)\((.*?)\)" - tool_result_pattern = r"Tool '(\w+)' result: (.*?)(?=INFO:|DEBUG:|ERROR:|$)" - - tool_calls_matches = re.finditer(tool_call_pattern, log_content, re.DOTALL) - - for match in tool_calls_matches: - tool_name = match.group(1) - tool_args = match.group(2) - - # Try to parse the arguments as JSON - try: - args_dict = json.loads(tool_args) - except (json.JSONDecodeError, TypeError, ValueError): - args_dict = {"raw": tool_args} - - tool_call = {"tool_name": tool_name, "arguments": args_dict, "result": None} - - # Find the corresponding result - result_pattern = rf"Tool '{tool_name}' result: (.*?)(?=INFO:|DEBUG:|ERROR:|$)" - result_match = re.search(result_pattern, log_content[match.end() :], re.DOTALL) - - if result_match: - result_text = result_match.group(1).strip() - # Try to parse as JSON - try: - tool_call["result"] = json.loads(result_text) - except (json.JSONDecodeError, TypeError, ValueError): - tool_call["result"] = {"raw": result_text} - - tool_calls.append(tool_call) - - return tool_calls - - -def extract_agent_messages(log_content: str) -> List[Dict[str, Any]]: - """Extract the agent's reasoning and responses.""" - messages = [] - - # Find OpenAI response messages - response_pattern = r"OpenAI response message: ChatCompletionMessage\((.*?)\)" - - for match in re.finditer(response_pattern, log_content, re.DOTALL): - message_str = match.group(1) - - # Extract thinking content - think_match = re.search(r"content='(.*?)', refusal=", message_str, re.DOTALL) - if think_match: - thinking = think_match.group(1) - - # Extract tags - think_content_match = re.search(r"(.*?)", thinking, re.DOTALL) - if think_content_match: - thinking_content = think_content_match.group(1).strip() - else: - thinking_content = thinking - - messages.append({"type": "thinking", "content": thinking_content}) - - # Extract tool calls from the message - tool_calls_match = re.search(r"tool_calls=\[(.*?)\]", message_str, re.DOTALL) - if tool_calls_match: - messages.append({"type": "tool_calls", "content": tool_calls_match.group(1)}) - - return messages - - -def extract_game_state_changes(log_content: str) -> List[Dict[str, Any]]: - """Extract game state changes from the environment responses.""" - states = [] - - # Find environment responses - env_pattern = r"Environment: (.*?)(?=\\n|Position:|Done:)" - visual_pattern = r"Visual State:\\n(.*?)(?=\\nPosition:|\\nDone:)" - position_pattern = r"Position: (\[.*?\])" - done_pattern = r"Done: (True|False)" - - # Find all environment messages - env_matches = re.finditer( - r"Tool 'step' result:.*?Environment: (.*?)\\nVisual State:\\n(.*?)\\nPosition: (\[.*?\])\\nDone: (True|False)", - log_content, - re.DOTALL, - ) - - for i, match in enumerate(env_matches): - env_message = match.group(1) - visual_state = match.group(2) - position = match.group(3) - done = match.group(4) == "True" - - states.append( - { - "step": i + 1, - "message": env_message, - "visual_state": visual_state.replace("\\n", "\n"), - "position": position, - "done": done, - } - ) - - return states - - -def create_trajectory_report(log_file: str) -> str: - """Create a detailed trajectory report.""" - - with open(log_file, "r") as f: - log_content = f.read() - - tool_calls = extract_tool_calls_from_log(log_content) - agent_messages = extract_agent_messages(log_content) - game_states = extract_game_state_changes(log_content) - - report = [] - report.append("FROZEN LAKE AGENT TRAJECTORY ANALYSIS") - report.append("=" * 50) - report.append("") - - # Summary - report.append(f"๐Ÿ“Š SUMMARY:") - report.append(f"โ€ข Total tool calls: {len(tool_calls)}") - report.append(f"โ€ข Total reasoning steps: {len([m for m in agent_messages if m['type'] == 'thinking'])}") - report.append(f"โ€ข Game state changes: {len(game_states)}") - report.append("") - - # Detailed trajectory - report.append("๐ŸŽฎ DETAILED TRAJECTORY:") - report.append("-" * 30) - report.append("") - - for i, tool_call in enumerate(tool_calls): - step_num = i + 1 - report.append(f"STEP {step_num}: {tool_call['tool_name'].upper()}") - report.append(f"Arguments: {tool_call['arguments']}") - - # Add corresponding game state if available - if i < len(game_states): - state = game_states[i] - report.append(f"Result: {state['message']}") - report.append(f"Position: {state['position']}") - report.append(f"Visual State:") - for line in state["visual_state"].split("\n"): - if line.strip(): - report.append(f" {line}") - report.append(f"Game Done: {state['done']}") - - report.append("") - - # Agent reasoning analysis - report.append("๐Ÿง  AGENT REASONING:") - report.append("-" * 20) - report.append("") - - thinking_messages = [m for m in agent_messages if m["type"] == "thinking"] - for i, message in enumerate(thinking_messages[:3]): # Show first 3 reasoning steps - report.append(f"REASONING STEP {i+1}:") - # Truncate long reasoning for readability - content = message["content"] - if len(content) > 500: - content = content[:500] + "...[truncated]" - report.append(content) - report.append("") - - if len(thinking_messages) > 3: - report.append(f"... and {len(thinking_messages) - 3} more reasoning steps") - report.append("") - - # Game progression analysis - report.append("๐Ÿ“ GAME PROGRESSION:") - report.append("-" * 20) - report.append("") - - positions = [] - for state in game_states: - try: - pos = eval(state["position"]) # Convert string representation to list - positions.append(pos) - except (SyntaxError, NameError, TypeError, ValueError): - positions.append(state["position"]) - - if positions: - report.append("Path taken:") - for i, pos in enumerate(positions): - if i == 0: - report.append(f" Start: {pos}") - else: - prev_pos = positions[i - 1] - direction = get_direction(prev_pos, pos) - report.append(f" Step {i}: {prev_pos} โ†’ {pos} ({direction})") - - # Final position - if positions: - final_pos = positions[-1] - # Check if reached goal (typically at [3,3]) - if final_pos == [3, 3]: - report.append(f" ๐ŸŽ‰ GOAL REACHED at {final_pos}!") - else: - report.append(f" Final position: {final_pos}") - - return "\n".join(report) - - -def get_direction(from_pos: List[int], to_pos: List[int]) -> str: - """Determine the direction of movement.""" - if len(from_pos) != 2 or len(to_pos) != 2: - return "unknown" - - row_diff = to_pos[0] - from_pos[0] - col_diff = to_pos[1] - from_pos[1] - - if row_diff == 0 and col_diff == 1: - return "RIGHT" - elif row_diff == 0 and col_diff == -1: - return "LEFT" - elif row_diff == 1 and col_diff == 0: - return "DOWN" - elif row_diff == -1 and col_diff == 0: - return "UP" - else: - return f"DIAGONAL({row_diff},{col_diff})" - - -def main(): - if len(sys.argv) != 2: - print("Usage: python analyze_trajectory.py ") - sys.exit(1) - - log_file = sys.argv[1] - - if not Path(log_file).exists(): - print(f"Error: Log file {log_file} not found") - sys.exit(1) - - try: - report = create_trajectory_report(log_file) - - # Save to analysis file - analysis_file = str(Path(log_file).with_suffix(".analysis.txt")) - with open(analysis_file, "w") as f: - f.write(report) - - print(report) - print(f"\n๐Ÿ“„ Analysis saved to: {analysis_file}") - - except Exception as e: - print(f"Error analyzing trajectory: {e}") - import traceback - - traceback.print_exc() - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/examples/frozen_lake/client/dataset.jsonl b/examples/frozen_lake/client/dataset.jsonl deleted file mode 100644 index 27c5c684..00000000 --- a/examples/frozen_lake/client/dataset.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -{"id": "run_001", "seed": 42} -{"id": "run_002", "seed": 123} -{"id": "run_003", "seed": 999} diff --git a/examples/frozen_lake/client/reward.py b/examples/frozen_lake/client/reward.py deleted file mode 100644 index 7239b8aa..00000000 --- a/examples/frozen_lake/client/reward.py +++ /dev/null @@ -1,126 +0,0 @@ -""" -Reward function for the Frozen Lake evaluation task. -""" - -from typing import List, Optional - -from eval_protocol.models import EvaluateResult, Message, MetricResult, StepOutput -from eval_protocol.typed_interface import reward_function - - -@reward_function -def frozen_lake_reward(messages: List[Message], state=None, **kwargs) -> EvaluateResult: - """ - Evaluate the final message list for a success string in the Frozen Lake game. - - Args: - messages: List of conversation messages - state: State dictionary containing trajectory data - **kwargs: Additional keyword arguments - - Returns: - EvaluateResult with score 1.0 for success, 0.0 for failure - """ - # Check if the last message (from the game) contains success indicators - if not messages: - return EvaluateResult( - score=0.0, - reason="No messages provided", - metrics={"success": MetricResult(score=0.0, reason="No messages provided")}, - ) - - # Check all messages (especially tool responses) for game outcome - def extract_content_from_message(msg): - """Extract text content from a message, handling JSON-encoded tool responses.""" - content = msg.content - if content and isinstance(content, str): - try: - # Try to parse JSON content from tool responses - import json - - parsed_content = json.loads(content) - if isinstance(parsed_content, dict) and "content" in parsed_content: - # Extract text from tool response format - content_list = parsed_content["content"] - if isinstance(content_list, list) and len(content_list) > 0: - text_item = content_list[0] - if isinstance(text_item, dict) and "text" in text_item: - content = text_item["text"] - except (json.JSONDecodeError, KeyError, IndexError, TypeError): - # If parsing fails, use original content - pass - return content.lower() if content else "" - - # Check for success/failure indicators in all messages - success_indicators = [ - "you win", - "you reached the goal", - "congratulations", - "success", - "goal reached", - "you made it", - "victory", - ] - - failure_indicators = ["you lose", "game over", "you fell", "hole"] - - is_success = False - is_failure = False - winning_message = "" - losing_message = "" - - # Check all messages for game outcome indicators - for msg in messages: - content = extract_content_from_message(msg) - - # Check for success - for indicator in success_indicators: - if indicator in content: - is_success = True - winning_message = content[:100] - break - - # Check for failure - for indicator in failure_indicators: - if indicator in content: - is_failure = True - losing_message = content[:100] - break - - # Determine the score (success takes precedence over failure) - if is_success: - score = 1.0 - reason = "Successfully reached the goal in Frozen Lake" - elif is_failure: - score = 0.0 - reason = "Failed to reach the goal (fell into hole or other failure)" - else: - # If no clear success/failure indicator, check if game is still ongoing - score = 0.0 - reason = "Game outcome unclear or still in progress" - - metrics = {"success": MetricResult(score=score, reason=reason, is_score_valid=True)} - - # Extract trajectory data if available - step_outputs = None - if state and "successful_func_calls" in state: - successful_calls = state["successful_func_calls"] - step_outputs = [] - - # Convert function calls to StepOutput format - step_index = 0 - for turn_calls in successful_calls: - for call in turn_calls: - # Extract action from function call arguments - action = call.get("args", {}).get("action", "unknown") - step_outputs.append( - StepOutput( - step_index=step_index, - action=action, - base_reward=(0.1 if action != "unknown" else 0.0), # Small reward for valid actions - reason=f"Agent took action: {action}", - ) - ) - step_index += 1 - - return EvaluateResult(score=score, reason=reason, metrics=metrics, step_outputs=step_outputs) diff --git a/examples/frozen_lake/client/task_def.yaml b/examples/frozen_lake/client/task_def.yaml deleted file mode 100644 index 62ee9540..00000000 --- a/examples/frozen_lake/client/task_def.yaml +++ /dev/null @@ -1,46 +0,0 @@ -name: "frozen_lake_http_rollout" -description: "Evaluate an agent's ability to navigate a Frozen Lake environment via HTTP rollout" - -# Data-driven configuration -dataset_path: "examples/frozen_lake/client/dataset.jsonl" -num_rollouts_per_sample: 1 # This is 'N', the number of rollouts per seed - -# Resource configuration - connects to the game server -resource_type: "http_rollout" -base_resource_config: - base_url: "http://localhost:8080" # Will be dynamically updated by TaskManager - timeout: 30.0 - -# Resource server configuration - automatically managed by TaskManager -resource_server: - start_command: "python examples/frozen_lake/server/http_rollout_server.py --port {port}" - health_check_url: "http://localhost:{port}/health" - -# Reward function - the only client-side logic needed -reward_function_path: "examples.frozen_lake.client.reward.frozen_lake_reward" - -# Initial user message - gets extended with game state from the server -messages: - - role: "user" - content: "Start playing the game!" - -# Evaluation configuration -poc_max_turns: 20 - -# Generation configuration -generation: - enabled: true - _target_: eval_protocol.generation.generate_responses - model_name: "accounts/fireworks/models/qwen3-235b-a22b" - temperature: 0.0 - max_new_tokens: 500 - batch_size: 1 - cache: - enabled: true - -# Reward Function Configuration -reward: - function_path: "examples.frozen_lake.client.reward.frozen_lake_reward" - -# Note: The TaskManager will automatically start the server on a free port -# and update the base_url accordingly for parallel execution support diff --git a/examples/frozen_lake/gymnasium_frozen_lake_server.py b/examples/frozen_lake/gymnasium_frozen_lake_server.py deleted file mode 100644 index 6a99153e..00000000 --- a/examples/frozen_lake/gymnasium_frozen_lake_server.py +++ /dev/null @@ -1,281 +0,0 @@ -""" -Gymnasium-based Frozen Lake game server implementation. - -This implementation wraps the official Gymnasium FrozenLake-v1 environment -and provides the same interface as the hand-rolled implementation for -seamless integration with the HTTP rollout server. -""" - -from typing import Dict, Optional, Tuple, Union - -import gymnasium as gym -import numpy as np -from gymnasium.envs.toy_text.frozen_lake import generate_random_map - - -class GymnasiumFrozenLakeGame: - """ - Gymnasium-based Frozen Lake game implementation. - - This class wraps the Gymnasium FrozenLake-v1 environment and provides - a compatible interface with the hand-rolled implementation. - - The game is played on a 4x4 grid where: - - S: Starting position - - F: Frozen surface (safe to walk on) - - H: Hole (game over if you fall in) - - G: Goal (reach this to win) - - Actions: - - 0: Left - - 1: Down - - 2: Right - - 3: Up - """ - - def __init__( - self, - map_name: str = "4x4", - is_slippery: bool = False, - render_mode: Optional[str] = None, - seed: Optional[int] = None, - **kwargs, - ): - """ - Initialize the Gymnasium Frozen Lake game. - - Args: - map_name: Map size ("4x4" or "8x8") - only used if seed is None - is_slippery: Whether the ice is slippery (stochastic environment) - render_mode: Rendering mode for Gymnasium environment - seed: Random seed for reproducible map generation and behavior - **kwargs: Additional keyword arguments (ignored for compatibility) - """ - self.map_name = map_name - self.is_slippery = is_slippery - self.seed = seed - - # Create the Gymnasium environment - if seed is not None: - # Use random map generation with seed for reproducible boards - size = 4 if map_name == "4x4" else 8 - desc = generate_random_map(size=size, p=0.8, seed=seed) - self.env = gym.make( - "FrozenLake-v1", - desc=desc, - is_slippery=is_slippery, - render_mode=render_mode, - ) - else: - # Use fixed predefined maps - self.env = gym.make( - "FrozenLake-v1", - map_name=map_name, - is_slippery=is_slippery, - render_mode=render_mode, - ) - - # Get environment properties - self.desc = self.env.unwrapped.desc - self.nrow, self.ncol = self.desc.shape - self.nS = self.env.observation_space.n - self.nA = self.env.action_space.n - - # Find start and goal positions - self.start_pos = None - self.goal_pos = None - for i in range(self.nrow): - for j in range(self.ncol): - if self.desc[i, j] == b"S": - self.start_pos = (i, j) - elif self.desc[i, j] == b"G": - self.goal_pos = (i, j) - - # Initialize state tracking - self.current_state: Optional[int] = None - self.current_pos: Optional[Tuple[int, int]] = None - self.done = False - self.won = False - - self.reset() - - def _state_to_pos(self, state: int) -> Tuple[int, int]: - """Convert state number to (row, col) position.""" - return state // self.ncol, state % self.ncol - - def _pos_to_state(self, row: int, col: int) -> int: - """Convert (row, col) position to state number.""" - return row * self.ncol + col - - def reset(self) -> Dict: - """Reset the game to the starting position.""" - if self.seed is not None: - self.current_state, _ = self.env.reset(seed=self.seed) - else: - self.current_state, _ = self.env.reset() - self.current_pos = self._state_to_pos(self.current_state) - self.done = False - self.won = False - return self._get_observation() - - def step(self, action: Union[int, str]) -> Tuple[Dict, bool]: - """ - Take a step in the environment. - - Args: - action: Action to take. Can be: - - Integer: 0=left, 1=down, 2=right, 3=up - - String: "left", "down", "right", "up" - - Returns: - Tuple of (observation, done) - """ - if self.done: - return self._get_observation(), True - - # Convert string action to integer if needed - if isinstance(action, str): - action_map = {"left": 0, "down": 1, "right": 2, "up": 3} - if action.lower() not in action_map: - raise ValueError(f"Invalid action '{action}'. Must be one of: left, down, right, up") - numeric_action = action_map[action.lower()] - else: - numeric_action = action - - if not (0 <= numeric_action < self.nA): - raise ValueError(f"Invalid action: {numeric_action}. Must be 0-{self.nA-1}") - - # Take the step in the Gymnasium environment - new_state, reward, terminated, truncated, info = self.env.step(numeric_action) - - # Update our state tracking - self.current_state = new_state - self.current_pos = self._state_to_pos(new_state) - self.done = terminated or truncated - self.won = reward > 0 # In FrozenLake, reward=1 for reaching goal, 0 otherwise - - return self._get_observation(), self.done - - def _get_observation(self) -> Dict: - """Get the current observation.""" - if self.current_pos is None: - raise RuntimeError("Game not initialized") - row, col = self.current_pos - cell = self.desc[row, col].decode("utf-8") - - # Create a visual representation - visual = [] - for i in range(self.nrow): - row_str = "" - for j in range(self.ncol): - if (i, j) == self.current_pos: - row_str += "[" + self.desc[i, j].decode("utf-8") + "]" - else: - row_str += " " + self.desc[i, j].decode("utf-8") + " " - visual.append(row_str) - - obs = { - "position": self.current_pos, - "current_cell": cell, - "done": self.done, - "won": self.won, - "visual": "\n".join(visual), - "message": self._get_message(), - "state": self.current_state, # Add the Gymnasium state for compatibility - } - - return obs - - def _get_message(self) -> str: - """Get a descriptive message about the current state.""" - if self.done: - if self.won: - return "Congratulations! You reached the goal! You win!" - else: - return "Oh no! You fell into a hole. Game over." - else: - if self.current_pos is None: - return "Game not initialized" - row, col = self.current_pos - cell = self.desc[row, col].decode("utf-8") - return f"You are at position ({row}, {col}) on a {cell} cell. Choose your next move carefully." - - def close(self): - """Close the Gymnasium environment.""" - self.env.close() - - def render(self, mode: str = "human"): - """Render the environment using Gymnasium's rendering.""" - return self.env.render() - - def get_action_meanings(self): - """Get human-readable action meanings.""" - return ["Left", "Down", "Right", "Up"] - - def get_action_space_info(self): - """Get information about the action space.""" - return { - "type": "Discrete", - "n": int(self.nA), # Convert numpy int to Python int - "actions": {0: "left", 1: "down", 2: "right", 3: "up"}, - } - - def get_observation_space_info(self): - """Get information about the observation space.""" - return { - "type": "Discrete", - "n": int(self.nS), # Convert numpy int to Python int - "shape": ( - int(self.nrow), - int(self.ncol), - ), # Convert numpy ints to Python ints - "description": "State number representing position on grid", - } - - def get_environment_info(self): - """Get comprehensive environment information.""" - return { - "name": "FrozenLake-v1", - "map_name": self.map_name, - "is_slippery": self.is_slippery, - "nrow": int(self.nrow), # Convert numpy int to Python int - "ncol": int(self.ncol), # Convert numpy int to Python int - "action_space": self.get_action_space_info(), - "observation_space": self.get_observation_space_info(), - "description": [[cell.decode("utf-8") for cell in row] for row in self.desc], # Convert to strings - } - - -# Backward compatibility: alias the old class name to the new one -FrozenLakeGame = GymnasiumFrozenLakeGame - - -if __name__ == "__main__": - """Test the Gymnasium implementation.""" - print("Testing Gymnasium FrozenLake implementation...") - - # Test with deterministic environment - game = GymnasiumFrozenLakeGame(is_slippery=False) - print(f"Environment info: {game.get_environment_info()}") - - obs = game.reset() - print(f"Initial observation: {obs}") - - # Test both string and numeric actions - test_actions = ["down", "down", "right", "right", "down", "right"] - - for i, action in enumerate(test_actions): - print(f"\nStep {i+1}: Taking action '{action}'") - obs, done = game.step(action) - print(f"Position: {obs['position']}, Done: {done}, Won: {obs['won']}") - print(f"Message: {obs['message']}") - - if done: - if obs["won"]: - print("๐ŸŽ‰ Success! Reached the goal!") - else: - print("๐Ÿ’€ Failed! Fell into a hole!") - break - - game.close() - print("\nTest completed!") diff --git a/examples/frozen_lake/run_full_evaluation.sh b/examples/frozen_lake/run_full_evaluation.sh deleted file mode 100755 index c2984c89..00000000 --- a/examples/frozen_lake/run_full_evaluation.sh +++ /dev/null @@ -1,318 +0,0 @@ -#!/bin/bash - -# Complete Frozen Lake HTTP Rollout Evaluation Script -# This script demonstrates the full end-to-end HTTP rollout evaluation - -set -e # Exit on any error - -# Configuration -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" -HTTP_ROLLOUT_SERVER_PORT=8082 -MAX_WAIT_TIME=30 - -# PID files to track server processes -HTTP_ROLLOUT_PID_FILE="/tmp/http_rollout_server.pid" - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' # No Color - -log() { - echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')] $1${NC}" -} - -warn() { - echo -e "${YELLOW}[$(date +'%Y-%m-%d %H:%M:%S')] WARNING: $1${NC}" -} - -error() { - echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $1${NC}" -} - -info() { - echo -e "${BLUE}[$(date +'%Y-%m-%d %H:%M:%S')] INFO: $1${NC}" -} - -# Cleanup function -cleanup() { - log "Cleaning up servers..." - - # Kill HTTP rollout server - if [ -f "$HTTP_ROLLOUT_PID_FILE" ]; then - HTTP_ROLLOUT_PID=$(cat "$HTTP_ROLLOUT_PID_FILE") - if kill -0 "$HTTP_ROLLOUT_PID" 2>/dev/null; then - log "Stopping HTTP rollout server (PID: $HTTP_ROLLOUT_PID)" - kill "$HTTP_ROLLOUT_PID" 2>/dev/null || true - sleep 2 - kill -9 "$HTTP_ROLLOUT_PID" 2>/dev/null || true - fi - rm -f "$HTTP_ROLLOUT_PID_FILE" - fi - - # Kill any remaining Python processes for our servers - pkill -f "http_rollout_server.py" 2>/dev/null || true - - log "Cleanup complete" -} - -# Set up signal handlers -trap cleanup EXIT INT TERM - -# Function to wait for a server to be ready -wait_for_server() { - local url=$1 - local name=$2 - local max_wait=$3 - - log "Waiting for $name to be ready at $url..." - - for i in $(seq 1 $max_wait); do - if curl -s -f "$url" > /dev/null 2>&1; then - log "$name is ready!" - return 0 - fi - sleep 1 - done - - error "$name failed to start within $max_wait seconds" - return 1 -} - -# Check prerequisites -check_prerequisites() { - info "Checking prerequisites..." - - # Check if reward-kit is available - if ! python -c "import eval_protocol" 2>/dev/null; then - error "reward-kit not installed or not in Python path" - exit 1 - fi - - # Check if required files exist - if [ ! -f "$SCRIPT_DIR/client/task_def.yaml" ]; then - error "task_def.yaml not found in $SCRIPT_DIR" - exit 1 - fi - - if [ ! -f "$SCRIPT_DIR/server/http_rollout_server.py" ]; then - error "http_rollout_server.py not found in $SCRIPT_DIR" - exit 1 - fi - - # Check if FIREWORKS_API_KEY is set - if [ -z "$FIREWORKS_API_KEY" ]; then - warn "FIREWORKS_API_KEY environment variable is not set" - warn "The evaluation will fail at the API call stage, but the infrastructure will be tested" - info "To run with a real model, set: export FIREWORKS_API_KEY=your_api_key" - else - info "FIREWORKS_API_KEY is set (length: ${#FIREWORKS_API_KEY})" - fi - - info "Prerequisites check complete" -} - -# Main execution -main() { - echo "" - echo "========================================" - echo "๐ŸŽฎ FROZEN LAKE HTTP ROLLOUT EVALUATION" - echo "========================================" - echo "" - - check_prerequisites - - # Change to the script directory - cd "$SCRIPT_DIR" - - # Check if ports are available - if lsof -Pi :$HTTP_ROLLOUT_SERVER_PORT -sTCP:LISTEN -t >/dev/null; then - error "Port $HTTP_ROLLOUT_SERVER_PORT is already in use" - exit 1 - fi - - # Start HTTP rollout server - log "Starting HTTP rollout server on port $HTTP_ROLLOUT_SERVER_PORT..." - python server/http_rollout_server.py --port $HTTP_ROLLOUT_SERVER_PORT & - HTTP_ROLLOUT_PID=$! - echo $HTTP_ROLLOUT_PID > "$HTTP_ROLLOUT_PID_FILE" - - # Wait for servers to be ready - wait_for_server "http://localhost:$HTTP_ROLLOUT_SERVER_PORT/health" "HTTP rollout server" $MAX_WAIT_TIME - - # Test the HTTP rollout server - info "Testing HTTP rollout server..." - - # Test start episode - EPISODE_DATA=$(curl -s -X POST "http://localhost:$HTTP_ROLLOUT_SERVER_PORT/start_episode") - EPISODE_ID=$(echo "$EPISODE_DATA" | python -c "import sys, json; print(json.load(sys.stdin)['episode_id'])") - info "Started episode: $EPISODE_ID" - - # Test step - STEP_DATA=$(curl -s -X POST "http://localhost:$HTTP_ROLLOUT_SERVER_PORT/step" \ - -H "Content-Type: application/json" \ - -d "{\"episode_id\": \"$EPISODE_ID\", \"action\": 2}") - info "Step result: $STEP_DATA" - - # End episode - curl -s -X POST "http://localhost:$HTTP_ROLLOUT_SERVER_PORT/end_episode" \ - -H "Content-Type: application/json" \ - -d "{\"episode_id\": \"$EPISODE_ID\"}" > /dev/null - info "Episode ended successfully" - - # Run the evaluation - log "Starting agent evaluation..." - cd "$REPO_ROOT" - - # Set model configuration - export MODEL_AGENT="fireworks/accounts/fireworks/models/qwen3-235b-a22b" - - # Create logs directory - LOG_DIR="$SCRIPT_DIR/evaluation_logs" - mkdir -p "$LOG_DIR" - TIMESTAMP=$(date +"%Y%m%d_%H%M%S") - FULL_LOG_FILE="$LOG_DIR/full_evaluation_${TIMESTAMP}.log" - TRAJECTORY_LOG_FILE="$LOG_DIR/agent_trajectory_${TIMESTAMP}.log" - - # Run the evaluation with detailed logging - info "Executing: python -m eval_protocol.cli agent-eval --task-def examples/frozen_lake/client/task_def.yaml" - info "Full logs will be saved to: $FULL_LOG_FILE" - info "Agent trajectory will be extracted to: $TRAJECTORY_LOG_FILE" - - # Capture all output and filter agent trajectory - python -m eval_protocol.cli agent-eval --task-def examples/frozen_lake/client/task_def.yaml 2>&1 | tee "$FULL_LOG_FILE" - - # Extract agent trajectory and tool calls - log "Extracting agent trajectory for review..." - - # Create a detailed trajectory log - cat > "$TRAJECTORY_LOG_FILE" << 'EOF' -FROZEN LAKE AGENT EVALUATION TRAJECTORY -====================================== - -This log contains the complete agent decision-making process including: -- User prompts -- Agent reasoning (thinking) -- Tool calls made by the agent -- Environment responses -- Agent reactions to environment feedback - -====================================== - -EOF - - # Extract the relevant trajectory information - grep -A 5 -B 5 "User Turn\|Inner Step\|Tool.*result\|OpenAI response\|Calling OpenAI\|tool calls" "$FULL_LOG_FILE" >> "$TRAJECTORY_LOG_FILE" || true - - echo "" >> "$TRAJECTORY_LOG_FILE" - echo "======================================" >> "$TRAJECTORY_LOG_FILE" - echo "DETAILED MESSAGES HISTORY" >> "$TRAJECTORY_LOG_FILE" - echo "======================================" >> "$TRAJECTORY_LOG_FILE" - echo "" >> "$TRAJECTORY_LOG_FILE" - - # Extract the complete conversation flow - grep -A 20 "messages_FULL_HISTORY" "$FULL_LOG_FILE" >> "$TRAJECTORY_LOG_FILE" || true - - echo "" >> "$TRAJECTORY_LOG_FILE" - echo "======================================" >> "$TRAJECTORY_LOG_FILE" - echo "TOOL CALLS AND RESPONSES" >> "$TRAJECTORY_LOG_FILE" - echo "======================================" >> "$TRAJECTORY_LOG_FILE" - echo "" >> "$TRAJECTORY_LOG_FILE" - - # Extract tool call details - grep -A 10 -B 2 "tool_calls\|Tool.*result\|step.*action" "$FULL_LOG_FILE" >> "$TRAJECTORY_LOG_FILE" || true - - # Run the trajectory analyzer - cd "$SCRIPT_DIR" - if [ -f "analyze_trajectory.py" ] && [ -f "$FULL_LOG_FILE" ]; then - info "Running trajectory analysis..." - python analyze_trajectory.py "$FULL_LOG_FILE" > "${LOG_DIR}/trajectory_analysis_${TIMESTAMP}.txt" 2>&1 || true - fi - - echo "" - echo "========================================" - echo "โœ… EVALUATION INFRASTRUCTURE COMPLETE" - echo "========================================" - echo "" - info "HTTP rollout support has been successfully implemented!" - echo "" - echo "Key achievements:" - echo "โ€ข โœ… HttpRolloutResource implemented and integrated" - echo "โ€ข โœ… Fireworks model support added to orchestrator" - echo "โ€ข โœ… Tool calling protocol working correctly" - echo "โ€ข โœ… HTTP rollout server communication verified" - echo "โ€ข โœ… Complete evaluation framework functional" - echo "" - - echo "๐Ÿ“‹ EVALUATION LOGS SAVED:" - echo "โ€ข Full evaluation log: $FULL_LOG_FILE" - echo "โ€ข Agent trajectory log: $TRAJECTORY_LOG_FILE" - - ANALYSIS_FILE="${LOG_DIR}/trajectory_analysis_${TIMESTAMP}.txt" - if [ -f "$ANALYSIS_FILE" ]; then - echo "โ€ข Trajectory analysis: $ANALYSIS_FILE" - fi - echo "" - - echo "๐Ÿ“Š AGENT TRAJECTORY SUMMARY:" - if [ -f "$FULL_LOG_FILE" ]; then - # Show a quick summary of tool calls - TOOL_CALL_COUNT=$(grep -c "Attempting tool call: step" "$FULL_LOG_FILE" || echo "0") - echo "โ€ข Total tool calls made: $TOOL_CALL_COUNT" - - # Show quick trajectory analysis if available - if [ -f "$ANALYSIS_FILE" ]; then - echo "" - echo "Quick trajectory preview:" - head -20 "$ANALYSIS_FILE" | tail -15 - echo "" - echo "๐Ÿ“– Full trajectory analysis:" - echo " cat $ANALYSIS_FILE" - else - echo "โ€ข Review detailed trajectory in: $TRAJECTORY_LOG_FILE" - - # Show the first few tool calls for quick review - echo "" - echo "First few tool calls:" - grep -m 3 -A 2 "Attempting tool call: step" "$FULL_LOG_FILE" | head -9 || true - fi - fi - echo "" - - if [ -z "$FIREWORKS_API_KEY" ]; then - echo "To run with actual LLM inference:" - echo "1. Set FIREWORKS_API_KEY environment variable" - echo "2. Re-run this script" - else - echo "๐ŸŽ‰ Ready for production use with LLM inference!" - echo "" - echo "๐Ÿ“– To review the agent's decision making:" - echo " cat $TRAJECTORY_LOG_FILE" - fi - echo "" -} - -# Check for help flag -if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then - echo "Frozen Lake HTTP Rollout Evaluation" - echo "" - echo "This script demonstrates the complete HTTP rollout evaluation infrastructure:" - echo "1. Starts HTTP rollout server for Frozen Lake game" - echo "2. Tests the HTTP rollout protocol" - echo "3. Runs the agent evaluation framework" - echo "4. Shows tool calling and resource integration working" - echo "" - echo "Prerequisites:" - echo "- reward-kit installed and configured" - echo "- FIREWORKS_API_KEY environment variable (optional for infrastructure testing)" - echo "" - echo "Usage: $0" - echo "" - exit 0 -fi - -# Run main function -main "$@" diff --git a/examples/frozen_lake/server/README.md b/examples/frozen_lake/server/README.md deleted file mode 100644 index 08d3f7eb..00000000 --- a/examples/frozen_lake/server/README.md +++ /dev/null @@ -1,187 +0,0 @@ -# Frozen Lake Game Server - -This is the **server-side implementation** of the Frozen Lake game environment that provides an HTTP API for agent evaluation. - -## Overview - -This server implements the HTTP Rollout Protocol that allows external evaluation frameworks (like eval-protocol) to interact with the game environment through standardized endpoints. - -## API Endpoints - -### `POST /start_episode` -Initializes a new game episode. - -**Response:** -```json -{ - "episode_id": "uuid-string", - "observation": { - "position": [0, 0], - "current_cell": "S", - "done": false, - "won": false, - "visual": "[S] F F F \n F H F H \n F F F H \n H F F G ", - "message": "You are at position (0, 0) on a S cell. Choose your next move carefully." - } -} -``` - -### `POST /step` -Executes an action in the game. - -**Request:** -```json -{ - "episode_id": "uuid-string", - "action": 2 -} -``` - -**Action Values:** -- `0` = Left -- `1` = Down -- `2` = Right -- `3` = Up - -**Response:** -```json -{ - "observation": { - "position": [0, 1], - "current_cell": "F", - "done": false, - "won": false, - "visual": " S [F] F F \n F H F H \n F F F H \n H F F G ", - "message": "You are at position (0, 1) on a F cell. Choose your next move carefully." - }, - "is_done": false -} -``` - -### `POST /end_episode` -Cleans up a completed episode. - -**Request:** -```json -{ - "episode_id": "uuid-string" -} -``` - -**Response:** -```json -{ - "message": "Episode ended successfully" -} -``` - -### `GET /health` -Health check endpoint. - -**Response:** -```json -{ - "status": "healthy", - "game": "frozen_lake" -} -``` - -## Game Logic - -### Board Layout -``` -S F F F -F H F H -F F F H -H F F G -``` - -### Game Rules -- **S**: Starting position (safe) -- **F**: Frozen lake (safe to step on) -- **H**: Hole (game over if stepped on) -- **G**: Goal (win condition) - -### Win/Loss Conditions -- **Win**: Reach the goal position (G) -- **Loss**: Step on a hole (H) or exceed maximum steps - -## Running the Server - -### Prerequisites -- Python 3.8+ -- FastAPI -- Uvicorn - -### Installation -```bash -pip install fastapi uvicorn -``` - -### Start Server -```bash -python http_rollout_server.py -``` - -The server will start on `http://localhost:8080` - -### Configuration -Environment variables: -- `PORT`: Server port (default: 8080) -- `HOST`: Server host (default: 0.0.0.0) - -## Integration Notes - -This server is designed to work with any HTTP rollout-compatible evaluation framework. The client side handles: -- Action translation (string โ†’ numeric) -- State interpretation -- Reward calculation -- Episode management - -## Customization - -### Different Board Layouts -Modify the `FROZEN_LAKE_MAP` constant: -```python -FROZEN_LAKE_MAP = [ - "SFFF", - "FHFH", - "FFFH", - "HFFG" -] -``` - -### Game Variants -- Slippery surfaces (movement uncertainty) -- Larger boards -- Dynamic obstacles -- Multi-goal scenarios - -## Development - -### Testing the API -```bash -# Health check -curl http://localhost:8080/health - -# Start episode -curl -X POST http://localhost:8080/start_episode - -# Take action -curl -X POST http://localhost:8080/step \ - -H "Content-Type: application/json" \ - -d '{"episode_id": "your-episode-id", "action": 2}' -``` - -### Logging -The server logs all API interactions for debugging and monitoring. - -## Protocol Compliance - -This implementation follows the HTTP Rollout Protocol specification: -- Stateless episode management -- Structured observation format -- Standardized error handling -- Health monitoring endpoint - -Game environment developers can use this as a reference implementation for creating their own HTTP rollout-compatible environments. diff --git a/examples/frozen_lake/server/http_rollout_server.py b/examples/frozen_lake/server/http_rollout_server.py deleted file mode 100644 index afca6959..00000000 --- a/examples/frozen_lake/server/http_rollout_server.py +++ /dev/null @@ -1,104 +0,0 @@ -""" -HTTP rollout server for Frozen Lake game. - -This server implements the standard HTTP rollout protocol using the reward-kit -library's standardized types for consistent client/server communication. -""" - -import os -import sys -import uuid -from typing import Dict - -from fastapi import FastAPI, HTTPException - -# Add parent directory to path to import gymnasium frozen lake server -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from gymnasium_frozen_lake_server import GymnasiumFrozenLakeGame as FrozenLakeGame - -# Additional models for responses -from pydantic import BaseModel - -# Import standardized HTTP rollout protocol types from reward-kit -from eval_protocol.agent.resources import ( - EndEpisodeRequest, - EndEpisodeResponse, - HealthResponse, - StartEpisodeRequest, - StartEpisodeResponse, - StepRequest, - StepResponse, -) - -# FastAPI app -app = FastAPI(title="Frozen Lake HTTP Rollout Server") - -# Store active episodes -episodes: Dict[str, FrozenLakeGame] = {} - - -@app.post("/start_episode", response_model=StartEpisodeResponse) -async def start_episode( - request: StartEpisodeRequest = StartEpisodeRequest(), -) -> StartEpisodeResponse: - """Start a new episode of the Frozen Lake game.""" - episode_id = str(uuid.uuid4()) - - # Extract request data to pass to the game constructor - request_data = request.dict() if hasattr(request, "dict") else request.model_dump() - - # Create Gymnasium-based game with configuration from request - # Default values maintained for backward compatibility - game_config = { - "map_name": "4x4", - "is_slippery": True, # Enable stochastic behavior to demonstrate seed effect - "render_mode": None, - } - # Override with any values from the request (like seed) - game_config.update(request_data) - - game = FrozenLakeGame(**game_config) - observation = game.reset() - episodes[episode_id] = game - - return StartEpisodeResponse(episode_id=episode_id, observation=observation) - - -@app.post("/step", response_model=StepResponse) -async def step(req: StepRequest) -> StepResponse: - """Take a step in the specified episode.""" - if req.episode_id not in episodes: - raise HTTPException(status_code=404, detail="Episode not found") - - game = episodes[req.episode_id] - observation, is_done = game.step(req.action) - - return StepResponse(observation=observation, is_done=is_done) - - -@app.post("/end_episode", response_model=EndEpisodeResponse) -async def end_episode(req: EndEpisodeRequest) -> EndEpisodeResponse: - """End the specified episode.""" - if req.episode_id not in episodes: - raise HTTPException(status_code=404, detail="Episode not found") - - del episodes[req.episode_id] - return EndEpisodeResponse(message=f"Episode {req.episode_id} ended successfully") - - -@app.get("/health", response_model=HealthResponse) -async def health_check() -> HealthResponse: - """Health check endpoint.""" - return HealthResponse(status="healthy", game="frozen_lake_gymnasium") - - -if __name__ == "__main__": - import argparse - - import uvicorn - - parser = argparse.ArgumentParser(description="Frozen Lake HTTP Rollout Server") - parser.add_argument("--port", type=int, default=8080, help="Port to run the server on") - args = parser.parse_args() - - uvicorn.run(app, host="0.0.0.0", port=args.port) diff --git a/tests/test_frozen_lake_http_server.py b/tests/test_frozen_lake_http_server.py deleted file mode 100644 index ebe7104f..00000000 --- a/tests/test_frozen_lake_http_server.py +++ /dev/null @@ -1,269 +0,0 @@ -""" -Tests for FrozenLake HTTP rollout server seed handling. - -This module tests the HTTP server's ability to accept and use seed parameters -to create reproducible game environments. -""" - -import json -from unittest.mock import MagicMock, patch - -import pytest -from fastapi.testclient import TestClient - -from examples.frozen_lake.gymnasium_frozen_lake_server import GymnasiumFrozenLakeGame - -# Import the server components -from examples.frozen_lake.server.http_rollout_server import app - - -class TestFrozenLakeHttpServer: - """Tests for the FrozenLake HTTP rollout server.""" - - def setup_method(self): - """Set up test client.""" - self.client = TestClient(app) - - def test_start_episode_without_seed(self): - """Test starting episode without seed uses default behavior.""" - response = self.client.post("/start_episode") - - assert response.status_code == 200 - data = response.json() - - assert "episode_id" in data - assert "observation" in data - - # Should have standard game state - observation = data["observation"] - assert "position" in observation - assert "current_cell" in observation - assert "visual" in observation - assert observation["position"] == [0, 0] # Start position - assert observation["current_cell"] == "S" # Start cell - - def test_start_episode_with_seed(self): - """Test starting episode with seed parameter.""" - seed_value = 42 - request_data = {"seed": seed_value} - - response = self.client.post("/start_episode", json=request_data) - - assert response.status_code == 200 - data = response.json() - - assert "episode_id" in data - assert "observation" in data - - # Should still start at position (0,0) - observation = data["observation"] - assert observation["position"] == [0, 0] - assert observation["current_cell"] == "S" - - def test_different_seeds_create_different_episodes(self): - """Test that different seeds create episodes with different board layouts.""" - # Start episode with seed 42 - response1 = self.client.post("/start_episode", json={"seed": 42}) - assert response1.status_code == 200 - data1 = response1.json() - episode_id1 = data1["episode_id"] - visual1 = data1["observation"]["visual"] - - # Start episode with seed 123 - response2 = self.client.post("/start_episode", json={"seed": 123}) - assert response2.status_code == 200 - data2 = response2.json() - episode_id2 = data2["episode_id"] - visual2 = data2["observation"]["visual"] - - # Episodes should have different IDs - assert episode_id1 != episode_id2 - - # Board layouts should be different (high probability with different seeds) - assert visual1 != visual2, "Different seeds should create different board layouts" - - def test_same_seed_creates_identical_episodes(self): - """Test that same seed creates episodes with identical board layouts.""" - seed_value = 999 - - # Start two episodes with same seed - response1 = self.client.post("/start_episode", json={"seed": seed_value}) - assert response1.status_code == 200 - data1 = response1.json() - visual1 = data1["observation"]["visual"] - - response2 = self.client.post("/start_episode", json={"seed": seed_value}) - assert response2.status_code == 200 - data2 = response2.json() - visual2 = data2["observation"]["visual"] - - # Board layouts should be identical - assert visual1 == visual2, "Same seed should create identical board layouts" - - def test_start_episode_with_additional_parameters(self): - """Test that server accepts additional parameters beyond seed.""" - request_data = {"seed": 42, "custom_param": "test_value", "id": "test_run_001"} - - response = self.client.post("/start_episode", json=request_data) - - assert response.status_code == 200 - data = response.json() - - # Should work normally despite extra parameters - assert "episode_id" in data - assert "observation" in data - - def test_step_action_in_seeded_episode(self): - """Test taking actions in a seeded episode.""" - # Start episode with seed - response = self.client.post("/start_episode", json={"seed": 42}) - assert response.status_code == 200 - episode_id = response.json()["episode_id"] - - # Take a step action - step_response = self.client.post("/step", json={"episode_id": episode_id, "action": "right"}) - - assert step_response.status_code == 200 - step_data = step_response.json() - - assert "observation" in step_data - assert "is_done" in step_data - - # Position should have changed (unless blocked) - observation = step_data["observation"] - assert "position" in observation - assert "current_cell" in observation - - def test_health_endpoint(self): - """Test the health check endpoint.""" - response = self.client.get("/health") - - assert response.status_code == 200 - data = response.json() - assert data["status"] == "healthy" - - def test_episode_cleanup_on_completion(self): - """Test that episodes are properly tracked and can be cleaned up.""" - # Start an episode - response = self.client.post("/start_episode", json={"seed": 42}) - assert response.status_code == 200 - episode_id = response.json()["episode_id"] - - # Episode should be trackable via step endpoint - step_response = self.client.post("/step", json={"episode_id": episode_id, "action": "right"}) - assert step_response.status_code == 200 - - def test_invalid_episode_id_handling(self): - """Test handling of invalid episode IDs.""" - # Try to step with non-existent episode ID - response = self.client.post("/step", json={"episode_id": "non_existent_episode", "action": "right"}) - - # Should return an error (400 or 404) - assert response.status_code in [400, 404] - - def test_server_configuration_with_slippery_environment(self): - """Test that server is configured with slippery environment for seed demonstration.""" - # Mock the game creation to verify configuration - with patch("examples.frozen_lake.server.http_rollout_server.FrozenLakeGame") as mock_game_class: - mock_game_instance = MagicMock() - mock_game_instance.reset.return_value = { - "position": [0, 0], - "current_cell": "S", - "visual": "test_visual", - "done": False, - } - mock_game_class.return_value = mock_game_instance - - response = self.client.post("/start_episode", json={"seed": 42}) - - # Verify game was created with correct configuration - mock_game_class.assert_called_once() - call_kwargs = mock_game_class.call_args[1] - - # Should include slippery=True and the seed - assert call_kwargs.get("is_slippery") is True - assert call_kwargs.get("seed") == 42 - - -class TestGymnasiumFrozenLakeIntegration: - """Integration tests between HTTP server and GymnasiumFrozenLakeGame.""" - - def test_gymnasium_integration_with_seeds(self): - """Test that the HTTP server correctly integrates with GymnasiumFrozenLakeGame seeds.""" - client = TestClient(app) - - # Test multiple seeds to ensure they work through the HTTP interface - seeds = [42, 123, 999] - board_layouts = [] - - for seed in seeds: - response = client.post("/start_episode", json={"seed": seed}) - assert response.status_code == 200 - - observation = response.json()["observation"] - board_layouts.append(observation["visual"]) - - # All board layouts should be different - unique_layouts = set(board_layouts) - assert len(unique_layouts) == len(seeds), "Each seed should produce a unique board layout" - - def test_episode_state_consistency(self): - """Test that episode state remains consistent within a single game.""" - client = TestClient(app) - - # Start episode with specific seed - response = client.post("/start_episode", json={"seed": 42}) - assert response.status_code == 200 - - episode_id = response.json()["episode_id"] - initial_visual = response.json()["observation"]["visual"] - - # Take a few actions and verify board layout doesn't change - for action in ["right", "down", "left"]: - step_response = client.post("/step", json={"episode_id": episode_id, "action": action}) - assert step_response.status_code == 200 - - observation = step_response.json()["observation"] - # Visual board should remain the same (only position marker changes) - current_visual = observation["visual"] - - # Board structure should be preserved (same letters, different position marker) - # Extract just the board cells without position markers - initial_cells = initial_visual.replace("[", "").replace("]", "") - current_cells = current_visual.replace("[", "").replace("]", "") - - # The underlying board structure should be identical - assert len(initial_cells) == len(current_cells), "Board size should remain constant" - - def test_seed_parameter_propagation(self): - """Test that seed parameter correctly propagates to the game engine.""" - # This test verifies the complete data flow from HTTP request to game creation - - with patch("examples.frozen_lake.server.http_rollout_server.FrozenLakeGame") as mock_game_class: - mock_game_instance = MagicMock() - mock_game_instance.reset.return_value = { - "position": [0, 0], - "current_cell": "S", - "visual": "mocked_visual", - "done": False, - "won": False, - "message": "test_message", - } - mock_game_class.return_value = mock_game_instance - - client = TestClient(app) - - # Send request with seed - seed_value = 1337 - response = client.post("/start_episode", json={"seed": seed_value}) - - assert response.status_code == 200 - - # Verify the game was created with the correct seed - mock_game_class.assert_called_once() - call_kwargs = mock_game_class.call_args[1] - assert call_kwargs["seed"] == seed_value - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) diff --git a/tests/test_frozen_lake_seed_evaluation.py b/tests/test_frozen_lake_seed_evaluation.py deleted file mode 100644 index 54bcf1c6..00000000 --- a/tests/test_frozen_lake_seed_evaluation.py +++ /dev/null @@ -1,541 +0,0 @@ -""" -Tests for seed-based reproducible evaluation in FrozenLake example. - -This module tests the complete data-driven evaluation pipeline including: -- Seed-based map generation for reproducible game boards -- Data-driven evaluation infrastructure in TaskManager -- Protocol enhancements for sample data passing -- End-to-end integration tests -""" - -import json -import tempfile -from pathlib import Path -from typing import Any, Dict -from unittest.mock import AsyncMock, MagicMock, patch - -import pytest -import pytest_asyncio - -from eval_protocol.agent.orchestrator import Orchestrator -from eval_protocol.agent.resources.http_rollout_protocol import StartEpisodeRequest -from eval_protocol.agent.resources.http_rollout_resource import HttpRolloutResource -from eval_protocol.agent.task_manager import TaskManager -from eval_protocol.models import TaskDefinitionModel - -# Import components under test -from examples.frozen_lake.gymnasium_frozen_lake_server import GymnasiumFrozenLakeGame - - -class TestSeedBasedMapGeneration: - """Tests for seed-based reproducible map generation in GymnasiumFrozenLakeGame.""" - - def test_seed_generates_different_maps(self): - """Test that different seeds generate different map layouts.""" - seed1 = 42 - seed2 = 123 - - game1 = GymnasiumFrozenLakeGame(seed=seed1) - game2 = GymnasiumFrozenLakeGame(seed=seed2) - - # Get map descriptions - map1 = game1.desc.tolist() - map2 = game2.desc.tolist() - - # Maps should be different - assert map1 != map2, "Different seeds should generate different maps" - - # Both should be 4x4 by default - assert len(map1) == 4 and len(map1[0]) == 4 - assert len(map2) == 4 and len(map2[0]) == 4 - - game1.close() - game2.close() - - def test_same_seed_generates_identical_maps(self): - """Test that the same seed generates identical map layouts.""" - seed = 42 - - game1 = GymnasiumFrozenLakeGame(seed=seed) - game2 = GymnasiumFrozenLakeGame(seed=seed) - - # Get map descriptions - map1 = game1.desc.tolist() - map2 = game2.desc.tolist() - - # Maps should be identical - assert map1 == map2, "Same seed should generate identical maps" - - game1.close() - game2.close() - - def test_no_seed_uses_fixed_map(self): - """Test that no seed uses the fixed predefined map.""" - game1 = GymnasiumFrozenLakeGame() # No seed - game2 = GymnasiumFrozenLakeGame() # No seed - - # Get map descriptions - map1 = game1.desc.tolist() - map2 = game2.desc.tolist() - - # Maps should be identical (both using fixed 4x4 map) - assert map1 == map2, "No seed should use identical fixed maps" - - # Should be the standard 4x4 FrozenLake map - expected_map = [ - [b"S", b"F", b"F", b"F"], - [b"F", b"H", b"F", b"H"], - [b"F", b"F", b"F", b"H"], - [b"H", b"F", b"F", b"G"], - ] - assert map1 == expected_map, "Should use standard 4x4 map when no seed" - - game1.close() - game2.close() - - def test_seed_with_8x8_map(self): - """Test seed-based generation with 8x8 map size.""" - seed = 999 - - game = GymnasiumFrozenLakeGame(map_name="8x8", seed=seed) - - # Should be 8x8 - assert game.desc.shape == (8, 8) - - # Should have start and goal - flat_map = game.desc.flatten() - assert b"S" in flat_map - assert b"G" in flat_map - - game.close() - - def test_seed_affects_reset_behavior(self): - """Test that seed affects the reset behavior for stochastic environments.""" - seed = 42 - - # Test with slippery environment - game = GymnasiumFrozenLakeGame(seed=seed, is_slippery=True) - - # Reset multiple times - should get same initial state - state1 = game.reset() - state2 = game.reset() - - # Initial position should be consistent - assert state1["position"] == state2["position"] == (0, 0) - assert state1["current_cell"] == state2["current_cell"] == "S" - - game.close() - - def test_map_has_valid_path(self): - """Test that generated maps always have a valid path from start to goal.""" - # Test multiple seeds to ensure path validity - for seed in [42, 123, 999, 1337]: - game = GymnasiumFrozenLakeGame(seed=seed) - - # Should have exactly one start and one goal - flat_map = game.desc.flatten() - start_count = sum(1 for cell in flat_map if cell == b"S") - goal_count = sum(1 for cell in flat_map if cell == b"G") - - assert start_count == 1, f"Should have exactly one start for seed {seed}" - assert goal_count == 1, f"Should have exactly one goal for seed {seed}" - - # Start should be at (0,0) and goal should exist - assert game.desc[0, 0] == b"S", f"Start should be at (0,0) for seed {seed}" - assert game.start_pos == ( - 0, - 0, - ), f"Start position should be (0,0) for seed {seed}" - assert game.goal_pos is not None, f"Goal position should exist for seed {seed}" - - game.close() - - -class TestStartEpisodeRequest: - """Tests for the enhanced StartEpisodeRequest protocol.""" - - def test_start_episode_request_accepts_arbitrary_fields(self): - """Test that StartEpisodeRequest accepts arbitrary fields like seed.""" - # Should accept seed and other fields - request = StartEpisodeRequest(seed=42, custom_field="test_value") - - # Access via model_dump or dict - if hasattr(request, "model_dump"): - data = request.model_dump() - else: - data = request.dict() - - assert data["seed"] == 42 - assert data["custom_field"] == "test_value" - - def test_start_episode_request_empty(self): - """Test that StartEpisodeRequest works with no extra fields.""" - request = StartEpisodeRequest() - - # Should work without errors - if hasattr(request, "model_dump"): - data = request.model_dump() - else: - data = request.dict() - - # Should be empty dict or have no extra fields - assert isinstance(data, dict) - - -class TestDataDrivenTaskDefinition: - """Tests for data-driven evaluation fields in TaskDefinitionModel.""" - - def test_task_definition_with_dataset_path(self): - """Test TaskDefinitionModel with dataset_path field.""" - task_def_dict = { - "name": "test_task", - "description": "Test task", - "resource_type": "http_rollout", - "base_resource_config": {"base_url": "http://localhost:8080"}, - "reward_function_path": "test.reward", - "dataset_path": "test_dataset.jsonl", - "num_rollouts_per_sample": 3, - } - - task_def = TaskDefinitionModel(**task_def_dict) - - assert task_def.dataset_path == "test_dataset.jsonl" - assert task_def.num_rollouts_per_sample == 3 - - def test_task_definition_without_dataset_path(self): - """Test TaskDefinitionModel without dataset_path (traditional evaluation).""" - task_def_dict = { - "name": "test_task", - "description": "Test task", - "resource_type": "http_rollout", - "base_resource_config": {"base_url": "http://localhost:8080"}, - "reward_function_path": "test.reward", - "num_rollouts": 5, - } - - task_def = TaskDefinitionModel(**task_def_dict) - - assert task_def.dataset_path is None - assert task_def.num_rollouts_per_sample == 1 # Default value - assert task_def.num_rollouts == 5 - - def test_num_rollouts_per_sample_validation(self): - """Test that num_rollouts_per_sample must be >= 1.""" - task_def_dict = { - "name": "test_task", - "description": "Test task", - "resource_type": "http_rollout", - "base_resource_config": {"base_url": "http://localhost:8080"}, - "reward_function_path": "test.reward", - "dataset_path": "test.jsonl", - "num_rollouts_per_sample": 0, # Invalid - } - - with pytest.raises(ValueError): - TaskDefinitionModel(**task_def_dict) - - -@pytest.mark.asyncio -class TestTaskManagerDataDrivenEvaluation: - """Tests for data-driven evaluation functionality in TaskManager.""" - - def test_load_dataset_samples_valid_jsonl(self): - """Test loading valid JSONL dataset.""" - # Create temporary JSONL file - with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f: - f.write('{"id": "sample1", "seed": 42}\n') - f.write('{"id": "sample2", "seed": 123}\n') - f.write('{"id": "sample3", "seed": 999}\n') - temp_file = f.name - - try: - task_manager = TaskManager() - samples = task_manager._load_dataset_samples(temp_file) - - assert len(samples) == 3 - assert samples[0] == {"id": "sample1", "seed": 42} - assert samples[1] == {"id": "sample2", "seed": 123} - assert samples[2] == {"id": "sample3", "seed": 999} - finally: - Path(temp_file).unlink() - - def test_load_dataset_samples_invalid_json(self): - """Test loading JSONL with invalid JSON lines.""" - # Create temporary JSONL file with some invalid lines - with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f: - f.write('{"id": "sample1", "seed": 42}\n') - f.write("invalid json line\n") # Invalid JSON - f.write('{"id": "sample2", "seed": 123}\n') - temp_file = f.name - - try: - task_manager = TaskManager() - samples = task_manager._load_dataset_samples(temp_file) - - # Should skip invalid line and load valid ones - assert len(samples) == 2 - assert samples[0] == {"id": "sample1", "seed": 42} - assert samples[1] == {"id": "sample2", "seed": 123} - finally: - Path(temp_file).unlink() - - def test_load_dataset_samples_nonexistent_file(self): - """Test loading from nonexistent file.""" - task_manager = TaskManager() - samples = task_manager._load_dataset_samples("nonexistent_file.jsonl") - - assert samples == [] - - def test_load_dataset_samples_empty_file(self): - """Test loading from empty file.""" - with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f: - # Empty file - temp_file = f.name - - try: - task_manager = TaskManager() - samples = task_manager._load_dataset_samples(temp_file) - - assert samples == [] - finally: - Path(temp_file).unlink() - - def test_load_dataset_samples_relative_path(self): - """Test loading dataset with relative path.""" - # Create a temporary directory and file - with tempfile.TemporaryDirectory() as temp_dir: - dataset_file = Path(temp_dir) / "test_dataset.jsonl" - with open(dataset_file, "w") as f: - f.write('{"id": "test", "seed": 42}\n') - - task_manager = TaskManager() - - # Test with absolute path since temp dir is not relative to cwd - absolute_path = str(dataset_file) - samples = task_manager._load_dataset_samples(absolute_path) - - assert len(samples) == 1 - assert samples[0] == {"id": "test", "seed": 42} - - -@pytest.mark.asyncio -class TestHttpRolloutResourceInitialization: - """Tests for HttpRolloutResource initialization with sample data.""" - - async def test_initialize_with_kwargs(self): - """Test that initialize method sends kwargs in POST request.""" - # Mock the HTTP client - mock_client = MagicMock() - mock_response = MagicMock() - mock_response.raise_for_status.return_value = None - mock_response.json.return_value = { - "episode_id": "test_episode", - "observation": {}, - } - mock_client.post.return_value = mock_response - - # Create resource with mock client - config = { - "base_url": "http://localhost:8080", - "start_episode_endpoint": "/start_episode", - } - - resource = HttpRolloutResource() - await resource.setup(config) - resource.client = mock_client # Replace with mock - - # Initialize with sample data - sample_data = {"seed": 42, "custom_param": "test_value"} - await resource.initialize(**sample_data) - - # Verify POST was called with correct parameters - mock_client.post.assert_called_once_with("http://localhost:8080/start_episode", json=sample_data) - - async def test_initialize_without_kwargs(self): - """Test that initialize method works without kwargs.""" - # Mock the HTTP client - mock_client = MagicMock() - mock_response = MagicMock() - mock_response.raise_for_status.return_value = None - mock_response.json.return_value = { - "episode_id": "test_episode", - "observation": {}, - } - mock_client.post.return_value = mock_response - - # Create resource with mock client - config = { - "base_url": "http://localhost:8080", - "start_episode_endpoint": "/start_episode", - } - - resource = HttpRolloutResource() - await resource.setup(config) - resource.client = mock_client # Replace with mock - - # Initialize without sample data - await resource.initialize() - - # Verify POST was called without json parameter - mock_client.post.assert_called_once_with("http://localhost:8080/start_episode") - - -@pytest.mark.asyncio -class TestOrchestratorSampleDataPassing: - """Tests for sample data passing in Orchestrator.""" - - async def test_execute_task_poc_with_sample_data(self): - """Test that execute_task_poc passes sample data to resource initialization.""" - # Create a minimal task definition - task_def_dict = { - "name": "test_task", - "description": "Test task", - "resource_type": "test_resource", - "base_resource_config": {}, - "reward_function_path": "test.reward", - "messages": [{"role": "user", "content": "test"}], - } - task_def = TaskDefinitionModel(**task_def_dict) - - # Mock the base resource - mock_resource = AsyncMock() - mock_resource.fork.return_value = AsyncMock() - mock_episode_resource = mock_resource.fork.return_value - mock_episode_resource.initialize = AsyncMock() - - # Create orchestrator - orchestrator = Orchestrator(task_definition=task_def) - orchestrator.base_resource = mock_resource - - # Mock execute_task_poc to just test the sample data passing logic - async def mock_execute_task_poc(sample_data=None): - if sample_data: - # Simulate the resource initialization that would happen - episode_resource = await orchestrator.base_resource.fork() - await episode_resource.initialize(**sample_data) - return {"score": 1.0} - - with patch.object(orchestrator, "execute_task_poc", side_effect=mock_execute_task_poc): - sample_data = {"seed": 42, "test_param": "value"} - await orchestrator.execute_task_poc(sample_data=sample_data) - - # Verify that episode resource was initialized with sample data - mock_episode_resource.initialize.assert_called_once_with(**sample_data) - - async def test_execute_task_poc_without_sample_data(self): - """Test that execute_task_poc works without sample data.""" - # Create a minimal task definition - task_def_dict = { - "name": "test_task", - "description": "Test task", - "resource_type": "test_resource", - "base_resource_config": {}, - "reward_function_path": "test.reward", - "messages": [{"role": "user", "content": "test"}], - } - task_def = TaskDefinitionModel(**task_def_dict) - - # Mock the base resource - mock_resource = AsyncMock() - mock_resource.fork.return_value = AsyncMock() - mock_episode_resource = mock_resource.fork.return_value - mock_episode_resource.initialize = AsyncMock() - - # Create orchestrator - orchestrator = Orchestrator(task_definition=task_def) - orchestrator.base_resource = mock_resource - - # Mock execute_task_poc to just test the sample data passing logic - async def mock_execute_task_poc(sample_data=None): - if sample_data: - # Simulate the resource initialization that would happen - episode_resource = await orchestrator.base_resource.fork() - await episode_resource.initialize(**sample_data) - return {"score": 1.0} - - with patch.object(orchestrator, "execute_task_poc", side_effect=mock_execute_task_poc): - await orchestrator.execute_task_poc(sample_data=None) - - # Verify that episode resource was not initialized (no sample_data) - mock_episode_resource.initialize.assert_not_called() - - -@pytest.mark.asyncio -class TestEndToEndDataDrivenEvaluation: - """Integration tests for end-to-end data-driven evaluation.""" - - async def test_data_driven_task_execution_flow(self): - """Test the complete flow of data-driven task execution.""" - # Create temporary dataset file - with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f: - f.write('{"id": "run_001", "seed": 42}\n') - f.write('{"id": "run_002", "seed": 123}\n') - temp_dataset = f.name - - try: - # Create task definition with dataset - task_def_dict = { - "name": "frozen_lake_test", - "description": "Test frozen lake with seeds", - "resource_type": "http_rollout", - "base_resource_config": {"base_url": "http://localhost:8080"}, - "reward_function_path": "test.reward", - "dataset_path": temp_dataset, - "num_rollouts_per_sample": 1, - "messages": [{"role": "user", "content": "test"}], - } - - task_manager = TaskManager() - task_manager.register_task("test_task", TaskDefinitionModel(**task_def_dict)) - - # Mock the orchestrator execution - with patch.object(task_manager, "_execute_data_driven_rollouts") as mock_execute: - mock_execute.return_value = [ - {"score": 1.0, "sample_data": {"id": "run_001", "seed": 42}}, - {"score": 0.0, "sample_data": {"id": "run_002", "seed": 123}}, - ] - - # Execute tasks - results = await task_manager.execute_tasks(["test_task"], max_concurrency=1) - - # Verify data-driven execution was called - mock_execute.assert_called_once() - call_args = mock_execute.call_args - samples = call_args[0][1] # Second argument is samples - - assert len(samples) == 2 - assert samples[0] == {"id": "run_001", "seed": 42} - assert samples[1] == {"id": "run_002", "seed": 123} - - finally: - Path(temp_dataset).unlink() - - def test_frozen_lake_dataset_format_validation(self): - """Test that the actual frozen lake dataset has correct format.""" - dataset_path = Path("examples/frozen_lake/client/dataset.jsonl") - - if dataset_path.exists(): - with open(dataset_path, "r") as f: - lines = f.readlines() - - # Should have at least one sample - assert len(lines) > 0, "Dataset should not be empty" - - for i, line in enumerate(lines): - line = line.strip() - if not line: - continue - - try: - sample = json.loads(line) - except json.JSONDecodeError: - pytest.fail(f"Invalid JSON on line {i+1}: {line}") - - # Each sample should have id and seed - assert "id" in sample, f"Sample {i+1} missing 'id' field" - assert "seed" in sample, f"Sample {i+1} missing 'seed' field" - assert isinstance(sample["seed"], int), f"Sample {i+1} seed should be integer" - - -if __name__ == "__main__": - pytest.main([__file__, "-v"])