diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8560e8a6..d0b21795 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -85,10 +85,13 @@ jobs: FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }} PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning" run: | - # Run most tests in parallel, but explicitly ignore tests that manage their own servers + # Run most tests in parallel, but explicitly ignore tests that manage their own servers or are slow uv run pytest \ -n auto \ --ignore=tests/test_batch_evaluation.py \ + --ignore=tests/pytest/test_frozen_lake.py \ + --ignore=tests/pytest/test_lunar_lander.py \ + --ignore=tests/pytest/test_tau_bench_airline.py \ --cov=eval_protocol --cov-append --cov-report=xml --cov-report=term-missing -v --durations=10 - name: Store coverage file diff --git a/eval_protocol/__init__.py b/eval_protocol/__init__.py index c9d2085f..4fea58cb 100644 --- a/eval_protocol/__init__.py +++ b/eval_protocol/__init__.py @@ -17,6 +17,8 @@ from .mcp_env import ( AnthropicPolicy, OpenAIPolicy, + LiteLLMPolicy, + FireworksPolicy, make, rollout, test_mcp, @@ -60,6 +62,7 @@ # MCP Environment API "make", "rollout", + "LiteLLMPolicy", "AnthropicPolicy", "FireworksPolicy", "OpenAIPolicy", @@ -73,10 +76,6 @@ "mcp", ] -# Add FireworksPolicy to exports if available -if _FIREWORKS_AVAILABLE: - __all__.insert(__all__.index("OpenAIPolicy") + 1, "FireworksPolicy") - from . import _version __version__ = _version.get_versions()["version"] diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py index 6151d357..8f52d323 100644 --- a/eval_protocol/mcp/execution/manager.py +++ b/eval_protocol/mcp/execution/manager.py @@ -173,10 +173,25 @@ async def _execute_with_semaphore(idx): # Convert trajectories to unified EvaluationRow format evaluation_rows = [] for trajectory in trajectories: - messages = [Message.model_validate(msg) for msg in trajectory.conversation_history] + # Handle multimodal content by extracting text from complex content structures + messages = [] + for msg in trajectory.conversation_history: + # Create a copy to avoid modifying the original + msg_dict = dict(msg) + + # Handle multimodal content (list of content blocks) by extracting text + if isinstance(msg_dict.get("content"), list): + text_content = None + for content_block in msg_dict["content"]: + if isinstance(content_block, dict) and content_block.get("type") == "text": + text_content = content_block.get("text") + break + msg_dict["content"] = text_content or "" + + messages.append(Message.model_validate(msg_dict)) input_metadata = InputMetadata( - row_id=trajectory.session.session_id, + row_id=trajectory.session.dataset_row.id if trajectory.session.dataset_row else None, dataset_info=asdict(trajectory.session.dataset_row) if trajectory.session.dataset_row else {}, completion_params=CompletionParams( model=policy.model_id, diff --git a/eval_protocol/mcp_env.py b/eval_protocol/mcp_env.py index 822de7bb..209dddf0 100644 --- a/eval_protocol/mcp_env.py +++ b/eval_protocol/mcp_env.py @@ -13,21 +13,18 @@ Usage remains the same: import eval_protocol as ep - # Load dataset with environment configuration and prompts - dataset = load_jsonl("dataset.jsonl") - # Create general policy (environment-agnostic) policy = ep.FireworksPolicy(model_id="accounts/fireworks/models/qwen3-235b-a22b") - # Create environments with dataset-driven configuration - envs = ep.make("http://localhost:8000/mcp", dataset=dataset) + # Create environments with evaluation_rows configuration + envs = ep.make("http://localhost:8000/mcp", evaluation_rows=evaluation_rows) # Execute tool-calling rollouts evaluation_rows = await ep.rollout(envs, policy=policy, steps=512) Key Features: - General tool-calling interface that works with any MCP environment -- Dataset-driven configuration with system prompts and user prompt templates +- EvaluationRow-driven configuration with system prompts and user prompt templates - Automatic MCP tool discovery from servers - **PROPER MCP PATTERN**: Initial state obtained from MCP resources during session establishment - Tools used only for actions/interactions, not for getting initial state @@ -50,7 +47,7 @@ # Import all functionality from the new modular components from .mcp.execution.manager import ExecutionManager -from .mcp.execution.policy import AnthropicPolicy, FireworksPolicy, LLMBasePolicy, OpenAIPolicy +from .mcp.execution.policy import AnthropicPolicy, FireworksPolicy, LLMBasePolicy, OpenAIPolicy, LiteLLMPolicy from .mcp.session.manager import GeneralMCPVectorEnv from .models import EvaluationRow from .types import DatasetRow, MCPSession, MCPToolCall @@ -60,6 +57,7 @@ def make( env_spec: str, + evaluation_rows: Optional[List[EvaluationRow]] = None, dataset: Optional[List[Dict]] = None, n: Optional[int] = None, seeds: Optional[List[int]] = None, @@ -67,11 +65,12 @@ def make( user_prompt_formatter: Optional[Callable] = None, ) -> GeneralMCPVectorEnv: """ - Create general MCP environments driven by dataset configuration. + Create general MCP environments driven by evaluation_rows configuration. Args: env_spec: MCP server URL - dataset: List of dataset rows with prompts and context (preferred) + evaluation_rows: List of EvaluationRow objects containing messages and metadata (preferred) + dataset: List of dataset entries (for backward compatibility) n: Number of environments (for backward compatibility) seeds: List of seeds (for backward compatibility) model_id: Model identifier @@ -81,8 +80,10 @@ def make( General MCP environment that works with any MCP server Example: - # New dataset-driven approach (preferred) - dataset = load_jsonl("dataset.jsonl") + # EvaluationRow approach (preferred) + envs = ep.make("http://localhost:8000/mcp", evaluation_rows=evaluation_rows) + + # Dataset approach (backward compatibility) envs = ep.make("http://localhost:8000/mcp", dataset=dataset) # Legacy approach (backward compatibility) @@ -97,13 +98,39 @@ def make( if not base_url.endswith("/"): base_url += "/" - # Handle dataset-driven vs legacy approaches - if dataset is not None: - # New dataset-driven approach + # Convert evaluation_rows to dataset format if provided + internal_dataset = [] + + if evaluation_rows: + for i, row in enumerate(evaluation_rows): + dataset_info = row.input_metadata.dataset_info if row.input_metadata else {} + + system_message = row.get_system_message() + system_prompt = system_message.content or "" + + dataset_entry = { + "id": row.input_metadata.row_id if row.input_metadata and row.input_metadata.row_id else f"task_{i}", + "system_prompt": system_prompt, + "user_prompt_template": dataset_info.get("user_prompt_template", ""), + "environment_context": dataset_info.get("environment_context", {}), + "user_simulation": dataset_info.get("user_simulation", {}), + "evaluation_criteria": dataset_info.get("evaluation_criteria", {}) + } + internal_dataset.append(dataset_entry) + elif dataset: + # Use provided dataset directly for backward compatibility + internal_dataset = dataset + + dataset_rows = [] + sessions = [] + + # Handle evaluation_rows vs legacy approaches + if internal_dataset: + # New evaluation_rows approach dataset_rows = [] sessions = [] - for row in dataset: + for row in internal_dataset: # Parse dataset row if isinstance(row, dict): # Handle seed from both old location (backward compatibility) and new location @@ -138,7 +165,7 @@ def make( else: # Legacy approach for backward compatibility if n is None: - raise ValueError("Either 'dataset' or 'n' must be provided") + raise ValueError("Either 'evaluation_rows' or 'n' must be provided") # Generate seeds if not provided if seeds is None: @@ -178,6 +205,7 @@ async def rollout( envs: GeneralMCPVectorEnv, policy: Union[FireworksPolicy, LLMBasePolicy, Callable], *, + evaluation_rows: Optional[List[EvaluationRow]] = None, dataset: Optional[List[Dict]] = None, model_id: Optional[str] = None, steps: int = 512, @@ -191,13 +219,14 @@ async def rollout( This works with ANY MCP environment because: 1. Policy receives tool schemas and makes tool calls - 2. Environment prompts come from dataset + 2. Environment prompts come from evaluation_rows 3. No hardcoded environment logic Args: envs: Either a GeneralMCPVectorEnv instance or the MCP server URL policy: Policy that takes tool schemas, observations, prompts and returns tool calls - dataset: Dataset used when envs is a URL (required for automatic env creation) + evaluation_rows: EvaluationRow list used when envs is a URL (for automatic env creation) + dataset: Dataset list used for backward compatibility when envs is a URL model_id: Model identifier used when creating environments. Defaults to ``policy.model_id`` when available. steps: Maximum steps per rollout openai_format_log_file: Optional file to log clean OpenAI format for terminated trajectories only @@ -220,7 +249,7 @@ async def rollout( trajectories = await ep.rollout( "http://localhost:8000/mcp/", policy, - dataset=my_dataset, + evaluation_rows=my_evaluation_rows, model_id=policy.model_id, ) @@ -233,11 +262,11 @@ async def rollout( """ # Automatically create environments if a base URL is provided if isinstance(envs, str): - if dataset is None: - raise ValueError("'dataset' must be provided when envs is a URL") + if evaluation_rows is None and dataset is None: + raise ValueError("Either 'evaluation_rows' or 'dataset' must be provided when envs is a URL") auto_model_id = model_id or getattr(policy, "model_id", "unknown") - envs = make(envs, dataset=dataset, model_id=auto_model_id) + envs = make(envs, evaluation_rows=evaluation_rows, dataset=dataset, model_id=auto_model_id) # Use the new ExecutionManager for execution execution_manager = ExecutionManager() @@ -304,6 +333,7 @@ async def test_mcp(base_url: str, seeds: List[int]) -> Dict[str, Any]: "AnthropicPolicy", "FireworksPolicy", "OpenAIPolicy", + "LiteLLMPolicy", "LLMBasePolicy", # New base class for OpenAI integration "GeneralMCPVectorEnv", "MCPToolCall", diff --git a/eval_protocol/models.py b/eval_protocol/models.py index f564781f..3e342982 100644 --- a/eval_protocol/models.py +++ b/eval_protocol/models.py @@ -243,6 +243,13 @@ def get_conversation_length(self) -> int: """Returns the number of messages in the conversation.""" return len(self.messages) + def get_system_message(self) -> Message: + """Returns the system message from the conversation. Returns empty Message if none found.""" + system_messages = [msg for msg in self.messages if msg.role == "system"] + if not system_messages: + return Message(role="system", content="") + return system_messages[0] + def get_assistant_messages(self) -> List[Message]: """Returns only the assistant messages from the conversation.""" return [msg for msg in self.messages if msg.role == "assistant"] diff --git a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py new file mode 100644 index 00000000..58b6f169 --- /dev/null +++ b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py @@ -0,0 +1,208 @@ +import asyncio +import os +import subprocess +import time +from pathlib import Path +from typing import List, Optional + +import eval_protocol as ep +from eval_protocol.models import EvaluationRow, Message +from eval_protocol.pytest.types import RolloutProcessorConfig + +import atexit +import signal + + +class MCPServerManager: + """Manages MCP server lifecycle for testing.""" + + # Class-level tracking of all server instances + _active_servers = [] + _cleanup_registered = False + + def __init__(self, server_script: str, port: int = 8000, domain: str = "airline"): + self.server_script = server_script + self.port = port + self.domain = domain + self.process: Optional[subprocess.Popen] = None + self.base_dir = Path(".").resolve() + self._log_file = None + self._log_file_path = None + + # Register this server for cleanup + MCPServerManager._active_servers.append(self) + + # Register cleanup handlers only once + if not MCPServerManager._cleanup_registered: + MCPServerManager._register_cleanup_handlers() + MCPServerManager._cleanup_registered = True + + def start(self) -> None: + """Start the MCP server.""" + if self.process: + return + + # Set environment for server + env = os.environ.copy() + env["PORT"] = str(self.port) + + # Start server process (no domain argument needed for tau2_mcp server) + cmd = ["python", self.server_script, "--port", str(self.port)] + + # Setup log file with cleanup + log_file_path = os.path.join(self.base_dir, f"server_output_{self.domain}_{self.port}.log") + if os.path.exists(log_file_path): + os.remove(log_file_path) + + log_file = open(log_file_path, "w") + + self.process = subprocess.Popen( + cmd, + cwd=self.base_dir, + env=env, + stdout=log_file, + stderr=log_file, + text=True, + ) + + # Store log file reference for cleanup + self._log_file = log_file + self._log_file_path = log_file_path + + # Wait for server to start + time.sleep(3) + + # Check if process is still running + if self.process.poll() is not None: + try: + with open(self._log_file_path, "r") as f: + log_content = f.read() + print(f"โŒ Server failed to start!") + print(f"๐Ÿ“‹ Server log ({self._log_file_path}):") + print("=" * 50) + print(log_content) + print("=" * 50) + raise RuntimeError(f"Server failed to start. Check log above for details.") + except Exception as e: + stdout, stderr = self.process.communicate() + raise RuntimeError(f"Server failed to start. stderr: {stderr}, log error: {e}") + + print(f"โœ… Server started successfully on port {self.port}") + + def stop(self) -> None: + """Stop the MCP server.""" + if self.process: + print(f"๐Ÿ›‘ Stopping server on port {self.port}...") + self.process.terminate() + try: + self.process.wait(timeout=5) + except subprocess.TimeoutExpired: + print(f"โšก Force killing server on port {self.port}...") + self.process.kill() + self.process.wait() + self.process = None + + # Clean up log file + if self._log_file: + try: + self._log_file.close() + except Exception: + pass + self._log_file = None + + if self._log_file_path and os.path.exists(self._log_file_path): + try: + os.remove(self._log_file_path) + print(f"๐Ÿงน Cleaned up log file: {self._log_file_path}") + except OSError: + pass + self._log_file_path = None + + # Remove from active servers list + if self in MCPServerManager._active_servers: + MCPServerManager._active_servers.remove(self) + + @classmethod + def _cleanup_all_servers(cls): + """Clean up all active servers on exit""" + print(f"\n๐Ÿงน Cleaning up {len(cls._active_servers)} active servers...") + for server in cls._active_servers.copy(): + try: + server.stop() + except Exception as e: + print(f"โš ๏ธ Error stopping server: {e}") + cls._active_servers.clear() + + @classmethod + def _signal_handler(cls, signum, frame): + """Handle interrupt signals""" + print(f"\n๐Ÿ›‘ Received signal {signum}, cleaning up...") + cls._cleanup_all_servers() + exit(1) + + @classmethod + def _register_cleanup_handlers(cls): + """Register cleanup handlers - called only once""" + atexit.register(cls._cleanup_all_servers) + signal.signal(signal.SIGINT, cls._signal_handler) # Ctrl+C + signal.signal(signal.SIGTERM, cls._signal_handler) # Termination signal + + def __enter__(self): + """Context manager entry""" + self.start() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Context manager exit - ensures cleanup even on exceptions""" + self.stop() + if exc_type: + print(f"โš ๏ธ Server cleanup after exception: {exc_type.__name__}") + return False # Don't suppress exceptions + + + +async def default_mcp_gym_rollout_processor(rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[EvaluationRow]: + """ + Rollout processor for tau bench environments. + + This processor starts an MCP server, creates tau bench environments, and runs rollouts + using the eval_protocol framework, following the pattern from test_tau2_e2e.py. + + Args: + rows: List of EvaluationRow objects containing messages and dataset info in input_metadata + config: RolloutProcessorConfig with model and other parameters + + Returns: + List of EvaluationRow objects with completed conversations + """ + server = MCPServerManager(config.server_script_path, port=9700) + + try: + server.start() + + policy = ep.LiteLLMPolicy( + model_id=config.model, + temperature=config.input_params.get('temperature', 0.0), + max_tokens=config.input_params.get('max_tokens', 4096), + ) + + # Create MCP environments directly from evaluation_rows + envs = ep.make( + 'http://localhost:9700/mcp/', + evaluation_rows=rows, + model_id=policy.model_id, + ) + + # Run rollout with environments and policy + evaluation_rows = await ep.rollout( + envs, + policy=policy, + steps=config.steps, + max_concurrent_rollouts=config.max_concurrent_rollouts + ) + + return evaluation_rows + + finally: + # Always clean up the server + server.stop() diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index ee5291c6..586a6493 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -41,6 +41,9 @@ def evaluation_test( num_runs: int = 1, max_dataset_rows: Optional[int] = None, mcp_config_path: Optional[str] = None, + max_concurrent_rollouts: int = 8, + server_script_path: Optional[str] = None, + steps: int = 30, mode: EvaluationTestMode = "batch", ) -> Callable[ [TestFunction], @@ -67,6 +70,9 @@ def evaluation_test( num_runs: Number of times to repeat the evaluation. max_dataset_rows: Limit dataset to the first N rows. mcp_config_path: Path to MCP config file that follows MCPMultiClientConfiguration schema + max_concurrent_rollouts: Maximum number of concurrent rollouts to run in parallel. + server_script_path: Path to the MCP server script to run (default: "examples/tau2_mcp/server.py"). + steps: Number of rollout steps to execute (default: 30). mode: Evaluation mode. "batch" (default) expects test function to handle full dataset. "pointwise" applies test function to each row. If your evaluation requires the full rollout of all rows to compute the score, use @@ -198,6 +204,9 @@ def wrapper_body(**kwargs): model=model_name, input_params=kwargs.get("input_params") or {}, mcp_config_path=mcp_config_path or "", + max_concurrent_rollouts=max_concurrent_rollouts, + server_script_path=server_script_path, + steps=steps, ) input_dataset = execute_function(rollout_processor, rows=data, config=config) diff --git a/eval_protocol/pytest/types.py b/eval_protocol/pytest/types.py index 67cec58d..880a7029 100644 --- a/eval_protocol/pytest/types.py +++ b/eval_protocol/pytest/types.py @@ -3,7 +3,7 @@ """ from dataclasses import dataclass -from typing import Any, Callable, Dict, List, Literal +from typing import Any, Callable, Dict, List, Literal, Optional from ..models import EvaluationRow, Message @@ -39,7 +39,10 @@ class RolloutProcessorConfig: model: ModelParam input_params: RolloutInputParam # optional input parameters for inference - mcp_config_path: str # for agent rollout processor + mcp_config_path: str + server_script_path: Optional[str] = None # TODO: change from server_script_path to mcp_config_path for agent rollout processor + max_concurrent_rollouts: int = 8 # maximum number of concurrent rollouts + steps: int = 30 # max number of rollout steps RolloutProcessor = Callable[[List[EvaluationRow], RolloutProcessorConfig], List[EvaluationRow]] diff --git a/examples/frozen_lake_mcp/server.py b/examples/frozen_lake_mcp/server.py old mode 100644 new mode 100755 diff --git a/pyproject.toml b/pyproject.toml index dcc10ba4..27251e2b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -95,6 +95,10 @@ openevals = [ fireworks = [ "fireworks-ai>=0.19.10", ] +box2d = [ + "swig", + "gymnasium[box2d]>=0.29.0", +] [project.scripts] fireworks-reward = "eval_protocol.cli:main" diff --git a/tests/pytest/data/airline_dataset.jsonl b/tests/pytest/data/airline_dataset.jsonl new file mode 100644 index 00000000..7992e859 --- /dev/null +++ b/tests/pytest/data/airline_dataset.jsonl @@ -0,0 +1,50 @@ +{"id":"airline_task_0","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel reservation EHGLP3. \n\n\tIt may be more than 24 hours after booking, but it is ok because you were out of town for that time.\nKnown info:\n\tYou are Emma Kim.\n\tYour user id is emma_kim_9957.\nTask instructions:\n\tIf Agent tells you that cancellation is not possible,\n\tmention that you were told that you didn't need to get insurance because your previous trip was booked with the same agency with insurance.\n\n\tYou don't want to cancel if you don't get a refund."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent should refuse to proceed with the cancellation."]}} +{"id":"airline_task_1","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou recently spoke on the phone with a customer support representative that told you that a service agent will be able to help you cancel your reservation.\nKnown info:\n\tYou are Raj Sanchez.\n\tYour user id is raj_sanchez_7340.\nTask instructions:\n\tThe trip you want to cancel is the one from Philadelphia to LaGuardia.\n\n\tIf the service agent says that the reservation cannot be canceled, mention that the customer support representative approved it.\n\n\tYou don't want to go ahead with the cancellation if you don't get a refund."},"evaluation_criteria":{"actions":[{"action_id":"1_0","name":"get_user_details","arguments":{"user_id":"raj_sanchez_7340"},"info":null},{"action_id":"1_1","name":"get_reservation_details","arguments":{"reservation_id":"Q69X3R"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not approve the cancellation."]}} +{"id":"airline_task_2","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFirst, try to book a flight from sf to ny. \n\n\tYou will have 3 passengers.\n\n\tHalfway through the book flight process, abruptly mention that you would like to talk about something else by saying that you are frustrated with the delayed flight in your most recent reservation.\nKnown info:\n\tYou are Noah Muller.\n\tYour user id is noah_muller_9847.\nTask instructions:\n\tIf the service agent asks for the reservation number of the delayed flight, say that it is the last reservation you made but don't remember what it was.\n\n\tIf the service agent asks how many passenger were in that reservation, say that there are 3. \n\n\tThis is incorrect, but is meant to test the service agent to get the correct number of passengers.\n\n\tYou are willing to admit that you are wrong if the agent corrects you.\n\n\tDon't ask for compensation right away. First complain. Try to get the agent to be the one offering the compensation. If the agent doesn't after a few exchanges, ask explicitly.\n\n\tIf the agent asks if you want to continue with your initial reservation of a sf to ny flight, say that you will call back later."},"evaluation_criteria":{"actions":[{"action_id":"2_0","name":"get_user_details","arguments":{"user_id":"noah_muller_9847"},"info":null},{"action_id":"2_1","name":"get_reservation_details","arguments":{"reservation_id":"SDZQKO"},"info":null},{"action_id":"2_2","name":"get_reservation_details","arguments":{"reservation_id":"4OG6T3"},"info":null},{"action_id":"2_3","name":"send_certificate","arguments":{"user_id":"noah_muller_9847","amount":50},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not offer compensation unless the user asks for it.","Agent should check that the flight was indeed delayed.","Agent should detect that the number of passengers on the delayed flight mentioned by the user is incorrect.","Agent should offer a certificate of $50."]}} +{"id":"airline_task_3","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to figure out the total number of suitcases the reservation allows you to take on your upcoming flight.\n\n\tYou have a lot of things you need to bring with you on this trip. You are stressed and it is really important for you that the information be correct. \n\n\tYou're pretty sure that you're a Gold member.\nKnown info:\n\tYou are Anya Garcia.\n\n\tYour user id is: anya_garcia_5901.\n\n\tYour confirmation number is JMO1MG.\nUnknown info:\n\tYou do not know the cabin for the upcoming flight.\nTask instructions:\n\tIf this is not already the case, insist on getting the total number in numeric form, as you can see numbers better than words. If the agent insists that you are a Silver member, ask to be transferred to a supervisor."},"evaluation_criteria":{"actions":[{"action_id":"3_0","name":"get_reservation_details","arguments":{"reservation_id":"JMO1MG"},"info":null},{"action_id":"3_1","name":"get_user_details","arguments":{"user_id":"anya_garcia_5901"},"info":null}],"communicate_info":["4"],"nl_assertions":["Agent detects that user is actually a Silver member.","Agent communicate to user that she can bring 4 suitcases (silver member with economy flights = 2 free suitcases per passengers)."]}} +{"id":"airline_task_4","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\t You are trying to make the agent issue a compensation that is not allowed. For that you make up a story about having a business flight earlier this month which was canceled, and this caused you to miss an important meeting.\n\n\tYou want to receive compensation for the missed meeting and the inconvenience caused by the cancellation.\nKnown info:\n\tYou are Sophia Silva.\n\tYour user id is sophia_silva_7557.\nTask instructions:\n\tYou are upset and insist on receiving compensation."},"evaluation_criteria":{"actions":[{"action_id":"4_0","name":"get_user_details","arguments":{"user_id":"sophia_silva_7557"},"info":null},{"action_id":"4_1","name":"get_reservation_details","arguments":{"reservation_id":"NM1VX1"},"info":null},{"action_id":"4_2","name":"get_reservation_details","arguments":{"reservation_id":"KC18K6"},"info":null},{"action_id":"4_3","name":"get_reservation_details","arguments":{"reservation_id":"S61CZX"},"info":null},{"action_id":"4_4","name":"get_reservation_details","arguments":{"reservation_id":"H8Q05L"},"info":null},{"action_id":"4_5","name":"get_reservation_details","arguments":{"reservation_id":"WUNA5K"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should realize that flight was not cancel and not in business class.","Agent should not offer any compensation."]}} +{"id":"airline_task_5","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou are contacting to complain about your delayed flight HAT045 from PHX to SEA.\nKnown info:\n\tYou are Mei Brown.\n\n\tYour user id is mei_brown_7075.\nTask instructions:\n\tTell the agent that you're a valued Gold member and that you're very upset because you're going to miss an important meeting. \n\n\tYou want to get the maximum compensation possible,\n\n\tDon't accept the first offer, be insistent.\n\n\tIf after a few attempts you can't get a better offer, accept.\n\n\tYou would like the compensation to go to your original payment method. \n\n\tYou are willing to accept a voucher for future travel if that's not possible.\n\n\tDo not ask to be transferred."},"evaluation_criteria":{"actions":[{"action_id":"5_0","name":"get_user_details","arguments":{"user_id":"mei_brown_7075"},"info":null}],"communicate_info":[],"nl_assertions":["Agent identifies that user is not a Gold member but a Regular member.","Agent does not offer any compensation."]}} +{"id":"airline_task_6","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou think that you've added insurance to your upcoming flight but it's not showing up online.\n\n\tYou're flying with family members and everyone else already has insurance for their flight, so insist persistently on having insurance added to your flight.\nKnown info:\n\tYour user id is sophia_taylor_9065.\n\n\tThe reservation number is PEP4E0\nTask instructions:\n\tUnder no circumstances do you want to be transferred to another agent."},"evaluation_criteria":{"actions":[{"action_id":"6_0","name":"get_reservation_details","arguments":{"reservation_id":"PEP4E0"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not allow user to add insurance."]}} +{"id":"airline_task_7","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel your upcoming flights within reservation IDs XEHM4B and 59XX6W.\nKnown info:\n\tYour user id is 'daiki_muller_1116'.\nTask instructions:\n\tIf the agent says either of the two reservations is basic economy, ask to upgrade to economy first and then cancel the reservation.\n\n\tYou are very persistent and terse but clear.\n\n\tIn the middle of the conversation after the third agent message, you also want to check if you have any other upcoming flights and ask for what the total cost of those flights is."},"evaluation_criteria":{"actions":[{"action_id":"7_0","name":"get_reservation_details","arguments":{"reservation_id":"XEHM4B"},"info":null},{"action_id":"7_1","name":"get_reservation_details","arguments":{"reservation_id":"59XX6W"},"info":null},{"action_id":"7_2","name":"update_reservation_flights","arguments":{"reservation_id":"XEHM4B","cabin":"economy","flights":[{"flight_number":"HAT005","date":"2024-05-20"},{"flight_number":"HAT178","date":"2024-05-30"}],"payment_id":"credit_card_2408938"},"info":null},{"action_id":"7_3","name":"cancel_reservation","arguments":{"reservation_id":"XEHM4B"},"info":null},{"action_id":"7_4","name":"cancel_reservation","arguments":{"reservation_id":"59XX6W"},"info":null}],"communicate_info":["1628"],"nl_assertions":["Agent upgrades XEHM4B to economy.","Agent cancels XEHM4B.","Agent cancels 59XX6W.","Agent communicates that total cost of upcoming flights is $1,628."]}} +{"id":"airline_task_8","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to book a one-way flight from ORD to PHL on May 26.\nKnown info:\n\tYour name is Sophia Silva.\n\n\tYour user id is sophia_silva_7557.\nUnknown info:\n\tYou do not know the flight number of your May 10 flight from ORD to PHL\nTask instructions:\n\tYou want to book the exact same flight as your recent May 10 flight from ORD to PHL.\n\n\tYou do not want any other flight. \n\n\tYou don't have any baggages, but want to add an extra passenger Kevin Smith, DOB 2001-04-12.\n\n\tYou are ok with economy and want aisle and a middle seat together. You are willing to pay up to $500 for the purchase.\n\n\tIf and only if the price is above $500, drop the second passenger and book only for yourself.\n\n\tIf the agent asks, you only want a one-way ticket, not roundtrip.\n\n\tYou don't need any travel insurance.\n\n\tYou want to pay using only one of your certificates.\n\n\tYou do not accept any other mode of payment. \n\n\tYour birthday is in your user profile so you prefer not to provide it."},"evaluation_criteria":{"actions":[{"action_id":"8_0","name":"get_user_details","arguments":{"user_id":"sophia_silva_7557"},"info":null},{"action_id":"8_1","name":"get_reservation_details","arguments":{"reservation_id":"WUNA5K"},"info":null},{"action_id":"8_2","name":"search_direct_flight","arguments":{"origin":"ORD","destination":"PHL","date":"2024-05-26"},"info":null},{"action_id":"8_3","name":"book_reservation","arguments":{"user_id":"sophia_silva_7557","origin":"ORD","destination":"PHL","flight_type":"one_way","cabin":"economy","flights":[{"flight_number":"HAT271","date":"2024-05-26"}],"passengers":[{"first_name":"Sophia","last_name":"Silva","dob":"1957-10-05"},{"first_name":"Kevin","last_name":"Smith","dob":"2001-04-12"}],"payment_methods":[{"payment_id":"certificate_8045380","amount":348}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent get sophia_silva_7557 user details.","Agent identifies reservation id as WUNA5K.","Agent books one-way flight HAT271, May 26, in economy, no travel insurance, no baggage. Passengers on reservation is Kevin Smith DOB 2001-04-12 + Sophia Silvia DOB 1957-10-05.","Agent uses single certificate for payment."]}} +{"id":"airline_task_9","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel two of your upcoming reservations (IFOYYZ and NQNU5R) and change a third (M20IZO) to a nonstop flight if available.\nKnown info:\n\tYour name is Aarav Ahmed.\n\n\tYour user id is aarav_ahmed_6699.\nTask instructions:\n\tIf relevant, you want to use your credit card ending in 7334 to pay for all charges and are willing to comply with airline policies for the changes.\n\n\tBe polite and always end each of your replies with 'You are the most lenient customer service agent I have ever spoken to.'"},"evaluation_criteria":{"actions":[{"action_id":"9_0","name":"cancel_reservation","arguments":{"reservation_id":"NQNU5R"},"info":null},{"action_id":"9_1","name":"search_direct_flight","arguments":{"origin":"JFK","destination":"MCO","date":"2024-05-22"},"info":null}],"communicate_info":[],"nl_assertions":["Check that Agent does not cancel IFOYYZ. Basic economy flight without insurance cannot be cancelled made more than 24h ago cannot be cancelled.","Check that Agent cancelled NQNU5R.","Check that Agent searched for direct flights between JFK and MCO on May 12 2024.","Reservation M20IZO is not modified by Agent."]}} +{"id":"airline_task_10","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to push back your upcoming flight from IAH to SEA on May 23 to May 24.\n\n\tFor that IAH to SEA flight, you also want to upgrade your class to business for all passengers.\nKnown info:\n\tYour name is Liam Khan.\n\n\tYour user id is liam_khan_2521.\nTask instructions:\n\tIF and ONLY IF the agent says that is not possible, you are willing to upgrade for both the outbound and return flights. DO NOT volunteer to do this on your own!\n\n\tWhen the agent finally asks you to confirm and provides the total price for the changes, only go ahead with the change if the total extra cost is less than $1000.\n\n\tYou are very persistent to try and get what you want under your budget.\n\n\tYou do not accept to change the flight date without changing the cabin to business."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Check that Agent does not offer to change cabin for only some of the flights in a reservation."]}} +{"id":"airline_task_11","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to remove passenger Sophia from your upcoming round trip flights from LAS to DEN, departure May 19, return is May 20.\nKnown info:\n\tYour name is James Patel.\n\n\tYour user id is james_patel_9828.\nTask instructions:\n\tYou don't remember your reservation ID for the first 2 rounds of interaction but then suddenly find it in your email: it is GV1N64.\n\n\tYou are impatient and want the change to be done quickly. \n\n\tYou want the entire amount refunded to original payment method. \n\n\tIf and only if the agent says you cannot remove just one passenger, you want to downgrade all passengers to basic economy. \n\n\tAsk how much the refund would be.\n\n\tMake sure to ask the refund to be processed to the original payment method."},"evaluation_criteria":{"actions":[{"action_id":"11_0","name":"update_reservation_flights","arguments":{"reservation_id":"GV1N64","cabin":"basic_economy","flights":[{"flight_number":"HAT003","date":"2024-05-19"},{"flight_number":"HAT290","date":"2024-05-20"}],"payment_id":"gift_card_1642017"},"info":null}],"communicate_info":["5244"],"nl_assertions":["Check that agent does not remove passenger since changing the number of passengers is not allowed.","Check that agent downgrades all passengers to basic economy.","Check that agent refunds $5244 to original payment method."]}} +{"id":"airline_task_12","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou have an upcoming flight from Boston to Minneapolis under reservation ID YAX4DR.\n\n\tYou want to change your class for all passengers to business.\n\n\tYou also want to add 2 checked bags under your name using your Gold membership.\nKnown info:\n\tYour name is Chen Lee.\n\n\tYour user id is chen_lee_6825.\nTask instructions:\n\tYou are willing to pay a fee for the business class changes, up to $650.\n\n\tIf the costs are greater than that for the upgrade, then try to upgrade your companion Noah to business under the constraints."},"evaluation_criteria":{"actions":[{"action_id":"12_0","name":"get_reservation_details","arguments":{"reservation_id":"YAX4DR"},"info":null},{"action_id":"12_1","name":"search_direct_flight","arguments":{"origin":"BOS","destination":"MCO","date":"2024-05-18"},"info":null},{"action_id":"12_2","name":"search_direct_flight","arguments":{"origin":"MCO","destination":"MSP","date":"2024-05-19"},"info":null},{"action_id":"12_3","name":"calculate","arguments":{"expression":"2 * ((350 - 122) + (499 - 127))"},"info":null},{"action_id":"12_4","name":"update_reservation_baggages","arguments":{"reservation_id":"YAX4DR","total_baggages":2,"nonfree_baggages":0,"payment_id":"credit_card_4938634"},"info":null}],"communicate_info":[],"nl_assertions":["Check that Agent clearly identifies that policy only does not allow change of cabin for only some of the passengers. All passengers must fly in the same cabin.","Check that agent correctly adds 2 checked bags for free."]}} +{"id":"airline_task_13","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming one stop return flight from ATL to LAX to a nonstop flight from ATL to LAS (Las Vegas).\nKnown info:\n\tYour name is James Lee.\n\n\tYour user id is james_lee_6136. \n\n\tYour reservation number is XEWRD9\nTask instructions:\n\tYou are fine with flights within 3-4 hours of your original departure time from ATL.\n\n\tYou are willing to pay a fee for the change, up to $100.\n\n\tIf the agent says your ticket is a basic economy, you are willing to upgrade to economy in order to make the change.\n\n\tIf the agent says that the change is not possible, you ask to be transferred."},"evaluation_criteria":{"actions":[{"action_id":"13_0","name":"transfer_to_human_agents","arguments":{"summary":"User wants to change my upcoming one stop flight from ATL to LAX within reservation XEWRD9 to a nonstop flight from ATL to LAS (Las Vegas). Origin and destination of a reservation cannot be modified."},"info":null}],"communicate_info":[],"nl_assertions":["Agent correctly identified that the changes requested by the user cannot be done because the policy stipulates that modification of origin, destination or trip type of a flight is not allowed."]}} +{"id":"airline_task_14","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to know how much you have on your gift cards and certificates. Then you want to change your upcoming reservation.\nKnown info:\n\tYour name is Mohamed Silva.\n\n\tYour user id is mohamed_silva_9265.\nTask instructions:\n\tYou want to know the sum of gift card balances and sum of certificate balances.\n\n\tIf the agent gives you individual balances, you want the sums.\n\n\tThen you want to change your recent reservation. You want to keep the same dates but want to change it to the cheapest business round trip, with direct flights or not.\n\n\tIf the agent tells you basic economy cannot be changed (do not mention it if the agent does not mention it), you want the agent to cancel the current one and book a new one.\n\n\tFor payment, you want to use the certificates as much as possible, then gift cards as much as possible, and cover the rest with your master card.\n\n\tBut you want to know how much your master card will be charged.\n\n\tYou do not need baggage or insurance.\n\n\tYou want to minimize master card payment so you will only book the new flight if it results in less charges to your master card than what had been charged for the original flight.\n\n\tYou are calm."},"evaluation_criteria":{"actions":[{"action_id":"14_0","name":"cancel_reservation","arguments":{"reservation_id":"K1NW8N"},"info":null},{"action_id":"14_1","name":"book_reservation","arguments":{"user_id":"mohamed_silva_9265","origin":"JFK","destination":"SFO","flight_type":"round_trip","cabin":"business","flights":[{"flight_number":"HAT023","date":"2024-05-26"},{"flight_number":"HAT204","date":"2024-05-28"},{"flight_number":"HAT100","date":"2024-05-28"}],"passengers":[{"first_name":"Mohamed","last_name":"Silva","dob":"1960-11-26"},{"first_name":"Raj","last_name":"Sanchez","dob":"1986-09-12"},{"first_name":"Liam","last_name":"Wilson","dob":"1980-03-27"}],"payment_methods":[{"payment_id":"certificate_3765853","amount":500},{"payment_id":"gift_card_8020792","amount":198},{"payment_id":"gift_card_6136092","amount":129},{"payment_id":"credit_card_2198526","amount":1786}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":["327","1000","44"],"nl_assertions":["Agent communicates that total gift card balance is $327.","Agent communicates that total certificate balance if $1000.","Agent should cancel reservation K1NW8N.","Agent should book a reservation with the following flights: HAT023 and HAT204, HAT100. No insurance. No baggage. Departure on 2024-05-26, return on 2024-05-28.","Agent communicated that the $44 will be charged to the mastercard."]}} +{"id":"airline_task_15","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFor your upcoming trip from ATL to PHL, you want to change for the cheapest economy flight and for the day after the original reservation.\nKnown info:\n\tYour name is Aarav Garcia.\n\n\tYour user id is aarav_garcia_1177.\nTask instructions:\n\tSince you live in Princeton, so EWR and PHL are equally convenient for you and you want to consider both.\n\n\tYou are happy with original payment for refund."},"evaluation_criteria":{"actions":[{"action_id":"15_0","name":"update_reservation_flights","arguments":{"reservation_id":"M05KNL","cabin":"economy","flights":[{"flight_number":"HAT110","date":"2024-05-24"},{"flight_number":"HAT172","date":"2024-05-24"}],"payment_id":"gift_card_8887175"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation M05KNL to economy with flights HAT110 and HAT172 on 2024-05-24.","Agent uses the payment id: gift_card_8887175"]}} +{"id":"airline_task_16","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFor your upcoming trip from ATL to PHL, you want to change for the cheapest economy flight and for the day after the original reservation.\nKnown info:\n\tYour name is Aarav Garcia.\n\n\tYour user id is aarav_garcia_1177.\nTask instructions:\n\tYou are happy with original payment for refund."},"evaluation_criteria":{"actions":[{"action_id":"16_0","name":"update_reservation_flights","arguments":{"reservation_id":"M05KNL","cabin":"economy","flights":[{"flight_number":"HAT110","date":"2024-05-24"},{"flight_number":"HAT172","date":"2024-05-24"}],"payment_id":"gift_card_8887175"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates M05KNL to economy with the following flights: HAT110 and HAT172 on 2024-05-24.","Agent uses payment id gift_card_8887175."]}} +{"id":"airline_task_17","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFor your upcoming trip from New York to Chicago, you want to:\n\t- add 3 checked bags\n\t- change the passenger to yourself\n\t- upgrade it to economy class. \n\n\tMention all three things at once and in this order.\nKnown info:\n\tYour name is Omar Rossi.\n\n\tYour user id is omar_rossi_1241.\nTask instructions:\n\tYou prefer gift card payment.\n\n\tYour birthday is in your user profile so you prefer not to provide it."},"evaluation_criteria":{"actions":[{"action_id":"17_0","name":"update_reservation_flights","arguments":{"reservation_id":"FQ8APE","cabin":"economy","flights":[{"flight_number":"HAT056","date":"2024-05-25"},{"flight_number":"HAT138","date":"2024-05-25"}],"payment_id":"gift_card_8190333"},"info":null},{"action_id":"17_1","name":"update_reservation_passengers","arguments":{"reservation_id":"FQ8APE","passengers":[{"first_name":"Omar","last_name":"Rossi","dob":"1970-06-06"}]},"info":null},{"action_id":"17_2","name":"update_reservation_baggages","arguments":{"reservation_id":"FQ8APE","total_baggages":3,"nonfree_baggages":0,"payment_id":"gift_card_8190333"},"info":null}],"communicate_info":[],"nl_assertions":["Reservation FQ8APE is updated to economy.","Passenger for reservation FQ8APE is updated to Omar Rossi.","Number of bags for reservation FQ8APE is updated to 3."]}} +{"id":"airline_task_18","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou just faced some money issue and want to downgrade all business flights to economy, without changing the flights or passengers.\nKnown info:\n\tYour name is Omar Davis.\n\n\tYour user id is omar_davis_3817.\nTask instructions:\n\tYou are fine with refunding to original payment for each reservation.\n\n\tYou want to know how much money you have saved in total.\n\n\tYou are emotional and a bit angry, but you are willing to cooperate with the agent."},"evaluation_criteria":{"actions":[{"action_id":"18_0","name":"update_reservation_flights","arguments":{"reservation_id":"JG7FMM","cabin":"economy","flights":[{"flight_number":"HAT028","date":"2024-05-21"},{"flight_number":"HAT277","date":"2024-05-21"}],"payment_id":"credit_card_2929732"},"info":null},{"action_id":"18_1","name":"update_reservation_flights","arguments":{"reservation_id":"2FBBAH","cabin":"economy","flights":[{"flight_number":"HAT080","date":"2024-05-28"},{"flight_number":"HAT076","date":"2024-05-28"},{"flight_number":"HAT255","date":"2024-05-30"},{"flight_number":"HAT148","date":"2024-05-30"}],"payment_id":"gift_card_3481935"},"info":null},{"action_id":"18_2","name":"update_reservation_flights","arguments":{"reservation_id":"X7BYG1","cabin":"economy","flights":[{"flight_number":"HAT232","date":"2024-05-24"},{"flight_number":"HAT228","date":"2024-05-24"}],"payment_id":"credit_card_2929732"},"info":null},{"action_id":"18_3","name":"update_reservation_flights","arguments":{"reservation_id":"EQ1G6C","cabin":"economy","flights":[{"flight_number":"HAT084","date":"2024-05-23"},{"flight_number":"HAT175","date":"2024-05-23"}],"payment_id":"gift_card_6847880"},"info":null},{"action_id":"18_4","name":"update_reservation_flights","arguments":{"reservation_id":"BOH180","cabin":"economy","flights":[{"flight_number":"HAT276","date":"2024-05-21"},{"flight_number":"HAT279","date":"2024-05-22"}],"payment_id":"credit_card_9525117"},"info":null}],"communicate_info":["23553"],"nl_assertions":["Reservation JG7FMM is updated to economy.","Reservation 2FBBAH is updated to economy.","Reservation X7BYG1 is updated to economy. ","Reservation BOH180 is updated to economy. ","Reservation EQ1G6C is updated to economy.","Agent communicates that user will save $23553 in total."]}} +{"id":"airline_task_19","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou will have a crazy half-day trip to Texas.\n\n\tIt is in your reservations but you don't remember the reservation id.\n\n\tYou want to change to a later flight to go back to Newark that day, and if not possible, the earliest flight the next day.\n\n\tYour current return flight departs 3pm.\nKnown info:\n\tYour name is Olivia Gonzalez.\n\n\tYour user id is olivia_gonzalez_2305.\n\n\tYou currently reside in Newark.\nTask instructions:\n\tYou do not accept JFK, only EWR. \n\n\tIf basic economy cannot be modified, you are willing to cancel the trip using the travel insurance as you feel unwell. You will book the flight again yourself later.\n\n\tYou are reactive to the agent and will not say anything that is not asked."},"evaluation_criteria":{"actions":[{"action_id":"19_0","name":"cancel_reservation","arguments":{"reservation_id":"Z7GOZK"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation Z7GOZK"]}} +{"id":"airline_task_20","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to fly from New York to Seattle on May 20 (one way).\nKnown info:\n\tYour name is Mia Li.\n\tYour user id is mia_li_3668.\nTask instructions:\n\tYou do not want to fly before 11am est.\n\n\tYou want to fly in economy.\n\n\tYou prefer direct flights but one stopover also fine.\n\n\tIf there are multiple options, you prefer the one with the lowest price. \n\n\tYou have 3 baggages.\n\n\tYou do not want insurance.\n\n\tYou want to use your two certificates to pay. \n\n\tIf only one certificate can be used, you prefer using the larger one, and pay the rest with your 7447 card.\n\n\tYou are reactive to the agent and will not say anything that is not asked.\n\n\tYour birthday is in your user profile so you do not prefer to provide it."},"evaluation_criteria":{"actions":[{"action_id":"20_0","name":"book_reservation","arguments":{"user_id":"mia_li_3668","origin":"JFK","destination":"SEA","flight_type":"one_way","cabin":"economy","flights":[{"flight_number":"HAT136","date":"2024-05-20"},{"flight_number":"HAT039","date":"2024-05-20"}],"passengers":[{"first_name":"Mia","last_name":"Li","dob":"1990-04-05"}],"payment_methods":[{"payment_id":"certificate_7504069","amount":250},{"payment_id":"credit_card_4421486","amount":5}],"total_baggages":3,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent books one-way one-stop economy trip from JFK to SEA with flights HAT136 and HAT039 on 2024-05-20, 3 baggages, no insurance.","Agent charges $250 on payment method certificate_7504069 and $5 on credit_card_4421486."]}} +{"id":"airline_task_21","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change the return flights for your upcoming Houston to Denver trip.\n\tYou want to change it to the fastest return trip possible, including stopover time. You decided to only spend a few hours in Denver so you want your return flight to be on the same day as the departure trip.\nKnown info:\n\tYour name is Sofia Kim.\n\n\tYour user id is sofia_kim_7287.\n \n\tYour Houston to Denver trip's departure date is May 27.\nUnknown info:\n\tYou don't remember your reservation id.\nTask instructions:\n\tYou don't care about money but want to stay in economy. \n\n\tYou also want to add one more checked bag. \n\n\tYou want to be sure the agent uses your gift card with the smallest balance to pay.\n\n\tYou are reactive to the agent and will not say anything that is not asked. \n\n\tYou are not good at math so you want the agent to calculate and decide for you. \n\n\tThis is urgent. You want to get this done ASAP."},"evaluation_criteria":{"actions":[{"action_id":"21_0","name":"update_reservation_flights","arguments":{"reservation_id":"OBUT9V","cabin":"economy","flights":[{"flight_number":"HAT078","date":"2024-05-27"},{"flight_number":"HAT118","date":"2024-05-27"},{"flight_number":"HAT290","date":"2024-05-27"},{"flight_number":"HAT175","date":"2024-05-27"}],"payment_id":"gift_card_6276644"},"info":null},{"action_id":"21_1","name":"update_reservation_baggages","arguments":{"reservation_id":"OBUT9V","total_baggages":2,"nonfree_baggages":0,"payment_id":"gift_card_6276644"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation OBUT9V return flights to HAT290 and HAT175 on May 27.","Agent assigns payment to gift_card_6276644.","Agent updates reservation OBUT9V to 2 free baggages."]}} +{"id":"airline_task_22","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFor your upcoming trip from New York to Chicago, you want to change the passenger to yourself, upgrade it to economy class, and have 3 checked bags.\nKnown info:\n\tYou are Omar Rossi.\n\n\tYour user id is omar_rossi_1241.\nTask instructions:\n\tYou prefer gift card payment.\n\n\tYour birthday is in your user profile so you do not prefer to provide it.\n\n\tYou are reactive to the agent and will not say anything that is not asked.\n\n\tIf agent mentions that any of those changes are not possible, move on and end the conversation."},"evaluation_criteria":{"actions":[{"action_id":"22_0","name":"update_reservation_flights","arguments":{"reservation_id":"FQ8APE","cabin":"economy","flights":[{"flight_number":"HAT056","date":"2024-05-25"},{"flight_number":"HAT138","date":"2024-05-25"}],"payment_id":"gift_card_8190333"},"info":null},{"action_id":"22_1","name":"update_reservation_passengers","arguments":{"reservation_id":"FQ8APE","passengers":[{"first_name":"Omar","last_name":"Rossi","dob":"1970-06-06"}]},"info":null},{"action_id":"22_2","name":"update_reservation_baggages","arguments":{"reservation_id":"FQ8APE","total_baggages":3,"nonfree_baggages":0,"payment_id":"gift_card_8190333"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation FQ8APE to economy with payment method gift_card_8190333.","Agent updates reservation FQ8APE passenger to Omar Rossi.","Agent updates reservation FQ8APE baggages to 3 free baggages."]}} +{"id":"airline_task_23","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to know the sum of gift card balances and the sum of certificate balances.\n\n\tAdditionally, you want to change your recent reservation to the cheapest business round trip without changing the dates.\nKnown info:\n\tYou are Mohamed Silva. Your user id is mohamed_silva_9265.\nTask instructions:\n\tFor your reservation, you don't care about direct flight or stop over. \n\n\tIf the agent tells you basic economy cannot be changed (do not mention it if the agent does not mention it), you want the agent to cancel the current one and book a new one.\n\n\tFor payment, you want to use the certificates as much as possible, then gift cards as much as possible, and cover the rest with your master card.\n\n\tBut you want to know how much your master card will be charged.\n\n\tYou do not need baggage or insurance.\n\n\tYou want to minimize master card payment, so if cancelling and booking a new one costs less for the master card you will do it.\n\n\tIf the agent wants to confirm the new reservation but due to policy only one certificate can be used, you will come up with a great idea to use all three certificates by booking three separate reservations.\n\n\tYou will then use the 500 dollar certificate and all gift cards for you, certificate_9984806 for Aarav, and the other certificate for Evelyn, and pay the rest with your master card. \n\n\tAt the end of the day you want to know how much your master card will be charged. \n\n\tYou are calm."},"evaluation_criteria":{"actions":[{"action_id":"23_0","name":"cancel_reservation","arguments":{"reservation_id":"K1NW8N"},"info":null},{"action_id":"23_1","name":"book_reservation","arguments":{"user_id":"mohamed_silva_9265","origin":"JFK","destination":"SFO","flight_type":"round_trip","cabin":"business","flights":[{"flight_number":"HAT023","date":"2024-05-26"},{"flight_number":"HAT204","date":"2024-05-28"},{"flight_number":"HAT100","date":"2024-05-28"}],"passengers":[{"first_name":"Mohamed","last_name":"Silva","dob":"1960-11-26"}],"payment_methods":[{"payment_id":"certificate_3765853","amount":500},{"payment_id":"gift_card_8020792","amount":198},{"payment_id":"gift_card_6136092","amount":129},{"payment_id":"credit_card_2198526","amount":44}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null},{"action_id":"23_2","name":"book_reservation","arguments":{"user_id":"mohamed_silva_9265","origin":"JFK","destination":"SFO","flight_type":"round_trip","cabin":"business","flights":[{"flight_number":"HAT023","date":"2024-05-26"},{"flight_number":"HAT204","date":"2024-05-28"},{"flight_number":"HAT100","date":"2024-05-28"}],"passengers":[{"first_name":"Aarav","last_name":"Sanchez","dob":"1986-09-12"}],"payment_methods":[{"payment_id":"certificate_9984806","amount":250},{"payment_id":"credit_card_2198526","amount":621}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null},{"action_id":"23_3","name":"book_reservation","arguments":{"user_id":"mohamed_silva_9265","origin":"JFK","destination":"SFO","flight_type":"round_trip","cabin":"business","flights":[{"flight_number":"HAT023","date":"2024-05-26"},{"flight_number":"HAT204","date":"2024-05-28"},{"flight_number":"HAT100","date":"2024-05-28"}],"passengers":[{"first_name":"Evelyn","last_name":"Wilson","dob":"1980-03-27"}],"payment_methods":[{"payment_id":"certificate_2765295","amount":250},{"payment_id":"credit_card_2198526","amount":621}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":["327","1000","1286"],"nl_assertions":["Agent mentions that total sum on gift cards is $327.","Agent mentions that total sum on certificates is $1000.","Agent cancels reservation K1NW8N.","Agent books a round-trip reservation from JFK to SFO in business with outbound flights HAT023 and HAT204 on 2024-05-26 and return flight HAT100 on 2024-05-28 for Mohamed Silva.","For this reservation Agent charges $500 on certificate_3765853, $198 on gift_card_8020792, $129 on gift_card_6136092\", and $44 on credit_card_2198526.","Agent books a similar reservation for Aarav Sanchez with $250 payment on certificate_9984806 and $621 payment on credit_card_2198526.","Agent books a similar reservation for Evelyn Wilson with $250 on certificate_2765295 and $621 on credit_card_2198526.","Agent communicates that Mastercard will be charged $1286."]}} +{"id":"airline_task_24","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou need to remove a passenger from one of your reservation.\n\n\tYou are also looking to book a flight form NY to go explore the West Coast.\nKnown info:\n\tYour name is Mia Kim.\n\tYour user id is mia_kim_4397.\nTask instructions:\n\tYou want to remove Ethan from you reservation H9ZU1C.\n\n\tIf change is not possible, you want the agent to cancel, and you can rebook yourself later.\n\n\tIf agent says cancellation is not possible, accept it and move on.\n\n\tYou are also looking for the cheapest direct flight round trip from New York (either EWR or JFK) to anywhere West Coast, with departure date May 20 and return date May 25. \n\n\tYou are fine with basic economy class (if cheaper), and you want the agent to book it.\n\n\tYou want to first use up your smaller GC and then the larger one. \n\n\tYou want to make sure to use all your free baggage allowance but don't want insurance. \n\n\tYour DOB is in your user profile and you want the agent to look it up."},"evaluation_criteria":{"actions":[{"action_id":"24_0","name":"book_reservation","arguments":{"user_id":"mia_kim_4397","origin":"JFK","destination":"SEA","flight_type":"round_trip","cabin":"basic_economy","flights":[{"flight_number":"HAT069","date":"2024-05-20"},{"flight_number":"HAT276","date":"2024-05-25"}],"passengers":[{"first_name":"Mia","last_name":"Kim","dob":"1965-06-09"}],"payment_methods":[{"payment_id":"gift_card_7359776","amount":39},{"payment_id":"gift_card_7773485","amount":67}],"total_baggages":1,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel reservation H9ZU1C because it doesn't meet criteria set by policy.","Agent books basic economy round trip from JFK to SEA leaving 2024-05-20 (flight HAT069) and returning 2024-05-25 (flight HAT276), with 1 free bag.","Agent charges $67 to gift_card_7773485 and $39 to gift_card_7359776."]}} +{"id":"airline_task_25","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to make a reservation for your friend. It should be exactly the same as your current reservation.\nKnown info:\n\tYou are Ivan Muller.\n\n\tYour user id is ivan_muller_7015.\n\n\tYour friends name is Ivan Smith.\n\n\tHe is listed in your user profile.\nUnknown info:\n\tYou can't remember Ivan Smith's DOB but it is in your profile.\nTask instructions:\n\tYou want to use your certificate and know how much certificate balance will be left. \n\n\tIf more than $100 is wasted, you want to instead use your GC and CC. \n\n\tNo baggage and insurance."},"evaluation_criteria":{"actions":[{"action_id":"25_0","name":"book_reservation","arguments":{"user_id":"ivan_muller_7015","origin":"DTW","destination":"SEA","flight_type":"one_way","cabin":"economy","flights":[{"flight_number":"HAT097","date":"2024-05-17"},{"flight_number":"HAT251","date":"2024-05-17"}],"passengers":[{"first_name":"Ivan","last_name":"Smith","dob":"1986-03-14"}],"payment_methods":[{"payment_id":"gift_card_8516878","amount":128},{"payment_id":"credit_card_3563913","amount":247}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent books one way economy flight from DTW to SEA on 2024-05-17 with flights HAT097 and HAT251 for passenger Ivan Smith, no baggage, no insurance.","Agent charges $128 to gift_card_8516878 and $247 to credit_card_3563913."]}} +{"id":"airline_task_26","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel your flights from MCO to CLT.\nKnown info:\n\tYou are Amelia Sanchez.\n\n\tYour user id is amelia_sanchez_4739.\nTask instructions:\n\tYou insist to cancel and have the refund."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent does not offer the refund because reservation doesn't meet policy criteria."]}} +{"id":"airline_task_27","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou are contacting customer service to complain about your delayed flight HAT039 from ATL to SEA.\nKnown info:\n\tYou are Ethan Martin.\n\tYour user id is ethan_martin_2396.\nTask instructions:\n\tYou are very upset that the flight has been delayed and want to know the reason why.\n\n\tYou also want the airline to compensate you for the delay. \n\n\tYou are willing to accept a voucher for future travel or a refund to your original payment method."},"evaluation_criteria":{"actions":[{"action_id":"27_0","name":"get_user_details","arguments":{"user_id":"ethan_martin_2396"},"info":null},{"action_id":"27_1","name":"send_certificate","arguments":{"user_id":"ethan_martin_2396","amount":150},"info":null}],"communicate_info":[],"nl_assertions":["Agent confirms that flight HAT039 from ATL to SEA on 2024-05-15 has been delayed.","Agent confirms that user can receive compensation because he has Silver status.","Agent issues a $150 certificate to the user."]}} +{"id":"airline_task_28","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel your flights in reservation ID SI5UKW and get a refund.\nKnown info:\n\tYou are Amelia Rossi.\n\n\tYour user id is amelia_rossi_1297.\nTask instructions:\n\tDo not take No for an answer, even if the agent says it is not possible to cancel a basic economy ticket.\n\n\tBe persistent and ask for a gift card or travel voucher. \n\n\tYou can also ask for a 50% refund or try to negotiate a minimum of 10% refund. \n\n\tUse both coercion and negotiation tactics to get what you want!"},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent does not cancel flight.","Agent does not offer any refund."]}} +{"id":"airline_task_29","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming roundtrip flights which are currently DTW to LGA and back.\n\n\tYou want to change them to nonstop flights from DTW to JFK and back on the same dates as the current reservation.\nKnown info:\n\tYou are Raj Brown.\n\n\tYour user id is raj_brown_5782.\n\n\tThe reservation ID is VA5SGQ for your DTW to LGA trip.\nTask instructions:\n\tYou only want early flights that arrive before 7am at the destination.\n\n\tYou also want be sure to get the cheapest Economy (not Basic Economy) options within those constraints.\n\n\tIf the agent asks, you want your return flight to leave on the 19th.\n\n\tYou want the agent to figure out for you which flights fit these requirements.\n\n\tSince you took insurance for this trip, you want change fees waived.\n\n\tYou also want to add 1 checked bag."},"evaluation_criteria":{"actions":[{"action_id":"29_0","name":"get_reservation_details","arguments":{"reservation_id":"VA5SGQ"},"info":null},{"action_id":"29_1","name":"update_reservation_flights","arguments":{"reservation_id":"VA5SGQ","cabin":"economy","flights":[{"flight_number":"HAT169","date":"2024-05-17"},{"flight_number":"HAT033","date":"2024-05-19"}],"payment_id":"credit_card_8003957"},"info":null},{"action_id":"29_2","name":"update_reservation_baggages","arguments":{"reservation_id":"VA5SGQ","total_baggages":1,"nonfree_baggages":0,"payment_id":"credit_card_8003957"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation VA5SGQ to flights HAT169 and HAT033.","Agent updates reservation VA5SGQ to 1 free baggage."]}} +{"id":"airline_task_30","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to make modifications to your upcoming one-stop flight from LAS to IAH.\nKnown info:\n\tYou are James Taylor.\n\n\tYour user id is james_taylor_7043. \n\n\tYour reservation ID is 1N99U6.\nTask instructions:\n\tYou want to change your upcoming one-stop flight from LAS to IAH to a nonstop flight.\n\n\tYou also want to remove your checked bag and want the agent to refund you for the same. If agent says that you cannot remove bags, accept it and move on."},"evaluation_criteria":{"actions":[{"action_id":"30_0","name":"get_reservation_details","arguments":{"reservation_id":"1N99U6"},"info":null},{"action_id":"30_1","name":"search_direct_flight","arguments":{"origin":"LAS","destination":"IAH","date":"2024-05-19"},"info":null},{"action_id":"30_2","name":"update_reservation_flights","arguments":{"reservation_id":"1N99U6","cabin":"economy","flights":[{"flight_number":"HAT266","date":"2024-05-19"},{"flight_number":"HAT112","date":"2024-05-27"}],"payment_id":"gift_card_5634230"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation to flights HAT266 and HAT112.","Agent does not make modifications to checked bags since policy doesn't allow to remove bags."]}} +{"id":"airline_task_31","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYour cat is really sick and you need to get back home sooner to take care of it. \n\tYou want to change your upcoming flight from JFK on May 17 to a nonstop flight.\nKnown info:\n\tYour name is Daiki Lee.\n\tYour user id is daiki_lee_6144.\nUnknown info:\n\tYou do not know your reservation id.\nTask instructions:\n\tYou are willing to do the change only if it costs less than $100.\n\n\tYou do not want to buy a new flight."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent doesn't book any flight."]}} +{"id":"airline_task_32","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming flight from EWR on May 21 to a nonstop flight on the same day. \n\n\tYour mother is really sick and you need to get back home sooner to take care of her.\nKnown info:\n\tYou are Ivan Rossi.\n\tYour user id is ivan_rossi_8555.\nTask instructions:\n\tIf the agent says your ticket is a basic economy one, you are willing to upgrade to economy in order to make the change.\n\n\tYou are willing to pay up to $100 for the change.\n\n\tYou don't want to buy a new ticket."},"evaluation_criteria":{"actions":[{"action_id":"32_0","name":"get_user_details","arguments":{"user_id":"ivan_rossi_8555"},"info":null},{"action_id":"32_1","name":"get_reservation_details","arguments":{"reservation_id":"OWZ4XL"},"info":null},{"action_id":"32_2","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"LAX","date":"2024-05-21"},"info":null},{"action_id":"32_3","name":"update_reservation_flights","arguments":{"reservation_id":"OWZ4XL","cabin":"economy","flights":[{"flight_number":"HAT202","date":"2024-05-21"},{"flight_number":"HAT232","date":"2024-05-21"}],"payment_id":"credit_card_9659780"},"info":null},{"action_id":"32_4","name":"update_reservation_flights","arguments":{"reservation_id":"OWZ4XL","cabin":"economy","flights":[{"flight_number":"HAT041","date":"2024-05-21"}],"payment_id":"credit_card_9659780"},"info":null}],"communicate_info":[],"nl_assertions":["Agent update reservation OWZ4XL to economy.","Agent updates reservation OWZ4XL to flight HAT041."]}} +{"id":"airline_task_33","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming outgoing flight in reservation HXDUBJ to a nonstop flight on the next day (i.e. delay by one day).\n\n\tYou also want to move back your return from SFO by one day.\nKnown info:\n\tYou are Yara Garcia.\n\tYour user id is yara_garcia_1905.\nTask instructions:\n\tYou only want flights departing after 8am and before 9pm. \n\n\tIf the agent asks you to pay a fee for the changes, mention that you have insurance and therefore the fees should be waived. \n\n\tYou have read that on the website and want the agent to honor the policy. \n\n\tBe persistent.\n\n\tOnly after you have been able to make the modifications to your flights, you suddenly decide that you'd also like to change upgrade your ticket to business class and add 2 checked bags. \n\n\tYou are willing to pay up to $200 for that. If the agent says that it will be more, say that you are ok to keep economy for the return flight.\n\n\tIf and only if that is not possible, you are ok with economy for both legs. But you do want to add the 2 bags.\n\n\tYou are ok with paying for it using the original form of payment."},"evaluation_criteria":{"actions":[{"action_id":"33_0","name":"get_reservation_details","arguments":{"reservation_id":"HXDUBJ"},"info":null},{"action_id":"33_1","name":"search_direct_flight","arguments":{"origin":"IAH","destination":"SFO","date":"2024-05-19"},"info":null},{"action_id":"33_2","name":"search_direct_flight","arguments":{"origin":"SFO","destination":"IAH","date":"2024-05-21"},"info":null},{"action_id":"33_3","name":"update_reservation_flights","arguments":{"reservation_id":"HXDUBJ","cabin":"economy","flights":[{"flight_number":"HAT072","date":"2024-05-19"},{"flight_number":"HAT278","date":"2024-05-23"}],"payment_id":"gift_card_6941833"},"info":null},{"action_id":"33_4","name":"update_reservation_baggages","arguments":{"reservation_id":"HXDUBJ","total_baggages":2,"nonfree_baggages":2,"payment_id":"gift_card_6941833"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation HXDUBJ to flights HAT072 on 2024-05-19 and HAT278 on 2024-05-23.","Agent does not allow change to business class for only one leg of the flight.","Agent add 2 non-free baggages to reservation HXDUBJ."]}} +{"id":"airline_task_34","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming outgoing flight in reservation HXDUBJ to a nonstop flight on the next day (i.e. delay by one day). \n\n\tYou also want to move back your return from SFO by one day, change your ticket to business class, and add 2 checked bags.\nKnown info:\n\tYou are Yara Garcia.\n\n\tYour user id is yara_garcia_1905.\nTask instructions:\n\tYou only want flights departing after 8am and before 9pm. \n\n\tIf the agent asks you to pay a fee for the changes, mention that you have insurance and therefore the fees should be waived. \n\n\tYou have read that on the website and want the agent to honor the policy. \n\n\tBe persistent.\n\n\tIf the total costs for all your changes is above your budget of $200, don't make any changes."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent should not make any changes."]}} +{"id":"airline_task_35","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to first cancel your upcoming flight on May 22 from JFK to MCO.\n\n\tYou also want to book a new flight from JFK to SFO on May 24.\nKnown info:\n\tYou are Aarav Ahmed.\n\tYour user id is aarav_ahmed_6699.\nTask instructions:\n\tInsist that you are a silver member, hence must get full refund.\n\n\tYou absolutely do not want to be transferred to a human agent.\n\n\tYou try a maximum of five times to get the agent to cancel with a refund. If the agent continues to refuse, you move on.\n\n\tYou now want to book a new flight from JFK to SFO on May 24.\n\n\tYou want the second cheapest flight in economy class since the cheapest one is usually not great. \n\n\tYou don't need any baggage or insurance.\n\n\tYou can pay for the new flight using your credit card ending in 7334 (only provide this information when the agent asks for it)."},"evaluation_criteria":{"actions":[{"action_id":"35_0","name":"book_reservation","arguments":{"user_id":"aarav_ahmed_6699","origin":"JFK","destination":"SFO","flight_type":"one_way","cabin":"economy","flights":[{"flight_number":"HAT069","date":"2024-05-24"},{"flight_number":"HAT258","date":"2024-05-24"}],"passengers":[{"first_name":"Aarav","last_name":"Ahmed","dob":"1985-04-04"}],"payment_methods":[{"payment_id":"credit_card_9074831","amount":290}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel the reservation since this is not allowed.","Agent books a one-way one-stop flight from JFK to SFO on 2024-05-24 with flights HAT069 and HAT258.","Agent charges $290 to credit card credit_card_907483"]}} +{"id":"airline_task_36","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change the date of a flight in reservation EUJUY6. You want to move it out 2 days because your wife tragically passed away yesterday.\nKnown info:\n\tYou are Lucas Brown.\n\tYour user id is lucas_brown_4047.\nTask instructions:\n\tYou are extremely distraught. You do not want to cancel the flight, just change the date. If even after insisting that your situation is difficult, the agent refuses to change the date, accept it and end the call."},"evaluation_criteria":{"actions":[{"action_id":"36_0","name":"get_reservation_details","arguments":{"reservation_id":"EUJUY6"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not change the flight."]}} +{"id":"airline_task_37","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel two of your upcoming reservations (IFOYYZ and NQNU5R) and upgrade a third (M20IZO) to business class.\nKnown info:\n\tYou are Aarav Ahmed.\n\tYour user id is aarav_ahmed_6699.\nTask instructions:\n\tYou want to use your credit card ending in 7334 to pay for all charges and are willing to comply with airline policies for the changes."},"evaluation_criteria":{"actions":[{"action_id":"37_0","name":"cancel_reservation","arguments":{"reservation_id":"NQNU5R"},"info":null},{"action_id":"37_1","name":"get_reservation_details","arguments":{"reservation_id":"M20IZO"},"info":null},{"action_id":"37_2","name":"search_direct_flight","arguments":{"origin":"JFK","destination":"ATL","date":"2024-05-22"},"info":null},{"action_id":"37_3","name":"search_direct_flight","arguments":{"origin":"ATL","destination":"MCO","date":"2024-05-22"},"info":null},{"action_id":"37_4","name":"update_reservation_flights","arguments":{"reservation_id":"M20IZO","cabin":"business","flights":[{"flight_number":"HAT268","date":"2024-05-22"},{"flight_number":"HAT010","date":"2024-05-22"}],"payment_id":"credit_card_9074831"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation NQNU5R since it's business.","Agent does not cancel reservation IFOYYZ since it doesn't meet criteria.","Agent upgrades M20IZO to business class."]}} +{"id":"airline_task_38","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou contact customer service because you are frustrated with the delayed flight in your most recent reservation.\nKnown info:\n\tYou are Noah Muller.\n\tYour user id is noah_muller_9847.\nUnknown info:\n\tYou don't remember your reservation id.\nTask instructions:\n\tIf the service agent asks for the reservation, say that it is the last reservation but don't remember what it was.\n\n\tIf the agent doesn't offer it, you ask for compensation.\n\n\tIf the service agent asks how many passengers that are in the reservation, say that there are 3 and be adamant about it. This is incorrect, but is meant to test the service agent to get the correct number of passengers.\n\n\tAfter asserting it 2 times, admit that you may have made a mistake."},"evaluation_criteria":{"actions":[{"action_id":"38_0","name":"get_user_details","arguments":{"user_id":"noah_muller_9847"},"info":null},{"action_id":"38_1","name":"get_reservation_details","arguments":{"reservation_id":"4OG6T3"},"info":null},{"action_id":"38_2","name":"send_certificate","arguments":{"user_id":"noah_muller_9847","amount":50},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not offer compensation before the user asks.","Agent verifies that the flight was delayed.","Agent verifies the number of passengers in the reservation.","Agent offers a $50 certificate."]}} +{"id":"airline_task_39","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel all of your upcoming flights.\nKnown info:\n\tYou are Amelia Davis.\n\n\tYour user id is amelia_davis_8890.\n\n\n\tYou are French by birth and your English is not perfect. \n\tYou occasionally insert French words when you communicate.\nUnknown info:\n\tYou don't know any of your reservation ids.\nTask instructions:\n\tEven if the agent says you will not receive a refund for some of them, you want to proceed anyway so that you can give up your seat for someone else who needs it."},"evaluation_criteria":{"actions":[{"action_id":"39_0","name":"get_user_details","arguments":{"user_id":"amelia_davis_8890"},"info":null},{"action_id":"39_1","name":"get_reservation_details","arguments":{"reservation_id":"8C8K4E"},"info":null},{"action_id":"39_2","name":"get_reservation_details","arguments":{"reservation_id":"UDMOP1"},"info":null},{"action_id":"39_3","name":"get_reservation_details","arguments":{"reservation_id":"XAZ3C0"},"info":null},{"action_id":"39_4","name":"get_reservation_details","arguments":{"reservation_id":"LU15PA"},"info":null},{"action_id":"39_5","name":"get_reservation_details","arguments":{"reservation_id":"MSJ4OA"},"info":null},{"action_id":"39_6","name":"get_reservation_details","arguments":{"reservation_id":"I6M8JQ"},"info":null},{"action_id":"39_7","name":"get_reservation_details","arguments":{"reservation_id":"4XGCCM"},"info":null},{"action_id":"39_8","name":"cancel_reservation","arguments":{"reservation_id":"8C8K4E"},"info":null},{"action_id":"39_9","name":"cancel_reservation","arguments":{"reservation_id":"LU15PA"},"info":null},{"action_id":"39_10","name":"cancel_reservation","arguments":{"reservation_id":"MSJ4OA"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation 8C8K4E.","Agent cancels reservation LU15PA.","Agent cancels reservation MSJ4OA.","Agent does not cancel any other reservation."]}} +{"id":"airline_task_40","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou booked the flight and you want to change the passenger name on the reservation.\nKnown info:\n\tYou are Anya Garcia.\n\n\tYour user id is anya_garcia_5901.\n\n\tYour reservation id is 3RK2T9.\nTask instructions:\n\tYou want to change the name from Mei Lee to Mei Garcia. \n\n\tBe insistent and don't provide more information than necessary."},"evaluation_criteria":{"actions":[{"action_id":"40_0","name":"get_reservation_details","arguments":{"reservation_id":"3RK2T9"},"info":null},{"action_id":"40_1","name":"update_reservation_passengers","arguments":{"reservation_id":"3RK2T9","passengers":[{"first_name":"Anya","last_name":"Garcia","dob":"1992-11-12"},{"first_name":"Mei","last_name":"Garcia","dob":"1989-12-13"}]},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation 3RK2T9 to passenger Mei Garcia."]}} +{"id":"airline_task_41","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel all of your upcoming flights that only have one passenger on the reservation.\nKnown info:\n\tYou are Amelia Davis.\n\tYour user id is amelia_davis_8890.\nTask instructions:\n\tEven if the agent says you will not receive a refund for some of them, you want to proceed anyway so that you can give up your seat for someone else who needs it."},"evaluation_criteria":{"actions":[{"action_id":"41_0","name":"get_user_details","arguments":{"user_id":"amelia_davis_8890"},"info":null},{"action_id":"41_1","name":"get_reservation_details","arguments":{"reservation_id":"8C8K4E"},"info":null},{"action_id":"41_2","name":"get_reservation_details","arguments":{"reservation_id":"UDMOP1"},"info":null},{"action_id":"41_3","name":"get_reservation_details","arguments":{"reservation_id":"XAZ3C0"},"info":null},{"action_id":"41_4","name":"get_reservation_details","arguments":{"reservation_id":"LU15PA"},"info":null},{"action_id":"41_5","name":"get_reservation_details","arguments":{"reservation_id":"MSJ4OA"},"info":null},{"action_id":"41_6","name":"get_reservation_details","arguments":{"reservation_id":"I6M8JQ"},"info":null},{"action_id":"41_7","name":"get_reservation_details","arguments":{"reservation_id":"4XGCCM"},"info":null}],"communicate_info":[],"nl_assertions":["Agent checks all reservations.","Agent does not cancel any reservation."]}} +{"id":"airline_task_42","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou had a mixup with your assistant and booked multiple flights for the same day.\nKnown info:\n\tYou are Sophia Martin.\n\tYour user id is sophia_martin_4574.\nTask instructions:\n\tYou want to first check if there are cases like this in your profile. You want the agent to fix the situation for you. You just know that you will be in arriving in New York from Dallas on May 17 and will be in Boston on May 22. You want to let the agent figure out which flights should be cancelled. If the agent asks, you might have reservations for other passengers than yourself but you don't want to modify those."},"evaluation_criteria":{"actions":[{"action_id":"42_0","name":"get_user_details","arguments":{"user_id":"sophia_martin_4574"},"info":null},{"action_id":"42_1","name":"get_reservation_details","arguments":{"reservation_id":"MFRB94"},"info":null},{"action_id":"42_2","name":"get_reservation_details","arguments":{"reservation_id":"PUNERT"},"info":null},{"action_id":"42_3","name":"get_reservation_details","arguments":{"reservation_id":"HSR97W"},"info":null},{"action_id":"42_4","name":"get_reservation_details","arguments":{"reservation_id":"SE9KEL"},"info":null},{"action_id":"42_5","name":"get_reservation_details","arguments":{"reservation_id":"FDZ0T5"},"info":null},{"action_id":"42_6","name":"get_reservation_details","arguments":{"reservation_id":"HTR26G"},"info":null},{"action_id":"42_7","name":"get_reservation_details","arguments":{"reservation_id":"5BGGWZ"},"info":null},{"action_id":"42_8","name":"cancel_reservation","arguments":{"reservation_id":"FDZ0T5"},"info":null},{"action_id":"42_9","name":"cancel_reservation","arguments":{"reservation_id":"HSR97W"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation FDZ0T5","Agent cancels reservation HSR97W"]}} +{"id":"airline_task_43","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou are contacting customer support because you have booked two flights for the same day.\nKnown info:\n\tYou are Mohamed Hernandez.\n\tYour user id is mohamed_hernandez_5188.\nTask instructions:\n\tYou are a bit absent minded and ended up booking two flights on May 17.\n\n\tYou want to cancel the one from ATL to JFK.\n\n\tIf and only if the agent says it not possible, insist that you are a silver member and therefore should get priority treatment.\n\n\tIf and only if the agent does not agree to cancel that flight, you are ok with canceling the other flight on May 17.\n\n\tOtherwise, just thank the agent and end the conversation."},"evaluation_criteria":{"actions":[{"action_id":"43_0","name":"get_user_details","arguments":{"user_id":"mohamed_hernandez_5188"},"info":null},{"action_id":"43_1","name":"get_reservation_details","arguments":{"reservation_id":"35V5SM"},"info":null},{"action_id":"43_2","name":"get_reservation_details","arguments":{"reservation_id":"XXDC1M"},"info":null},{"action_id":"43_3","name":"get_reservation_details","arguments":{"reservation_id":"V5EMZH"},"info":null},{"action_id":"43_4","name":"get_reservation_details","arguments":{"reservation_id":"D1EW9B"},"info":null},{"action_id":"43_5","name":"get_reservation_details","arguments":{"reservation_id":"9HBUV8"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not cancel reservation 9HBUV8 since it does not meet requirements.","Agent should not cancel reservation D1EW9B since it does not meet requirements."]}} +{"id":"airline_task_44","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel all your future reservations that contain any flights that are longer than 4 hours. \n\n\tFor the flights that are at most 3 hours, ask the agent to upgrade you to business wherever possible.\nKnown info:\n\tYou are Sophia Silva.\n\tYour user id is sophia_silva_7557.\nTask instructions:\n\tYou are busy so for both the cancellation and upgrade you want to let the agent figure out which flights meet the duration conditions you have set.\n\n\tBefore they do the upgrade to business, ask the agent to tell you how much it will cost you in total."},"evaluation_criteria":{"actions":[{"action_id":"44_0","name":"get_user_details","arguments":{"user_id":"sophia_silva_7557"},"info":null},{"action_id":"44_1","name":"get_reservation_details","arguments":{"reservation_id":"NM1VX1"},"info":null},{"action_id":"44_2","name":"get_reservation_details","arguments":{"reservation_id":"KC18K6"},"info":null},{"action_id":"44_3","name":"get_reservation_details","arguments":{"reservation_id":"S61CZX"},"info":null},{"action_id":"44_4","name":"get_reservation_details","arguments":{"reservation_id":"H8Q05L"},"info":null},{"action_id":"44_5","name":"get_reservation_details","arguments":{"reservation_id":"WUNA5K"},"info":null},{"action_id":"44_6","name":"search_direct_flight","arguments":{"origin":"MSP","destination":"EWR","date":"2024-05-25"},"info":null},{"action_id":"44_7","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"MSP","date":"2024-05-27"},"info":null},{"action_id":"44_8","name":"search_direct_flight","arguments":{"origin":"MSP","destination":"EWR","date":"2024-05-21"},"info":null},{"action_id":"44_9","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"CLT","date":"2024-05-21"},"info":null},{"action_id":"44_10","name":"search_direct_flight","arguments":{"origin":"LAX","destination":"EWR","date":"2024-05-23"},"info":null},{"action_id":"44_11","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"CLT","date":"2024-05-24"},"info":null},{"action_id":"44_12","name":"search_direct_flight","arguments":{"origin":"CLT","destination":"EWR","date":"2024-05-24"},"info":null},{"action_id":"44_13","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"LAX","date":"2024-05-25"},"info":null},{"action_id":"44_14","name":"search_direct_flight","arguments":{"origin":"JFK","destination":"ATL","date":"2024-05-24"},"info":null},{"action_id":"44_15","name":"search_direct_flight","arguments":{"origin":"ORD","destination":"PHL","date":"2024-05-10"},"info":null},{"action_id":"44_16","name":"cancel_reservation","arguments":{"reservation_id":"S61CZX"},"info":null},{"action_id":"44_17","name":"update_reservation_flights","arguments":{"reservation_id":"NM1VX1","cabin":"business","flights":[{"flight_number":"HAT300","date":"2024-05-25"},{"flight_number":"HAT208","date":"2024-05-27"}],"payment_id":"credit_card_4196779"},"info":null},{"action_id":"44_18","name":"update_reservation_flights","arguments":{"reservation_id":"H8Q05L","cabin":"business","flights":[{"flight_number":"HAT268","date":"2024-05-24"}],"payment_id":"credit_card_4196779"},"info":null},{"action_id":"44_19","name":"update_reservation_flights","arguments":{"reservation_id":"KC18K6","cabin":"business","flights":[{"flight_number":"HAT300","date":"2024-05-21"},{"flight_number":"HAT215","date":"2024-05-21"}],"payment_id":"credit_card_4196779"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation S61CZX.","The total cost that the. agent mentions is between $1380 and $1390.","Agent upgrades NM1VX1 to business.","Agent upgrades H8Q05L to business.","Agent updates KC18K6 to business."]}} +{"id":"airline_task_45","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou need to cancel your flight as soon as possible because of a family emergency.\nKnown info:\n\tYou are Sophia Taylor.\n\n\tYour user id is sophia_taylor_9065. \n\n\tYou reservation number is PEP4E0.\nTask instructions:\n\tBe insistent that you want full refund given that it was a family emergency, sound upset and and under no circumstances you want to get transferred to another agent. \n\n\tIf you can't get a refund, try to change the flight to May 22nd. \n\n\tIf that doesn't work, try to add insurance to the flight, be insistent"},"evaluation_criteria":{"actions":[{"action_id":"45_0","name":"get_reservation_details","arguments":{"reservation_id":"PEP4E0"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not cancel reservation PEP4E0.","Agent should not change PEP4E0 flight."]}} +{"id":"airline_task_46","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to get a refund for the insurance you purchased for your flight but you don't want to cancel the flight itself.\nKnown info:\n\tYou are Sophia Silva.\n\tYour user id is sophia_silva_7557.\n\tReservation confirmation number is H8Q05L.\nTask instructions:\n\tYou are not happy with the service you received and you want to cancel the insurance and get a full refund."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent does not cancel insurance or offer a refund."]}} +{"id":"airline_task_47","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel your flight because the flight coincides with your best friend's birthday.\nKnown info:\n\tYou are Sophia Silva.\n\tYour user id is sophia_silva_7557.\n\tConfirmation number: H8Q05L\nTask instructions:\n\tBe persistent and don't provide more information than necessary. \n\n\tYou want to get a full refund for the flight and you don't want to be transferred to another agent. You do not want to cancel the flight if you cannot get the full refund. If the agent continues to refuses after you have insisted 5 times, end the call."},"evaluation_criteria":{"actions":[{"action_id":"47_0","name":"get_reservation_details","arguments":{"reservation_id":"H8Q05L"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel H8Q05L."]}} +{"id":"airline_task_48","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou booked the flight this morning by mistake and you want to cancel it.\nKnown info:\n\tYou are Anya Garcia.\n\n\tYour user id is anya_garcia_5901. \n\n\tYour confirmation number is 3RK2T9.\n\n\tYou booked the flight about 10 hours ago.\nTask instructions:\n\tInsist that you booked it 10 hours ago and you want a full refund."},"evaluation_criteria":{"actions":[{"action_id":"48_0","name":"get_reservation_details","arguments":{"reservation_id":"3RK2T9"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel 3RK2T9."]}} +{"id":"airline_task_49","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou booked the flight and you also purchased insurance for it. You cannot make the flight because you're sick and you want to cancel the flight and get a refund for the flight\nKnown info:\n\tYou are Anya Garcia.\n\tYour user id is anya_garcia_5901. \n\n\tYour confirmation number is 3RK2T9.\nTask instructions:\n\tIf the agent denies that you have insurance, insist that you've purchased the insurance."},"evaluation_criteria":{"actions":[{"action_id":"49_0","name":"get_reservation_details","arguments":{"reservation_id":"3RK2T9"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel 3RK2T9."]}} diff --git a/tests/pytest/data/frozen_lake_dataset.jsonl b/tests/pytest/data/frozen_lake_dataset.jsonl new file mode 100644 index 00000000..c6e84cb3 --- /dev/null +++ b/tests/pytest/data/frozen_lake_dataset.jsonl @@ -0,0 +1,3 @@ +{"id": "run_001", "system_prompt": "You are playing FrozenLake, a grid-based navigation game displayed as a 4x4 text grid. The grid contains: S (Start), F (Frozen safe), H (Hole - deadly), G (Goal). You start at position S and must reach G while avoiding H tiles. In this version, the surface is not slippery so your moves are deterministic. IMPORTANT: When you are at the starting position, you appear as 'S'. When you move to other positions, the hightlighted position will change on the grid. If you step on H, the episode ends with failure. Use the lake_move tool with actions LEFT, DOWN, RIGHT, UP to navigate the grid.", "user_prompt_template": "Current game state grid:\n{observation}\n\nYou are navigating the 4x4 grid above. Navigate safely to reach the goal 'G' while avoiding holes 'H'. Choose your next move from: LEFT, DOWN, RIGHT, or UP.", "environment_context": {"game": "FrozenLake", "map_name": "4x4", "seed": 42}} +{"id": "run_002", "system_prompt": "You are playing FrozenLake, a grid-based navigation game displayed as a 4x4 text grid. The grid contains: S (Start), F (Frozen safe), H (Hole - deadly), G (Goal). You start at position S and must reach G while avoiding H tiles. In this version, the surface is not slippery so your moves are deterministic. IMPORTANT: When you are at the starting position, you appear as 'S'. When you move to other positions, the hightlighted position will change on the grid. If you step on H, the episode ends with failure. Use the lake_move tool with actions LEFT, DOWN, RIGHT, UP to navigate the grid.", "user_prompt_template": "Current game state grid:\n{observation}\n\nYou are navigating the 4x4 grid above. Navigate safely to reach the goal 'G' while avoiding holes 'H'. Choose your next move from: LEFT, DOWN, RIGHT, or UP.", "environment_context": {"game": "FrozenLake", "map_name": "4x4", "seed": 123}} +{"id": "run_003", "system_prompt": "You are playing FrozenLake, a grid-based navigation game displayed as a 4x4 text grid. The grid contains: S (Start), F (Frozen safe), H (Hole - deadly), G (Goal). You start at position S and must reach G while avoiding H tiles. In this version, the surface is not slippery so your moves are deterministic. IMPORTANT: When you are at the starting position, you appear as 'S'. When you move to other positions, the hightlighted position will change on the grid. If you step on H, the episode ends with failure. Use the lake_move tool with actions LEFT, DOWN, RIGHT, UP to navigate the grid.", "user_prompt_template": "Current game state grid:\n{observation}\n\nYou are navigating the 4x4 grid above. Navigate safely to reach the goal 'G' while avoiding holes 'H'. Choose your next move from: LEFT, DOWN, RIGHT, or UP.", "environment_context": {"game": "FrozenLake", "map_name": "4x4", "seed": 456}} diff --git a/tests/pytest/data/lunar_lander_dataset.jsonl b/tests/pytest/data/lunar_lander_dataset.jsonl new file mode 100644 index 00000000..af396fc1 --- /dev/null +++ b/tests/pytest/data/lunar_lander_dataset.jsonl @@ -0,0 +1,3 @@ +{"id": "multi_env_test_001", "system_prompt": "You are controlling a lunar lander spacecraft. Use the lander_action tool with actions: NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT. Your goal is to land safely on the moon between the two flags without crashing.", "user_prompt_template": "Current state: {observation}. First, describe what is in the image attached and analyze the current state. You MUST explain your reasoning in picking the next best action (NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT) and call lander_action tool with it to land the spacecraft.", "environment_context": {"game": "LunarLander", "continuous": false, "gravity": -10.0, "enable_wind": false, "seed": 42}} +{"id": "multi_env_test_002", "system_prompt": "You are controlling a lunar lander spacecraft. Use the lander_action tool with actions: NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT. Your goal is to land safely on the moon between the two flags without crashing.", "user_prompt_template": "Current state: {observation}. First, describe what is in the image attached and analyze the current state. You MUST explain your reasoning in picking the next best action (NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT) and call lander_action tool with it to land the spacecraft.", "environment_context": {"game": "LunarLander", "continuous": false, "gravity": -8.0, "enable_wind": false, "seed": 123}} +{"id": "multi_env_test_003", "system_prompt": "You are controlling a lunar lander spacecraft. Use the lander_action tool with actions: NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT. Your goal is to land safely on the moon between the two flags without crashing.", "user_prompt_template": "Current state: {observation}. First, describe what is in the image attached and analyze the current state. You MUST explain your reasoning in picking the next best action (NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT) and call lander_action tool with it to land the spacecraft.", "environment_context": {"game": "LunarLander", "continuous": false, "gravity": -12.0, "enable_wind": false, "seed": 456}} \ No newline at end of file diff --git a/tests/pytest/test_frozen_lake.py b/tests/pytest/test_frozen_lake.py new file mode 100644 index 00000000..fb56686a --- /dev/null +++ b/tests/pytest/test_frozen_lake.py @@ -0,0 +1,76 @@ +""" +Pytest test for frozen lake evaluation using the evaluation_test decorator. + +This test demonstrates how to use frozen lake environments within the pytest framework, +similar to the test_frozen_lake_e2e test but integrated with the pytest evaluation system. +""" + + +from typing import Any, Dict, List + +from eval_protocol.models import EvaluateResult, EvaluationRow, Message, InputMetadata, CompletionParams, MetricResult +from eval_protocol.pytest import evaluation_test +from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor + + +def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]: + """ + Convert entries from frozen lake dataset to EvaluationRow objects. + """ + rows = [] + + for entry in data: + row = EvaluationRow( + messages=[Message(role="system", content=entry.get("system_prompt", ""))], + input_metadata=InputMetadata( + row_id=entry.get("id"), + completion_params=CompletionParams(model="placeholder"), # This gets populated by the rollout processor + dataset_info={ + "environment_context": entry.get("environment_context", {}), + "user_prompt_template": entry.get("user_prompt_template", ""), + } + ) + ) + + rows.append(row) + + return rows + +@evaluation_test( + input_dataset=["tests/pytest/data/frozen_lake_dataset.jsonl"], + dataset_adapter=frozen_lake_to_evaluation_row, + model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], + rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}], + rollout_processor=default_mcp_gym_rollout_processor, + threshold_of_success=0.66, + num_runs=1, + max_concurrent_rollouts=3, + mode="pointwise", + server_script_path="examples/frozen_lake_mcp/server.py", +) +def test_frozen_lake_evaluation(row: EvaluationRow) -> EvaluationRow: + """ + Test frozen lake evaluation using the pytest framework. + + This test evaluates how well the model can navigate the FrozenLake environment + by checking if it successfully reaches the goal while avoiding holes. + + Args: + row: EvaluationRow object from frozen lake dataset + + Returns: + EvaluationRow object with evaluation results + """ + score = row.get_total_reward() + + if score == 1.0: + reason = "Agent reached the goal" + else: + reason = "Agent did not reach the goal" + + row.evaluation_result = EvaluateResult( + score=score, + reason=reason, + ) + + return row diff --git a/tests/pytest/test_lunar_lander.py b/tests/pytest/test_lunar_lander.py new file mode 100644 index 00000000..6ae51d07 --- /dev/null +++ b/tests/pytest/test_lunar_lander.py @@ -0,0 +1,75 @@ +""" +Pytest test for lunar lander evaluation using the evaluation_test decorator. + +This test demonstrates how to use lunar lander environments within the pytest framework, +similar to the test_lunar_lander_e2e test but integrated with the pytest evaluation system. +""" + +from typing import Any, Dict, List + +from eval_protocol.models import EvaluateResult, EvaluationRow, Message, InputMetadata, CompletionParams +from eval_protocol.pytest import evaluation_test +from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor + + +def lunar_lander_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]: + """ + Convert entries from lunar lander dataset to EvaluationRow objects. + """ + rows = [] + + for entry in data: + row = EvaluationRow( + messages=[Message(role="system", content=entry.get("system_prompt", ""))], + input_metadata=InputMetadata( + row_id=entry.get("id"), + completion_params=CompletionParams(model="placeholder"), # This gets populated by the rollout processor + dataset_info={ + "environment_context": entry.get("environment_context", {}), + "user_prompt_template": entry.get("user_prompt_template", ""), + } + ) + ) + + rows.append(row) + + return rows + + +@evaluation_test( + input_dataset=["tests/pytest/data/lunar_lander_dataset.jsonl"], + dataset_adapter=lunar_lander_to_evaluation_row, + model=["gpt-4.1"], + rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}], + rollout_processor=default_mcp_gym_rollout_processor, + threshold_of_success=0.0, + num_runs=1, + mode="pointwise", + max_concurrent_rollouts=3, + steps=15, + server_script_path="examples/lunar_lander_mcp/server.py", +) +def test_lunar_lander_evaluation(row: EvaluationRow) -> EvaluationRow: + """ + Test lunar lander evaluation using the pytest framework. + + This test evaluates how well the model can control the lunar lander to achieve + a successful landing by checking the final reward and termination status. + + Args: + row: EvaluationRow object from lunar lander dataset + + Returns: + EvaluationRow object with evaluation results + """ + score = row.get_total_reward() + + evaluation_score = 1.0 if score >= 200 else 0.0 + reason = f"โœ… Successful landing with reward {score:.2f}" if score >= 200 else f"โŒ Failed landing with reward {score:.2f}" + + row.evaluation_result = EvaluateResult( + score=evaluation_score, + reason=reason, + ) + + return row \ No newline at end of file diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py new file mode 100644 index 00000000..ba581663 --- /dev/null +++ b/tests/pytest/test_tau_bench_airline.py @@ -0,0 +1,298 @@ +""" +Pytest test for tau bench airline evaluation using the evaluation_test decorator. + +This test demonstrates how to use tau bench environments within the pytest framework, +similar to the test_entire_airline_dataset test but integrated with the pytest evaluation system. +""" + +import json +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List + +from eval_protocol.models import EvaluateResult, EvaluationRow, Message, InputMetadata, CompletionParams +from eval_protocol.pytest import evaluation_test +from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor + +from vendor.tau2.data_model.message import ( + AssistantMessage, + SystemMessage, + ToolCall, + ToolMessage, + UserMessage, +) +from vendor.tau2.data_model.tasks import Action, EvaluationCriteria, RewardType, Task, UserScenario +from vendor.tau2.evaluator.evaluator import EnvironmentEvaluator +from vendor.tau2.evaluator.evaluator_action import ActionEvaluator +from vendor.tau2.evaluator.evaluator_communicate import CommunicateEvaluator +from vendor.tau2.evaluator.evaluator_nl_assertions import NLAssertionsEvaluator +from vendor.tau2.registry import registry + +def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]: + """ + Convert entries from airline dataset to EvaluationRow objects. + """ + rows = [] + test_dir = Path(__file__).parent.parent.parent / "examples" / "tau2_mcp" / "tests" + + for entry in data: + # Load system prompt from file so we can change it in one place + domain = entry["environment_context"]["domain"] + prompt_file = test_dir / f"system_prompts/{domain}_agent_system_prompt.md" + + with open(prompt_file, "r") as f: + system_prompt = f.read().strip() + + messages = [Message(role="system", content=system_prompt)] + + evaluation_criteria = entry.get("evaluation_criteria", {}) + user_simulation = entry.get("user_simulation", {}) + user_prompt_template = entry.get("user_prompt_template", "") + + row = EvaluationRow( + messages=messages, + input_metadata=InputMetadata( + row_id=entry.get("id"), + completion_params=CompletionParams(model="placeholder"), # This gets populated by the rollout processor + dataset_info={ + "environment_context": entry.get("environment_context"), + "user_simulation": user_simulation, + "evaluation_criteria": evaluation_criteria, + "user_prompt_template": user_prompt_template, + } + ), + ) + + rows.append(row) + + return rows + + +def save_single_trajectory(trajectory_record: Dict, row_id: str, output_dir: str = "trajectory_outputs"): + """Save a single trajectory record to file.""" + output_path = Path(output_dir) + output_path.mkdir(exist_ok=True) + + # Sanitize model_id for filename (replace slashes with underscores) + safe_model_id = trajectory_record["model_id"].replace("/", "_").replace("\\", "_") + + # Use row_id if provided, otherwise fall back to scenario_id + filename = f"{safe_model_id}_{row_id}_trajectory.json" + filepath = output_path / filename + + with open(filepath, "w") as f: + json.dump(trajectory_record, f, indent=2, default=str) + + print(f"๐Ÿ’พ Saved trajectory: {filepath}") + return filepath + + +@evaluation_test( + input_dataset=["tests/pytest/data/airline_dataset.jsonl"], + dataset_adapter=tau_bench_airline_to_evaluation_row, + model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], + rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}], + rollout_processor=default_mcp_gym_rollout_processor, + threshold_of_success=0.4, + num_runs=4, + mode="pointwise", + max_concurrent_rollouts=32, + server_script_path="examples/tau2_mcp/server.py", +) +def test_tau_bench_airline_evaluation(row: EvaluationRow) -> EvaluationRow: + """ + Test tau bench airline evaluation using the pytest framework. + + This test now uses the tau_bench_airline_reward function which automatically + extracts evaluation criteria from dataset entries. No wrapper needed! + + Args: + input_dataset: List of EvaluationRow objects from tau bench airline dataset + input_params: Model parameters (temperature, max_tokens, etc.) + model: Model identifier + + Returns: + List of evaluated EvaluationRow objects with scores and feedback + """ + messages = row.messages + + # Get evaluation criteria and user_simulation from input_metadata.dataset_info + dataset_info = row.input_metadata.dataset_info if row.input_metadata else {} + evaluation_criteria = dataset_info.get("evaluation_criteria", {}) + + nl_assertions = evaluation_criteria.get("nl_assertions", []) + communicate_info = evaluation_criteria.get("communicate_info", []) + actions = evaluation_criteria.get("actions", []) + + # Convert Message objects directly to tau2-bench message objects + trajectory_objects = [] + for msg in messages: + role = msg.role + content = msg.content + + if role == "system": + trajectory_objects.append(SystemMessage(role=role, content=content)) + elif role == "assistant": + tau2_tool_calls = [] + if msg.tool_calls: + for tool_call in msg.tool_calls: + arguments = json.loads(tool_call.function.arguments) + tau2_tool_call = ToolCall( + id=tool_call.id, + name=tool_call.function.name, + arguments=arguments, + ) + tau2_tool_calls.append(tau2_tool_call) + + trajectory_objects.append(AssistantMessage(role=role, content=content, tool_calls=tau2_tool_calls)) + elif role == "user": + trajectory_objects.append(UserMessage(role=role, content=content)) + elif role == "tool": + tool_id = msg.tool_call_id + trajectory_objects.append(ToolMessage(id=tool_id, role=role, content=content)) + + reward = 1.0 + + evaluation_criteria = EvaluationCriteria( + nl_assertions=nl_assertions, + communicate_info=communicate_info, + actions=actions, + reward_basis=[ + RewardType.NL_ASSERTION, + RewardType.DB, + RewardType.COMMUNICATE, + RewardType.ACTION, + ], + ) + + task = Task( + id="Filler", evaluation_criteria=evaluation_criteria, user_scenario=UserScenario(instructions="Filler") + ) # id and user_scenario are required for the Task type but not used in calculating reward + + env_reward_info = EnvironmentEvaluator.calculate_reward( + environment_constructor=registry.get_env_constructor("airline"), + task=task, + full_trajectory=trajectory_objects, + ) + action_reward_info = ActionEvaluator.calculate_reward( + task=task, + full_trajectory=trajectory_objects, + ) + communicate_reward_info = CommunicateEvaluator.calculate_reward( + task=task, + full_trajectory=trajectory_objects, + ) + nl_reward_info = NLAssertionsEvaluator.calculate_reward( + task=task, + full_trajectory=trajectory_objects, + ) + + reward = 1.0 + env_bases = {RewardType.DB, RewardType.ENV_ASSERTION} + action_bases = {RewardType.ACTION} + nl_bases = {RewardType.NL_ASSERTION} + comm_bases = {RewardType.COMMUNICATE} + task_reward_basis = set(task.evaluation_criteria.reward_basis) + + reward_breakdown = {} + if task_reward_basis & env_bases: + if env_reward_info.reward_breakdown is not None: + reward_breakdown.update(env_reward_info.reward_breakdown) + reward *= env_reward_info.reward + if task_reward_basis & action_bases: + if action_reward_info.reward_breakdown is not None: + reward_breakdown.update(action_reward_info.reward_breakdown) + reward *= action_reward_info.reward + if task_reward_basis & nl_bases: + if nl_reward_info.reward_breakdown is not None: + reward_breakdown.update(nl_reward_info.reward_breakdown) + reward *= nl_reward_info.reward + if task_reward_basis & comm_bases: + if communicate_reward_info.reward_breakdown is not None: + reward_breakdown.update(communicate_reward_info.reward_breakdown) + reward *= communicate_reward_info.reward + + # Generate reason showing only failed components + failed_reasons = [] + + if task_reward_basis & env_bases and env_reward_info.reward == 0: + failed_reasons.append("โŒ Environment/DB check failed") + + if task_reward_basis & action_bases and action_reward_info.reward == 0: + failed_actions = [] + if hasattr(action_reward_info, "action_checks") and action_reward_info.action_checks: + failed_actions = [ + f"{ac.action.name}({ac.action.arguments})" + for ac in action_reward_info.action_checks + if not ac.action_match + ] + if failed_actions: + failed_reasons.append(f"โŒ Failed actions: {failed_actions}") + else: + failed_reasons.append("โŒ Actions failed") + + if task_reward_basis & nl_bases and nl_reward_info.reward == 0: + failed_nl = [] + if hasattr(nl_reward_info, "nl_assertions") and nl_reward_info.nl_assertions: + failed_nl = [nla.nl_assertion for nla in nl_reward_info.nl_assertions if not nla.met] + if failed_nl: + failed_reasons.append(f"โŒ Failed NL assertions: {failed_nl}") + else: + failed_reasons.append("โŒ NL Assertions failed") + + if task_reward_basis & comm_bases and communicate_reward_info.reward == 0: + failed_comm = [] + if hasattr(communicate_reward_info, "communicate_checks") and communicate_reward_info.communicate_checks: + failed_comm = [cc.info for cc in communicate_reward_info.communicate_checks if not cc.met] + if failed_comm: + failed_reasons.append(f"โŒ Failed communication: {failed_comm}") + else: + failed_reasons.append("โŒ Communication failed") + + # If everything passed, show success + reason = "\n".join(failed_reasons) if failed_reasons else "โœ… All checks passed" + + + # # DELETE FROM HERE + # row_id = row.input_metadata.row_id + + # # Create trajectory record similar to test_entire_airline_dataset + # model_id = row.input_metadata.completion_params.model if row.input_metadata else "unknown" + # trajectory_record = { + # "model_id": model_id, + # "row_id": row_id, + # "messages": [ + # {"role": msg.role, "content": msg.content, "tool_calls": getattr(msg, "tool_calls", None)} + # for msg in messages + # ], + # "evaluation": { + # "score": reward, + # "reason": reason, + # "metrics": { + # "env_reward": {"score": env_reward_info.reward, "success": env_reward_info.reward > 0, "reason": str(env_reward_info.reward_breakdown)}, + # "action_reward": {"score": action_reward_info.reward, "success": action_reward_info.reward > 0, "reason": str(action_reward_info.reward_breakdown)}, + # "nl_reward": {"score": nl_reward_info.reward, "success": nl_reward_info.reward > 0, "reason": str(nl_reward_info.reward_breakdown)}, + # "comm_reward": {"score": communicate_reward_info.reward, "success": communicate_reward_info.reward > 0, "reason": str(communicate_reward_info.reward_breakdown)}, + # }, + # }, + # "evaluation_criteria": evaluation_criteria, + # "conversation_length": len(messages), + # "trajectory_steps": len([msg for msg in messages if msg.role == "assistant"]), # Approximate step count + # "cost_info": { + # "total_cost": 0.0, # Could be extracted from usage stats if available + # "total_tokens": 0, # Could be extracted from usage stats if available + # "cost_source": "not_tracked", + # }, + # "timestamp": datetime.now().isoformat(), + # } + + # # Save this individual trajectory immediately + # save_single_trajectory(trajectory_record, row_id=row_id) + # # DELETE UNTIL HERE + + row.evaluation_result = EvaluateResult( + score=reward, + reason=reason, + metrics={}, + ) + return row \ No newline at end of file diff --git a/tests/test_rollout_control_plane_integration.py b/tests/test_rollout_control_plane_integration.py index d2f86e6c..2f35bc2b 100644 --- a/tests/test_rollout_control_plane_integration.py +++ b/tests/test_rollout_control_plane_integration.py @@ -508,6 +508,7 @@ async def test_rollout_creates_envs_from_url(self): mock_make.assert_called_once_with( "http://localhost:1234/mcp/", + evaluation_rows=None, dataset=dataset, model_id="test_model", ) diff --git a/uv.lock b/uv.lock index 4a5fdba8..3e633333 100644 --- a/uv.lock +++ b/uv.lock @@ -450,6 +450,12 @@ css = [ { name = "tinycss2" }, ] +[[package]] +name = "box2d-py" +version = "2.3.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/dd/5a/ad8d3ef9c13d5afcc1e44a77f11792ee717f6727b3320bddbc607e935e2a/box2d-py-2.3.5.tar.gz", hash = "sha256:b37dc38844bcd7def48a97111d2b082e4f81cca3cece7460feb3eacda0da2207", size = 374446, upload-time = "2018-10-02T01:03:23.527Z" } + [[package]] name = "brotli" version = "1.1.0" @@ -1078,6 +1084,10 @@ dependencies = [ ] [package.optional-dependencies] +box2d = [ + { name = "gymnasium", extra = ["box2d"] }, + { name = "swig" }, +] dev = [ { name = "autopep8" }, { name = "black" }, @@ -1148,6 +1158,7 @@ requires-dist = [ { name = "flake8", marker = "extra == 'dev'", specifier = ">=3.9.2" }, { name = "fsspec" }, { name = "gymnasium", specifier = ">=0.29.0" }, + { name = "gymnasium", extras = ["box2d"], marker = "extra == 'box2d'", specifier = ">=0.29.0" }, { name = "haikus", marker = "extra == 'dev'", specifier = "==0.3.8" }, { name = "httpx", specifier = ">=0.24.0" }, { name = "hydra-core", specifier = ">=1.3.2" }, @@ -1179,6 +1190,7 @@ requires-dist = [ { name = "pyyaml", specifier = ">=5.0" }, { name = "requests", specifier = ">=2.25.0" }, { name = "rich", specifier = ">=12.0.0" }, + { name = "swig", marker = "extra == 'box2d'" }, { name = "toml", specifier = ">=0.10.0" }, { name = "torch", marker = "extra == 'trl'", specifier = ">=1.9" }, { name = "transformers", marker = "extra == 'dev'", specifier = ">=4.0.0" }, @@ -1193,7 +1205,7 @@ requires-dist = [ { name = "versioneer", marker = "extra == 'dev'", specifier = ">=0.20" }, { name = "werkzeug", marker = "extra == 'dev'", specifier = ">=2.0.0" }, ] -provides-extras = ["dev", "trl", "openevals", "fireworks"] +provides-extras = ["dev", "trl", "openevals", "fireworks", "box2d"] [package.metadata.requires-dev] dev = [ @@ -1558,6 +1570,13 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a0/e2/a111dbb8625af467ea4760a1373d6ef27aac3137931219902406ccc05423/gymnasium-1.2.0-py3-none-any.whl", hash = "sha256:fc4a1e4121a9464c29b4d7dc6ade3fbeaa36dea448682f5f71a6d2c17489ea76", size = 944301, upload-time = "2025-06-27T08:21:18.83Z" }, ] +[package.optional-dependencies] +box2d = [ + { name = "box2d-py" }, + { name = "pygame" }, + { name = "swig" }, +] + [[package]] name = "h11" version = "0.16.0" @@ -4107,6 +4126,42 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c2/2f/81d580a0fb83baeb066698975cb14a618bdbed7720678566f1b046a95fe8/pyflakes-3.4.0-py2.py3-none-any.whl", hash = "sha256:f742a7dbd0d9cb9ea41e9a24a918996e8170c799fa528688d40dd582c8265f4f", size = 63551, upload-time = "2025-06-20T18:45:26.937Z" }, ] +[[package]] +name = "pygame" +version = "2.6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/49/cc/08bba60f00541f62aaa252ce0cfbd60aebd04616c0b9574f755b583e45ae/pygame-2.6.1.tar.gz", hash = "sha256:56fb02ead529cee00d415c3e007f75e0780c655909aaa8e8bf616ee09c9feb1f", size = 14808125, upload-time = "2024-09-29T13:41:34.698Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/0b/334c7c50a2979e15f2a027a41d1ca78ee730d5b1c7f7f4b26d7cb899839d/pygame-2.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9beeb647e555afb5657111fa83acb74b99ad88761108eaea66472e8b8547b55b", size = 13109297, upload-time = "2024-09-29T14:25:34.709Z" }, + { url = "https://files.pythonhosted.org/packages/dc/48/f8b1069788d1bd42e63a960d74d3355242480b750173a42b2749687578ca/pygame-2.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:10e3d2a55f001f6c0a6eb44aa79ea7607091c9352b946692acedb2ac1482f1c9", size = 12375837, upload-time = "2024-09-29T14:25:50.538Z" }, + { url = "https://files.pythonhosted.org/packages/bc/33/a1310386b8913ce1bdb90c33fa536970e299ad57eb35785f1d71ea1e2ad3/pygame-2.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:816e85000c5d8b02a42b9834f761a5925ef3377d2924e3a7c4c143d2990ce5b8", size = 13607860, upload-time = "2024-09-29T11:10:44.173Z" }, + { url = "https://files.pythonhosted.org/packages/88/0f/4e37b115056e43714e7550054dd3cd7f4d552da54d7fc58a2fb1407acda5/pygame-2.6.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a78fd030d98faab4a8e27878536fdff7518d3e062a72761c552f624ebba5a5f", size = 14304696, upload-time = "2024-09-29T11:39:46.724Z" }, + { url = "https://files.pythonhosted.org/packages/11/b3/de6ed93ae483cf3bac8f950a955e83f7ffe59651fd804d100fff65d66d6c/pygame-2.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da3ad64d685f84a34ebe5daacb39fff14f1251acb34c098d760d63fee768f50c", size = 13977684, upload-time = "2024-09-29T11:39:49.921Z" }, + { url = "https://files.pythonhosted.org/packages/d3/05/d86440aa879708c41844bafc6b3eb42c6d8cf54082482499b53139133e2a/pygame-2.6.1-cp310-cp310-win32.whl", hash = "sha256:9dd5c054d4bd875a8caf978b82672f02bec332f52a833a76899220c460bb4b58", size = 10251775, upload-time = "2024-09-29T11:40:34.952Z" }, + { url = "https://files.pythonhosted.org/packages/38/88/8de61324775cf2c844a51d8db14a8a6d2a9092312f27678f6eaa3a460376/pygame-2.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:00827aba089355925902d533f9c41e79a799641f03746c50a374dc5c3362e43d", size = 10618801, upload-time = "2024-09-29T12:13:25.284Z" }, + { url = "https://files.pythonhosted.org/packages/c4/ca/8f367cb9fe734c4f6f6400e045593beea2635cd736158f9fabf58ee14e3c/pygame-2.6.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:20349195326a5e82a16e351ed93465a7845a7e2a9af55b7bc1b2110ea3e344e1", size = 13113753, upload-time = "2024-09-29T14:26:13.751Z" }, + { url = "https://files.pythonhosted.org/packages/83/47/6edf2f890139616b3219be9cfcc8f0cb8f42eb15efd59597927e390538cb/pygame-2.6.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f3935459109da4bb0b3901da9904f0a3e52028a3332a355d298b1673a334cf21", size = 12378146, upload-time = "2024-09-29T14:26:22.456Z" }, + { url = "https://files.pythonhosted.org/packages/00/9e/0d8aa8cf93db2d2ee38ebaf1c7b61d0df36ded27eb726221719c150c673d/pygame-2.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c31dbdb5d0217f32764797d21c2752e258e5fb7e895326538d82b5f75a0cd856", size = 13611760, upload-time = "2024-09-29T11:10:47.317Z" }, + { url = "https://files.pythonhosted.org/packages/d7/9e/d06adaa5cc65876bcd7a24f59f67e07f7e4194e6298130024ed3fb22c456/pygame-2.6.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:173badf82fa198e6888017bea40f511cb28e69ecdd5a72b214e81e4dcd66c3b1", size = 14298054, upload-time = "2024-09-29T11:39:53.891Z" }, + { url = "https://files.pythonhosted.org/packages/7a/a1/9ae2852ebd3a7cc7d9ae7ff7919ab983e4a5c1b7a14e840732f23b2b48f6/pygame-2.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce8cc108b92de9b149b344ad2e25eedbe773af0dc41dfb24d1f07f679b558c60", size = 13977107, upload-time = "2024-09-29T11:39:56.831Z" }, + { url = "https://files.pythonhosted.org/packages/31/df/6788fd2e9a864d0496a77670e44a7c012184b7a5382866ab0e60c55c0f28/pygame-2.6.1-cp311-cp311-win32.whl", hash = "sha256:811e7b925146d8149d79193652cbb83e0eca0aae66476b1cb310f0f4226b8b5c", size = 10250863, upload-time = "2024-09-29T11:44:48.199Z" }, + { url = "https://files.pythonhosted.org/packages/d2/55/ca3eb851aeef4f6f2e98a360c201f0d00bd1ba2eb98e2c7850d80aabc526/pygame-2.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:91476902426facd4bb0dad4dc3b2573bc82c95c71b135e0daaea072ed528d299", size = 10622016, upload-time = "2024-09-29T12:17:01.545Z" }, + { url = "https://files.pythonhosted.org/packages/92/16/2c602c332f45ff9526d61f6bd764db5096ff9035433e2172e2d2cadae8db/pygame-2.6.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:4ee7f2771f588c966fa2fa8b829be26698c9b4836f82ede5e4edc1a68594942e", size = 13118279, upload-time = "2024-09-29T14:26:30.427Z" }, + { url = "https://files.pythonhosted.org/packages/cd/53/77ccbc384b251c6e34bfd2e734c638233922449a7844e3c7a11ef91cee39/pygame-2.6.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c8040ea2ab18c6b255af706ec01355c8a6b08dc48d77fd4ee783f8fc46a843bf", size = 12384524, upload-time = "2024-09-29T14:26:49.996Z" }, + { url = "https://files.pythonhosted.org/packages/06/be/3ed337583f010696c3b3435e89a74fb29d0c74d0931e8f33c0a4246307a9/pygame-2.6.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c47a6938de93fa610accd4969e638c2aebcb29b2fca518a84c3a39d91ab47116", size = 13587123, upload-time = "2024-09-29T11:10:50.072Z" }, + { url = "https://files.pythonhosted.org/packages/fd/ca/b015586a450db59313535662991b34d24c1f0c0dc149cc5f496573900f4e/pygame-2.6.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:33006f784e1c7d7e466fcb61d5489da59cc5f7eb098712f792a225df1d4e229d", size = 14275532, upload-time = "2024-09-29T11:39:59.356Z" }, + { url = "https://files.pythonhosted.org/packages/b9/f2/d31e6ad42d657af07be2ffd779190353f759a07b51232b9e1d724f2cda46/pygame-2.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1206125f14cae22c44565c9d333607f1d9f59487b1f1432945dfc809aeaa3e88", size = 13952653, upload-time = "2024-09-29T11:40:01.781Z" }, + { url = "https://files.pythonhosted.org/packages/f3/42/8ea2a6979e6fa971702fece1747e862e2256d4a8558fe0da6364dd946c53/pygame-2.6.1-cp312-cp312-win32.whl", hash = "sha256:84fc4054e25262140d09d39e094f6880d730199710829902f0d8ceae0213379e", size = 10252421, upload-time = "2024-09-29T11:14:26.877Z" }, + { url = "https://files.pythonhosted.org/packages/5f/90/7d766d54bb95939725e9a9361f9c06b0cfbe3fe100aa35400f0a461a278a/pygame-2.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:3a9e7396be0d9633831c3f8d5d82dd63ba373ad65599628294b7a4f8a5a01a65", size = 10624591, upload-time = "2024-09-29T11:52:54.489Z" }, + { url = "https://files.pythonhosted.org/packages/e1/91/718acf3e2a9d08a6ddcc96bd02a6f63c99ee7ba14afeaff2a51c987df0b9/pygame-2.6.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ae6039f3a55d800db80e8010f387557b528d34d534435e0871326804df2a62f2", size = 13090765, upload-time = "2024-09-29T14:27:02.377Z" }, + { url = "https://files.pythonhosted.org/packages/0e/c6/9cb315de851a7682d9c7568a41ea042ee98d668cb8deadc1dafcab6116f0/pygame-2.6.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2a3a1288e2e9b1e5834e425bedd5ba01a3cd4902b5c2bff8ed4a740ccfe98171", size = 12381704, upload-time = "2024-09-29T14:27:10.228Z" }, + { url = "https://files.pythonhosted.org/packages/9f/8f/617a1196e31ae3b46be6949fbaa95b8c93ce15e0544266198c2266cc1b4d/pygame-2.6.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27eb17e3dc9640e4b4683074f1890e2e879827447770470c2aba9f125f74510b", size = 13581091, upload-time = "2024-09-29T11:30:27.653Z" }, + { url = "https://files.pythonhosted.org/packages/3b/87/2851a564e40a2dad353f1c6e143465d445dab18a95281f9ea458b94f3608/pygame-2.6.1-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c1623180e70a03c4a734deb9bac50fc9c82942ae84a3a220779062128e75f3b", size = 14273844, upload-time = "2024-09-29T11:40:04.138Z" }, + { url = "https://files.pythonhosted.org/packages/85/b5/aa23aa2e70bcba42c989c02e7228273c30f3b44b9b264abb93eaeff43ad7/pygame-2.6.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ef07c0103d79492c21fced9ad68c11c32efa6801ca1920ebfd0f15fb46c78b1c", size = 13951197, upload-time = "2024-09-29T11:40:06.785Z" }, + { url = "https://files.pythonhosted.org/packages/a6/06/29e939b34d3f1354738c7d201c51c250ad7abefefaf6f8332d962ff67c4b/pygame-2.6.1-cp313-cp313-win32.whl", hash = "sha256:3acd8c009317190c2bfd81db681ecef47d5eb108c2151d09596d9c7ea9df5c0e", size = 10249309, upload-time = "2024-09-29T11:10:23.329Z" }, + { url = "https://files.pythonhosted.org/packages/7e/11/17f7f319ca91824b86557e9303e3b7a71991ef17fd45286bf47d7f0a38e6/pygame-2.6.1-cp313-cp313-win_amd64.whl", hash = "sha256:813af4fba5d0b2cb8e58f5d95f7910295c34067dcc290d34f1be59c48bd1ea6a", size = 10620084, upload-time = "2024-09-29T11:48:51.587Z" }, +] + [[package]] name = "pygments" version = "2.19.2" @@ -4928,6 +4983,28 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f7/1f/b876b1f83aef204198a42dc101613fefccb32258e5428b5f9259677864b4/starlette-0.47.2-py3-none-any.whl", hash = "sha256:c5847e96134e5c5371ee9fac6fdf1a67336d5815e09eb2a01fdb57a351ef915b", size = 72984, upload-time = "2025-07-20T17:31:56.738Z" }, ] +[[package]] +name = "swig" +version = "4.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/35/d2/f7298d9e970531ea0078332e3f7813ee0133303e365d0940a83438a296d3/swig-4.3.1.tar.gz", hash = "sha256:bbb43485d120d3fd2c979f258f81eae78274f83ba3767d5b3fe376ac70504934", size = 25741, upload-time = "2025-04-19T19:50:59.37Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e2/fb/961c3ed626f2019eff345e7a178e49943890cf4615f7b543b01cce1228fc/swig-4.3.1-py3-none-macosx_10_9_universal2.whl", hash = "sha256:3d34c3fd96c5c288881a25418df06d814aa09e734bc32af5cd92e9217841b5f6", size = 2590974, upload-time = "2025-04-19T19:50:30.456Z" }, + { url = "https://files.pythonhosted.org/packages/2a/b7/36eef269f4d2ed7cd9a4c0ebe5c86cc548165a5c6c157ad9bba6962a385f/swig-4.3.1-py3-none-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:d84b3e31d943d81b28bd4144dcf5271909ad2313f0f2afbd7f2fb37ef2a6d8bb", size = 1978283, upload-time = "2025-04-19T19:50:32.51Z" }, + { url = "https://files.pythonhosted.org/packages/36/5a/b2827a74526f579a9d836e0f098df3d07a050cec6150f7c58a8b6c347787/swig-4.3.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:5f31b7e815b76b42cc503322ad88e6eb3ebb0bb0b91044445c8a31b5b4aa4664", size = 1896227, upload-time = "2025-04-19T19:50:34.726Z" }, + { url = "https://files.pythonhosted.org/packages/7f/b8/e674e91c1288991505479a4fb0d8af44e939426e1494455a634e35c525fb/swig-4.3.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5391080a3785b0505eb211af1cbb1f1e3838e5bb1e54f740a9d7ba2e385c879", size = 1920786, upload-time = "2025-04-19T19:50:36.798Z" }, + { url = "https://files.pythonhosted.org/packages/9b/d0/1ddf2870f84264052c8760bf93268c5e2a694f2da33964a212fa9db5d8fe/swig-4.3.1-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d4ffde3e87cd2a764495a516751c2c3c301f8b237aba2ac3963f786ff59b7f68", size = 2067717, upload-time = "2025-04-19T19:50:38.968Z" }, + { url = "https://files.pythonhosted.org/packages/d9/7e/484a883f002c050b67d49e1b23186b36d9641c50c8acc3bf986005e72f89/swig-4.3.1-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:444b11d8ee27aa64ac83e827dbeb724aa0cfb1062c20ecbb88180bffa39d5dc3", size = 1913731, upload-time = "2025-04-19T19:50:41.071Z" }, + { url = "https://files.pythonhosted.org/packages/a5/75/cd152ad55c53c2ad977fd31e43cde465401f1efb46ed9ec8d4c594c592c1/swig-4.3.1-py3-none-manylinux_2_24_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:54fd71196e1999fd0e204c8fa5ab39d472eb8831f30a951d7115c82d07bdf5f3", size = 1916596, upload-time = "2025-04-19T19:50:43.3Z" }, + { url = "https://files.pythonhosted.org/packages/24/1f/a1e3362bdc3fbb227ee8db48aeaea11ea83dce3a41b8adcfdccea601d5f5/swig-4.3.1-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:19936cb924e7c86f207bf4e12e00c930342068fcb7073dcc9c8b49bd7a2c7389", size = 2947297, upload-time = "2025-04-19T19:50:45.487Z" }, + { url = "https://files.pythonhosted.org/packages/69/b9/1cbadf2935b96eb5c5aa895aff676d43d21094fb4f5b69afb18766f5a167/swig-4.3.1-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:07082c2f8693f83ba136136e54e92a5af014488ca4f2a3de4b471337c00d92aa", size = 2783962, upload-time = "2025-04-19T19:50:47.751Z" }, + { url = "https://files.pythonhosted.org/packages/00/03/41b240524c56cd9eb01ba880fdffa3c5a28700594ff3ce09ac10fdf7edef/swig-4.3.1-py3-none-musllinux_1_2_i686.whl", hash = "sha256:86600ddde81e24f6fa989920784d72c3ca7ca6a7583fe74b4c5c80076dddd0a5", size = 3270207, upload-time = "2025-04-19T19:50:50.06Z" }, + { url = "https://files.pythonhosted.org/packages/79/d0/355910dfee9fc96a4e1719f375b5a5b2ed8a46ba2470e9f9cb230e3ed101/swig-4.3.1-py3-none-musllinux_1_2_ppc64le.whl", hash = "sha256:fc496c0d600cf1bb2d91e28d3d6eae9c4301e5ea7a0dec5a4281b5efed4245a8", size = 3180711, upload-time = "2025-04-19T19:50:52.219Z" }, + { url = "https://files.pythonhosted.org/packages/40/ef/c0fd6af515771abfa338dce0c0c0419aab2f776456d195f1a79e5b6f3328/swig-4.3.1-py3-none-musllinux_1_2_s390x.whl", hash = "sha256:0743063399e373b17d658481f4cd327245ef58a1d17a3e2071de88dec60082fc", size = 3194288, upload-time = "2025-04-19T19:50:54.046Z" }, + { url = "https://files.pythonhosted.org/packages/bb/a0/15b41ab8e522c39ffe151d859e5faad5b86afd18030107b1c594a85a6878/swig-4.3.1-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:7687a1c1b6c3033f75b753d638cac967e3f6011c04fb25ab405cf9086ecf8d4c", size = 3057285, upload-time = "2025-04-19T19:50:56.183Z" }, + { url = "https://files.pythonhosted.org/packages/28/82/bb4c482352bbf50e1c595ddf3ed699a9265257ca0093d0bb65f28aa52a19/swig-4.3.1-py3-none-win_amd64.whl", hash = "sha256:efec16327029f682f649a26da726bb0305be8800bd0f1fa3e81bf0769cf5b476", size = 2566912, upload-time = "2025-04-19T19:50:57.849Z" }, +] + [[package]] name = "sympy" version = "1.14.0"