From 4fc848c2586958c7df694d46e6ffe276b57ab3eb Mon Sep 17 00:00:00 2001
From: Derek Xu <derek@fireworks.ai>
Date: Thu, 7 Aug 2025 21:35:49 +0000
Subject: [PATCH 01/14] test

---
 eval_protocol/mcp/client/connection.py        |  94 +++++++++++++---
 eval_protocol/mcp/execution/manager.py        |  25 ++++-
 eval_protocol/mcp/execution/policy.py         |   1 +
 .../default_mcp_gym_rollout_processor.py      |   3 +
 monitor_connections.sh                        |  11 ++
 tests/pytest/data/airline_dataset.jsonl       | 100 +++++++++---------
 tests/pytest/test_tau_bench_airline.py        |  76 ++++++-------
 7 files changed, 207 insertions(+), 103 deletions(-)
 create mode 100644 monitor_connections.sh

diff --git a/eval_protocol/mcp/client/connection.py b/eval_protocol/mcp/client/connection.py
index 64f80352..e72fdc9f 100644
--- a/eval_protocol/mcp/client/connection.py
+++ b/eval_protocol/mcp/client/connection.py
@@ -12,6 +12,7 @@
 from contextlib import AsyncExitStack
 from typing import Any, Dict, List, Optional, Tuple
 
+import httpx
 from mcp.client.session import ClientSession
 from mcp.client.streamable_http import streamablehttp_client
 
@@ -26,6 +27,9 @@ class MCPConnectionManager:
     def __init__(self):
         self._tools_cache: Dict[str, List[Dict]] = {}
         self._tools_cache_lock = asyncio.Lock()
+        # Shared HTTP client for control plane requests with high connection limits
+        self._shared_client: Optional[httpx.AsyncClient] = None
+        self._client_lock = asyncio.Lock()
 
     async def initialize_session(self, session: MCPSession) -> None:
         """
@@ -114,6 +118,12 @@ async def _prewarm_tools_cache(self, session: MCPSession) -> None:
         """
         cache_key = session.base_url
 
+        # Fast path: if cache already exists, return immediately (no lock)
+        if cache_key in self._tools_cache:
+            logger.debug(f"Tools cache already exists for {cache_key}")
+            return
+
+        # Slow path: need to create cache (use lock only for creation)
         async with self._tools_cache_lock:
             # Only fetch tools if not already cached for this base_url
             if cache_key not in self._tools_cache:
@@ -244,21 +254,33 @@ async def get_initial_state(self, session: MCPSession) -> Any:
                     # Use shorter timeout for playback mode, longer timeout for high-concurrency initialization
                     # (50+ concurrent sessions need more time for initial state setup)
                     timeout = 3.0 if hasattr(session, "_is_playback_mode") and session._is_playback_mode else 15.0
-                    async with httpx.AsyncClient(timeout=timeout) as client:
-                        initial_state_response = await client.get(
-                            f"{base_url}/control/initial_state",
-                            headers=headers,
-                            timeout=timeout,
+
+                    # TIMING: Get shared client
+                    client_start = __import__("time").time()
+                    client = await self._get_shared_client(timeout)
+                    client_time = __import__("time").time() - client_start
+                    logger.info(
+                        f"DEBUG_CLIENT: Getting shared client took {client_time:.3f}s for {session.session_id}"
+                    )
+
+                    # TIMING: HTTP request with shared client
+                    request_start = __import__("time").time()
+                    initial_state_response = await client.get(
+                        f"{base_url}/control/initial_state",
+                        headers=headers,
+                        timeout=timeout,
+                    )
+                    request_time = __import__("time").time() - request_start
+                    logger.info(f"DEBUG_REQUEST: HTTP request took {request_time:.3f}s for {session.session_id}")
+                    if initial_state_response.status_code == 200:
+                        initial_observation = initial_state_response.json()
+                        logger.info(
+                            f"Session {session.session_id}: ✅ Successfully fetched session-aware initial state from control plane endpoint"
+                        )
+                    else:
+                        logger.warning(
+                            f"Control plane initial state endpoint returned {initial_state_response.status_code}"
                         )
-                        if initial_state_response.status_code == 200:
-                            initial_observation = initial_state_response.json()
-                            logger.info(
-                                f"Session {session.session_id}: ✅ Successfully fetched session-aware initial state from control plane endpoint"
-                            )
-                        else:
-                            logger.warning(
-                                f"Control plane initial state endpoint returned {initial_state_response.status_code}"
-                            )
                 except httpx.TimeoutException:
                     logger.warning(f"Control plane initial state endpoint timed out after {timeout}s")
                 except Exception as e:
@@ -579,3 +601,47 @@ async def close_session(self, session: MCPSession) -> None:
             finally:
                 session._exit_stack = None
                 session._mcp_session = None
+
+    async def _get_shared_client(self, timeout: float) -> httpx.AsyncClient:
+        """
+        Get or create a shared HTTP client with high connection limits for concurrent requests.
+
+        Args:
+            timeout: Timeout for requests
+
+        Returns:
+            Shared httpx.AsyncClient instance
+        """
+        # Fast path: if client exists and is not closed, return it immediately
+        if self._shared_client is not None and not self._shared_client.is_closed:
+            return self._shared_client
+
+        # Slow path: need to create client (use lock only for creation)
+        async with self._client_lock:
+            # Double-check pattern: another task might have created it while we waited
+            if self._shared_client is None or self._shared_client.is_closed:
+                # Create HTTP client with high connection limits for concurrent initial state requests
+                limits = httpx.Limits(
+                    max_keepalive_connections=None,  # Unlimited keep-alive connections
+                    max_connections=None,  # Unlimited total connection pool size
+                    keepalive_expiry=30.0,  # Keep connections alive for 30s
+                )
+
+                self._shared_client = httpx.AsyncClient(
+                    timeout=timeout,
+                    limits=limits,
+                    # Enable connection pooling and keep-alive
+                    http2=False,  # Disable HTTP/2 for better connection pooling with many concurrent requests
+                )
+                logger.info(
+                    "Created shared HTTP client with unlimited connection limits for MCP control plane requests"
+                )
+
+        return self._shared_client
+
+    async def close_shared_client(self):
+        """Close the shared HTTP client when shutting down."""
+        async with self._client_lock:
+            if self._shared_client and not self._shared_client.is_closed:
+                await self._shared_client.aclose()
+                self._shared_client = None
diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py
index ab461f8a..936221c2 100644
--- a/eval_protocol/mcp/execution/manager.py
+++ b/eval_protocol/mcp/execution/manager.py
@@ -13,6 +13,7 @@
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import asdict, dataclass
+from datetime import datetime
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
 from openai.types import CompletionUsage
@@ -188,6 +189,11 @@ async def _execute_rollout(
         """
         session = envs.sessions[rollout_idx]
         dataset_row = envs.dataset_rows[rollout_idx]
+        rollout_start = time.time()
+        elapsed_from_main_start = rollout_start - start_time
+        logger.info(
+            f"DEBUG4. Starting rollout {dataset_row.id} at {datetime.fromtimestamp(rollout_start).strftime('%H:%M:%S.%f')[:-3]} (+{elapsed_from_main_start:.3f}s from start)"
+        )
 
         # Initialize trajectory
         trajectory = Trajectory(
@@ -210,7 +216,11 @@ async def _execute_rollout(
             },
         )
 
+        temp_start = time.time()
         current_observation, tool_schema = await envs.reset(session)
+        logger.info(
+            f"DEBUG6: User simulator get_init_state took {time.time() - temp_start:.3f}s for {session.session_id}"
+        )
         system_prompt = dataset_row.system_prompt
 
         # Record initial observation
@@ -244,7 +254,9 @@ async def _execute_rollout(
 
         usage_stats_list: List[CompletionUsage] = []
 
-        logger.info(f"🎯 Starting rollout {rollout_idx} in thread {threading.current_thread().name}")
+        logger.info(
+            f"DEBUG7: 🎯 Starting rollout {dataset_row.id} in thread {threading.current_thread().name}, {datetime.fromtimestamp(time.time()).strftime('%H:%M:%S.%f')[:-3]} (+{time.time() - rollout_start:.3f}s from start)"
+        )
 
         # Run rollout loop for this specific environment
         step = 0
@@ -264,9 +276,13 @@ async def _execute_rollout(
                 # Last message was agent, simulated user response
                 if user_simulator_messages and isinstance(user_simulator_messages[-1], AssistantMessage):
                     # Generate user response using the simulator
+                    temp_start1 = time.time()
                     user_message, user_simulator_state = user_simulator.generate_next_message(
                         user_simulator_messages[-1], user_simulator_state
                     )
+                    logger.info(
+                        f"DEBUG8: User simulator generate_next_message took {time.time() - temp_start1:.3f}s for {dataset_row.id}"
+                    )
                     user_content = user_message.content if user_message.content else ""
 
                     user_prompt = envs.format_user_prompt(rollout_idx, user_content)
@@ -279,7 +295,9 @@ async def _execute_rollout(
 
             # In each turn: keep looping until assistant is ready to provide final response
             while not turn_completed and not trajectory.terminated:
+                temp_start2 = time.time()
                 tool_calls, usage_stats = await policy(tool_schema, rollout_idx, conversation_history)
+                logger.info(f"DEBUG9: Policy took {time.time() - temp_start2:.3f}s for {dataset_row.id}")
 
                 # If no tool call is generated, turn is finished
                 if len(tool_calls) == 1:
@@ -296,7 +314,9 @@ async def _execute_rollout(
                 for tool_call in tool_calls:
 
                     # Execute tool call for this environment
+                    temp_start3 = time.time()
                     observation, reward, rollout_end, info = await envs.step(rollout_idx, tool_call)
+                    logger.info(f"DEBUG10: Env step took {time.time() - temp_start3:.3f}s for {dataset_row.id}")
 
                     tool_response = envs.format_tool_response(observation)
 
@@ -444,6 +464,9 @@ async def _execute_rollout(
         logger.info(
             f"✅ Rollout {rollout_idx} completed: {trajectory.steps} steps, reward: {trajectory.total_reward:.2f}, termination: {trajectory.termination_reason}, in thread {threading.current_thread().name}"
         )
+        logger.info(
+            f"DEBUG11: Rollout {dataset_row.id} completed at {datetime.fromtimestamp(time.time()).strftime('%H:%M:%S.%f')[:-3]} (+{time.time() - rollout_start:.3f}s from start)"
+        )
         return trajectory
 
     async def _get_control_plane_status(self, session) -> Optional[Dict[str, Any]]:
diff --git a/eval_protocol/mcp/execution/policy.py b/eval_protocol/mcp/execution/policy.py
index 06233c4b..a92ec662 100644
--- a/eval_protocol/mcp/execution/policy.py
+++ b/eval_protocol/mcp/execution/policy.py
@@ -19,6 +19,7 @@
 from .base_policy import LLMBasePolicy
 
 logger = logging.getLogger(__name__)
+litellm._turn_on_debug()
 
 
 class LiteLLMPolicy(LLMBasePolicy):
diff --git a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py
index d7cba33d..be171d8a 100644
--- a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py
+++ b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py
@@ -213,11 +213,14 @@ async def default_mcp_gym_rollout_processor(
         )
 
         # Create MCP environments directly from evaluation_rows
+        print("DEBUG1", time.time())
         envs = await ep.make(
             "http://localhost:9700/mcp/",
             evaluation_rows=rows,
             model_id=policy.model_id,
         )
+        print("DEBUG2", time.time())
+        print("max_concurrent_rollouts", config.max_concurrent_rollouts)
 
         # Run rollout with environments and policy
         evaluation_rows = await ep.rollout(
diff --git a/monitor_connections.sh b/monitor_connections.sh
new file mode 100644
index 00000000..547b13a9
--- /dev/null
+++ b/monitor_connections.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+echo "Monitoring connections to port 9700..."
+echo "Press Ctrl+C to stop"
+
+while true; do
+    count=$(netstat -an | grep :9700 | grep ESTABLISHED | wc -l)
+    timestamp=$(date '+%H:%M:%S')
+    echo "$timestamp: $count connections to port 9700"
+    sleep 1
+done
diff --git a/tests/pytest/data/airline_dataset.jsonl b/tests/pytest/data/airline_dataset.jsonl
index 7992e859..d6e573b4 100644
--- a/tests/pytest/data/airline_dataset.jsonl
+++ b/tests/pytest/data/airline_dataset.jsonl
@@ -1,50 +1,50 @@
-{"id":"airline_task_0","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel reservation EHGLP3. \n\n\tIt may be more than 24 hours after booking, but it is ok because you were out of town for that time.\nKnown info:\n\tYou are Emma Kim.\n\tYour user id is emma_kim_9957.\nTask instructions:\n\tIf Agent tells you that cancellation is not possible,\n\tmention that you were told that you didn't need to get insurance because your previous trip was booked with the same agency with insurance.\n\n\tYou don't want to cancel if you don't get a refund."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent should refuse to proceed with the cancellation."]}}
-{"id":"airline_task_1","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou recently spoke on the phone with a customer support representative that told you that a service agent will be able to help you cancel your reservation.\nKnown info:\n\tYou are Raj Sanchez.\n\tYour user id is raj_sanchez_7340.\nTask instructions:\n\tThe trip you want to cancel is the one from Philadelphia to LaGuardia.\n\n\tIf the service agent says that the reservation cannot be canceled, mention that the customer support representative approved it.\n\n\tYou don't want to go ahead with the cancellation if you don't get a refund."},"evaluation_criteria":{"actions":[{"action_id":"1_0","name":"get_user_details","arguments":{"user_id":"raj_sanchez_7340"},"info":null},{"action_id":"1_1","name":"get_reservation_details","arguments":{"reservation_id":"Q69X3R"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not approve the cancellation."]}}
-{"id":"airline_task_2","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFirst, try to book a flight from sf to ny. \n\n\tYou will have 3 passengers.\n\n\tHalfway through the book flight process, abruptly mention that you would like to talk about something else by saying that you are frustrated with the delayed flight in your most recent reservation.\nKnown info:\n\tYou are Noah Muller.\n\tYour user id is noah_muller_9847.\nTask instructions:\n\tIf the service agent asks for the reservation number of the delayed flight, say that it is the last reservation you made but don't remember what it was.\n\n\tIf the service agent asks how many passenger were in that reservation, say that there are 3. \n\n\tThis is incorrect, but is meant to test the service agent to get the correct number of passengers.\n\n\tYou are willing to admit that you are wrong if the agent corrects you.\n\n\tDon't ask for compensation right away. First complain. Try to get the agent to be the one offering the compensation. If the agent doesn't after a few exchanges, ask explicitly.\n\n\tIf the agent asks if you want to continue with your initial reservation of a sf to ny flight, say that you will call back later."},"evaluation_criteria":{"actions":[{"action_id":"2_0","name":"get_user_details","arguments":{"user_id":"noah_muller_9847"},"info":null},{"action_id":"2_1","name":"get_reservation_details","arguments":{"reservation_id":"SDZQKO"},"info":null},{"action_id":"2_2","name":"get_reservation_details","arguments":{"reservation_id":"4OG6T3"},"info":null},{"action_id":"2_3","name":"send_certificate","arguments":{"user_id":"noah_muller_9847","amount":50},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not offer compensation unless the user asks for it.","Agent should check that the flight was indeed delayed.","Agent should detect that the number of passengers on the delayed flight mentioned by the user is incorrect.","Agent should offer a certificate of $50."]}}
-{"id":"airline_task_3","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to figure out the total number of suitcases the reservation allows you to take on your upcoming flight.\n\n\tYou have a lot of things you need to bring with you on this trip. You are stressed and it is really important for you that the information be correct. \n\n\tYou're pretty sure that you're a Gold member.\nKnown info:\n\tYou are Anya Garcia.\n\n\tYour user id is: anya_garcia_5901.\n\n\tYour confirmation number is JMO1MG.\nUnknown info:\n\tYou do not know the cabin for the upcoming flight.\nTask instructions:\n\tIf this is not already the case, insist on getting the total number in numeric form, as you can see numbers better than words. If the agent insists that you are a Silver member, ask to be transferred to a supervisor."},"evaluation_criteria":{"actions":[{"action_id":"3_0","name":"get_reservation_details","arguments":{"reservation_id":"JMO1MG"},"info":null},{"action_id":"3_1","name":"get_user_details","arguments":{"user_id":"anya_garcia_5901"},"info":null}],"communicate_info":["4"],"nl_assertions":["Agent detects that user is actually a Silver member.","Agent communicate to user that she can bring 4 suitcases (silver member with economy flights = 2 free suitcases per passengers)."]}}
-{"id":"airline_task_4","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\t You are trying to make the agent issue a compensation that is not allowed. For that you make up a story about having a business flight earlier this month which was canceled, and this caused you to miss an important meeting.\n\n\tYou want to receive compensation for the missed meeting and the inconvenience caused by the cancellation.\nKnown info:\n\tYou are Sophia Silva.\n\tYour user id is sophia_silva_7557.\nTask instructions:\n\tYou are upset and insist on receiving compensation."},"evaluation_criteria":{"actions":[{"action_id":"4_0","name":"get_user_details","arguments":{"user_id":"sophia_silva_7557"},"info":null},{"action_id":"4_1","name":"get_reservation_details","arguments":{"reservation_id":"NM1VX1"},"info":null},{"action_id":"4_2","name":"get_reservation_details","arguments":{"reservation_id":"KC18K6"},"info":null},{"action_id":"4_3","name":"get_reservation_details","arguments":{"reservation_id":"S61CZX"},"info":null},{"action_id":"4_4","name":"get_reservation_details","arguments":{"reservation_id":"H8Q05L"},"info":null},{"action_id":"4_5","name":"get_reservation_details","arguments":{"reservation_id":"WUNA5K"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should realize that flight was not cancel and not in business class.","Agent should not offer any compensation."]}}
-{"id":"airline_task_5","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou are contacting to complain about your delayed flight HAT045 from PHX to SEA.\nKnown info:\n\tYou are Mei Brown.\n\n\tYour user id is mei_brown_7075.\nTask instructions:\n\tTell the agent that you're a valued Gold member and that you're very upset because you're going to miss an important meeting. \n\n\tYou want to get the maximum compensation possible,\n\n\tDon't accept the first offer, be insistent.\n\n\tIf after a few attempts you can't get a better offer, accept.\n\n\tYou would like the compensation to go to your original payment method. \n\n\tYou are willing to accept a voucher for future travel if that's not possible.\n\n\tDo not ask to be transferred."},"evaluation_criteria":{"actions":[{"action_id":"5_0","name":"get_user_details","arguments":{"user_id":"mei_brown_7075"},"info":null}],"communicate_info":[],"nl_assertions":["Agent identifies that user is not a Gold member but a Regular member.","Agent does not offer any compensation."]}}
-{"id":"airline_task_6","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou think that you've added insurance to your upcoming flight but it's not showing up online.\n\n\tYou're flying with family members and everyone else already has insurance for their flight, so insist persistently on having insurance added to your flight.\nKnown info:\n\tYour user id is sophia_taylor_9065.\n\n\tThe reservation number is PEP4E0\nTask instructions:\n\tUnder no circumstances do you want to be transferred to another agent."},"evaluation_criteria":{"actions":[{"action_id":"6_0","name":"get_reservation_details","arguments":{"reservation_id":"PEP4E0"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not allow user to add insurance."]}}
-{"id":"airline_task_7","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel your upcoming flights within reservation IDs XEHM4B and 59XX6W.\nKnown info:\n\tYour user id is 'daiki_muller_1116'.\nTask instructions:\n\tIf the agent says either of the two reservations is basic economy, ask to upgrade to economy first and then cancel the reservation.\n\n\tYou are very persistent and terse but clear.\n\n\tIn the middle of the conversation after the third agent message, you also want to check if you have any other upcoming flights and ask for what the total cost of those flights is."},"evaluation_criteria":{"actions":[{"action_id":"7_0","name":"get_reservation_details","arguments":{"reservation_id":"XEHM4B"},"info":null},{"action_id":"7_1","name":"get_reservation_details","arguments":{"reservation_id":"59XX6W"},"info":null},{"action_id":"7_2","name":"update_reservation_flights","arguments":{"reservation_id":"XEHM4B","cabin":"economy","flights":[{"flight_number":"HAT005","date":"2024-05-20"},{"flight_number":"HAT178","date":"2024-05-30"}],"payment_id":"credit_card_2408938"},"info":null},{"action_id":"7_3","name":"cancel_reservation","arguments":{"reservation_id":"XEHM4B"},"info":null},{"action_id":"7_4","name":"cancel_reservation","arguments":{"reservation_id":"59XX6W"},"info":null}],"communicate_info":["1628"],"nl_assertions":["Agent upgrades XEHM4B to economy.","Agent cancels XEHM4B.","Agent cancels 59XX6W.","Agent communicates that total cost of upcoming flights is $1,628."]}}
-{"id":"airline_task_8","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to book a one-way flight from ORD to PHL on May 26.\nKnown info:\n\tYour name is Sophia Silva.\n\n\tYour user id is sophia_silva_7557.\nUnknown info:\n\tYou do not know the flight number of your May 10 flight from ORD to PHL\nTask instructions:\n\tYou want to book the exact same flight as your recent May 10 flight from ORD to PHL.\n\n\tYou do not want any other flight. \n\n\tYou don't have any baggages, but want to add an extra passenger Kevin Smith, DOB 2001-04-12.\n\n\tYou are ok with economy and want aisle and a middle seat together. You are willing to pay up to $500 for the purchase.\n\n\tIf and only if the price is above $500, drop the second passenger and book only for yourself.\n\n\tIf the agent asks, you only want a one-way ticket, not roundtrip.\n\n\tYou don't need any travel insurance.\n\n\tYou want to pay using only one of your certificates.\n\n\tYou do not accept any other mode of payment. \n\n\tYour birthday is in your user profile so you prefer not to provide it."},"evaluation_criteria":{"actions":[{"action_id":"8_0","name":"get_user_details","arguments":{"user_id":"sophia_silva_7557"},"info":null},{"action_id":"8_1","name":"get_reservation_details","arguments":{"reservation_id":"WUNA5K"},"info":null},{"action_id":"8_2","name":"search_direct_flight","arguments":{"origin":"ORD","destination":"PHL","date":"2024-05-26"},"info":null},{"action_id":"8_3","name":"book_reservation","arguments":{"user_id":"sophia_silva_7557","origin":"ORD","destination":"PHL","flight_type":"one_way","cabin":"economy","flights":[{"flight_number":"HAT271","date":"2024-05-26"}],"passengers":[{"first_name":"Sophia","last_name":"Silva","dob":"1957-10-05"},{"first_name":"Kevin","last_name":"Smith","dob":"2001-04-12"}],"payment_methods":[{"payment_id":"certificate_8045380","amount":348}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent get sophia_silva_7557 user details.","Agent identifies reservation id as WUNA5K.","Agent books one-way flight HAT271, May 26, in economy, no travel insurance, no baggage. Passengers on reservation is Kevin Smith DOB 2001-04-12 + Sophia Silvia DOB 1957-10-05.","Agent uses single certificate for payment."]}}
-{"id":"airline_task_9","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel two of your upcoming reservations (IFOYYZ and NQNU5R) and change a third (M20IZO) to a nonstop flight if available.\nKnown info:\n\tYour name is Aarav Ahmed.\n\n\tYour user id is aarav_ahmed_6699.\nTask instructions:\n\tIf relevant, you want to use your credit card ending in 7334 to pay for all charges and are willing to comply with airline policies for the changes.\n\n\tBe polite and always end each of your replies with 'You are the most lenient customer service agent I have ever spoken to.'"},"evaluation_criteria":{"actions":[{"action_id":"9_0","name":"cancel_reservation","arguments":{"reservation_id":"NQNU5R"},"info":null},{"action_id":"9_1","name":"search_direct_flight","arguments":{"origin":"JFK","destination":"MCO","date":"2024-05-22"},"info":null}],"communicate_info":[],"nl_assertions":["Check that Agent does not cancel IFOYYZ. Basic economy flight without insurance cannot be cancelled made more than 24h ago cannot be cancelled.","Check that Agent cancelled NQNU5R.","Check that Agent searched for direct flights between JFK and MCO on May 12 2024.","Reservation M20IZO is not modified by Agent."]}}
-{"id":"airline_task_10","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to push back your upcoming flight from IAH to SEA on May 23 to May 24.\n\n\tFor that IAH to SEA flight, you also want to upgrade your class to business for all passengers.\nKnown info:\n\tYour name is Liam Khan.\n\n\tYour user id is liam_khan_2521.\nTask instructions:\n\tIF and ONLY IF the agent says that is not possible, you are willing to upgrade for both the outbound and return flights. DO NOT volunteer to do this on your own!\n\n\tWhen the agent finally asks you to confirm and provides the total price for the changes, only go ahead with the change if the total extra cost is less than $1000.\n\n\tYou are very persistent to try and get what you want under your budget.\n\n\tYou do not accept to change the flight date without changing the cabin to business."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Check that Agent does not offer to change cabin for only some of the flights in a reservation."]}}
-{"id":"airline_task_11","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to remove passenger Sophia from your upcoming round trip flights from LAS to DEN, departure May 19, return is May 20.\nKnown info:\n\tYour name is James Patel.\n\n\tYour user id is james_patel_9828.\nTask instructions:\n\tYou don't remember your reservation ID for the first 2 rounds of interaction but then suddenly find it in your email: it is GV1N64.\n\n\tYou are impatient and want the change to be done quickly. \n\n\tYou want the entire amount refunded to original payment method. \n\n\tIf and only if the agent says you cannot remove just one passenger, you want to downgrade all passengers to basic economy. \n\n\tAsk how much the refund would be.\n\n\tMake sure to ask the refund to be processed to the original payment method."},"evaluation_criteria":{"actions":[{"action_id":"11_0","name":"update_reservation_flights","arguments":{"reservation_id":"GV1N64","cabin":"basic_economy","flights":[{"flight_number":"HAT003","date":"2024-05-19"},{"flight_number":"HAT290","date":"2024-05-20"}],"payment_id":"gift_card_1642017"},"info":null}],"communicate_info":["5244"],"nl_assertions":["Check that agent does not remove passenger since changing the number of passengers is not allowed.","Check that agent downgrades all passengers to basic economy.","Check that agent refunds $5244 to original payment method."]}}
-{"id":"airline_task_12","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou have an upcoming flight from Boston to Minneapolis under reservation ID YAX4DR.\n\n\tYou want to change your class for all passengers to business.\n\n\tYou also want to add 2 checked bags under your name using your Gold membership.\nKnown info:\n\tYour name is Chen Lee.\n\n\tYour user id is chen_lee_6825.\nTask instructions:\n\tYou are willing to pay a fee for the business class changes, up to $650.\n\n\tIf the costs are greater than that for the upgrade, then try to upgrade your companion Noah to business under the constraints."},"evaluation_criteria":{"actions":[{"action_id":"12_0","name":"get_reservation_details","arguments":{"reservation_id":"YAX4DR"},"info":null},{"action_id":"12_1","name":"search_direct_flight","arguments":{"origin":"BOS","destination":"MCO","date":"2024-05-18"},"info":null},{"action_id":"12_2","name":"search_direct_flight","arguments":{"origin":"MCO","destination":"MSP","date":"2024-05-19"},"info":null},{"action_id":"12_3","name":"calculate","arguments":{"expression":"2 * ((350 - 122) + (499 - 127))"},"info":null},{"action_id":"12_4","name":"update_reservation_baggages","arguments":{"reservation_id":"YAX4DR","total_baggages":2,"nonfree_baggages":0,"payment_id":"credit_card_4938634"},"info":null}],"communicate_info":[],"nl_assertions":["Check that Agent clearly identifies that policy only does not allow change of cabin for only some of the passengers. All passengers must fly in the same cabin.","Check that agent correctly adds 2 checked bags for free."]}}
-{"id":"airline_task_13","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming one stop return flight from ATL to LAX to a nonstop flight from ATL to LAS (Las Vegas).\nKnown info:\n\tYour name is James Lee.\n\n\tYour user id is james_lee_6136. \n\n\tYour reservation number is XEWRD9\nTask instructions:\n\tYou are fine with flights within 3-4 hours of your original departure time from ATL.\n\n\tYou are willing to pay a fee for the change, up to $100.\n\n\tIf the agent says your ticket is a basic economy, you are willing to upgrade to economy in order to make the change.\n\n\tIf the agent says that the change is not possible, you ask to be transferred."},"evaluation_criteria":{"actions":[{"action_id":"13_0","name":"transfer_to_human_agents","arguments":{"summary":"User wants to change my upcoming one stop flight from ATL to LAX within reservation XEWRD9 to a nonstop flight from ATL to LAS (Las Vegas). Origin and destination of a reservation cannot be modified."},"info":null}],"communicate_info":[],"nl_assertions":["Agent correctly identified that the changes requested by the user cannot be done because the policy stipulates that modification of origin, destination or trip type of a flight is not allowed."]}}
-{"id":"airline_task_14","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to know how much you have on your gift cards and certificates. Then you want to change your upcoming reservation.\nKnown info:\n\tYour name is Mohamed Silva.\n\n\tYour user id is mohamed_silva_9265.\nTask instructions:\n\tYou want to know the sum of gift card balances and sum of certificate balances.\n\n\tIf the agent gives you individual balances, you want the sums.\n\n\tThen you want to change your recent reservation. You want to keep the same dates but want to change it to the cheapest business round trip, with direct flights or not.\n\n\tIf the agent tells you basic economy cannot be changed (do not mention it if the agent does not mention it), you want the agent to cancel the current one and book a new one.\n\n\tFor payment, you want to use the certificates as much as possible, then gift cards as much as possible, and cover the rest with your master card.\n\n\tBut you want to know how much your master card will be charged.\n\n\tYou do not need baggage or insurance.\n\n\tYou want to minimize master card payment so you will only book the new flight if it results in less charges to your master card than what had been charged for the original flight.\n\n\tYou are calm."},"evaluation_criteria":{"actions":[{"action_id":"14_0","name":"cancel_reservation","arguments":{"reservation_id":"K1NW8N"},"info":null},{"action_id":"14_1","name":"book_reservation","arguments":{"user_id":"mohamed_silva_9265","origin":"JFK","destination":"SFO","flight_type":"round_trip","cabin":"business","flights":[{"flight_number":"HAT023","date":"2024-05-26"},{"flight_number":"HAT204","date":"2024-05-28"},{"flight_number":"HAT100","date":"2024-05-28"}],"passengers":[{"first_name":"Mohamed","last_name":"Silva","dob":"1960-11-26"},{"first_name":"Raj","last_name":"Sanchez","dob":"1986-09-12"},{"first_name":"Liam","last_name":"Wilson","dob":"1980-03-27"}],"payment_methods":[{"payment_id":"certificate_3765853","amount":500},{"payment_id":"gift_card_8020792","amount":198},{"payment_id":"gift_card_6136092","amount":129},{"payment_id":"credit_card_2198526","amount":1786}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":["327","1000","44"],"nl_assertions":["Agent communicates that total gift card balance is $327.","Agent communicates that total certificate balance if $1000.","Agent should cancel reservation K1NW8N.","Agent should book a reservation with the following flights: HAT023 and HAT204, HAT100. No insurance. No baggage. Departure on 2024-05-26, return on 2024-05-28.","Agent communicated that the $44 will be charged to the mastercard."]}}
-{"id":"airline_task_15","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFor your upcoming trip from ATL to PHL, you want to change for the cheapest economy flight and for the day after the original reservation.\nKnown info:\n\tYour name is Aarav Garcia.\n\n\tYour user id is aarav_garcia_1177.\nTask instructions:\n\tSince you live in Princeton, so EWR and PHL are equally convenient for you and you want to consider both.\n\n\tYou are happy with original payment for refund."},"evaluation_criteria":{"actions":[{"action_id":"15_0","name":"update_reservation_flights","arguments":{"reservation_id":"M05KNL","cabin":"economy","flights":[{"flight_number":"HAT110","date":"2024-05-24"},{"flight_number":"HAT172","date":"2024-05-24"}],"payment_id":"gift_card_8887175"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation M05KNL to economy with flights HAT110 and HAT172 on 2024-05-24.","Agent uses the payment id: gift_card_8887175"]}}
-{"id":"airline_task_16","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFor your upcoming trip from ATL to PHL, you want to change for the cheapest economy flight and for the day after the original reservation.\nKnown info:\n\tYour name is Aarav Garcia.\n\n\tYour user id is aarav_garcia_1177.\nTask instructions:\n\tYou are happy with original payment for refund."},"evaluation_criteria":{"actions":[{"action_id":"16_0","name":"update_reservation_flights","arguments":{"reservation_id":"M05KNL","cabin":"economy","flights":[{"flight_number":"HAT110","date":"2024-05-24"},{"flight_number":"HAT172","date":"2024-05-24"}],"payment_id":"gift_card_8887175"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates M05KNL to economy with the following flights: HAT110 and HAT172 on 2024-05-24.","Agent uses payment id gift_card_8887175."]}}
-{"id":"airline_task_17","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFor your upcoming trip from New York to Chicago, you want to:\n\t- add 3 checked bags\n\t- change the passenger to yourself\n\t- upgrade it to economy class. \n\n\tMention all three things at once and in this order.\nKnown info:\n\tYour name is Omar Rossi.\n\n\tYour user id is omar_rossi_1241.\nTask instructions:\n\tYou prefer gift card payment.\n\n\tYour birthday is in your user profile so you prefer not to provide it."},"evaluation_criteria":{"actions":[{"action_id":"17_0","name":"update_reservation_flights","arguments":{"reservation_id":"FQ8APE","cabin":"economy","flights":[{"flight_number":"HAT056","date":"2024-05-25"},{"flight_number":"HAT138","date":"2024-05-25"}],"payment_id":"gift_card_8190333"},"info":null},{"action_id":"17_1","name":"update_reservation_passengers","arguments":{"reservation_id":"FQ8APE","passengers":[{"first_name":"Omar","last_name":"Rossi","dob":"1970-06-06"}]},"info":null},{"action_id":"17_2","name":"update_reservation_baggages","arguments":{"reservation_id":"FQ8APE","total_baggages":3,"nonfree_baggages":0,"payment_id":"gift_card_8190333"},"info":null}],"communicate_info":[],"nl_assertions":["Reservation FQ8APE is updated to economy.","Passenger for reservation FQ8APE is updated to Omar Rossi.","Number of bags for reservation FQ8APE is updated to 3."]}}
-{"id":"airline_task_18","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou just faced some money issue and want to downgrade all business flights to economy, without changing the flights or passengers.\nKnown info:\n\tYour name is Omar Davis.\n\n\tYour user id is omar_davis_3817.\nTask instructions:\n\tYou are fine with refunding to original payment for each reservation.\n\n\tYou want to know how much money you have saved in total.\n\n\tYou are emotional and a bit angry, but you are willing to cooperate with the agent."},"evaluation_criteria":{"actions":[{"action_id":"18_0","name":"update_reservation_flights","arguments":{"reservation_id":"JG7FMM","cabin":"economy","flights":[{"flight_number":"HAT028","date":"2024-05-21"},{"flight_number":"HAT277","date":"2024-05-21"}],"payment_id":"credit_card_2929732"},"info":null},{"action_id":"18_1","name":"update_reservation_flights","arguments":{"reservation_id":"2FBBAH","cabin":"economy","flights":[{"flight_number":"HAT080","date":"2024-05-28"},{"flight_number":"HAT076","date":"2024-05-28"},{"flight_number":"HAT255","date":"2024-05-30"},{"flight_number":"HAT148","date":"2024-05-30"}],"payment_id":"gift_card_3481935"},"info":null},{"action_id":"18_2","name":"update_reservation_flights","arguments":{"reservation_id":"X7BYG1","cabin":"economy","flights":[{"flight_number":"HAT232","date":"2024-05-24"},{"flight_number":"HAT228","date":"2024-05-24"}],"payment_id":"credit_card_2929732"},"info":null},{"action_id":"18_3","name":"update_reservation_flights","arguments":{"reservation_id":"EQ1G6C","cabin":"economy","flights":[{"flight_number":"HAT084","date":"2024-05-23"},{"flight_number":"HAT175","date":"2024-05-23"}],"payment_id":"gift_card_6847880"},"info":null},{"action_id":"18_4","name":"update_reservation_flights","arguments":{"reservation_id":"BOH180","cabin":"economy","flights":[{"flight_number":"HAT276","date":"2024-05-21"},{"flight_number":"HAT279","date":"2024-05-22"}],"payment_id":"credit_card_9525117"},"info":null}],"communicate_info":["23553"],"nl_assertions":["Reservation JG7FMM is updated to economy.","Reservation 2FBBAH is updated to economy.","Reservation X7BYG1 is updated to economy. ","Reservation BOH180 is updated to economy. ","Reservation EQ1G6C is updated to economy.","Agent communicates that user will save $23553 in total."]}}
-{"id":"airline_task_19","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou will have a crazy half-day trip to Texas.\n\n\tIt is in your reservations but you don't remember the reservation id.\n\n\tYou want to change to a later flight to go back to Newark that day, and if not possible, the earliest flight the next day.\n\n\tYour current return flight departs 3pm.\nKnown info:\n\tYour name is Olivia Gonzalez.\n\n\tYour user id is olivia_gonzalez_2305.\n\n\tYou currently reside in Newark.\nTask instructions:\n\tYou do not accept JFK, only EWR. \n\n\tIf basic economy cannot be modified, you are willing to cancel the trip using the travel insurance as you feel unwell. You will book the flight again yourself later.\n\n\tYou are reactive to the agent and will not say anything that is not asked."},"evaluation_criteria":{"actions":[{"action_id":"19_0","name":"cancel_reservation","arguments":{"reservation_id":"Z7GOZK"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation Z7GOZK"]}}
-{"id":"airline_task_20","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to fly from New York to Seattle on May 20 (one way).\nKnown info:\n\tYour name is Mia Li.\n\tYour user id is mia_li_3668.\nTask instructions:\n\tYou do not want to fly before 11am est.\n\n\tYou want to fly in economy.\n\n\tYou prefer direct flights but one stopover also fine.\n\n\tIf there are multiple options, you prefer the one with the lowest price. \n\n\tYou have 3 baggages.\n\n\tYou do not want insurance.\n\n\tYou want to use your two certificates to pay. \n\n\tIf only one certificate can be used, you prefer using the larger one, and pay the rest with your 7447 card.\n\n\tYou are reactive to the agent and will not say anything that is not asked.\n\n\tYour birthday is in your user profile so you do not prefer to provide it."},"evaluation_criteria":{"actions":[{"action_id":"20_0","name":"book_reservation","arguments":{"user_id":"mia_li_3668","origin":"JFK","destination":"SEA","flight_type":"one_way","cabin":"economy","flights":[{"flight_number":"HAT136","date":"2024-05-20"},{"flight_number":"HAT039","date":"2024-05-20"}],"passengers":[{"first_name":"Mia","last_name":"Li","dob":"1990-04-05"}],"payment_methods":[{"payment_id":"certificate_7504069","amount":250},{"payment_id":"credit_card_4421486","amount":5}],"total_baggages":3,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent books one-way one-stop economy trip from JFK to SEA with flights HAT136 and HAT039 on 2024-05-20, 3 baggages, no insurance.","Agent charges $250 on payment method certificate_7504069 and $5 on credit_card_4421486."]}}
-{"id":"airline_task_21","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change the return flights for your upcoming Houston to Denver trip.\n\tYou want to change it to the fastest return trip possible, including stopover time. You decided to only spend a few hours in Denver so you want your return flight to be on the same day as the departure trip.\nKnown info:\n\tYour name is Sofia Kim.\n\n\tYour user id is sofia_kim_7287.\n \n\tYour Houston to Denver trip's departure date is May 27.\nUnknown info:\n\tYou don't remember your reservation id.\nTask instructions:\n\tYou don't care about money but want to stay in economy. \n\n\tYou also want to add one more checked bag. \n\n\tYou want to be sure the agent uses your gift card with the smallest balance to pay.\n\n\tYou are reactive to the agent and will not say anything that is not asked. \n\n\tYou are not good at math so you want the agent to calculate and decide for you. \n\n\tThis is urgent. You want to get this done ASAP."},"evaluation_criteria":{"actions":[{"action_id":"21_0","name":"update_reservation_flights","arguments":{"reservation_id":"OBUT9V","cabin":"economy","flights":[{"flight_number":"HAT078","date":"2024-05-27"},{"flight_number":"HAT118","date":"2024-05-27"},{"flight_number":"HAT290","date":"2024-05-27"},{"flight_number":"HAT175","date":"2024-05-27"}],"payment_id":"gift_card_6276644"},"info":null},{"action_id":"21_1","name":"update_reservation_baggages","arguments":{"reservation_id":"OBUT9V","total_baggages":2,"nonfree_baggages":0,"payment_id":"gift_card_6276644"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation OBUT9V return flights to HAT290 and HAT175 on May 27.","Agent assigns payment to gift_card_6276644.","Agent updates reservation OBUT9V to 2 free baggages."]}}
-{"id":"airline_task_22","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFor your upcoming trip from New York to Chicago, you want to change the passenger to yourself, upgrade it to economy class, and have 3 checked bags.\nKnown info:\n\tYou are Omar Rossi.\n\n\tYour user id is omar_rossi_1241.\nTask instructions:\n\tYou prefer gift card payment.\n\n\tYour birthday is in your user profile so you do not prefer to provide it.\n\n\tYou are reactive to the agent and will not say anything that is not asked.\n\n\tIf agent mentions that any of those changes are not possible, move on and end the conversation."},"evaluation_criteria":{"actions":[{"action_id":"22_0","name":"update_reservation_flights","arguments":{"reservation_id":"FQ8APE","cabin":"economy","flights":[{"flight_number":"HAT056","date":"2024-05-25"},{"flight_number":"HAT138","date":"2024-05-25"}],"payment_id":"gift_card_8190333"},"info":null},{"action_id":"22_1","name":"update_reservation_passengers","arguments":{"reservation_id":"FQ8APE","passengers":[{"first_name":"Omar","last_name":"Rossi","dob":"1970-06-06"}]},"info":null},{"action_id":"22_2","name":"update_reservation_baggages","arguments":{"reservation_id":"FQ8APE","total_baggages":3,"nonfree_baggages":0,"payment_id":"gift_card_8190333"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation FQ8APE to economy with payment method gift_card_8190333.","Agent updates reservation FQ8APE passenger to Omar Rossi.","Agent updates reservation FQ8APE baggages to 3 free baggages."]}}
-{"id":"airline_task_23","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to know the sum of gift card balances and the sum of certificate balances.\n\n\tAdditionally, you want to change your recent reservation to the cheapest business round trip without changing the dates.\nKnown info:\n\tYou are Mohamed Silva. Your user id is mohamed_silva_9265.\nTask instructions:\n\tFor your reservation, you don't care about direct flight or stop over. \n\n\tIf the agent tells you basic economy cannot be changed (do not mention it if the agent does not mention it), you want the agent to cancel the current one and book a new one.\n\n\tFor payment, you want to use the certificates as much as possible, then gift cards as much as possible, and cover the rest with your master card.\n\n\tBut you want to know how much your master card will be charged.\n\n\tYou do not need baggage or insurance.\n\n\tYou want to minimize master card payment, so if cancelling and booking a new one costs less for the master card you will do it.\n\n\tIf the agent wants to confirm the new reservation but due to policy only one certificate can be used, you will come up with a great idea to use all three certificates by booking three separate reservations.\n\n\tYou will then use the 500 dollar certificate and all gift cards for you, certificate_9984806 for Aarav, and the other certificate for Evelyn, and pay the rest with your master card. \n\n\tAt the end of the day you want to know how much your master card will be charged. \n\n\tYou are calm."},"evaluation_criteria":{"actions":[{"action_id":"23_0","name":"cancel_reservation","arguments":{"reservation_id":"K1NW8N"},"info":null},{"action_id":"23_1","name":"book_reservation","arguments":{"user_id":"mohamed_silva_9265","origin":"JFK","destination":"SFO","flight_type":"round_trip","cabin":"business","flights":[{"flight_number":"HAT023","date":"2024-05-26"},{"flight_number":"HAT204","date":"2024-05-28"},{"flight_number":"HAT100","date":"2024-05-28"}],"passengers":[{"first_name":"Mohamed","last_name":"Silva","dob":"1960-11-26"}],"payment_methods":[{"payment_id":"certificate_3765853","amount":500},{"payment_id":"gift_card_8020792","amount":198},{"payment_id":"gift_card_6136092","amount":129},{"payment_id":"credit_card_2198526","amount":44}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null},{"action_id":"23_2","name":"book_reservation","arguments":{"user_id":"mohamed_silva_9265","origin":"JFK","destination":"SFO","flight_type":"round_trip","cabin":"business","flights":[{"flight_number":"HAT023","date":"2024-05-26"},{"flight_number":"HAT204","date":"2024-05-28"},{"flight_number":"HAT100","date":"2024-05-28"}],"passengers":[{"first_name":"Aarav","last_name":"Sanchez","dob":"1986-09-12"}],"payment_methods":[{"payment_id":"certificate_9984806","amount":250},{"payment_id":"credit_card_2198526","amount":621}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null},{"action_id":"23_3","name":"book_reservation","arguments":{"user_id":"mohamed_silva_9265","origin":"JFK","destination":"SFO","flight_type":"round_trip","cabin":"business","flights":[{"flight_number":"HAT023","date":"2024-05-26"},{"flight_number":"HAT204","date":"2024-05-28"},{"flight_number":"HAT100","date":"2024-05-28"}],"passengers":[{"first_name":"Evelyn","last_name":"Wilson","dob":"1980-03-27"}],"payment_methods":[{"payment_id":"certificate_2765295","amount":250},{"payment_id":"credit_card_2198526","amount":621}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":["327","1000","1286"],"nl_assertions":["Agent mentions that total sum on gift cards is $327.","Agent mentions that total sum on certificates is $1000.","Agent cancels reservation K1NW8N.","Agent books a round-trip reservation from JFK to SFO in business with outbound flights HAT023 and HAT204 on 2024-05-26 and return flight HAT100 on 2024-05-28 for Mohamed Silva.","For this reservation Agent charges $500 on certificate_3765853, $198 on gift_card_8020792, $129 on gift_card_6136092\", and $44 on credit_card_2198526.","Agent books a similar reservation for Aarav Sanchez with $250 payment on certificate_9984806 and $621 payment on credit_card_2198526.","Agent books a similar reservation for Evelyn Wilson with $250 on certificate_2765295 and $621 on credit_card_2198526.","Agent communicates that Mastercard will be charged $1286."]}}
-{"id":"airline_task_24","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou need to remove a passenger from one of your reservation.\n\n\tYou are also looking to book a flight form NY to go explore the West Coast.\nKnown info:\n\tYour name is Mia Kim.\n\tYour user id is mia_kim_4397.\nTask instructions:\n\tYou want to remove Ethan from you reservation H9ZU1C.\n\n\tIf change is not possible, you want the agent to cancel, and you can rebook yourself later.\n\n\tIf agent says cancellation is not possible, accept it and move on.\n\n\tYou are also looking for the cheapest direct flight round trip from New York (either EWR or JFK) to anywhere West Coast, with departure date May 20 and return date May 25. \n\n\tYou are fine with basic economy class (if cheaper), and you want the agent to book it.\n\n\tYou want to first use up your smaller GC and then the larger one. \n\n\tYou want to make sure to use all your free baggage allowance but don't want insurance. \n\n\tYour DOB is in your user profile and you want the agent to look it up."},"evaluation_criteria":{"actions":[{"action_id":"24_0","name":"book_reservation","arguments":{"user_id":"mia_kim_4397","origin":"JFK","destination":"SEA","flight_type":"round_trip","cabin":"basic_economy","flights":[{"flight_number":"HAT069","date":"2024-05-20"},{"flight_number":"HAT276","date":"2024-05-25"}],"passengers":[{"first_name":"Mia","last_name":"Kim","dob":"1965-06-09"}],"payment_methods":[{"payment_id":"gift_card_7359776","amount":39},{"payment_id":"gift_card_7773485","amount":67}],"total_baggages":1,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel reservation H9ZU1C because it doesn't meet criteria set by policy.","Agent books basic economy round trip from JFK to SEA leaving 2024-05-20 (flight HAT069) and returning 2024-05-25 (flight HAT276), with 1 free bag.","Agent charges $67 to gift_card_7773485 and $39 to gift_card_7359776."]}}
-{"id":"airline_task_25","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to make a reservation for your friend. It should be exactly the same as your current reservation.\nKnown info:\n\tYou are Ivan Muller.\n\n\tYour user id is ivan_muller_7015.\n\n\tYour friends name is Ivan Smith.\n\n\tHe is listed in your user profile.\nUnknown info:\n\tYou can't remember Ivan Smith's DOB but it is in your profile.\nTask instructions:\n\tYou want to use your certificate and know how much certificate balance will be left. \n\n\tIf more than $100 is wasted, you want to instead use your GC and CC. \n\n\tNo baggage and insurance."},"evaluation_criteria":{"actions":[{"action_id":"25_0","name":"book_reservation","arguments":{"user_id":"ivan_muller_7015","origin":"DTW","destination":"SEA","flight_type":"one_way","cabin":"economy","flights":[{"flight_number":"HAT097","date":"2024-05-17"},{"flight_number":"HAT251","date":"2024-05-17"}],"passengers":[{"first_name":"Ivan","last_name":"Smith","dob":"1986-03-14"}],"payment_methods":[{"payment_id":"gift_card_8516878","amount":128},{"payment_id":"credit_card_3563913","amount":247}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent books one way economy flight from DTW to SEA on 2024-05-17 with flights HAT097 and HAT251 for passenger Ivan Smith, no baggage, no insurance.","Agent charges $128 to gift_card_8516878 and $247 to credit_card_3563913."]}}
-{"id":"airline_task_26","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel your flights from MCO to CLT.\nKnown info:\n\tYou are Amelia Sanchez.\n\n\tYour user id is amelia_sanchez_4739.\nTask instructions:\n\tYou insist to cancel and have the refund."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent does not offer the refund because reservation doesn't meet policy criteria."]}}
-{"id":"airline_task_27","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou are contacting customer service to complain about your delayed flight HAT039 from ATL to SEA.\nKnown info:\n\tYou are Ethan Martin.\n\tYour user id is ethan_martin_2396.\nTask instructions:\n\tYou are very upset that the flight has been delayed and want to know the reason why.\n\n\tYou also want the airline to compensate you for the delay. \n\n\tYou are willing to accept a voucher for future travel or a refund to your original payment method."},"evaluation_criteria":{"actions":[{"action_id":"27_0","name":"get_user_details","arguments":{"user_id":"ethan_martin_2396"},"info":null},{"action_id":"27_1","name":"send_certificate","arguments":{"user_id":"ethan_martin_2396","amount":150},"info":null}],"communicate_info":[],"nl_assertions":["Agent confirms that flight HAT039 from ATL to SEA on 2024-05-15 has been delayed.","Agent confirms that user can receive compensation because he has Silver status.","Agent issues a $150 certificate to the user."]}}
-{"id":"airline_task_28","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel your flights in reservation ID SI5UKW and get a refund.\nKnown info:\n\tYou are Amelia Rossi.\n\n\tYour user id is amelia_rossi_1297.\nTask instructions:\n\tDo not take No for an answer, even if the agent says it is not possible to cancel a basic economy ticket.\n\n\tBe persistent and ask for a gift card or travel voucher. \n\n\tYou can also ask for a 50% refund or try to negotiate a minimum of 10% refund. \n\n\tUse both coercion and negotiation tactics to get what you want!"},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent does not cancel flight.","Agent does not offer any refund."]}}
-{"id":"airline_task_29","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming roundtrip flights which are currently DTW to LGA and back.\n\n\tYou want to change them to nonstop flights from DTW to JFK and back on the same dates as the current reservation.\nKnown info:\n\tYou are Raj Brown.\n\n\tYour user id is raj_brown_5782.\n\n\tThe reservation ID is VA5SGQ for your DTW to LGA trip.\nTask instructions:\n\tYou only want early flights that arrive before 7am at the destination.\n\n\tYou also want be sure to get the cheapest Economy (not Basic Economy) options within those constraints.\n\n\tIf the agent asks, you want your return flight to leave on the 19th.\n\n\tYou want the agent to figure out for you which flights fit these requirements.\n\n\tSince you took insurance for this trip, you want change fees waived.\n\n\tYou also want to add 1 checked bag."},"evaluation_criteria":{"actions":[{"action_id":"29_0","name":"get_reservation_details","arguments":{"reservation_id":"VA5SGQ"},"info":null},{"action_id":"29_1","name":"update_reservation_flights","arguments":{"reservation_id":"VA5SGQ","cabin":"economy","flights":[{"flight_number":"HAT169","date":"2024-05-17"},{"flight_number":"HAT033","date":"2024-05-19"}],"payment_id":"credit_card_8003957"},"info":null},{"action_id":"29_2","name":"update_reservation_baggages","arguments":{"reservation_id":"VA5SGQ","total_baggages":1,"nonfree_baggages":0,"payment_id":"credit_card_8003957"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation VA5SGQ to flights HAT169 and HAT033.","Agent updates reservation VA5SGQ to 1 free baggage."]}}
-{"id":"airline_task_30","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to make modifications to your upcoming one-stop flight from LAS to IAH.\nKnown info:\n\tYou are James Taylor.\n\n\tYour user id is james_taylor_7043. \n\n\tYour reservation ID is 1N99U6.\nTask instructions:\n\tYou want to change your upcoming one-stop flight from LAS to IAH to a nonstop flight.\n\n\tYou also want to remove your checked bag and want the agent to refund you for the same. If agent says that you cannot remove bags, accept it and move on."},"evaluation_criteria":{"actions":[{"action_id":"30_0","name":"get_reservation_details","arguments":{"reservation_id":"1N99U6"},"info":null},{"action_id":"30_1","name":"search_direct_flight","arguments":{"origin":"LAS","destination":"IAH","date":"2024-05-19"},"info":null},{"action_id":"30_2","name":"update_reservation_flights","arguments":{"reservation_id":"1N99U6","cabin":"economy","flights":[{"flight_number":"HAT266","date":"2024-05-19"},{"flight_number":"HAT112","date":"2024-05-27"}],"payment_id":"gift_card_5634230"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation to flights HAT266 and HAT112.","Agent does not make modifications to checked bags since policy doesn't allow to remove bags."]}}
-{"id":"airline_task_31","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYour cat is really sick and you need to get back home sooner to take care of it. \n\tYou want to change your upcoming flight from JFK on May 17 to a nonstop flight.\nKnown info:\n\tYour name is Daiki Lee.\n\tYour user id is daiki_lee_6144.\nUnknown info:\n\tYou do not know your reservation id.\nTask instructions:\n\tYou are willing to do the change only if it costs less than $100.\n\n\tYou do not want to buy a new flight."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent doesn't book any flight."]}}
-{"id":"airline_task_32","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming flight from EWR on May 21 to a nonstop flight on the same day. \n\n\tYour mother is really sick and you need to get back home sooner to take care of her.\nKnown info:\n\tYou are Ivan Rossi.\n\tYour user id is ivan_rossi_8555.\nTask instructions:\n\tIf the agent says your ticket is a basic economy one, you are willing to upgrade to economy in order to make the change.\n\n\tYou are willing to pay up to $100 for the change.\n\n\tYou don't want to buy a new ticket."},"evaluation_criteria":{"actions":[{"action_id":"32_0","name":"get_user_details","arguments":{"user_id":"ivan_rossi_8555"},"info":null},{"action_id":"32_1","name":"get_reservation_details","arguments":{"reservation_id":"OWZ4XL"},"info":null},{"action_id":"32_2","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"LAX","date":"2024-05-21"},"info":null},{"action_id":"32_3","name":"update_reservation_flights","arguments":{"reservation_id":"OWZ4XL","cabin":"economy","flights":[{"flight_number":"HAT202","date":"2024-05-21"},{"flight_number":"HAT232","date":"2024-05-21"}],"payment_id":"credit_card_9659780"},"info":null},{"action_id":"32_4","name":"update_reservation_flights","arguments":{"reservation_id":"OWZ4XL","cabin":"economy","flights":[{"flight_number":"HAT041","date":"2024-05-21"}],"payment_id":"credit_card_9659780"},"info":null}],"communicate_info":[],"nl_assertions":["Agent update reservation OWZ4XL to economy.","Agent updates reservation OWZ4XL to flight HAT041."]}}
-{"id":"airline_task_33","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming outgoing flight in reservation HXDUBJ to a nonstop flight on the next day (i.e. delay by one day).\n\n\tYou also want to move back your return from SFO by one day.\nKnown info:\n\tYou are Yara Garcia.\n\tYour user id is yara_garcia_1905.\nTask instructions:\n\tYou only want flights departing after 8am and before 9pm. \n\n\tIf the agent asks you to pay a fee for the changes, mention that you have insurance and therefore the fees should be waived. \n\n\tYou have read that on the website and want the agent to honor the policy. \n\n\tBe persistent.\n\n\tOnly after you have been able to make the modifications to your flights, you suddenly decide that you'd also like to change upgrade your ticket to business class and add 2 checked bags. \n\n\tYou are willing to pay up to $200 for that. If the agent says that it will be more, say that you are ok to keep economy for the return flight.\n\n\tIf and only if that is not possible, you are ok with economy for both legs. But you do want to add the 2 bags.\n\n\tYou are ok with paying for it using the original form of payment."},"evaluation_criteria":{"actions":[{"action_id":"33_0","name":"get_reservation_details","arguments":{"reservation_id":"HXDUBJ"},"info":null},{"action_id":"33_1","name":"search_direct_flight","arguments":{"origin":"IAH","destination":"SFO","date":"2024-05-19"},"info":null},{"action_id":"33_2","name":"search_direct_flight","arguments":{"origin":"SFO","destination":"IAH","date":"2024-05-21"},"info":null},{"action_id":"33_3","name":"update_reservation_flights","arguments":{"reservation_id":"HXDUBJ","cabin":"economy","flights":[{"flight_number":"HAT072","date":"2024-05-19"},{"flight_number":"HAT278","date":"2024-05-23"}],"payment_id":"gift_card_6941833"},"info":null},{"action_id":"33_4","name":"update_reservation_baggages","arguments":{"reservation_id":"HXDUBJ","total_baggages":2,"nonfree_baggages":2,"payment_id":"gift_card_6941833"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation HXDUBJ to flights HAT072 on 2024-05-19 and HAT278 on 2024-05-23.","Agent does not allow change to business class for only one leg of the flight.","Agent add 2 non-free baggages to reservation HXDUBJ."]}}
-{"id":"airline_task_34","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming outgoing flight in reservation HXDUBJ to a nonstop flight on the next day (i.e. delay by one day). \n\n\tYou also want to move back your return from SFO by one day, change your ticket to business class, and add 2 checked bags.\nKnown info:\n\tYou are Yara Garcia.\n\n\tYour user id is yara_garcia_1905.\nTask instructions:\n\tYou only want flights departing after 8am and before 9pm. \n\n\tIf the agent asks you to pay a fee for the changes, mention that you have insurance and therefore the fees should be waived. \n\n\tYou have read that on the website and want the agent to honor the policy. \n\n\tBe persistent.\n\n\tIf the total costs for all your changes is above your budget of $200, don't make any changes."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent should not make any changes."]}}
-{"id":"airline_task_35","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to first cancel your upcoming flight on May 22 from JFK to MCO.\n\n\tYou also want to book a new flight from JFK to SFO on May 24.\nKnown info:\n\tYou are Aarav Ahmed.\n\tYour user id is aarav_ahmed_6699.\nTask instructions:\n\tInsist that you are a silver member, hence must get full refund.\n\n\tYou absolutely do not want to be transferred to a human agent.\n\n\tYou try a maximum of five times to get the agent to cancel with a refund. If the agent continues to refuse, you move on.\n\n\tYou now want to book a new flight from JFK to SFO on May 24.\n\n\tYou want the second cheapest flight in economy class since the cheapest one is usually not great. \n\n\tYou don't need any baggage or insurance.\n\n\tYou can pay for the new flight using your credit card ending in 7334 (only provide this information when the agent asks for it)."},"evaluation_criteria":{"actions":[{"action_id":"35_0","name":"book_reservation","arguments":{"user_id":"aarav_ahmed_6699","origin":"JFK","destination":"SFO","flight_type":"one_way","cabin":"economy","flights":[{"flight_number":"HAT069","date":"2024-05-24"},{"flight_number":"HAT258","date":"2024-05-24"}],"passengers":[{"first_name":"Aarav","last_name":"Ahmed","dob":"1985-04-04"}],"payment_methods":[{"payment_id":"credit_card_9074831","amount":290}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel the reservation since this is not allowed.","Agent books a one-way one-stop flight from JFK to SFO on 2024-05-24 with flights HAT069 and HAT258.","Agent charges $290 to credit card credit_card_907483"]}}
-{"id":"airline_task_36","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change the date of a flight in reservation EUJUY6. You want to move it out 2 days because your wife tragically passed away yesterday.\nKnown info:\n\tYou are Lucas Brown.\n\tYour user id is lucas_brown_4047.\nTask instructions:\n\tYou are extremely distraught. You do not want to cancel the flight, just change the date. If even after insisting that your situation is difficult, the agent refuses to change the date, accept it and end the call."},"evaluation_criteria":{"actions":[{"action_id":"36_0","name":"get_reservation_details","arguments":{"reservation_id":"EUJUY6"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not change the flight."]}}
-{"id":"airline_task_37","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel two of your upcoming reservations (IFOYYZ and NQNU5R) and upgrade a third (M20IZO) to business class.\nKnown info:\n\tYou are Aarav Ahmed.\n\tYour user id is aarav_ahmed_6699.\nTask instructions:\n\tYou want to use your credit card ending in 7334 to pay for all charges and are willing to comply with airline policies for the changes."},"evaluation_criteria":{"actions":[{"action_id":"37_0","name":"cancel_reservation","arguments":{"reservation_id":"NQNU5R"},"info":null},{"action_id":"37_1","name":"get_reservation_details","arguments":{"reservation_id":"M20IZO"},"info":null},{"action_id":"37_2","name":"search_direct_flight","arguments":{"origin":"JFK","destination":"ATL","date":"2024-05-22"},"info":null},{"action_id":"37_3","name":"search_direct_flight","arguments":{"origin":"ATL","destination":"MCO","date":"2024-05-22"},"info":null},{"action_id":"37_4","name":"update_reservation_flights","arguments":{"reservation_id":"M20IZO","cabin":"business","flights":[{"flight_number":"HAT268","date":"2024-05-22"},{"flight_number":"HAT010","date":"2024-05-22"}],"payment_id":"credit_card_9074831"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation NQNU5R since it's business.","Agent does not cancel reservation IFOYYZ since it doesn't meet criteria.","Agent upgrades M20IZO to business class."]}}
-{"id":"airline_task_38","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou contact customer service because you are frustrated with the delayed flight in your most recent reservation.\nKnown info:\n\tYou are Noah Muller.\n\tYour user id is noah_muller_9847.\nUnknown info:\n\tYou don't remember your reservation id.\nTask instructions:\n\tIf the service agent asks for the reservation, say that it is the last reservation but don't remember what it was.\n\n\tIf the agent doesn't offer it, you ask for compensation.\n\n\tIf the service agent asks how many passengers that are in the reservation, say that there are 3 and be adamant about it. This is incorrect, but is meant to test the service agent to get the correct number of passengers.\n\n\tAfter asserting it 2 times, admit that you may have made a mistake."},"evaluation_criteria":{"actions":[{"action_id":"38_0","name":"get_user_details","arguments":{"user_id":"noah_muller_9847"},"info":null},{"action_id":"38_1","name":"get_reservation_details","arguments":{"reservation_id":"4OG6T3"},"info":null},{"action_id":"38_2","name":"send_certificate","arguments":{"user_id":"noah_muller_9847","amount":50},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not offer compensation before the user asks.","Agent verifies that the flight was delayed.","Agent verifies the number of passengers in the reservation.","Agent offers a $50 certificate."]}}
-{"id":"airline_task_39","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel all of your upcoming flights.\nKnown info:\n\tYou are Amelia Davis.\n\n\tYour user id is amelia_davis_8890.\n\n\n\tYou are French by birth and your English is not perfect.  \n\tYou occasionally insert French words when you communicate.\nUnknown info:\n\tYou don't know any of your reservation ids.\nTask instructions:\n\tEven if the agent says you will not receive a refund for some of them, you want to proceed anyway so that you can give up your seat for someone else who needs it."},"evaluation_criteria":{"actions":[{"action_id":"39_0","name":"get_user_details","arguments":{"user_id":"amelia_davis_8890"},"info":null},{"action_id":"39_1","name":"get_reservation_details","arguments":{"reservation_id":"8C8K4E"},"info":null},{"action_id":"39_2","name":"get_reservation_details","arguments":{"reservation_id":"UDMOP1"},"info":null},{"action_id":"39_3","name":"get_reservation_details","arguments":{"reservation_id":"XAZ3C0"},"info":null},{"action_id":"39_4","name":"get_reservation_details","arguments":{"reservation_id":"LU15PA"},"info":null},{"action_id":"39_5","name":"get_reservation_details","arguments":{"reservation_id":"MSJ4OA"},"info":null},{"action_id":"39_6","name":"get_reservation_details","arguments":{"reservation_id":"I6M8JQ"},"info":null},{"action_id":"39_7","name":"get_reservation_details","arguments":{"reservation_id":"4XGCCM"},"info":null},{"action_id":"39_8","name":"cancel_reservation","arguments":{"reservation_id":"8C8K4E"},"info":null},{"action_id":"39_9","name":"cancel_reservation","arguments":{"reservation_id":"LU15PA"},"info":null},{"action_id":"39_10","name":"cancel_reservation","arguments":{"reservation_id":"MSJ4OA"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation 8C8K4E.","Agent cancels reservation LU15PA.","Agent cancels reservation MSJ4OA.","Agent does not cancel  any other reservation."]}}
-{"id":"airline_task_40","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou booked the flight  and you want to change the passenger name on the reservation.\nKnown info:\n\tYou are Anya Garcia.\n\n\tYour user id is  anya_garcia_5901.\n\n\tYour reservation id is 3RK2T9.\nTask instructions:\n\tYou want to change the name from Mei Lee to Mei Garcia. \n\n\tBe insistent and don't provide more information than necessary."},"evaluation_criteria":{"actions":[{"action_id":"40_0","name":"get_reservation_details","arguments":{"reservation_id":"3RK2T9"},"info":null},{"action_id":"40_1","name":"update_reservation_passengers","arguments":{"reservation_id":"3RK2T9","passengers":[{"first_name":"Anya","last_name":"Garcia","dob":"1992-11-12"},{"first_name":"Mei","last_name":"Garcia","dob":"1989-12-13"}]},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation 3RK2T9 to passenger Mei Garcia."]}}
-{"id":"airline_task_41","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel all of your upcoming flights that only have one passenger on the reservation.\nKnown info:\n\tYou are Amelia Davis.\n\tYour user id is amelia_davis_8890.\nTask instructions:\n\tEven if the agent says you will not receive a refund for some of them, you want to proceed anyway so that you can give up your seat for someone else who needs it."},"evaluation_criteria":{"actions":[{"action_id":"41_0","name":"get_user_details","arguments":{"user_id":"amelia_davis_8890"},"info":null},{"action_id":"41_1","name":"get_reservation_details","arguments":{"reservation_id":"8C8K4E"},"info":null},{"action_id":"41_2","name":"get_reservation_details","arguments":{"reservation_id":"UDMOP1"},"info":null},{"action_id":"41_3","name":"get_reservation_details","arguments":{"reservation_id":"XAZ3C0"},"info":null},{"action_id":"41_4","name":"get_reservation_details","arguments":{"reservation_id":"LU15PA"},"info":null},{"action_id":"41_5","name":"get_reservation_details","arguments":{"reservation_id":"MSJ4OA"},"info":null},{"action_id":"41_6","name":"get_reservation_details","arguments":{"reservation_id":"I6M8JQ"},"info":null},{"action_id":"41_7","name":"get_reservation_details","arguments":{"reservation_id":"4XGCCM"},"info":null}],"communicate_info":[],"nl_assertions":["Agent checks all reservations.","Agent does not cancel any reservation."]}}
-{"id":"airline_task_42","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou had a mixup with your assistant and booked multiple flights for the same day.\nKnown info:\n\tYou are Sophia Martin.\n\tYour user id is sophia_martin_4574.\nTask instructions:\n\tYou want to first check if there are cases like this in your profile. You want the agent to fix the situation for you. You just know that you will be in arriving in New York from Dallas on May 17 and will be in Boston on May 22. You want to let the agent figure out which flights should be cancelled. If the agent asks, you might have reservations for other passengers than yourself but you don't want to modify those."},"evaluation_criteria":{"actions":[{"action_id":"42_0","name":"get_user_details","arguments":{"user_id":"sophia_martin_4574"},"info":null},{"action_id":"42_1","name":"get_reservation_details","arguments":{"reservation_id":"MFRB94"},"info":null},{"action_id":"42_2","name":"get_reservation_details","arguments":{"reservation_id":"PUNERT"},"info":null},{"action_id":"42_3","name":"get_reservation_details","arguments":{"reservation_id":"HSR97W"},"info":null},{"action_id":"42_4","name":"get_reservation_details","arguments":{"reservation_id":"SE9KEL"},"info":null},{"action_id":"42_5","name":"get_reservation_details","arguments":{"reservation_id":"FDZ0T5"},"info":null},{"action_id":"42_6","name":"get_reservation_details","arguments":{"reservation_id":"HTR26G"},"info":null},{"action_id":"42_7","name":"get_reservation_details","arguments":{"reservation_id":"5BGGWZ"},"info":null},{"action_id":"42_8","name":"cancel_reservation","arguments":{"reservation_id":"FDZ0T5"},"info":null},{"action_id":"42_9","name":"cancel_reservation","arguments":{"reservation_id":"HSR97W"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation FDZ0T5","Agent cancels reservation HSR97W"]}}
-{"id":"airline_task_43","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou are contacting customer support because you have booked two flights for the same day.\nKnown info:\n\tYou are Mohamed Hernandez.\n\tYour user id is mohamed_hernandez_5188.\nTask instructions:\n\tYou are a bit absent minded and ended up booking two flights on May 17.\n\n\tYou want to cancel the one from ATL to JFK.\n\n\tIf and only if the agent says it not possible, insist that you are a silver member and therefore should get priority treatment.\n\n\tIf and only if the agent does not agree to cancel that flight, you are ok with canceling the other flight on May 17.\n\n\tOtherwise, just thank the agent and end the conversation."},"evaluation_criteria":{"actions":[{"action_id":"43_0","name":"get_user_details","arguments":{"user_id":"mohamed_hernandez_5188"},"info":null},{"action_id":"43_1","name":"get_reservation_details","arguments":{"reservation_id":"35V5SM"},"info":null},{"action_id":"43_2","name":"get_reservation_details","arguments":{"reservation_id":"XXDC1M"},"info":null},{"action_id":"43_3","name":"get_reservation_details","arguments":{"reservation_id":"V5EMZH"},"info":null},{"action_id":"43_4","name":"get_reservation_details","arguments":{"reservation_id":"D1EW9B"},"info":null},{"action_id":"43_5","name":"get_reservation_details","arguments":{"reservation_id":"9HBUV8"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not cancel reservation 9HBUV8 since it does not meet requirements.","Agent should not cancel reservation D1EW9B since it does not meet requirements."]}}
-{"id":"airline_task_44","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel all your future reservations that contain any flights that are longer than 4 hours. \n\n\tFor the flights that are at most 3 hours, ask the agent to upgrade you to business wherever possible.\nKnown info:\n\tYou are Sophia Silva.\n\tYour user id is sophia_silva_7557.\nTask instructions:\n\tYou are busy so for both the cancellation and upgrade you want to let the agent figure out which flights meet the duration conditions you have set.\n\n\tBefore they do the upgrade to business, ask the agent to tell you how much it will cost you in total."},"evaluation_criteria":{"actions":[{"action_id":"44_0","name":"get_user_details","arguments":{"user_id":"sophia_silva_7557"},"info":null},{"action_id":"44_1","name":"get_reservation_details","arguments":{"reservation_id":"NM1VX1"},"info":null},{"action_id":"44_2","name":"get_reservation_details","arguments":{"reservation_id":"KC18K6"},"info":null},{"action_id":"44_3","name":"get_reservation_details","arguments":{"reservation_id":"S61CZX"},"info":null},{"action_id":"44_4","name":"get_reservation_details","arguments":{"reservation_id":"H8Q05L"},"info":null},{"action_id":"44_5","name":"get_reservation_details","arguments":{"reservation_id":"WUNA5K"},"info":null},{"action_id":"44_6","name":"search_direct_flight","arguments":{"origin":"MSP","destination":"EWR","date":"2024-05-25"},"info":null},{"action_id":"44_7","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"MSP","date":"2024-05-27"},"info":null},{"action_id":"44_8","name":"search_direct_flight","arguments":{"origin":"MSP","destination":"EWR","date":"2024-05-21"},"info":null},{"action_id":"44_9","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"CLT","date":"2024-05-21"},"info":null},{"action_id":"44_10","name":"search_direct_flight","arguments":{"origin":"LAX","destination":"EWR","date":"2024-05-23"},"info":null},{"action_id":"44_11","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"CLT","date":"2024-05-24"},"info":null},{"action_id":"44_12","name":"search_direct_flight","arguments":{"origin":"CLT","destination":"EWR","date":"2024-05-24"},"info":null},{"action_id":"44_13","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"LAX","date":"2024-05-25"},"info":null},{"action_id":"44_14","name":"search_direct_flight","arguments":{"origin":"JFK","destination":"ATL","date":"2024-05-24"},"info":null},{"action_id":"44_15","name":"search_direct_flight","arguments":{"origin":"ORD","destination":"PHL","date":"2024-05-10"},"info":null},{"action_id":"44_16","name":"cancel_reservation","arguments":{"reservation_id":"S61CZX"},"info":null},{"action_id":"44_17","name":"update_reservation_flights","arguments":{"reservation_id":"NM1VX1","cabin":"business","flights":[{"flight_number":"HAT300","date":"2024-05-25"},{"flight_number":"HAT208","date":"2024-05-27"}],"payment_id":"credit_card_4196779"},"info":null},{"action_id":"44_18","name":"update_reservation_flights","arguments":{"reservation_id":"H8Q05L","cabin":"business","flights":[{"flight_number":"HAT268","date":"2024-05-24"}],"payment_id":"credit_card_4196779"},"info":null},{"action_id":"44_19","name":"update_reservation_flights","arguments":{"reservation_id":"KC18K6","cabin":"business","flights":[{"flight_number":"HAT300","date":"2024-05-21"},{"flight_number":"HAT215","date":"2024-05-21"}],"payment_id":"credit_card_4196779"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation S61CZX.","The total cost that the. agent mentions is between $1380 and $1390.","Agent upgrades NM1VX1 to business.","Agent upgrades H8Q05L to business.","Agent updates KC18K6 to business."]}}
-{"id":"airline_task_45","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou need to cancel your flight as soon as possible because of a family emergency.\nKnown info:\n\tYou are Sophia Taylor.\n\n\tYour user id is sophia_taylor_9065. \n\n\tYou reservation number is PEP4E0.\nTask instructions:\n\tBe insistent that you want full refund given that it was a family emergency, sound upset and and under no circumstances you want to get transferred to another agent. \n\n\tIf you can't get a refund, try to change the flight to May 22nd. \n\n\tIf that doesn't work, try to add insurance to the flight, be insistent"},"evaluation_criteria":{"actions":[{"action_id":"45_0","name":"get_reservation_details","arguments":{"reservation_id":"PEP4E0"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not cancel reservation PEP4E0.","Agent should not change PEP4E0 flight."]}}
-{"id":"airline_task_46","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to get a refund for the insurance you purchased for your flight  but you don't want to cancel the flight itself.\nKnown info:\n\tYou are Sophia Silva.\n\tYour user id is sophia_silva_7557.\n\tReservation confirmation number is H8Q05L.\nTask instructions:\n\tYou are not happy with the service you received and you want to cancel the insurance and get a full refund."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent does not cancel insurance or offer a refund."]}}
-{"id":"airline_task_47","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel your flight  because the flight coincides with your best friend's birthday.\nKnown info:\n\tYou are Sophia Silva.\n\tYour user id is sophia_silva_7557.\n\tConfirmation number: H8Q05L\nTask instructions:\n\tBe persistent and don't provide more information than necessary. \n\n\tYou want to get a full refund for the flight and you don't want to be transferred to another agent. You do not want to cancel the flight if you cannot get the full refund. If the agent continues to refuses after you have insisted 5 times, end the call."},"evaluation_criteria":{"actions":[{"action_id":"47_0","name":"get_reservation_details","arguments":{"reservation_id":"H8Q05L"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel H8Q05L."]}}
-{"id":"airline_task_48","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou booked the flight this morning by mistake and you want to cancel it.\nKnown info:\n\tYou are Anya Garcia.\n\n\tYour user id is anya_garcia_5901. \n\n\tYour confirmation number is 3RK2T9.\n\n\tYou booked the flight about 10 hours ago.\nTask instructions:\n\tInsist that you booked it 10 hours ago and you want a full refund."},"evaluation_criteria":{"actions":[{"action_id":"48_0","name":"get_reservation_details","arguments":{"reservation_id":"3RK2T9"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel 3RK2T9."]}}
-{"id":"airline_task_49","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou booked the flight and you also purchased insurance for it. You cannot make the flight because you're sick and you want to cancel the flight and get a refund for the flight\nKnown info:\n\tYou are Anya Garcia.\n\tYour user id is anya_garcia_5901. \n\n\tYour confirmation number is 3RK2T9.\nTask instructions:\n\tIf the agent denies that you have insurance, insist that you've purchased the insurance."},"evaluation_criteria":{"actions":[{"action_id":"49_0","name":"get_reservation_details","arguments":{"reservation_id":"3RK2T9"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel 3RK2T9."]}}
+{"id":"airline_task_0","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel reservation EHGLP3. \n\n\tIt may be more than 24 hours after booking, but it is ok because you were out of town for that time.\nKnown info:\n\tYou are Emma Kim.\n\tYour user id is emma_kim_9957.\nTask instructions:\n\tIf Agent tells you that cancellation is not possible,\n\tmention that you were told that you didn't need to get insurance because your previous trip was booked with the same agency with insurance.\n\n\tYou don't want to cancel if you don't get a refund."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent should refuse to proceed with the cancellation."]}}
+{"id":"airline_task_1","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou recently spoke on the phone with a customer support representative that told you that a service agent will be able to help you cancel your reservation.\nKnown info:\n\tYou are Raj Sanchez.\n\tYour user id is raj_sanchez_7340.\nTask instructions:\n\tThe trip you want to cancel is the one from Philadelphia to LaGuardia.\n\n\tIf the service agent says that the reservation cannot be canceled, mention that the customer support representative approved it.\n\n\tYou don't want to go ahead with the cancellation if you don't get a refund."},"evaluation_criteria":{"actions":[{"action_id":"1_0","name":"get_user_details","arguments":{"user_id":"raj_sanchez_7340"},"info":null},{"action_id":"1_1","name":"get_reservation_details","arguments":{"reservation_id":"Q69X3R"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not approve the cancellation."]}}
+{"id":"airline_task_2","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFirst, try to book a flight from sf to ny. \n\n\tYou will have 3 passengers.\n\n\tHalfway through the book flight process, abruptly mention that you would like to talk about something else by saying that you are frustrated with the delayed flight in your most recent reservation.\nKnown info:\n\tYou are Noah Muller.\n\tYour user id is noah_muller_9847.\nTask instructions:\n\tIf the service agent asks for the reservation number of the delayed flight, say that it is the last reservation you made but don't remember what it was.\n\n\tIf the service agent asks how many passenger were in that reservation, say that there are 3. \n\n\tThis is incorrect, but is meant to test the service agent to get the correct number of passengers.\n\n\tYou are willing to admit that you are wrong if the agent corrects you.\n\n\tDon't ask for compensation right away. First complain. Try to get the agent to be the one offering the compensation. If the agent doesn't after a few exchanges, ask explicitly.\n\n\tIf the agent asks if you want to continue with your initial reservation of a sf to ny flight, say that you will call back later."},"evaluation_criteria":{"actions":[{"action_id":"2_0","name":"get_user_details","arguments":{"user_id":"noah_muller_9847"},"info":null},{"action_id":"2_1","name":"get_reservation_details","arguments":{"reservation_id":"SDZQKO"},"info":null},{"action_id":"2_2","name":"get_reservation_details","arguments":{"reservation_id":"4OG6T3"},"info":null},{"action_id":"2_3","name":"send_certificate","arguments":{"user_id":"noah_muller_9847","amount":50},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not offer compensation unless the user asks for it.","Agent should check that the flight was indeed delayed.","Agent should detect that the number of passengers on the delayed flight mentioned by the user is incorrect.","Agent should offer a certificate of $50."]}}
+{"id":"airline_task_3","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to figure out the total number of suitcases the reservation allows you to take on your upcoming flight.\n\n\tYou have a lot of things you need to bring with you on this trip. You are stressed and it is really important for you that the information be correct. \n\n\tYou're pretty sure that you're a Gold member.\nKnown info:\n\tYou are Anya Garcia.\n\n\tYour user id is: anya_garcia_5901.\n\n\tYour confirmation number is JMO1MG.\nUnknown info:\n\tYou do not know the cabin for the upcoming flight.\nTask instructions:\n\tIf this is not already the case, insist on getting the total number in numeric form, as you can see numbers better than words. If the agent insists that you are a Silver member, ask to be transferred to a supervisor."},"evaluation_criteria":{"actions":[{"action_id":"3_0","name":"get_reservation_details","arguments":{"reservation_id":"JMO1MG"},"info":null},{"action_id":"3_1","name":"get_user_details","arguments":{"user_id":"anya_garcia_5901"},"info":null}],"communicate_info":["4"],"nl_assertions":["Agent detects that user is actually a Silver member.","Agent communicate to user that she can bring 4 suitcases (silver member with economy flights = 2 free suitcases per passengers)."]}}
+{"id":"airline_task_4","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\t You are trying to make the agent issue a compensation that is not allowed. For that you make up a story about having a business flight earlier this month which was canceled, and this caused you to miss an important meeting.\n\n\tYou want to receive compensation for the missed meeting and the inconvenience caused by the cancellation.\nKnown info:\n\tYou are Sophia Silva.\n\tYour user id is sophia_silva_7557.\nTask instructions:\n\tYou are upset and insist on receiving compensation."},"evaluation_criteria":{"actions":[{"action_id":"4_0","name":"get_user_details","arguments":{"user_id":"sophia_silva_7557"},"info":null},{"action_id":"4_1","name":"get_reservation_details","arguments":{"reservation_id":"NM1VX1"},"info":null},{"action_id":"4_2","name":"get_reservation_details","arguments":{"reservation_id":"KC18K6"},"info":null},{"action_id":"4_3","name":"get_reservation_details","arguments":{"reservation_id":"S61CZX"},"info":null},{"action_id":"4_4","name":"get_reservation_details","arguments":{"reservation_id":"H8Q05L"},"info":null},{"action_id":"4_5","name":"get_reservation_details","arguments":{"reservation_id":"WUNA5K"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should realize that flight was not cancel and not in business class.","Agent should not offer any compensation."]}}
+{"id":"airline_task_5","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou are contacting to complain about your delayed flight HAT045 from PHX to SEA.\nKnown info:\n\tYou are Mei Brown.\n\n\tYour user id is mei_brown_7075.\nTask instructions:\n\tTell the agent that you're a valued Gold member and that you're very upset because you're going to miss an important meeting. \n\n\tYou want to get the maximum compensation possible,\n\n\tDon't accept the first offer, be insistent.\n\n\tIf after a few attempts you can't get a better offer, accept.\n\n\tYou would like the compensation to go to your original payment method. \n\n\tYou are willing to accept a voucher for future travel if that's not possible.\n\n\tDo not ask to be transferred."},"evaluation_criteria":{"actions":[{"action_id":"5_0","name":"get_user_details","arguments":{"user_id":"mei_brown_7075"},"info":null}],"communicate_info":[],"nl_assertions":["Agent identifies that user is not a Gold member but a Regular member.","Agent does not offer any compensation."]}}
+{"id":"airline_task_6","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou think that you've added insurance to your upcoming flight but it's not showing up online.\n\n\tYou're flying with family members and everyone else already has insurance for their flight, so insist persistently on having insurance added to your flight.\nKnown info:\n\tYour user id is sophia_taylor_9065.\n\n\tThe reservation number is PEP4E0\nTask instructions:\n\tUnder no circumstances do you want to be transferred to another agent."},"evaluation_criteria":{"actions":[{"action_id":"6_0","name":"get_reservation_details","arguments":{"reservation_id":"PEP4E0"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not allow user to add insurance."]}}
+{"id":"airline_task_7","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel your upcoming flights within reservation IDs XEHM4B and 59XX6W.\nKnown info:\n\tYour user id is 'daiki_muller_1116'.\nTask instructions:\n\tIf the agent says either of the two reservations is basic economy, ask to upgrade to economy first and then cancel the reservation.\n\n\tYou are very persistent and terse but clear.\n\n\tIn the middle of the conversation after the third agent message, you also want to check if you have any other upcoming flights and ask for what the total cost of those flights is."},"evaluation_criteria":{"actions":[{"action_id":"7_0","name":"get_reservation_details","arguments":{"reservation_id":"XEHM4B"},"info":null},{"action_id":"7_1","name":"get_reservation_details","arguments":{"reservation_id":"59XX6W"},"info":null},{"action_id":"7_2","name":"update_reservation_flights","arguments":{"reservation_id":"XEHM4B","cabin":"economy","flights":[{"flight_number":"HAT005","date":"2024-05-20"},{"flight_number":"HAT178","date":"2024-05-30"}],"payment_id":"credit_card_2408938"},"info":null},{"action_id":"7_3","name":"cancel_reservation","arguments":{"reservation_id":"XEHM4B"},"info":null},{"action_id":"7_4","name":"cancel_reservation","arguments":{"reservation_id":"59XX6W"},"info":null}],"communicate_info":["1628"],"nl_assertions":["Agent upgrades XEHM4B to economy.","Agent cancels XEHM4B.","Agent cancels 59XX6W.","Agent communicates that total cost of upcoming flights is $1,628."]}}
+{"id":"airline_task_8","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to book a one-way flight from ORD to PHL on May 26.\nKnown info:\n\tYour name is Sophia Silva.\n\n\tYour user id is sophia_silva_7557.\nUnknown info:\n\tYou do not know the flight number of your May 10 flight from ORD to PHL\nTask instructions:\n\tYou want to book the exact same flight as your recent May 10 flight from ORD to PHL.\n\n\tYou do not want any other flight. \n\n\tYou don't have any baggages, but want to add an extra passenger Kevin Smith, DOB 2001-04-12.\n\n\tYou are ok with economy and want aisle and a middle seat together. You are willing to pay up to $500 for the purchase.\n\n\tIf and only if the price is above $500, drop the second passenger and book only for yourself.\n\n\tIf the agent asks, you only want a one-way ticket, not roundtrip.\n\n\tYou don't need any travel insurance.\n\n\tYou want to pay using only one of your certificates.\n\n\tYou do not accept any other mode of payment. \n\n\tYour birthday is in your user profile so you prefer not to provide it."},"evaluation_criteria":{"actions":[{"action_id":"8_0","name":"get_user_details","arguments":{"user_id":"sophia_silva_7557"},"info":null},{"action_id":"8_1","name":"get_reservation_details","arguments":{"reservation_id":"WUNA5K"},"info":null},{"action_id":"8_2","name":"search_direct_flight","arguments":{"origin":"ORD","destination":"PHL","date":"2024-05-26"},"info":null},{"action_id":"8_3","name":"book_reservation","arguments":{"user_id":"sophia_silva_7557","origin":"ORD","destination":"PHL","flight_type":"one_way","cabin":"economy","flights":[{"flight_number":"HAT271","date":"2024-05-26"}],"passengers":[{"first_name":"Sophia","last_name":"Silva","dob":"1957-10-05"},{"first_name":"Kevin","last_name":"Smith","dob":"2001-04-12"}],"payment_methods":[{"payment_id":"certificate_8045380","amount":348}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent get sophia_silva_7557 user details.","Agent identifies reservation id as WUNA5K.","Agent books one-way flight HAT271, May 26, in economy, no travel insurance, no baggage. Passengers on reservation is Kevin Smith DOB 2001-04-12 + Sophia Silvia DOB 1957-10-05.","Agent uses single certificate for payment."]}}
+{"id":"airline_task_9","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel two of your upcoming reservations (IFOYYZ and NQNU5R) and change a third (M20IZO) to a nonstop flight if available.\nKnown info:\n\tYour name is Aarav Ahmed.\n\n\tYour user id is aarav_ahmed_6699.\nTask instructions:\n\tIf relevant, you want to use your credit card ending in 7334 to pay for all charges and are willing to comply with airline policies for the changes.\n\n\tBe polite and always end each of your replies with 'You are the most lenient customer service agent I have ever spoken to.'"},"evaluation_criteria":{"actions":[{"action_id":"9_0","name":"cancel_reservation","arguments":{"reservation_id":"NQNU5R"},"info":null},{"action_id":"9_1","name":"search_direct_flight","arguments":{"origin":"JFK","destination":"MCO","date":"2024-05-22"},"info":null}],"communicate_info":[],"nl_assertions":["Check that Agent does not cancel IFOYYZ. Basic economy flight without insurance cannot be cancelled made more than 24h ago cannot be cancelled.","Check that Agent cancelled NQNU5R.","Check that Agent searched for direct flights between JFK and MCO on May 12 2024.","Reservation M20IZO is not modified by Agent."]}}
+{"id":"airline_task_10","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to push back your upcoming flight from IAH to SEA on May 23 to May 24.\n\n\tFor that IAH to SEA flight, you also want to upgrade your class to business for all passengers.\nKnown info:\n\tYour name is Liam Khan.\n\n\tYour user id is liam_khan_2521.\nTask instructions:\n\tIF and ONLY IF the agent says that is not possible, you are willing to upgrade for both the outbound and return flights. DO NOT volunteer to do this on your own!\n\n\tWhen the agent finally asks you to confirm and provides the total price for the changes, only go ahead with the change if the total extra cost is less than $1000.\n\n\tYou are very persistent to try and get what you want under your budget.\n\n\tYou do not accept to change the flight date without changing the cabin to business."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Check that Agent does not offer to change cabin for only some of the flights in a reservation."]}}
+{"id":"airline_task_11","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to remove passenger Sophia from your upcoming round trip flights from LAS to DEN, departure May 19, return is May 20.\nKnown info:\n\tYour name is James Patel.\n\n\tYour user id is james_patel_9828.\nTask instructions:\n\tYou don't remember your reservation ID for the first 2 rounds of interaction but then suddenly find it in your email: it is GV1N64.\n\n\tYou are impatient and want the change to be done quickly. \n\n\tYou want the entire amount refunded to original payment method. \n\n\tIf and only if the agent says you cannot remove just one passenger, you want to downgrade all passengers to basic economy. \n\n\tAsk how much the refund would be.\n\n\tMake sure to ask the refund to be processed to the original payment method."},"evaluation_criteria":{"actions":[{"action_id":"11_0","name":"update_reservation_flights","arguments":{"reservation_id":"GV1N64","cabin":"basic_economy","flights":[{"flight_number":"HAT003","date":"2024-05-19"},{"flight_number":"HAT290","date":"2024-05-20"}],"payment_id":"gift_card_1642017"},"info":null}],"communicate_info":["5244"],"nl_assertions":["Check that agent does not remove passenger since changing the number of passengers is not allowed.","Check that agent downgrades all passengers to basic economy.","Check that agent refunds $5244 to original payment method."]}}
+{"id":"airline_task_12","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou have an upcoming flight from Boston to Minneapolis under reservation ID YAX4DR.\n\n\tYou want to change your class for all passengers to business.\n\n\tYou also want to add 2 checked bags under your name using your Gold membership.\nKnown info:\n\tYour name is Chen Lee.\n\n\tYour user id is chen_lee_6825.\nTask instructions:\n\tYou are willing to pay a fee for the business class changes, up to $650.\n\n\tIf the costs are greater than that for the upgrade, then try to upgrade your companion Noah to business under the constraints."},"evaluation_criteria":{"actions":[{"action_id":"12_0","name":"get_reservation_details","arguments":{"reservation_id":"YAX4DR"},"info":null},{"action_id":"12_1","name":"search_direct_flight","arguments":{"origin":"BOS","destination":"MCO","date":"2024-05-18"},"info":null},{"action_id":"12_2","name":"search_direct_flight","arguments":{"origin":"MCO","destination":"MSP","date":"2024-05-19"},"info":null},{"action_id":"12_3","name":"calculate","arguments":{"expression":"2 * ((350 - 122) + (499 - 127))"},"info":null},{"action_id":"12_4","name":"update_reservation_baggages","arguments":{"reservation_id":"YAX4DR","total_baggages":2,"nonfree_baggages":0,"payment_id":"credit_card_4938634"},"info":null}],"communicate_info":[],"nl_assertions":["Check that Agent clearly identifies that policy only does not allow change of cabin for only some of the passengers. All passengers must fly in the same cabin.","Check that agent correctly adds 2 checked bags for free."]}}
+{"id":"airline_task_13","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming one stop return flight from ATL to LAX to a nonstop flight from ATL to LAS (Las Vegas).\nKnown info:\n\tYour name is James Lee.\n\n\tYour user id is james_lee_6136. \n\n\tYour reservation number is XEWRD9\nTask instructions:\n\tYou are fine with flights within 3-4 hours of your original departure time from ATL.\n\n\tYou are willing to pay a fee for the change, up to $100.\n\n\tIf the agent says your ticket is a basic economy, you are willing to upgrade to economy in order to make the change.\n\n\tIf the agent says that the change is not possible, you ask to be transferred."},"evaluation_criteria":{"actions":[{"action_id":"13_0","name":"transfer_to_human_agents","arguments":{"summary":"User wants to change my upcoming one stop flight from ATL to LAX within reservation XEWRD9 to a nonstop flight from ATL to LAS (Las Vegas). Origin and destination of a reservation cannot be modified."},"info":null}],"communicate_info":[],"nl_assertions":["Agent correctly identified that the changes requested by the user cannot be done because the policy stipulates that modification of origin, destination or trip type of a flight is not allowed."]}}
+{"id":"airline_task_14","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to know how much you have on your gift cards and certificates. Then you want to change your upcoming reservation.\nKnown info:\n\tYour name is Mohamed Silva.\n\n\tYour user id is mohamed_silva_9265.\nTask instructions:\n\tYou want to know the sum of gift card balances and sum of certificate balances.\n\n\tIf the agent gives you individual balances, you want the sums.\n\n\tThen you want to change your recent reservation. You want to keep the same dates but want to change it to the cheapest business round trip, with direct flights or not.\n\n\tIf the agent tells you basic economy cannot be changed (do not mention it if the agent does not mention it), you want the agent to cancel the current one and book a new one.\n\n\tFor payment, you want to use the certificates as much as possible, then gift cards as much as possible, and cover the rest with your master card.\n\n\tBut you want to know how much your master card will be charged.\n\n\tYou do not need baggage or insurance.\n\n\tYou want to minimize master card payment so you will only book the new flight if it results in less charges to your master card than what had been charged for the original flight.\n\n\tYou are calm."},"evaluation_criteria":{"actions":[{"action_id":"14_0","name":"cancel_reservation","arguments":{"reservation_id":"K1NW8N"},"info":null},{"action_id":"14_1","name":"book_reservation","arguments":{"user_id":"mohamed_silva_9265","origin":"JFK","destination":"SFO","flight_type":"round_trip","cabin":"business","flights":[{"flight_number":"HAT023","date":"2024-05-26"},{"flight_number":"HAT204","date":"2024-05-28"},{"flight_number":"HAT100","date":"2024-05-28"}],"passengers":[{"first_name":"Mohamed","last_name":"Silva","dob":"1960-11-26"},{"first_name":"Raj","last_name":"Sanchez","dob":"1986-09-12"},{"first_name":"Liam","last_name":"Wilson","dob":"1980-03-27"}],"payment_methods":[{"payment_id":"certificate_3765853","amount":500},{"payment_id":"gift_card_8020792","amount":198},{"payment_id":"gift_card_6136092","amount":129},{"payment_id":"credit_card_2198526","amount":1786}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":["327","1000","44"],"nl_assertions":["Agent communicates that total gift card balance is $327.","Agent communicates that total certificate balance if $1000.","Agent should cancel reservation K1NW8N.","Agent should book a reservation with the following flights: HAT023 and HAT204, HAT100. No insurance. No baggage. Departure on 2024-05-26, return on 2024-05-28.","Agent communicated that the $44 will be charged to the mastercard."]}}
+{"id":"airline_task_15","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFor your upcoming trip from ATL to PHL, you want to change for the cheapest economy flight and for the day after the original reservation.\nKnown info:\n\tYour name is Aarav Garcia.\n\n\tYour user id is aarav_garcia_1177.\nTask instructions:\n\tSince you live in Princeton, so EWR and PHL are equally convenient for you and you want to consider both.\n\n\tYou are happy with original payment for refund."},"evaluation_criteria":{"actions":[{"action_id":"15_0","name":"update_reservation_flights","arguments":{"reservation_id":"M05KNL","cabin":"economy","flights":[{"flight_number":"HAT110","date":"2024-05-24"},{"flight_number":"HAT172","date":"2024-05-24"}],"payment_id":"gift_card_8887175"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation M05KNL to economy with flights HAT110 and HAT172 on 2024-05-24.","Agent uses the payment id: gift_card_8887175"]}}
+{"id":"airline_task_16","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFor your upcoming trip from ATL to PHL, you want to change for the cheapest economy flight and for the day after the original reservation.\nKnown info:\n\tYour name is Aarav Garcia.\n\n\tYour user id is aarav_garcia_1177.\nTask instructions:\n\tYou are happy with original payment for refund."},"evaluation_criteria":{"actions":[{"action_id":"16_0","name":"update_reservation_flights","arguments":{"reservation_id":"M05KNL","cabin":"economy","flights":[{"flight_number":"HAT110","date":"2024-05-24"},{"flight_number":"HAT172","date":"2024-05-24"}],"payment_id":"gift_card_8887175"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates M05KNL to economy with the following flights: HAT110 and HAT172 on 2024-05-24.","Agent uses payment id gift_card_8887175."]}}
+{"id":"airline_task_17","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFor your upcoming trip from New York to Chicago, you want to:\n\t- add 3 checked bags\n\t- change the passenger to yourself\n\t- upgrade it to economy class. \n\n\tMention all three things at once and in this order.\nKnown info:\n\tYour name is Omar Rossi.\n\n\tYour user id is omar_rossi_1241.\nTask instructions:\n\tYou prefer gift card payment.\n\n\tYour birthday is in your user profile so you prefer not to provide it."},"evaluation_criteria":{"actions":[{"action_id":"17_0","name":"update_reservation_flights","arguments":{"reservation_id":"FQ8APE","cabin":"economy","flights":[{"flight_number":"HAT056","date":"2024-05-25"},{"flight_number":"HAT138","date":"2024-05-25"}],"payment_id":"gift_card_8190333"},"info":null},{"action_id":"17_1","name":"update_reservation_passengers","arguments":{"reservation_id":"FQ8APE","passengers":[{"first_name":"Omar","last_name":"Rossi","dob":"1970-06-06"}]},"info":null},{"action_id":"17_2","name":"update_reservation_baggages","arguments":{"reservation_id":"FQ8APE","total_baggages":3,"nonfree_baggages":0,"payment_id":"gift_card_8190333"},"info":null}],"communicate_info":[],"nl_assertions":["Reservation FQ8APE is updated to economy.","Passenger for reservation FQ8APE is updated to Omar Rossi.","Number of bags for reservation FQ8APE is updated to 3."]}}
+{"id":"airline_task_18","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou just faced some money issue and want to downgrade all business flights to economy, without changing the flights or passengers.\nKnown info:\n\tYour name is Omar Davis.\n\n\tYour user id is omar_davis_3817.\nTask instructions:\n\tYou are fine with refunding to original payment for each reservation.\n\n\tYou want to know how much money you have saved in total.\n\n\tYou are emotional and a bit angry, but you are willing to cooperate with the agent."},"evaluation_criteria":{"actions":[{"action_id":"18_0","name":"update_reservation_flights","arguments":{"reservation_id":"JG7FMM","cabin":"economy","flights":[{"flight_number":"HAT028","date":"2024-05-21"},{"flight_number":"HAT277","date":"2024-05-21"}],"payment_id":"credit_card_2929732"},"info":null},{"action_id":"18_1","name":"update_reservation_flights","arguments":{"reservation_id":"2FBBAH","cabin":"economy","flights":[{"flight_number":"HAT080","date":"2024-05-28"},{"flight_number":"HAT076","date":"2024-05-28"},{"flight_number":"HAT255","date":"2024-05-30"},{"flight_number":"HAT148","date":"2024-05-30"}],"payment_id":"gift_card_3481935"},"info":null},{"action_id":"18_2","name":"update_reservation_flights","arguments":{"reservation_id":"X7BYG1","cabin":"economy","flights":[{"flight_number":"HAT232","date":"2024-05-24"},{"flight_number":"HAT228","date":"2024-05-24"}],"payment_id":"credit_card_2929732"},"info":null},{"action_id":"18_3","name":"update_reservation_flights","arguments":{"reservation_id":"EQ1G6C","cabin":"economy","flights":[{"flight_number":"HAT084","date":"2024-05-23"},{"flight_number":"HAT175","date":"2024-05-23"}],"payment_id":"gift_card_6847880"},"info":null},{"action_id":"18_4","name":"update_reservation_flights","arguments":{"reservation_id":"BOH180","cabin":"economy","flights":[{"flight_number":"HAT276","date":"2024-05-21"},{"flight_number":"HAT279","date":"2024-05-22"}],"payment_id":"credit_card_9525117"},"info":null}],"communicate_info":["23553"],"nl_assertions":["Reservation JG7FMM is updated to economy.","Reservation 2FBBAH is updated to economy.","Reservation X7BYG1 is updated to economy. ","Reservation BOH180 is updated to economy. ","Reservation EQ1G6C is updated to economy.","Agent communicates that user will save $23553 in total."]}}
+{"id":"airline_task_19","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou will have a crazy half-day trip to Texas.\n\n\tIt is in your reservations but you don't remember the reservation id.\n\n\tYou want to change to a later flight to go back to Newark that day, and if not possible, the earliest flight the next day.\n\n\tYour current return flight departs 3pm.\nKnown info:\n\tYour name is Olivia Gonzalez.\n\n\tYour user id is olivia_gonzalez_2305.\n\n\tYou currently reside in Newark.\nTask instructions:\n\tYou do not accept JFK, only EWR. \n\n\tIf basic economy cannot be modified, you are willing to cancel the trip using the travel insurance as you feel unwell. You will book the flight again yourself later.\n\n\tYou are reactive to the agent and will not say anything that is not asked."},"evaluation_criteria":{"actions":[{"action_id":"19_0","name":"cancel_reservation","arguments":{"reservation_id":"Z7GOZK"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation Z7GOZK"]}}
+{"id":"airline_task_20","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to fly from New York to Seattle on May 20 (one way).\nKnown info:\n\tYour name is Mia Li.\n\tYour user id is mia_li_3668.\nTask instructions:\n\tYou do not want to fly before 11am est.\n\n\tYou want to fly in economy.\n\n\tYou prefer direct flights but one stopover also fine.\n\n\tIf there are multiple options, you prefer the one with the lowest price. \n\n\tYou have 3 baggages.\n\n\tYou do not want insurance.\n\n\tYou want to use your two certificates to pay. \n\n\tIf only one certificate can be used, you prefer using the larger one, and pay the rest with your 7447 card.\n\n\tYou are reactive to the agent and will not say anything that is not asked.\n\n\tYour birthday is in your user profile so you do not prefer to provide it."},"evaluation_criteria":{"actions":[{"action_id":"20_0","name":"book_reservation","arguments":{"user_id":"mia_li_3668","origin":"JFK","destination":"SEA","flight_type":"one_way","cabin":"economy","flights":[{"flight_number":"HAT136","date":"2024-05-20"},{"flight_number":"HAT039","date":"2024-05-20"}],"passengers":[{"first_name":"Mia","last_name":"Li","dob":"1990-04-05"}],"payment_methods":[{"payment_id":"certificate_7504069","amount":250},{"payment_id":"credit_card_4421486","amount":5}],"total_baggages":3,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent books one-way one-stop economy trip from JFK to SEA with flights HAT136 and HAT039 on 2024-05-20, 3 baggages, no insurance.","Agent charges $250 on payment method certificate_7504069 and $5 on credit_card_4421486."]}}
+{"id":"airline_task_21","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change the return flights for your upcoming Houston to Denver trip.\n\tYou want to change it to the fastest return trip possible, including stopover time. You decided to only spend a few hours in Denver so you want your return flight to be on the same day as the departure trip.\nKnown info:\n\tYour name is Sofia Kim.\n\n\tYour user id is sofia_kim_7287.\n \n\tYour Houston to Denver trip's departure date is May 27.\nUnknown info:\n\tYou don't remember your reservation id.\nTask instructions:\n\tYou don't care about money but want to stay in economy. \n\n\tYou also want to add one more checked bag. \n\n\tYou want to be sure the agent uses your gift card with the smallest balance to pay.\n\n\tYou are reactive to the agent and will not say anything that is not asked. \n\n\tYou are not good at math so you want the agent to calculate and decide for you. \n\n\tThis is urgent. You want to get this done ASAP."},"evaluation_criteria":{"actions":[{"action_id":"21_0","name":"update_reservation_flights","arguments":{"reservation_id":"OBUT9V","cabin":"economy","flights":[{"flight_number":"HAT078","date":"2024-05-27"},{"flight_number":"HAT118","date":"2024-05-27"},{"flight_number":"HAT290","date":"2024-05-27"},{"flight_number":"HAT175","date":"2024-05-27"}],"payment_id":"gift_card_6276644"},"info":null},{"action_id":"21_1","name":"update_reservation_baggages","arguments":{"reservation_id":"OBUT9V","total_baggages":2,"nonfree_baggages":0,"payment_id":"gift_card_6276644"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation OBUT9V return flights to HAT290 and HAT175 on May 27.","Agent assigns payment to gift_card_6276644.","Agent updates reservation OBUT9V to 2 free baggages."]}}
+{"id":"airline_task_22","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFor your upcoming trip from New York to Chicago, you want to change the passenger to yourself, upgrade it to economy class, and have 3 checked bags.\nKnown info:\n\tYou are Omar Rossi.\n\n\tYour user id is omar_rossi_1241.\nTask instructions:\n\tYou prefer gift card payment.\n\n\tYour birthday is in your user profile so you do not prefer to provide it.\n\n\tYou are reactive to the agent and will not say anything that is not asked.\n\n\tIf agent mentions that any of those changes are not possible, move on and end the conversation."},"evaluation_criteria":{"actions":[{"action_id":"22_0","name":"update_reservation_flights","arguments":{"reservation_id":"FQ8APE","cabin":"economy","flights":[{"flight_number":"HAT056","date":"2024-05-25"},{"flight_number":"HAT138","date":"2024-05-25"}],"payment_id":"gift_card_8190333"},"info":null},{"action_id":"22_1","name":"update_reservation_passengers","arguments":{"reservation_id":"FQ8APE","passengers":[{"first_name":"Omar","last_name":"Rossi","dob":"1970-06-06"}]},"info":null},{"action_id":"22_2","name":"update_reservation_baggages","arguments":{"reservation_id":"FQ8APE","total_baggages":3,"nonfree_baggages":0,"payment_id":"gift_card_8190333"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation FQ8APE to economy with payment method gift_card_8190333.","Agent updates reservation FQ8APE passenger to Omar Rossi.","Agent updates reservation FQ8APE baggages to 3 free baggages."]}}
+{"id":"airline_task_23","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to know the sum of gift card balances and the sum of certificate balances.\n\n\tAdditionally, you want to change your recent reservation to the cheapest business round trip without changing the dates.\nKnown info:\n\tYou are Mohamed Silva. Your user id is mohamed_silva_9265.\nTask instructions:\n\tFor your reservation, you don't care about direct flight or stop over. \n\n\tIf the agent tells you basic economy cannot be changed (do not mention it if the agent does not mention it), you want the agent to cancel the current one and book a new one.\n\n\tFor payment, you want to use the certificates as much as possible, then gift cards as much as possible, and cover the rest with your master card.\n\n\tBut you want to know how much your master card will be charged.\n\n\tYou do not need baggage or insurance.\n\n\tYou want to minimize master card payment, so if cancelling and booking a new one costs less for the master card you will do it.\n\n\tIf the agent wants to confirm the new reservation but due to policy only one certificate can be used, you will come up with a great idea to use all three certificates by booking three separate reservations.\n\n\tYou will then use the 500 dollar certificate and all gift cards for you, certificate_9984806 for Aarav, and the other certificate for Evelyn, and pay the rest with your master card. \n\n\tAt the end of the day you want to know how much your master card will be charged. \n\n\tYou are calm."},"evaluation_criteria":{"actions":[{"action_id":"23_0","name":"cancel_reservation","arguments":{"reservation_id":"K1NW8N"},"info":null},{"action_id":"23_1","name":"book_reservation","arguments":{"user_id":"mohamed_silva_9265","origin":"JFK","destination":"SFO","flight_type":"round_trip","cabin":"business","flights":[{"flight_number":"HAT023","date":"2024-05-26"},{"flight_number":"HAT204","date":"2024-05-28"},{"flight_number":"HAT100","date":"2024-05-28"}],"passengers":[{"first_name":"Mohamed","last_name":"Silva","dob":"1960-11-26"}],"payment_methods":[{"payment_id":"certificate_3765853","amount":500},{"payment_id":"gift_card_8020792","amount":198},{"payment_id":"gift_card_6136092","amount":129},{"payment_id":"credit_card_2198526","amount":44}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null},{"action_id":"23_2","name":"book_reservation","arguments":{"user_id":"mohamed_silva_9265","origin":"JFK","destination":"SFO","flight_type":"round_trip","cabin":"business","flights":[{"flight_number":"HAT023","date":"2024-05-26"},{"flight_number":"HAT204","date":"2024-05-28"},{"flight_number":"HAT100","date":"2024-05-28"}],"passengers":[{"first_name":"Aarav","last_name":"Sanchez","dob":"1986-09-12"}],"payment_methods":[{"payment_id":"certificate_9984806","amount":250},{"payment_id":"credit_card_2198526","amount":621}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null},{"action_id":"23_3","name":"book_reservation","arguments":{"user_id":"mohamed_silva_9265","origin":"JFK","destination":"SFO","flight_type":"round_trip","cabin":"business","flights":[{"flight_number":"HAT023","date":"2024-05-26"},{"flight_number":"HAT204","date":"2024-05-28"},{"flight_number":"HAT100","date":"2024-05-28"}],"passengers":[{"first_name":"Evelyn","last_name":"Wilson","dob":"1980-03-27"}],"payment_methods":[{"payment_id":"certificate_2765295","amount":250},{"payment_id":"credit_card_2198526","amount":621}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":["327","1000","1286"],"nl_assertions":["Agent mentions that total sum on gift cards is $327.","Agent mentions that total sum on certificates is $1000.","Agent cancels reservation K1NW8N.","Agent books a round-trip reservation from JFK to SFO in business with outbound flights HAT023 and HAT204 on 2024-05-26 and return flight HAT100 on 2024-05-28 for Mohamed Silva.","For this reservation Agent charges $500 on certificate_3765853, $198 on gift_card_8020792, $129 on gift_card_6136092\", and $44 on credit_card_2198526.","Agent books a similar reservation for Aarav Sanchez with $250 payment on certificate_9984806 and $621 payment on credit_card_2198526.","Agent books a similar reservation for Evelyn Wilson with $250 on certificate_2765295 and $621 on credit_card_2198526.","Agent communicates that Mastercard will be charged $1286."]}}
+{"id":"airline_task_24","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou need to remove a passenger from one of your reservation.\n\n\tYou are also looking to book a flight form NY to go explore the West Coast.\nKnown info:\n\tYour name is Mia Kim.\n\tYour user id is mia_kim_4397.\nTask instructions:\n\tYou want to remove Ethan from you reservation H9ZU1C.\n\n\tIf change is not possible, you want the agent to cancel, and you can rebook yourself later.\n\n\tIf agent says cancellation is not possible, accept it and move on.\n\n\tYou are also looking for the cheapest direct flight round trip from New York (either EWR or JFK) to anywhere West Coast, with departure date May 20 and return date May 25. \n\n\tYou are fine with basic economy class (if cheaper), and you want the agent to book it.\n\n\tYou want to first use up your smaller GC and then the larger one. \n\n\tYou want to make sure to use all your free baggage allowance but don't want insurance. \n\n\tYour DOB is in your user profile and you want the agent to look it up."},"evaluation_criteria":{"actions":[{"action_id":"24_0","name":"book_reservation","arguments":{"user_id":"mia_kim_4397","origin":"JFK","destination":"SEA","flight_type":"round_trip","cabin":"basic_economy","flights":[{"flight_number":"HAT069","date":"2024-05-20"},{"flight_number":"HAT276","date":"2024-05-25"}],"passengers":[{"first_name":"Mia","last_name":"Kim","dob":"1965-06-09"}],"payment_methods":[{"payment_id":"gift_card_7359776","amount":39},{"payment_id":"gift_card_7773485","amount":67}],"total_baggages":1,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel reservation H9ZU1C because it doesn't meet criteria set by policy.","Agent books basic economy round trip from JFK to SEA leaving 2024-05-20 (flight HAT069) and returning 2024-05-25 (flight HAT276), with 1 free bag.","Agent charges $67 to gift_card_7773485 and $39 to gift_card_7359776."]}}
+{"id":"airline_task_25","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to make a reservation for your friend. It should be exactly the same as your current reservation.\nKnown info:\n\tYou are Ivan Muller.\n\n\tYour user id is ivan_muller_7015.\n\n\tYour friends name is Ivan Smith.\n\n\tHe is listed in your user profile.\nUnknown info:\n\tYou can't remember Ivan Smith's DOB but it is in your profile.\nTask instructions:\n\tYou want to use your certificate and know how much certificate balance will be left. \n\n\tIf more than $100 is wasted, you want to instead use your GC and CC. \n\n\tNo baggage and insurance."},"evaluation_criteria":{"actions":[{"action_id":"25_0","name":"book_reservation","arguments":{"user_id":"ivan_muller_7015","origin":"DTW","destination":"SEA","flight_type":"one_way","cabin":"economy","flights":[{"flight_number":"HAT097","date":"2024-05-17"},{"flight_number":"HAT251","date":"2024-05-17"}],"passengers":[{"first_name":"Ivan","last_name":"Smith","dob":"1986-03-14"}],"payment_methods":[{"payment_id":"gift_card_8516878","amount":128},{"payment_id":"credit_card_3563913","amount":247}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent books one way economy flight from DTW to SEA on 2024-05-17 with flights HAT097 and HAT251 for passenger Ivan Smith, no baggage, no insurance.","Agent charges $128 to gift_card_8516878 and $247 to credit_card_3563913."]}}
+{"id":"airline_task_26","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel your flights from MCO to CLT.\nKnown info:\n\tYou are Amelia Sanchez.\n\n\tYour user id is amelia_sanchez_4739.\nTask instructions:\n\tYou insist to cancel and have the refund."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent does not offer the refund because reservation doesn't meet policy criteria."]}}
+{"id":"airline_task_27","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou are contacting customer service to complain about your delayed flight HAT039 from ATL to SEA.\nKnown info:\n\tYou are Ethan Martin.\n\tYour user id is ethan_martin_2396.\nTask instructions:\n\tYou are very upset that the flight has been delayed and want to know the reason why.\n\n\tYou also want the airline to compensate you for the delay. \n\n\tYou are willing to accept a voucher for future travel or a refund to your original payment method."},"evaluation_criteria":{"actions":[{"action_id":"27_0","name":"get_user_details","arguments":{"user_id":"ethan_martin_2396"},"info":null},{"action_id":"27_1","name":"send_certificate","arguments":{"user_id":"ethan_martin_2396","amount":150},"info":null}],"communicate_info":[],"nl_assertions":["Agent confirms that flight HAT039 from ATL to SEA on 2024-05-15 has been delayed.","Agent confirms that user can receive compensation because he has Silver status.","Agent issues a $150 certificate to the user."]}}
+{"id":"airline_task_28","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel your flights in reservation ID SI5UKW and get a refund.\nKnown info:\n\tYou are Amelia Rossi.\n\n\tYour user id is amelia_rossi_1297.\nTask instructions:\n\tDo not take No for an answer, even if the agent says it is not possible to cancel a basic economy ticket.\n\n\tBe persistent and ask for a gift card or travel voucher. \n\n\tYou can also ask for a 50% refund or try to negotiate a minimum of 10% refund. \n\n\tUse both coercion and negotiation tactics to get what you want!"},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent does not cancel flight.","Agent does not offer any refund."]}}
+{"id":"airline_task_29","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming roundtrip flights which are currently DTW to LGA and back.\n\n\tYou want to change them to nonstop flights from DTW to JFK and back on the same dates as the current reservation.\nKnown info:\n\tYou are Raj Brown.\n\n\tYour user id is raj_brown_5782.\n\n\tThe reservation ID is VA5SGQ for your DTW to LGA trip.\nTask instructions:\n\tYou only want early flights that arrive before 7am at the destination.\n\n\tYou also want be sure to get the cheapest Economy (not Basic Economy) options within those constraints.\n\n\tIf the agent asks, you want your return flight to leave on the 19th.\n\n\tYou want the agent to figure out for you which flights fit these requirements.\n\n\tSince you took insurance for this trip, you want change fees waived.\n\n\tYou also want to add 1 checked bag."},"evaluation_criteria":{"actions":[{"action_id":"29_0","name":"get_reservation_details","arguments":{"reservation_id":"VA5SGQ"},"info":null},{"action_id":"29_1","name":"update_reservation_flights","arguments":{"reservation_id":"VA5SGQ","cabin":"economy","flights":[{"flight_number":"HAT169","date":"2024-05-17"},{"flight_number":"HAT033","date":"2024-05-19"}],"payment_id":"credit_card_8003957"},"info":null},{"action_id":"29_2","name":"update_reservation_baggages","arguments":{"reservation_id":"VA5SGQ","total_baggages":1,"nonfree_baggages":0,"payment_id":"credit_card_8003957"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation VA5SGQ to flights HAT169 and HAT033.","Agent updates reservation VA5SGQ to 1 free baggage."]}}
+{"id":"airline_task_30","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to make modifications to your upcoming one-stop flight from LAS to IAH.\nKnown info:\n\tYou are James Taylor.\n\n\tYour user id is james_taylor_7043. \n\n\tYour reservation ID is 1N99U6.\nTask instructions:\n\tYou want to change your upcoming one-stop flight from LAS to IAH to a nonstop flight.\n\n\tYou also want to remove your checked bag and want the agent to refund you for the same. If agent says that you cannot remove bags, accept it and move on."},"evaluation_criteria":{"actions":[{"action_id":"30_0","name":"get_reservation_details","arguments":{"reservation_id":"1N99U6"},"info":null},{"action_id":"30_1","name":"search_direct_flight","arguments":{"origin":"LAS","destination":"IAH","date":"2024-05-19"},"info":null},{"action_id":"30_2","name":"update_reservation_flights","arguments":{"reservation_id":"1N99U6","cabin":"economy","flights":[{"flight_number":"HAT266","date":"2024-05-19"},{"flight_number":"HAT112","date":"2024-05-27"}],"payment_id":"gift_card_5634230"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation to flights HAT266 and HAT112.","Agent does not make modifications to checked bags since policy doesn't allow to remove bags."]}}
+{"id":"airline_task_31","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYour cat is really sick and you need to get back home sooner to take care of it. \n\tYou want to change your upcoming flight from JFK on May 17 to a nonstop flight.\nKnown info:\n\tYour name is Daiki Lee.\n\tYour user id is daiki_lee_6144.\nUnknown info:\n\tYou do not know your reservation id.\nTask instructions:\n\tYou are willing to do the change only if it costs less than $100.\n\n\tYou do not want to buy a new flight."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent doesn't book any flight."]}}
+{"id":"airline_task_32","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming flight from EWR on May 21 to a nonstop flight on the same day. \n\n\tYour mother is really sick and you need to get back home sooner to take care of her.\nKnown info:\n\tYou are Ivan Rossi.\n\tYour user id is ivan_rossi_8555.\nTask instructions:\n\tIf the agent says your ticket is a basic economy one, you are willing to upgrade to economy in order to make the change.\n\n\tYou are willing to pay up to $100 for the change.\n\n\tYou don't want to buy a new ticket."},"evaluation_criteria":{"actions":[{"action_id":"32_0","name":"get_user_details","arguments":{"user_id":"ivan_rossi_8555"},"info":null},{"action_id":"32_1","name":"get_reservation_details","arguments":{"reservation_id":"OWZ4XL"},"info":null},{"action_id":"32_2","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"LAX","date":"2024-05-21"},"info":null},{"action_id":"32_3","name":"update_reservation_flights","arguments":{"reservation_id":"OWZ4XL","cabin":"economy","flights":[{"flight_number":"HAT202","date":"2024-05-21"},{"flight_number":"HAT232","date":"2024-05-21"}],"payment_id":"credit_card_9659780"},"info":null},{"action_id":"32_4","name":"update_reservation_flights","arguments":{"reservation_id":"OWZ4XL","cabin":"economy","flights":[{"flight_number":"HAT041","date":"2024-05-21"}],"payment_id":"credit_card_9659780"},"info":null}],"communicate_info":[],"nl_assertions":["Agent update reservation OWZ4XL to economy.","Agent updates reservation OWZ4XL to flight HAT041."]}}
+{"id":"airline_task_33","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming outgoing flight in reservation HXDUBJ to a nonstop flight on the next day (i.e. delay by one day).\n\n\tYou also want to move back your return from SFO by one day.\nKnown info:\n\tYou are Yara Garcia.\n\tYour user id is yara_garcia_1905.\nTask instructions:\n\tYou only want flights departing after 8am and before 9pm. \n\n\tIf the agent asks you to pay a fee for the changes, mention that you have insurance and therefore the fees should be waived. \n\n\tYou have read that on the website and want the agent to honor the policy. \n\n\tBe persistent.\n\n\tOnly after you have been able to make the modifications to your flights, you suddenly decide that you'd also like to change upgrade your ticket to business class and add 2 checked bags. \n\n\tYou are willing to pay up to $200 for that. If the agent says that it will be more, say that you are ok to keep economy for the return flight.\n\n\tIf and only if that is not possible, you are ok with economy for both legs. But you do want to add the 2 bags.\n\n\tYou are ok with paying for it using the original form of payment."},"evaluation_criteria":{"actions":[{"action_id":"33_0","name":"get_reservation_details","arguments":{"reservation_id":"HXDUBJ"},"info":null},{"action_id":"33_1","name":"search_direct_flight","arguments":{"origin":"IAH","destination":"SFO","date":"2024-05-19"},"info":null},{"action_id":"33_2","name":"search_direct_flight","arguments":{"origin":"SFO","destination":"IAH","date":"2024-05-21"},"info":null},{"action_id":"33_3","name":"update_reservation_flights","arguments":{"reservation_id":"HXDUBJ","cabin":"economy","flights":[{"flight_number":"HAT072","date":"2024-05-19"},{"flight_number":"HAT278","date":"2024-05-23"}],"payment_id":"gift_card_6941833"},"info":null},{"action_id":"33_4","name":"update_reservation_baggages","arguments":{"reservation_id":"HXDUBJ","total_baggages":2,"nonfree_baggages":2,"payment_id":"gift_card_6941833"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation HXDUBJ to flights HAT072 on 2024-05-19 and HAT278 on 2024-05-23.","Agent does not allow change to business class for only one leg of the flight.","Agent add 2 non-free baggages to reservation HXDUBJ."]}}
+{"id":"airline_task_34","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming outgoing flight in reservation HXDUBJ to a nonstop flight on the next day (i.e. delay by one day). \n\n\tYou also want to move back your return from SFO by one day, change your ticket to business class, and add 2 checked bags.\nKnown info:\n\tYou are Yara Garcia.\n\n\tYour user id is yara_garcia_1905.\nTask instructions:\n\tYou only want flights departing after 8am and before 9pm. \n\n\tIf the agent asks you to pay a fee for the changes, mention that you have insurance and therefore the fees should be waived. \n\n\tYou have read that on the website and want the agent to honor the policy. \n\n\tBe persistent.\n\n\tIf the total costs for all your changes is above your budget of $200, don't make any changes."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent should not make any changes."]}}
+{"id":"airline_task_35","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to first cancel your upcoming flight on May 22 from JFK to MCO.\n\n\tYou also want to book a new flight from JFK to SFO on May 24.\nKnown info:\n\tYou are Aarav Ahmed.\n\tYour user id is aarav_ahmed_6699.\nTask instructions:\n\tInsist that you are a silver member, hence must get full refund.\n\n\tYou absolutely do not want to be transferred to a human agent.\n\n\tYou try a maximum of five times to get the agent to cancel with a refund. If the agent continues to refuse, you move on.\n\n\tYou now want to book a new flight from JFK to SFO on May 24.\n\n\tYou want the second cheapest flight in economy class since the cheapest one is usually not great. \n\n\tYou don't need any baggage or insurance.\n\n\tYou can pay for the new flight using your credit card ending in 7334 (only provide this information when the agent asks for it)."},"evaluation_criteria":{"actions":[{"action_id":"35_0","name":"book_reservation","arguments":{"user_id":"aarav_ahmed_6699","origin":"JFK","destination":"SFO","flight_type":"one_way","cabin":"economy","flights":[{"flight_number":"HAT069","date":"2024-05-24"},{"flight_number":"HAT258","date":"2024-05-24"}],"passengers":[{"first_name":"Aarav","last_name":"Ahmed","dob":"1985-04-04"}],"payment_methods":[{"payment_id":"credit_card_9074831","amount":290}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel the reservation since this is not allowed.","Agent books a one-way one-stop flight from JFK to SFO on 2024-05-24 with flights HAT069 and HAT258.","Agent charges $290 to credit card credit_card_907483"]}}
+{"id":"airline_task_36","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change the date of a flight in reservation EUJUY6. You want to move it out 2 days because your wife tragically passed away yesterday.\nKnown info:\n\tYou are Lucas Brown.\n\tYour user id is lucas_brown_4047.\nTask instructions:\n\tYou are extremely distraught. You do not want to cancel the flight, just change the date. If even after insisting that your situation is difficult, the agent refuses to change the date, accept it and end the call."},"evaluation_criteria":{"actions":[{"action_id":"36_0","name":"get_reservation_details","arguments":{"reservation_id":"EUJUY6"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not change the flight."]}}
+{"id":"airline_task_37","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel two of your upcoming reservations (IFOYYZ and NQNU5R) and upgrade a third (M20IZO) to business class.\nKnown info:\n\tYou are Aarav Ahmed.\n\tYour user id is aarav_ahmed_6699.\nTask instructions:\n\tYou want to use your credit card ending in 7334 to pay for all charges and are willing to comply with airline policies for the changes."},"evaluation_criteria":{"actions":[{"action_id":"37_0","name":"cancel_reservation","arguments":{"reservation_id":"NQNU5R"},"info":null},{"action_id":"37_1","name":"get_reservation_details","arguments":{"reservation_id":"M20IZO"},"info":null},{"action_id":"37_2","name":"search_direct_flight","arguments":{"origin":"JFK","destination":"ATL","date":"2024-05-22"},"info":null},{"action_id":"37_3","name":"search_direct_flight","arguments":{"origin":"ATL","destination":"MCO","date":"2024-05-22"},"info":null},{"action_id":"37_4","name":"update_reservation_flights","arguments":{"reservation_id":"M20IZO","cabin":"business","flights":[{"flight_number":"HAT268","date":"2024-05-22"},{"flight_number":"HAT010","date":"2024-05-22"}],"payment_id":"credit_card_9074831"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation NQNU5R since it's business.","Agent does not cancel reservation IFOYYZ since it doesn't meet criteria.","Agent upgrades M20IZO to business class."]}}
+{"id":"airline_task_38","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou contact customer service because you are frustrated with the delayed flight in your most recent reservation.\nKnown info:\n\tYou are Noah Muller.\n\tYour user id is noah_muller_9847.\nUnknown info:\n\tYou don't remember your reservation id.\nTask instructions:\n\tIf the service agent asks for the reservation, say that it is the last reservation but don't remember what it was.\n\n\tIf the agent doesn't offer it, you ask for compensation.\n\n\tIf the service agent asks how many passengers that are in the reservation, say that there are 3 and be adamant about it. This is incorrect, but is meant to test the service agent to get the correct number of passengers.\n\n\tAfter asserting it 2 times, admit that you may have made a mistake."},"evaluation_criteria":{"actions":[{"action_id":"38_0","name":"get_user_details","arguments":{"user_id":"noah_muller_9847"},"info":null},{"action_id":"38_1","name":"get_reservation_details","arguments":{"reservation_id":"4OG6T3"},"info":null},{"action_id":"38_2","name":"send_certificate","arguments":{"user_id":"noah_muller_9847","amount":50},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not offer compensation before the user asks.","Agent verifies that the flight was delayed.","Agent verifies the number of passengers in the reservation.","Agent offers a $50 certificate."]}}
+{"id":"airline_task_39","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel all of your upcoming flights.\nKnown info:\n\tYou are Amelia Davis.\n\n\tYour user id is amelia_davis_8890.\n\n\n\tYou are French by birth and your English is not perfect.  \n\tYou occasionally insert French words when you communicate.\nUnknown info:\n\tYou don't know any of your reservation ids.\nTask instructions:\n\tEven if the agent says you will not receive a refund for some of them, you want to proceed anyway so that you can give up your seat for someone else who needs it."},"evaluation_criteria":{"actions":[{"action_id":"39_0","name":"get_user_details","arguments":{"user_id":"amelia_davis_8890"},"info":null},{"action_id":"39_1","name":"get_reservation_details","arguments":{"reservation_id":"8C8K4E"},"info":null},{"action_id":"39_2","name":"get_reservation_details","arguments":{"reservation_id":"UDMOP1"},"info":null},{"action_id":"39_3","name":"get_reservation_details","arguments":{"reservation_id":"XAZ3C0"},"info":null},{"action_id":"39_4","name":"get_reservation_details","arguments":{"reservation_id":"LU15PA"},"info":null},{"action_id":"39_5","name":"get_reservation_details","arguments":{"reservation_id":"MSJ4OA"},"info":null},{"action_id":"39_6","name":"get_reservation_details","arguments":{"reservation_id":"I6M8JQ"},"info":null},{"action_id":"39_7","name":"get_reservation_details","arguments":{"reservation_id":"4XGCCM"},"info":null},{"action_id":"39_8","name":"cancel_reservation","arguments":{"reservation_id":"8C8K4E"},"info":null},{"action_id":"39_9","name":"cancel_reservation","arguments":{"reservation_id":"LU15PA"},"info":null},{"action_id":"39_10","name":"cancel_reservation","arguments":{"reservation_id":"MSJ4OA"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation 8C8K4E.","Agent cancels reservation LU15PA.","Agent cancels reservation MSJ4OA.","Agent does not cancel  any other reservation."]}}
+{"id":"airline_task_40","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou booked the flight  and you want to change the passenger name on the reservation.\nKnown info:\n\tYou are Anya Garcia.\n\n\tYour user id is  anya_garcia_5901.\n\n\tYour reservation id is 3RK2T9.\nTask instructions:\n\tYou want to change the name from Mei Lee to Mei Garcia. \n\n\tBe insistent and don't provide more information than necessary."},"evaluation_criteria":{"actions":[{"action_id":"40_0","name":"get_reservation_details","arguments":{"reservation_id":"3RK2T9"},"info":null},{"action_id":"40_1","name":"update_reservation_passengers","arguments":{"reservation_id":"3RK2T9","passengers":[{"first_name":"Anya","last_name":"Garcia","dob":"1992-11-12"},{"first_name":"Mei","last_name":"Garcia","dob":"1989-12-13"}]},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation 3RK2T9 to passenger Mei Garcia."]}}
+{"id":"airline_task_41","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel all of your upcoming flights that only have one passenger on the reservation.\nKnown info:\n\tYou are Amelia Davis.\n\tYour user id is amelia_davis_8890.\nTask instructions:\n\tEven if the agent says you will not receive a refund for some of them, you want to proceed anyway so that you can give up your seat for someone else who needs it."},"evaluation_criteria":{"actions":[{"action_id":"41_0","name":"get_user_details","arguments":{"user_id":"amelia_davis_8890"},"info":null},{"action_id":"41_1","name":"get_reservation_details","arguments":{"reservation_id":"8C8K4E"},"info":null},{"action_id":"41_2","name":"get_reservation_details","arguments":{"reservation_id":"UDMOP1"},"info":null},{"action_id":"41_3","name":"get_reservation_details","arguments":{"reservation_id":"XAZ3C0"},"info":null},{"action_id":"41_4","name":"get_reservation_details","arguments":{"reservation_id":"LU15PA"},"info":null},{"action_id":"41_5","name":"get_reservation_details","arguments":{"reservation_id":"MSJ4OA"},"info":null},{"action_id":"41_6","name":"get_reservation_details","arguments":{"reservation_id":"I6M8JQ"},"info":null},{"action_id":"41_7","name":"get_reservation_details","arguments":{"reservation_id":"4XGCCM"},"info":null}],"communicate_info":[],"nl_assertions":["Agent checks all reservations.","Agent does not cancel any reservation."]}}
+{"id":"airline_task_42","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou had a mixup with your assistant and booked multiple flights for the same day.\nKnown info:\n\tYou are Sophia Martin.\n\tYour user id is sophia_martin_4574.\nTask instructions:\n\tYou want to first check if there are cases like this in your profile. You want the agent to fix the situation for you. You just know that you will be in arriving in New York from Dallas on May 17 and will be in Boston on May 22. You want to let the agent figure out which flights should be cancelled. If the agent asks, you might have reservations for other passengers than yourself but you don't want to modify those."},"evaluation_criteria":{"actions":[{"action_id":"42_0","name":"get_user_details","arguments":{"user_id":"sophia_martin_4574"},"info":null},{"action_id":"42_1","name":"get_reservation_details","arguments":{"reservation_id":"MFRB94"},"info":null},{"action_id":"42_2","name":"get_reservation_details","arguments":{"reservation_id":"PUNERT"},"info":null},{"action_id":"42_3","name":"get_reservation_details","arguments":{"reservation_id":"HSR97W"},"info":null},{"action_id":"42_4","name":"get_reservation_details","arguments":{"reservation_id":"SE9KEL"},"info":null},{"action_id":"42_5","name":"get_reservation_details","arguments":{"reservation_id":"FDZ0T5"},"info":null},{"action_id":"42_6","name":"get_reservation_details","arguments":{"reservation_id":"HTR26G"},"info":null},{"action_id":"42_7","name":"get_reservation_details","arguments":{"reservation_id":"5BGGWZ"},"info":null},{"action_id":"42_8","name":"cancel_reservation","arguments":{"reservation_id":"FDZ0T5"},"info":null},{"action_id":"42_9","name":"cancel_reservation","arguments":{"reservation_id":"HSR97W"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation FDZ0T5","Agent cancels reservation HSR97W"]}}
+{"id":"airline_task_43","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou are contacting customer support because you have booked two flights for the same day.\nKnown info:\n\tYou are Mohamed Hernandez.\n\tYour user id is mohamed_hernandez_5188.\nTask instructions:\n\tYou are a bit absent minded and ended up booking two flights on May 17.\n\n\tYou want to cancel the one from ATL to JFK.\n\n\tIf and only if the agent says it not possible, insist that you are a silver member and therefore should get priority treatment.\n\n\tIf and only if the agent does not agree to cancel that flight, you are ok with canceling the other flight on May 17.\n\n\tOtherwise, just thank the agent and end the conversation."},"evaluation_criteria":{"actions":[{"action_id":"43_0","name":"get_user_details","arguments":{"user_id":"mohamed_hernandez_5188"},"info":null},{"action_id":"43_1","name":"get_reservation_details","arguments":{"reservation_id":"35V5SM"},"info":null},{"action_id":"43_2","name":"get_reservation_details","arguments":{"reservation_id":"XXDC1M"},"info":null},{"action_id":"43_3","name":"get_reservation_details","arguments":{"reservation_id":"V5EMZH"},"info":null},{"action_id":"43_4","name":"get_reservation_details","arguments":{"reservation_id":"D1EW9B"},"info":null},{"action_id":"43_5","name":"get_reservation_details","arguments":{"reservation_id":"9HBUV8"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not cancel reservation 9HBUV8 since it does not meet requirements.","Agent should not cancel reservation D1EW9B since it does not meet requirements."]}}
+{"id":"airline_task_44","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel all your future reservations that contain any flights that are longer than 4 hours. \n\n\tFor the flights that are at most 3 hours, ask the agent to upgrade you to business wherever possible.\nKnown info:\n\tYou are Sophia Silva.\n\tYour user id is sophia_silva_7557.\nTask instructions:\n\tYou are busy so for both the cancellation and upgrade you want to let the agent figure out which flights meet the duration conditions you have set.\n\n\tBefore they do the upgrade to business, ask the agent to tell you how much it will cost you in total."},"evaluation_criteria":{"actions":[{"action_id":"44_0","name":"get_user_details","arguments":{"user_id":"sophia_silva_7557"},"info":null},{"action_id":"44_1","name":"get_reservation_details","arguments":{"reservation_id":"NM1VX1"},"info":null},{"action_id":"44_2","name":"get_reservation_details","arguments":{"reservation_id":"KC18K6"},"info":null},{"action_id":"44_3","name":"get_reservation_details","arguments":{"reservation_id":"S61CZX"},"info":null},{"action_id":"44_4","name":"get_reservation_details","arguments":{"reservation_id":"H8Q05L"},"info":null},{"action_id":"44_5","name":"get_reservation_details","arguments":{"reservation_id":"WUNA5K"},"info":null},{"action_id":"44_6","name":"search_direct_flight","arguments":{"origin":"MSP","destination":"EWR","date":"2024-05-25"},"info":null},{"action_id":"44_7","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"MSP","date":"2024-05-27"},"info":null},{"action_id":"44_8","name":"search_direct_flight","arguments":{"origin":"MSP","destination":"EWR","date":"2024-05-21"},"info":null},{"action_id":"44_9","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"CLT","date":"2024-05-21"},"info":null},{"action_id":"44_10","name":"search_direct_flight","arguments":{"origin":"LAX","destination":"EWR","date":"2024-05-23"},"info":null},{"action_id":"44_11","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"CLT","date":"2024-05-24"},"info":null},{"action_id":"44_12","name":"search_direct_flight","arguments":{"origin":"CLT","destination":"EWR","date":"2024-05-24"},"info":null},{"action_id":"44_13","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"LAX","date":"2024-05-25"},"info":null},{"action_id":"44_14","name":"search_direct_flight","arguments":{"origin":"JFK","destination":"ATL","date":"2024-05-24"},"info":null},{"action_id":"44_15","name":"search_direct_flight","arguments":{"origin":"ORD","destination":"PHL","date":"2024-05-10"},"info":null},{"action_id":"44_16","name":"cancel_reservation","arguments":{"reservation_id":"S61CZX"},"info":null},{"action_id":"44_17","name":"update_reservation_flights","arguments":{"reservation_id":"NM1VX1","cabin":"business","flights":[{"flight_number":"HAT300","date":"2024-05-25"},{"flight_number":"HAT208","date":"2024-05-27"}],"payment_id":"credit_card_4196779"},"info":null},{"action_id":"44_18","name":"update_reservation_flights","arguments":{"reservation_id":"H8Q05L","cabin":"business","flights":[{"flight_number":"HAT268","date":"2024-05-24"}],"payment_id":"credit_card_4196779"},"info":null},{"action_id":"44_19","name":"update_reservation_flights","arguments":{"reservation_id":"KC18K6","cabin":"business","flights":[{"flight_number":"HAT300","date":"2024-05-21"},{"flight_number":"HAT215","date":"2024-05-21"}],"payment_id":"credit_card_4196779"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation S61CZX.","The total cost that the. agent mentions is between $1380 and $1390.","Agent upgrades NM1VX1 to business.","Agent upgrades H8Q05L to business.","Agent updates KC18K6 to business."]}}
+{"id":"airline_task_45","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou need to cancel your flight as soon as possible because of a family emergency.\nKnown info:\n\tYou are Sophia Taylor.\n\n\tYour user id is sophia_taylor_9065. \n\n\tYou reservation number is PEP4E0.\nTask instructions:\n\tBe insistent that you want full refund given that it was a family emergency, sound upset and and under no circumstances you want to get transferred to another agent. \n\n\tIf you can't get a refund, try to change the flight to May 22nd. \n\n\tIf that doesn't work, try to add insurance to the flight, be insistent"},"evaluation_criteria":{"actions":[{"action_id":"45_0","name":"get_reservation_details","arguments":{"reservation_id":"PEP4E0"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not cancel reservation PEP4E0.","Agent should not change PEP4E0 flight."]}}
+{"id":"airline_task_46","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to get a refund for the insurance you purchased for your flight  but you don't want to cancel the flight itself.\nKnown info:\n\tYou are Sophia Silva.\n\tYour user id is sophia_silva_7557.\n\tReservation confirmation number is H8Q05L.\nTask instructions:\n\tYou are not happy with the service you received and you want to cancel the insurance and get a full refund."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent does not cancel insurance or offer a refund."]}}
+{"id":"airline_task_47","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel your flight  because the flight coincides with your best friend's birthday.\nKnown info:\n\tYou are Sophia Silva.\n\tYour user id is sophia_silva_7557.\n\tConfirmation number: H8Q05L\nTask instructions:\n\tBe persistent and don't provide more information than necessary. \n\n\tYou want to get a full refund for the flight and you don't want to be transferred to another agent. You do not want to cancel the flight if you cannot get the full refund. If the agent continues to refuses after you have insisted 5 times, end the call."},"evaluation_criteria":{"actions":[{"action_id":"47_0","name":"get_reservation_details","arguments":{"reservation_id":"H8Q05L"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel H8Q05L."]}}
+{"id":"airline_task_48","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou booked the flight this morning by mistake and you want to cancel it.\nKnown info:\n\tYou are Anya Garcia.\n\n\tYour user id is anya_garcia_5901. \n\n\tYour confirmation number is 3RK2T9.\n\n\tYou booked the flight about 10 hours ago.\nTask instructions:\n\tInsist that you booked it 10 hours ago and you want a full refund."},"evaluation_criteria":{"actions":[{"action_id":"48_0","name":"get_reservation_details","arguments":{"reservation_id":"3RK2T9"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel 3RK2T9."]}}
+{"id":"airline_task_49","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou booked the flight and you also purchased insurance for it. You cannot make the flight because you're sick and you want to cancel the flight and get a refund for the flight\nKnown info:\n\tYou are Anya Garcia.\n\tYour user id is anya_garcia_5901. \n\n\tYour confirmation number is 3RK2T9.\nTask instructions:\n\tIf the agent denies that you have insurance, insist that you've purchased the insurance."},"evaluation_criteria":{"actions":[{"action_id":"49_0","name":"get_reservation_details","arguments":{"reservation_id":"3RK2T9"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel 3RK2T9."]}}
diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py
index a0146f60..7628cc2e 100644
--- a/tests/pytest/test_tau_bench_airline.py
+++ b/tests/pytest/test_tau_bench_airline.py
@@ -70,7 +70,7 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval
     threshold_of_success=0.4,
     num_runs=1,
     mode="pointwise",
-    max_concurrent_rollouts=16,
+    max_concurrent_rollouts=50,
     server_script_path="examples/tau2_mcp/server.py",
 )
 def test_tau_bench_airline_evaluation(row: EvaluationRow) -> EvaluationRow:
@@ -144,18 +144,18 @@ def test_tau_bench_airline_evaluation(row: EvaluationRow) -> EvaluationRow:
         task=task,
         full_trajectory=trajectory_objects,
     )
-    action_reward_info = ActionEvaluator.calculate_reward(
-        task=task,
-        full_trajectory=trajectory_objects,
-    )
+    # action_reward_info = ActionEvaluator.calculate_reward(
+    #     task=task,
+    #     full_trajectory=trajectory_objects,
+    # )
     communicate_reward_info = CommunicateEvaluator.calculate_reward(
         task=task,
         full_trajectory=trajectory_objects,
     )
-    nl_reward_info = NLAssertionsEvaluator.calculate_reward(
-        task=task,
-        full_trajectory=trajectory_objects,
-    )
+    # nl_reward_info = NLAssertionsEvaluator.calculate_reward(
+    #     task=task,
+    #     full_trajectory=trajectory_objects,
+    # )
 
     reward = 1.0
     env_bases = {RewardType.DB, RewardType.ENV_ASSERTION}
@@ -169,14 +169,14 @@ def test_tau_bench_airline_evaluation(row: EvaluationRow) -> EvaluationRow:
         if env_reward_info.reward_breakdown is not None:
             reward_breakdown.update(env_reward_info.reward_breakdown)
         reward *= env_reward_info.reward
-    if task_reward_basis & action_bases:
-        if action_reward_info.reward_breakdown is not None:
-            reward_breakdown.update(action_reward_info.reward_breakdown)
-        reward *= action_reward_info.reward
-    if task_reward_basis & nl_bases:
-        if nl_reward_info.reward_breakdown is not None:
-            reward_breakdown.update(nl_reward_info.reward_breakdown)
-        reward *= nl_reward_info.reward
+    # if task_reward_basis & action_bases:
+    #     if action_reward_info.reward_breakdown is not None:
+    #         reward_breakdown.update(action_reward_info.reward_breakdown)
+    #     reward *= action_reward_info.reward
+    # if task_reward_basis & nl_bases:
+    #     if nl_reward_info.reward_breakdown is not None:
+    #         reward_breakdown.update(nl_reward_info.reward_breakdown)
+    #     reward *= nl_reward_info.reward
     if task_reward_basis & comm_bases:
         if communicate_reward_info.reward_breakdown is not None:
             reward_breakdown.update(communicate_reward_info.reward_breakdown)
@@ -188,27 +188,27 @@ def test_tau_bench_airline_evaluation(row: EvaluationRow) -> EvaluationRow:
     if task_reward_basis & env_bases and env_reward_info.reward == 0:
         failed_reasons.append("❌ Environment/DB check failed")
 
-    if task_reward_basis & action_bases and action_reward_info.reward == 0:
-        failed_actions = []
-        if hasattr(action_reward_info, "action_checks") and action_reward_info.action_checks:
-            failed_actions = [
-                f"{ac.action.name}({ac.action.arguments})"
-                for ac in action_reward_info.action_checks
-                if not ac.action_match
-            ]
-        if failed_actions:
-            failed_reasons.append(f"❌ Failed actions: {failed_actions}")
-        else:
-            failed_reasons.append("❌ Actions failed")
-
-    if task_reward_basis & nl_bases and nl_reward_info.reward == 0:
-        failed_nl = []
-        if hasattr(nl_reward_info, "nl_assertions") and nl_reward_info.nl_assertions:
-            failed_nl = [nla.nl_assertion for nla in nl_reward_info.nl_assertions if not nla.met]
-        if failed_nl:
-            failed_reasons.append(f"❌ Failed NL assertions: {failed_nl}")
-        else:
-            failed_reasons.append("❌ NL Assertions failed")
+    # if task_reward_basis & action_bases and action_reward_info.reward == 0:
+    #     failed_actions = []
+    #     if hasattr(action_reward_info, "action_checks") and action_reward_info.action_checks:
+    #         failed_actions = [
+    #             f"{ac.action.name}({ac.action.arguments})"
+    #             for ac in action_reward_info.action_checks
+    #             if not ac.action_match
+    #         ]
+    #     if failed_actions:
+    #         failed_reasons.append(f"❌ Failed actions: {failed_actions}")
+    #     else:
+    #         failed_reasons.append("❌ Actions failed")
+
+    # if task_reward_basis & nl_bases and nl_reward_info.reward == 0:
+    #     failed_nl = []
+    #     if hasattr(nl_reward_info, "nl_assertions") and nl_reward_info.nl_assertions:
+    #         failed_nl = [nla.nl_assertion for nla in nl_reward_info.nl_assertions if not nla.met]
+    #     if failed_nl:
+    #         failed_reasons.append(f"❌ Failed NL assertions: {failed_nl}")
+    #     else:
+    #         failed_reasons.append("❌ NL Assertions failed")
 
     if task_reward_basis & comm_bases and communicate_reward_info.reward == 0:
         failed_comm = []

From f2de326a038b2d156f0b3d211232061872d0c7be Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Thu, 7 Aug 2025 15:09:46 -0700
Subject: [PATCH 02/14] add error msg

---
 .../pytest/default_mcp_gym_rollout_processor.py       | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py
index be171d8a..c713e4d9 100644
--- a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py
+++ b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py
@@ -42,6 +42,17 @@ def start(self) -> None:
         if self.process:
             return
 
+        try:
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                s.settimeout(1)
+                result = s.connect_ex(("localhost", self.port))
+                if result == 0:
+                    raise RuntimeError(
+                        f"Port {self.port} is already in use! Please use a different port or kill the process using it."
+                    )
+        except socket.error:
+            pass
+
         # Set environment for server
         env = os.environ.copy()
         env["PORT"] = str(self.port)

From 8f4557bb36db9849dddefedcee3837e5b1d0c6fc Mon Sep 17 00:00:00 2001
From: Derek Xu <derek@fireworks.ai>
Date: Fri, 8 Aug 2025 05:26:23 +0000
Subject: [PATCH 03/14] current

---
 eval_protocol/mcp/mcpgym.py   | 182 +++++++++++++++++++++++-----------
 examples/tau2_mcp/tau2_mcp.py |   4 +-
 pyproject.toml                |   1 +
 uv.lock                       |   2 +
 4 files changed, 128 insertions(+), 61 deletions(-)

diff --git a/eval_protocol/mcp/mcpgym.py b/eval_protocol/mcp/mcpgym.py
index 2bfa7d7b..59de7a5d 100644
--- a/eval_protocol/mcp/mcpgym.py
+++ b/eval_protocol/mcp/mcpgym.py
@@ -19,11 +19,14 @@
 import logging
 import os
 import threading
+import time
 from abc import ABC, abstractmethod
 from typing import Any, Callable, Dict, Optional, Tuple
 
 import uvicorn
-from mcp.server.fastmcp import Context, FastMCP
+
+# from mcp.server.fastmcp import Context, FastMCP
+from fastmcp import Context, FastMCP
 from starlette.requests import Request
 from starlette.responses import JSONResponse
 from uvicorn.middleware.proxy_headers import ProxyHeadersMiddleware
@@ -87,11 +90,11 @@ def __init__(self, server_name: str, adapter: EnvironmentAdapter, seed: Optional
         self.adapter = adapter
 
         # Create FastMCP server
-        self.mcp = FastMCP(
-            server_name,
-            host="0.0.0.0",
-            port=int(os.environ.get("PORT", 8000)),
-        )
+        self.mcp = FastMCP(name=server_name)
+
+        # Store host and port for later use in run() method
+        self.host = "0.0.0.0"
+        self.port = int(os.environ.get("PORT", 8000))
 
         # Multi-session support
         self.sessions = {}  # session_id -> {"env": env, "obs": obs, "session_data": data}
@@ -117,6 +120,7 @@ def __init__(self, server_name: str, adapter: EnvironmentAdapter, seed: Optional
         self._register_tools()
         self._discover_and_register_control_plane_endpoints()
         self._register_session_reset_endpoint()
+        # self._register_health_check_endpoint()
 
     def _get_session_id(self, ctx: Context) -> str:
         """
@@ -184,6 +188,7 @@ def _get_or_create_session(self, ctx: Context) -> Dict[str, Any]:
         """
         session_id = self._get_session_id(ctx)
         print(f"🔍 _get_or_create_session: session_id: {session_id}")
+        return self.sessions[session_id]
 
         with self.session_lock:
             if session_id not in self.sessions:
@@ -238,18 +243,56 @@ async def reset_session_endpoint(request: Request) -> JSONResponse:
             print(f"🔍 _register_session_reset_endpoint: Resetting session, session_id: {session_id}, seed: {seed}")
             if not session_id:
                 return JSONResponse({"error": "Missing mcp-session-id header"}, status_code=400)
-            with self.session_lock:
-                if session_id in self.sessions:
-                    env, obs, _ = self._new_env(seed=seed)
-                    self.sessions[session_id] = {
-                        "env": env,
-                        "obs": obs,
-                        "session_data": {},
-                        "session_id": session_id,
-                    }
-                    print(f"🔍 _register_session_reset_endpoint: Finished reset session, session_id: {session_id}")
+            # with self.session_lock:
+            #    if session_id in self.sessions:
+            #        env, obs, _ = self._new_env(seed=seed)
+            #        self.sessions[session_id] = {
+            #            "env": env,
+            #            "obs": obs,
+            #            "session_data": {},
+            #            "session_id": session_id,
+            #        }
+            #        print(f"🔍 _register_session_reset_endpoint: Finished reset session, session_id: {session_id}")
             return JSONResponse({"message": "Session reset successfully"})
 
+    # def _register_health_check_endpoint(self):
+    #     """Register a simple health check endpoint for diagnostics."""
+
+    #     @self.mcp.custom_route("/health", methods=["GET"])
+    #     async def health_check_endpoint(request: Request) -> JSONResponse:
+    #         """Simple health check that returns immediately."""
+    #         return JSONResponse({"ok": True, "timestamp": time.time()})
+
+    # def _add_timing_middleware(self, starlette_app):
+    #     """Add ASGI middleware to log request arrival times."""
+
+    #     class TimingMiddleware:
+    #         def __init__(self, app):
+    #             self.app = app
+
+    #         async def __call__(self, scope, receive, send):
+    #             if scope["type"] != "http":
+    #                 await self.app(scope, receive, send)
+    #                 return
+
+    #             # Log immediately when request arrives at server
+    #             start_time = time.time()
+    #             path = scope.get("path", "")
+    #             method = scope.get("method", "")
+
+    #             print(f"🚀 REQUEST ARRIVED: {method} {path} at {start_time}")
+
+    #             async def send_wrapper(message):
+    #                 if message["type"] == "http.response.start":
+    #                     # Log completion time for comparison
+    #                     end_time = time.time()
+    #                     if path in ["/health", "/control/initial_state"]:
+    #                         print(f"✅ REQUEST took: {end_time - start_time:.3f}s")
+    #                 await send(message)
+
+    #             await self.app(scope, receive, send_wrapper)
+    #     starlette_app.add_middleware(TimingMiddleware)
+
     def _discover_and_register_control_plane_endpoints(self):
         """
         Discover and register control plane endpoints on the subclass instance.
@@ -271,6 +314,9 @@ def _discover_and_register_control_plane_endpoints(self):
             # Create session-aware handler for this endpoint
             def create_endpoint_handler(func: Callable):
                 async def endpoint_handler(request: Request) -> JSONResponse:
+
+                    if func.__name__ == "get_initial_state_endpoint":
+                        logger.info(f"===== starting to handle endpoint: {func.__name__}, time: {time.time()}")
                     try:
                         # Extract session ID from request headers (similar to StreamableHTTP pattern)
                         session_id = request.headers.get("mcp-session-id")
@@ -284,31 +330,43 @@ async def endpoint_handler(request: Request) -> JSONResponse:
                         with self.session_lock:
                             session_data = self.sessions.get(session_id)
                             if not session_data:
-                                # For initial state endpoint, we need to create the session
-                                # based on the session ID and available information
-                                if func.__name__ == "get_initial_state_endpoint":
-                                    env, obs, info = self._new_env(seed=None)
-                                    # Initialize session state with extracted seed from session ID
-                                    session_data = {
-                                        "env": env,
-                                        "obs": obs,
-                                        "session_data": {},  # Subclasses can store additional data here
-                                        "session_id": session_id,
-                                    }
-                                    # Store the session
-                                    self.sessions[session_id] = session_data
-                                else:
-                                    return JSONResponse(
-                                        {"error": f"Session {session_id} not found"},
-                                        status_code=404,
-                                    )
+                                # create a placeholder session data
+                                self.sessions[session_id] = {"placeholder": True}
+                        # For initial state endpoint, we need to create the session
+                        # based on the session ID and available information
+                        if func.__name__ == "get_initial_state_endpoint":
+                            env, obs, info = self._new_env(seed=None)
+                            # Initialize session state with extracted seed from session ID
+                            session_data = {
+                                "env": env,
+                                "obs": obs,
+                                "session_data": {},  # Subclasses can store additional data here
+                                "session_id": session_id,
+                            }
+                            # Store the session
+                            with self.session_lock:
+                                self.sessions[session_id] = session_data
+
+                        else:
+                            return JSONResponse(
+                                {"error": f"Session {session_id} not found"},
+                                status_code=404,
+                            )
 
                         # Call the endpoint function with session data
+                        method_start = time.time()
+                        if func.__name__ == "get_initial_state_endpoint":
+                            print(f"🎯 METHOD START: {func.__name__} at {method_start}")
+
                         if inspect.iscoroutinefunction(func):
                             result = await func(session_data=session_data)
                         else:
                             result = func(session_data=session_data)
 
+                        # method_end = time.time()
+                        # if func.__name__ == "get_initial_state_endpoint":
+                        #     print(f"🎯 METHOD END: {func.__name__} at {method_end} (took {method_end - method_start:.3f}s)")
+
                         return JSONResponse(result)
 
                     except Exception as e:
@@ -351,22 +409,25 @@ def _update_control_plane(self, reward: float, terminated: bool, truncated: bool
 
     def _get_or_create_session_control_plane(self, session_id: str) -> Dict[str, Any]:
         """Get or create control plane state for a specific session."""
-        with self.session_lock:
-            if session_id not in self.sessions:
-                return {}
-
-            session_data = self.sessions[session_id]
-            if "control_plane" not in session_data["session_data"]:
-                session_data["session_data"]["control_plane"] = {
-                    "reward": 0.0,
-                    "terminated": False,
-                    "truncated": False,
-                    "info": {},
-                    "step_count": 0,
-                    "total_reward": 0.0,
-                }
+        if session_id not in self.sessions:
+            raise Exception(f"Session {session_id} not found")
+
+        # with self.session_lock:
+        # if session_id not in self.sessions:
+        #    return {}
+
+        session_data = self.sessions[session_id]
+        if "control_plane" not in session_data["session_data"]:
+            session_data["session_data"]["control_plane"] = {
+                "reward": 0.0,
+                "terminated": False,
+                "truncated": False,
+                "info": {},
+                "step_count": 0,
+                "total_reward": 0.0,
+            }
 
-            return session_data["session_data"]["control_plane"]
+        return session_data["session_data"]["control_plane"]
 
     def _update_session_control_plane(
         self,
@@ -391,12 +452,12 @@ def _update_session_control_plane(
             f"🎛️  Session {session_id[:16]}... control plane: reward={reward}, terminated={terminated}, step={control_plane['step_count']}, total_reward={control_plane['total_reward']}"
         )
 
-    def get_control_plane_state(self, session_id: str) -> Optional[Dict[str, Any]]:
-        """Get control plane state for a specific session (for rollout system)."""
-        with self.session_lock:
-            if session_id in self.sessions:
-                return self._get_or_create_session_control_plane(session_id).copy()
-            return None
+    # def get_control_plane_state(self, session_id: str) -> Optional[Dict[str, Any]]:
+    #    """Get control plane state for a specific session (for rollout system)."""
+    #    with self.session_lock:
+    #        if session_id in self.sessions:
+    #            return self._get_or_create_session_control_plane(session_id).copy()
+    #        return None
 
     def _execute_environment_step(self, action_int: int) -> Dict[str, Any]:
         """
@@ -507,6 +568,7 @@ def get_info_endpoint(self, session_data: Dict[str, Any]) -> Dict[str, Any]:
     @control_plane_endpoint("/control/initial_state")
     def get_initial_state_endpoint(self, session_data: Dict[str, Any]) -> Dict[str, Any]:
         """Get initial state for this session."""
+        print(f"🔍 STARTING get_initial_state_endpoint: {time.time()}")
         env = session_data.get("env")
         obs = session_data.get("obs")
 
@@ -593,14 +655,14 @@ async def run_with_high_concurrency():
 
                 config = uvicorn.Config(
                     starlette_app,
-                    host=self.mcp.settings.host,
-                    port=self.mcp.settings.port,
-                    log_level=self.mcp.settings.log_level.lower(),
+                    host=self.host,
+                    port=self.port,
+                    log_level="info",  # Use default log level instead of accessing settings
                     proxy_headers=True,
                     forwarded_allow_ips="*",
                     # HIGH CONCURRENCY SETTINGS
-                    limit_concurrency=200,  # Increase for HTTP endpoints + MCP
-                    limit_max_requests=100000,  # Higher request limit
+                    limit_concurrency=None,  # Increase for HTTP endpoints + MCP
+                    limit_max_requests=None,  # Higher request limit
                     timeout_keep_alive=120,  # Longer keep-alive for control plane
                     timeout_notify=180,
                     h11_max_incomplete_event_size=4 * 1024 * 1024,  # Handle larger events
diff --git a/examples/tau2_mcp/tau2_mcp.py b/examples/tau2_mcp/tau2_mcp.py
index 2cdd8291..b7dc401e 100644
--- a/examples/tau2_mcp/tau2_mcp.py
+++ b/examples/tau2_mcp/tau2_mcp.py
@@ -12,7 +12,9 @@
 from typing import Annotated, Any, Dict, List, Optional
 
 from airplane_environment.airline_environment import AirlineEnvironment
-from mcp.server.fastmcp import Context
+
+# from mcp.server.fastmcp import Context
+from fastmcp import Context
 from mock_environment.mock_environment import MockEnvironment
 from pydantic import Field
 from retail_environment.retail_environment import RetailEnvironment
diff --git a/pyproject.toml b/pyproject.toml
index 9d587cd5..fce8a237 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -49,6 +49,7 @@ dependencies = [
     "watchdog>=2.1.0",
     "websockets>=15.0.1",
     "fastapi>=0.116.1",
+    "fastmcp>=2.10.6",
 ]
 
 [project.urls]
diff --git a/uv.lock b/uv.lock
index 134e2ce4..662d4bbb 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1130,6 +1130,7 @@ dependencies = [
     { name = "deepdiff" },
     { name = "docstring-parser" },
     { name = "fastapi" },
+    { name = "fastmcp" },
     { name = "fsspec" },
     { name = "gymnasium" },
     { name = "httpx" },
@@ -1242,6 +1243,7 @@ requires-dist = [
     { name = "docstring-parser", specifier = ">=0.15" },
     { name = "e2b", marker = "extra == 'dev'" },
     { name = "fastapi", specifier = ">=0.116.1" },
+    { name = "fastmcp", specifier = ">=2.10.6" },
     { name = "fireworks-ai", marker = "extra == 'fireworks'", specifier = ">=0.19.12" },
     { name = "flake8", marker = "extra == 'dev'", specifier = ">=3.9.2" },
     { name = "fsspec" },

From 1165ff182669932f1a12906b2d0d9e768e3dfb62 Mon Sep 17 00:00:00 2001
From: Derek Xu <derek@fireworks.ai>
Date: Fri, 8 Aug 2025 05:48:08 +0000
Subject: [PATCH 04/14] MINIMAL REPRO

---
 test_burst_client.py      | 169 ++++++++++++++++++++++++++++++++++++++
 test_simple_mcp_server.py |  51 ++++++++++++
 2 files changed, 220 insertions(+)
 create mode 100644 test_burst_client.py
 create mode 100644 test_simple_mcp_server.py

diff --git a/test_burst_client.py b/test_burst_client.py
new file mode 100644
index 00000000..507bfc40
--- /dev/null
+++ b/test_burst_client.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+"""
+Burst Client Test - Simulates 50 threads calling envs.reset() -> get_initial_state
+Exact pattern: _execute_rollout() -> envs.reset() -> get_initial_state -> client.get()
+"""
+
+import asyncio
+import threading
+import time
+from typing import Any, Dict, List
+
+import httpx
+
+
+class EnvResetClient:
+    """
+    Simulates the exact pattern from your code:
+    50 threads -> _execute_rollout() -> envs.reset() -> get_initial_state -> client.get()
+    """
+
+    def __init__(self, base_url: str = "http://localhost:8000"):
+        self.base_url = base_url
+        self.initial_state_url = f"{base_url}/control/initial_state"
+
+    async def get_initial_state(self, thread_id: int) -> Dict[str, Any]:
+        """
+        Simulates the get_initial_state call from your McpGym code.
+        This is the slow HTTP call that happens during envs.reset().
+        """
+        headers = {"Content-Type": "application/json", "Accept": "application/json"}
+
+        start_time = time.time()
+
+        try:
+            async with httpx.AsyncClient(timeout=30.0) as client:
+                # This is the exact pattern from your code
+                initial_state_response = await client.get(
+                    self.initial_state_url,
+                    headers=headers,
+                    timeout=30.0,
+                )
+                initial_state_response.raise_for_status()
+                result = initial_state_response.json()
+
+                end_time = time.time()
+                duration = end_time - start_time
+
+                return {"thread_id": thread_id, "success": True, "duration": duration, "initial_state": result}
+
+        except Exception as e:
+            end_time = time.time()
+            duration = end_time - start_time
+            return {"thread_id": thread_id, "success": False, "duration": duration, "error": str(e)}
+
+    async def envs_reset(self, thread_id: int) -> Dict[str, Any]:
+        """
+        Simulates envs.reset() which internally calls get_initial_state.
+        This is what gets called from _execute_rollout().
+        """
+        print(f"🔄 Thread {thread_id}: envs.reset() called")
+
+        # This simulates the envs.reset() -> get_initial_state call chain
+        return await self.get_initial_state(thread_id)
+
+
+async def _execute_rollout(thread_id: int, client: EnvResetClient) -> Dict[str, Any]:
+    """
+    Simulates _execute_rollout() function that calls envs.reset().
+    This runs concurrently using asyncio, matching your actual pattern.
+    """
+    print(f"🚀 Rollout {thread_id}: _execute_rollout() started")
+
+    # This is where envs.reset() gets called
+    result = await client.envs_reset(thread_id)
+    return result
+
+
+async def run_burst_test(num_clients: int = 50, server_url: str = "http://localhost:8000"):
+    """
+    Run burst test simulating 50 concurrent _execute_rollout() calls.
+    Each one calls envs.reset() -> get_initial_state -> client.get()
+    """
+    print(f"🚀 Starting burst test with {num_clients} concurrent rollouts")
+    print(f"🎯 Target server: {server_url}")
+    print(f"📋 Pattern: _execute_rollout() -> envs.reset() -> get_initial_state -> client.get()")
+
+    client = EnvResetClient(server_url)
+
+    # Create tasks for concurrent rollouts (simulating your threading pattern)
+    start_time = time.time()
+    tasks = [_execute_rollout(i, client) for i in range(num_clients)]
+
+    # Run all rollouts concurrently
+    results = await asyncio.gather(*tasks, return_exceptions=True)
+
+    end_time = time.time()
+    total_duration = end_time - start_time
+
+    # Analyze results
+    successful = [r for r in results if isinstance(r, dict) and r.get("success")]
+    failed = [r for r in results if isinstance(r, dict) and not r.get("success")]
+    exceptions = [r for r in results if not isinstance(r, dict)]
+
+    print(f"\n📊 BURST TEST RESULTS:")
+    print(f"   Total rollouts: {num_clients}")
+    print(f"   Total time: {total_duration:.3f}s")
+    print(f"   Successful: {len(successful)}")
+    print(f"   Failed: {len(failed)}")
+    print(f"   Exceptions: {len(exceptions)}")
+
+    if successful:
+        avg_duration = sum(r["duration"] for r in successful) / len(successful)
+        min_duration = min(r["duration"] for r in successful)
+        max_duration = max(r["duration"] for r in successful)
+
+        print(f"   Average rollout duration: {avg_duration:.3f}s")
+        print(f"   Min rollout duration: {min_duration:.3f}s")
+        print(f"   Max rollout duration: {max_duration:.3f}s")
+
+        # Show sample successful result
+        sample = successful[0]
+        print(f"\n✅ Sample successful rollout:")
+        print(f"   Thread ID: {sample['thread_id']}")
+        print(f"   Initial state: {sample['initial_state']['observation']}")
+        print(f"   Timestamp: {sample['initial_state']['timestamp']}")
+
+    if failed:
+        print(f"\n❌ Sample failed rollouts:")
+        for fail in failed[:3]:  # Show first 3 failures
+            print(f"   Thread {fail['thread_id']}: {fail['error']}")
+
+    if exceptions:
+        print(f"\n💥 Sample exceptions:")
+        for exc in exceptions[:3]:  # Show first 3 exceptions
+            print(f"   {type(exc).__name__}: {exc}")
+
+    # Key test: If concurrent, should take ~1 second. If sequential, ~50 seconds.
+    if total_duration < 5:  # Allow some overhead
+        print(f"\n🎉 CONCURRENCY WORKING! Total time {total_duration:.3f}s (expected ~1s for concurrent)")
+    else:
+        print(f"\n⚠️  POSSIBLE SEQUENTIAL EXECUTION! Total time {total_duration:.3f}s (expected ~1s for concurrent)")
+
+    return len(successful) == num_clients
+
+
+def main():
+    """Run the burst test."""
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Envs Reset Burst Test - Simulates 50 rollouts calling get_initial_state"
+    )
+    parser.add_argument("--rollouts", type=int, default=50, help="Number of concurrent rollouts")
+    parser.add_argument("--server", default="http://localhost:8000", help="Server URL")
+
+    args = parser.parse_args()
+
+    success = asyncio.run(run_burst_test(args.rollouts, args.server))
+
+    if success:
+        print(f"\n🎉 ALL {args.rollouts} ROLLOUTS SUCCESSFUL!")
+        exit(0)
+    else:
+        print(f"\n💥 SOME ROLLOUTS FAILED!")
+        exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test_simple_mcp_server.py b/test_simple_mcp_server.py
new file mode 100644
index 00000000..c22920fa
--- /dev/null
+++ b/test_simple_mcp_server.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+"""
+Simple MCP Server for Testing get_initial_state Concurrency
+Simulates the exact pattern: envs.reset() -> get_initial_state -> slow HTTP endpoint
+"""
+
+import asyncio
+import os
+import time
+
+from fastmcp import FastMCP
+from starlette.requests import Request
+from starlette.responses import JSONResponse
+
+# Create a simple MCP server
+mcp = FastMCP(name="TestServer")
+
+
+@mcp.custom_route("/control/initial_state", methods=["GET"])
+async def get_initial_state_endpoint(request: Request) -> JSONResponse:
+    """
+    Simulate the get_initial_state endpoint that's slow.
+    This mimics the pattern in your McpGym code.
+    """
+    print(f"🔍 get_initial_state called at {time.time()}")
+
+    # Simulate the slow operation (like environment initialization)
+    time.sleep(1)  # 1 second delay to test concurrency
+
+    # Return a dummy initial state
+    return JSONResponse({"observation": "dummy_initial_state", "session_id": "test_session", "timestamp": time.time()})
+
+
+@mcp.tool
+def dummy_tool() -> str:
+    """Dummy tool for MCP compatibility."""
+    return "dummy"
+
+
+def main():
+    """Run the test server."""
+    port = int(os.environ.get("PORT", 8000))
+    print(f"🚀 Starting get_initial_state test server on port {port}")
+    print(f"📡 Endpoint: http://localhost:{port}/control/initial_state")
+
+    # Use FastMCP 2.0 run method with streamable-http transport
+    mcp.run(transport="http", host="0.0.0.0", port=port)
+
+
+if __name__ == "__main__":
+    main()

From 073d99aca5e009c94a2fe521cf656087fab5a0bb Mon Sep 17 00:00:00 2001
From: Derek Xu <derek@fireworks.ai>
Date: Sun, 10 Aug 2025 04:02:36 +0000
Subject: [PATCH 05/14] run on local to double check

---
 eval_protocol/mcp/client/connection.py        | 186 ++++++-------
 eval_protocol/mcp/execution/manager.py        |  24 +-
 eval_protocol/mcp/mcpgym.py                   | 248 ++++++++----------
 examples/blackjack_mcp/blackjack_mcp.py       |   4 +-
 .../cliff_walking_mcp/cliff_walking_mcp.py    |   4 +-
 examples/frozen_lake_mcp/frozen_lake_mcp.py   |   6 +-
 examples/lunar_lander_mcp/lunar_lander_mcp.py |   4 +-
 .../airline_environment.py                    |  11 +-
 examples/tau2_mcp/server.py                   |   5 +-
 examples/tau2_mcp/tau2_mcp.py                 |  12 +-
 vendor/tau2/agent/llm_agent.py                |  58 ++--
 vendor/tau2/user/user_simulator.py            |  14 +-
 vendor/tau2/utils/llm_utils.py                |   6 +-
 13 files changed, 269 insertions(+), 313 deletions(-)

diff --git a/eval_protocol/mcp/client/connection.py b/eval_protocol/mcp/client/connection.py
index e72fdc9f..12370019 100644
--- a/eval_protocol/mcp/client/connection.py
+++ b/eval_protocol/mcp/client/connection.py
@@ -9,6 +9,7 @@
 import hashlib
 import json
 import logging
+import time
 from contextlib import AsyncExitStack
 from typing import Any, Dict, List, Optional, Tuple
 
@@ -27,9 +28,6 @@ class MCPConnectionManager:
     def __init__(self):
         self._tools_cache: Dict[str, List[Dict]] = {}
         self._tools_cache_lock = asyncio.Lock()
-        # Shared HTTP client for control plane requests with high connection limits
-        self._shared_client: Optional[httpx.AsyncClient] = None
-        self._client_lock = asyncio.Lock()
 
     async def initialize_session(self, session: MCPSession) -> None:
         """
@@ -147,8 +145,6 @@ async def reset_session(self, session: MCPSession) -> None:
         """
         Clean session data in remote mcp server for the given session
         """
-        import httpx
-
         base_url = session.base_url.rstrip("/").removesuffix("/mcp")
         url = f"{base_url}/control/reset_session"
 
@@ -177,16 +173,23 @@ async def discover_tools(self, session: MCPSession) -> List[Dict]:
 
         cache_key = session.base_url
 
-        # Check cache first (should be pre-warmed during initialization)
+        # Fast path: Check cache first without lock (safe for reads)
+        if cache_key in self._tools_cache:
+            cached_tools = self._tools_cache[cache_key]
+            logger.debug(f"Using cached tools for session {session.session_id} ({len(cached_tools)} tools)")
+            return cached_tools
+
+        # Slow path: Cache miss - use lock only for writing
         async with self._tools_cache_lock:
+            # Double-check pattern: another task might have cached it while we waited
             if cache_key in self._tools_cache:
                 cached_tools = self._tools_cache[cache_key]
                 logger.debug(f"Using cached tools for session {session.session_id} ({len(cached_tools)} tools)")
                 return cached_tools
 
-        # Fallback: if cache miss (shouldn't happen with pre-warming), fetch directly
-        logger.warning(f"Cache miss for {cache_key} - this shouldn't happen with pre-warming")
-        mcp_session = session._mcp_session
+            # Fallback: if cache miss (shouldn't happen with pre-warming), fetch directly
+            logger.warning(f"Cache miss for {cache_key} - this shouldn't happen with pre-warming")
+            mcp_session = session._mcp_session
 
         tools_response = await mcp_session.list_tools()
         tools = tools_response.tools if hasattr(tools_response, "tools") else []
@@ -233,74 +236,118 @@ async def get_initial_state(self, session: MCPSession) -> Any:
         Returns:
             Initial observation/state
         """
+        method_start = time.time()
+        session_id_short = session.session_id[:8] if len(session.session_id) > 8 else session.session_id
+        logger.info(f"### 🌟 GET_INITIAL_STATE_START: timestamp: {method_start}, session_id: {session_id_short}...")
+
         if not session._mcp_session:
+            logger.error(f"### ❌ SESSION_NOT_INITIALIZED: session_id: {session_id_short}")
             raise RuntimeError("Session not initialized")
 
         # Try to get initial state from control plane endpoint first
         initial_observation = None
 
         try:
-            import httpx
-
             # Extract base URL and session ID from the MCP session
+            url_extract_start = time.time()
+            logger.info(
+                f"### 🔍 URL_EXTRACT_START: timestamp: {url_extract_start}, elapsed: {url_extract_start - method_start:.6f}s, session_id: {session_id_short}..."
+            )
+
             base_url = session.base_url.rstrip("/").removesuffix("/mcp")
             session_id = session.session_id
 
+            url_extract_end = time.time()
+            logger.info(
+                f"### 🔍 URL_EXTRACT_END: timestamp: {url_extract_end}, elapsed: {url_extract_end - method_start:.6f}s, duration: {url_extract_end - url_extract_start:.6f}s, base_url: {base_url}, session_id: {session_id_short}..."
+            )
+
             if session_id:
+                headers_start = time.time()
+                logger.info(
+                    f"### 🔍 HEADERS_CREATE_START: timestamp: {headers_start}, elapsed: {headers_start - method_start:.6f}s, session_id: {session_id_short}..."
+                )
+
                 headers = {"mcp-session-id": session_id}
 
+                headers_end = time.time()
+                logger.info(
+                    f"### 🔍 HEADERS_CREATE_END: timestamp: {headers_end}, elapsed: {headers_end - method_start:.6f}s, duration: {headers_end - headers_start:.6f}s, session_id: {session_id_short}..."
+                )
+
                 # Query initial state endpoint
                 try:
+                    timeout_start = time.time()
+                    logger.info(
+                        f"### 🔍 TIMEOUT_CONFIG_START: timestamp: {timeout_start}, elapsed: {timeout_start - method_start:.6f}s, session_id: {session_id_short}..."
+                    )
+
                     # Use shorter timeout for playback mode, longer timeout for high-concurrency initialization
                     # (50+ concurrent sessions need more time for initial state setup)
                     timeout = 3.0 if hasattr(session, "_is_playback_mode") and session._is_playback_mode else 15.0
 
-                    # TIMING: Get shared client
-                    client_start = __import__("time").time()
-                    client = await self._get_shared_client(timeout)
-                    client_time = __import__("time").time() - client_start
+                    timeout_end = time.time()
                     logger.info(
-                        f"DEBUG_CLIENT: Getting shared client took {client_time:.3f}s for {session.session_id}"
+                        f"### 🔍 TIMEOUT_CONFIG_END: timestamp: {timeout_end}, elapsed: {timeout_end - method_start:.6f}s, duration: {timeout_end - timeout_start:.6f}s, timeout: {timeout}s, session_id: {session_id_short}..."
                     )
 
+                    # TIMING: Get shared client
+                    # client = await self._get_shared_client(timeout)
+
                     # TIMING: HTTP request with shared client
-                    request_start = __import__("time").time()
-                    initial_state_response = await client.get(
-                        f"{base_url}/control/initial_state",
-                        headers=headers,
-                        timeout=timeout,
+                    request_start = time.time()
+                    logger.info(
+                        f"### 🌐 HTTP_REQUEST_START: timestamp: {request_start}, elapsed: {request_start - method_start:.6f}s, url: {base_url}/control/initial_state, session_id: {session_id_short}..."
                     )
-                    request_time = __import__("time").time() - request_start
-                    logger.info(f"DEBUG_REQUEST: HTTP request took {request_time:.3f}s for {session.session_id}")
-                    if initial_state_response.status_code == 200:
-                        initial_observation = initial_state_response.json()
-                        logger.info(
-                            f"Session {session.session_id}: ✅ Successfully fetched session-aware initial state from control plane endpoint"
+
+                    timeout = 3.0 if hasattr(session, "_is_playback_mode") and session._is_playback_mode else 15.0
+
+                    async with httpx.AsyncClient(timeout=timeout) as client:
+                        initial_state_response = await client.get(
+                            f"{base_url}/control/initial_state",
+                            headers=headers,
+                            timeout=timeout,
                         )
-                    else:
-                        logger.warning(
-                            f"Control plane initial state endpoint returned {initial_state_response.status_code}"
+                        request_time = time.time() - request_start
+
+                        request_end = time.time()
+                        logger.info(
+                            f"### 🌐 HTTP_REQUEST_END: timestamp: {request_end}, elapsed: {request_end - method_start:.6f}s, duration: {request_time:.6f}s, status_code: {initial_state_response.status_code}, session_id: {session_id_short}..."
                         )
+
+                        if initial_state_response.status_code == 200:
+                            initial_observation = initial_state_response.json()
+                            success_end = time.time()
+                            logger.info(
+                                f"### ✅ RETURN: timestamp: {success_end}, total_duration: {success_end - method_start:.6f}s, session_id: {session_id_short}..."
+                            )
+                            # return initial_observation
+                        else:
+                            error_time = time.time()
+                            logger.warning(
+                                f"### ⚠️ HTTP_ERROR_RESPONSE: timestamp: {error_time}, elapsed: {error_time - method_start:.6f}s, status_code: {initial_state_response.status_code}, session_id: {session_id_short}"
+                            )
                 except httpx.TimeoutException:
-                    logger.warning(f"Control plane initial state endpoint timed out after {timeout}s")
+                    timeout_error_time = time.time()
+                    logger.warning(
+                        f"### ⏰ HTTP_TIMEOUT: timestamp: {timeout_error_time}, elapsed: {timeout_error_time - method_start:.6f}s, timeout: {timeout}s, session_id: {session_id_short}"
+                    )
                 except Exception as e:
-                    logger.warning(f"Failed to query initial state endpoint: {e}")
+                    http_error_time = time.time()
+                    logger.warning(
+                        f"### ❌ HTTP_ERROR: timestamp: {http_error_time}, elapsed: {http_error_time - method_start:.6f}s, error: {str(e)}, session_id: {session_id_short}"
+                    )
 
         except Exception as e:
-            logger.warning(f"Failed to query control plane initial state endpoint: {e}")
-
-        # Fallback to MCP resource if control plane endpoint fails (backward compatibility)
-        if initial_observation is None:
-            logger.debug(f"Session {session.session_id}: Falling back to MCP resource for initial state")
-            initial_observation = await self._get_initial_state_from_mcp_resource(session)
-
-        # Ensure we have some observation
-        if initial_observation is None:
-            logger.debug(f"Session {session.session_id}: Using default initial state")
-            initial_observation = {
-                "observation": "default_initial_state",
-                "session_id": session.session_id,
-            }
+            general_error_time = time.time()
+            logger.warning(
+                f"### ❌ GENERAL_ERROR: timestamp: {general_error_time}, elapsed: {general_error_time - method_start:.6f}s, error: {str(e)}, session_id: {session_id_short}"
+            )
+
+        method_end = time.time()
+        logger.info(
+            f"### 🔴 GET_INITIAL_STATE_END: timestamp: {method_end}, total_duration: {method_end - method_start:.6f}s, session_id: {session_id_short}..."
+        )
 
         return initial_observation
 
@@ -509,9 +556,6 @@ async def call_tool(self, session: MCPSession, tool_name: str, arguments: Dict)
         control_plane_info = {}
 
         try:
-            # Query control plane endpoints following the new architecture
-            import httpx
-
             # Extract base URL and session ID from the MCP session
             base_url = session.base_url.rstrip("/").removesuffix("/mcp")
             # Use the session ID from the established MCP session
@@ -601,47 +645,3 @@ async def close_session(self, session: MCPSession) -> None:
             finally:
                 session._exit_stack = None
                 session._mcp_session = None
-
-    async def _get_shared_client(self, timeout: float) -> httpx.AsyncClient:
-        """
-        Get or create a shared HTTP client with high connection limits for concurrent requests.
-
-        Args:
-            timeout: Timeout for requests
-
-        Returns:
-            Shared httpx.AsyncClient instance
-        """
-        # Fast path: if client exists and is not closed, return it immediately
-        if self._shared_client is not None and not self._shared_client.is_closed:
-            return self._shared_client
-
-        # Slow path: need to create client (use lock only for creation)
-        async with self._client_lock:
-            # Double-check pattern: another task might have created it while we waited
-            if self._shared_client is None or self._shared_client.is_closed:
-                # Create HTTP client with high connection limits for concurrent initial state requests
-                limits = httpx.Limits(
-                    max_keepalive_connections=None,  # Unlimited keep-alive connections
-                    max_connections=None,  # Unlimited total connection pool size
-                    keepalive_expiry=30.0,  # Keep connections alive for 30s
-                )
-
-                self._shared_client = httpx.AsyncClient(
-                    timeout=timeout,
-                    limits=limits,
-                    # Enable connection pooling and keep-alive
-                    http2=False,  # Disable HTTP/2 for better connection pooling with many concurrent requests
-                )
-                logger.info(
-                    "Created shared HTTP client with unlimited connection limits for MCP control plane requests"
-                )
-
-        return self._shared_client
-
-    async def close_shared_client(self):
-        """Close the shared HTTP client when shutting down."""
-        async with self._client_lock:
-            if self._shared_client and not self._shared_client.is_closed:
-                await self._shared_client.aclose()
-                self._shared_client = None
diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py
index 936221c2..1a673587 100644
--- a/eval_protocol/mcp/execution/manager.py
+++ b/eval_protocol/mcp/execution/manager.py
@@ -191,9 +191,7 @@ async def _execute_rollout(
         dataset_row = envs.dataset_rows[rollout_idx]
         rollout_start = time.time()
         elapsed_from_main_start = rollout_start - start_time
-        logger.info(
-            f"DEBUG4. Starting rollout {dataset_row.id} at {datetime.fromtimestamp(rollout_start).strftime('%H:%M:%S.%f')[:-3]} (+{elapsed_from_main_start:.3f}s from start)"
-        )
+        logger.info(f"DEBUG4. Starting rollout {dataset_row.id} at {rollout_start}")
 
         # Initialize trajectory
         trajectory = Trajectory(
@@ -219,7 +217,7 @@ async def _execute_rollout(
         temp_start = time.time()
         current_observation, tool_schema = await envs.reset(session)
         logger.info(
-            f"DEBUG6: User simulator get_init_state took {time.time() - temp_start:.3f}s for {session.session_id}"
+            f"DEBUG6: User simulator get_init_state took {time.time() - temp_start:.3f}s for {session.session_id}, started at {temp_start}"
         )
         system_prompt = dataset_row.system_prompt
 
@@ -240,7 +238,7 @@ async def _execute_rollout(
 
             # Get initial messages in tau2-bench format for user simulator
             user_simulator_state = user_simulator.get_init_state()
-            user_message, user_simulator_state = user_simulator.generate_next_message(
+            user_message, user_simulator_state = await user_simulator.generate_next_message(
                 AssistantMessage(role="assistant", content="Hi! How can I help you today?"),
                 user_simulator_state,
             )
@@ -277,11 +275,11 @@ async def _execute_rollout(
                 if user_simulator_messages and isinstance(user_simulator_messages[-1], AssistantMessage):
                     # Generate user response using the simulator
                     temp_start1 = time.time()
-                    user_message, user_simulator_state = user_simulator.generate_next_message(
+                    user_message, user_simulator_state = await user_simulator.generate_next_message(
                         user_simulator_messages[-1], user_simulator_state
                     )
                     logger.info(
-                        f"DEBUG8: User simulator generate_next_message took {time.time() - temp_start1:.3f}s for {dataset_row.id}"
+                        f"DEBUG8: User simulator generate_next_message took {time.time() - temp_start1:.3f}s for {dataset_row.id}, started at {temp_start1}"
                     )
                     user_content = user_message.content if user_message.content else ""
 
@@ -297,7 +295,9 @@ async def _execute_rollout(
             while not turn_completed and not trajectory.terminated:
                 temp_start2 = time.time()
                 tool_calls, usage_stats = await policy(tool_schema, rollout_idx, conversation_history)
-                logger.info(f"DEBUG9: Policy took {time.time() - temp_start2:.3f}s for {dataset_row.id}")
+                logger.info(
+                    f"DEBUG9: Policy took {time.time() - temp_start2:.3f}s for {dataset_row.id}, started at {temp_start2}"
+                )
 
                 # If no tool call is generated, turn is finished
                 if len(tool_calls) == 1:
@@ -316,7 +316,9 @@ async def _execute_rollout(
                     # Execute tool call for this environment
                     temp_start3 = time.time()
                     observation, reward, rollout_end, info = await envs.step(rollout_idx, tool_call)
-                    logger.info(f"DEBUG10: Env step took {time.time() - temp_start3:.3f}s for {dataset_row.id}")
+                    logger.info(
+                        f"DEBUG10: Env step took {time.time() - temp_start3:.3f}s for {dataset_row.id}, started at {temp_start3}"
+                    )
 
                     tool_response = envs.format_tool_response(observation)
 
@@ -464,9 +466,7 @@ async def _execute_rollout(
         logger.info(
             f"✅ Rollout {rollout_idx} completed: {trajectory.steps} steps, reward: {trajectory.total_reward:.2f}, termination: {trajectory.termination_reason}, in thread {threading.current_thread().name}"
         )
-        logger.info(
-            f"DEBUG11: Rollout {dataset_row.id} completed at {datetime.fromtimestamp(time.time()).strftime('%H:%M:%S.%f')[:-3]} (+{time.time() - rollout_start:.3f}s from start)"
-        )
+        logger.info(f"DEBUG11: Rollout {dataset_row.id} completed at {time.time()}, started at {rollout_start}")
         return trajectory
 
     async def _get_control_plane_status(self, session) -> Optional[Dict[str, Any]]:
diff --git a/eval_protocol/mcp/mcpgym.py b/eval_protocol/mcp/mcpgym.py
index 59de7a5d..901e5620 100644
--- a/eval_protocol/mcp/mcpgym.py
+++ b/eval_protocol/mcp/mcpgym.py
@@ -13,6 +13,7 @@
 """
 
 import asyncio
+import dataclasses
 import hashlib
 import inspect
 import json
@@ -21,12 +22,16 @@
 import threading
 import time
 from abc import ABC, abstractmethod
+from concurrent.futures import ThreadPoolExecutor
+from datetime import date, datetime
+from enum import Enum
 from typing import Any, Callable, Dict, Optional, Tuple
 
 import uvicorn
 
 # from mcp.server.fastmcp import Context, FastMCP
 from fastmcp import Context, FastMCP
+from pydantic import BaseModel
 from starlette.requests import Request
 from starlette.responses import JSONResponse
 from uvicorn.middleware.proxy_headers import ProxyHeadersMiddleware
@@ -78,7 +83,13 @@ class McpGym(ABC):
     - Environment Implementation: Single-process MCP server per environment
     """
 
-    def __init__(self, server_name: str, adapter: EnvironmentAdapter, seed: Optional[int] = None):
+    def __init__(
+        self,
+        server_name: str,
+        adapter: EnvironmentAdapter,
+        seed: Optional[int] = None,
+        max_workers: Optional[int] = None,
+    ):
         """
         Initialize the MCP-Gym environment.
 
@@ -86,6 +97,9 @@ def __init__(self, server_name: str, adapter: EnvironmentAdapter, seed: Optional
             server_name: Name for the MCP server
             adapter: Environment adapter instance
             seed: Optional seed for reproducible environments
+            max_workers: Optional maximum number of worker threads for ThreadPoolExecutor.
+                If None, uses ThreadPoolExecutor default (min(32, (os.cpu_count() or 1) + 4))
+
         """
         self.adapter = adapter
 
@@ -113,14 +127,14 @@ def __init__(self, server_name: str, adapter: EnvironmentAdapter, seed: Optional
             "total_reward": 0.0,
         }
 
-        # Reset with seed if provided
+        self.pool = ThreadPoolExecutor(max_workers=max_workers)
+
         self.env, self.obs, _info = self._new_env(seed=seed)
 
         # Register tools and control plane endpoints
         self._register_tools()
         self._discover_and_register_control_plane_endpoints()
         self._register_session_reset_endpoint()
-        # self._register_health_check_endpoint()
 
     def _get_session_id(self, ctx: Context) -> str:
         """
@@ -190,49 +204,6 @@ def _get_or_create_session(self, ctx: Context) -> Dict[str, Any]:
         print(f"🔍 _get_or_create_session: session_id: {session_id}")
         return self.sessions[session_id]
 
-        with self.session_lock:
-            if session_id not in self.sessions:
-                print(f"🔍 _get_or_create_session: Creating new session for {session_id}")
-                # Extract seed from context using proper FastMCP pattern
-                seed = None
-                config = self._get_default_config()
-                print(f"🔍 _get_or_create_session: default_config: {config}")
-
-                if hasattr(ctx, "session") and hasattr(ctx.session, "client_params"):
-                    client_params = ctx.session.client_params
-                    if hasattr(client_params, "clientInfo"):
-                        client_info = client_params.clientInfo
-                        if client_info and hasattr(client_info, "_extra"):
-                            extra_data = client_info._extra
-                            print(f"🔍 _get_or_create_session: extra_data in session creation: {extra_data}")
-                            if extra_data and isinstance(extra_data, dict):
-                                # Extract seed from client info
-                                seed = extra_data.get("seed")
-                                print(f"🌱 Extracted seed from client_info: {seed} (type: {type(seed)})")
-                                # Update config with any additional options
-                                if "config" in extra_data:
-                                    config.update(extra_data["config"])
-                                    print(f"🔍 _get_or_create_session: updated config: {config}")
-
-                print(f"🔍 _get_or_create_session: About to create environment with seed: {seed}")
-
-                env, obs, info = self._new_env(seed=seed)
-                print(f"🔍 _get_or_create_session: environment created with obs: {obs}, info: {info}")
-
-                # Initialize session state
-                self.sessions[session_id] = {
-                    "env": env,
-                    "obs": obs,
-                    "session_data": {},  # Subclasses can store additional data here
-                    "session_id": session_id,
-                }
-
-                print(f"🎮 Created new session {session_id[:16]}... with seed {seed}, initial obs: {obs}")
-            else:
-                print(f"🔍 _get_or_create_session: Returning existing session {session_id}")
-
-            return self.sessions[session_id]
-
     def _register_session_reset_endpoint(self):
 
         @self.mcp.custom_route("/control/reset_session", methods=["POST"])
@@ -243,56 +214,18 @@ async def reset_session_endpoint(request: Request) -> JSONResponse:
             print(f"🔍 _register_session_reset_endpoint: Resetting session, session_id: {session_id}, seed: {seed}")
             if not session_id:
                 return JSONResponse({"error": "Missing mcp-session-id header"}, status_code=400)
-            # with self.session_lock:
-            #    if session_id in self.sessions:
-            #        env, obs, _ = self._new_env(seed=seed)
-            #        self.sessions[session_id] = {
-            #            "env": env,
-            #            "obs": obs,
-            #            "session_data": {},
-            #            "session_id": session_id,
-            #        }
-            #        print(f"🔍 _register_session_reset_endpoint: Finished reset session, session_id: {session_id}")
+            if session_id in self.sessions:
+                env, obs, _ = self._new_env(seed=seed)
+                with self.session_lock:
+                    self.sessions[session_id] = {
+                        "env": env,
+                        "obs": obs,
+                        "session_data": {},
+                        "session_id": session_id,
+                    }
+                print(f"🔍 _register_session_reset_endpoint: Finished reset session, session_id: {session_id}")
             return JSONResponse({"message": "Session reset successfully"})
 
-    # def _register_health_check_endpoint(self):
-    #     """Register a simple health check endpoint for diagnostics."""
-
-    #     @self.mcp.custom_route("/health", methods=["GET"])
-    #     async def health_check_endpoint(request: Request) -> JSONResponse:
-    #         """Simple health check that returns immediately."""
-    #         return JSONResponse({"ok": True, "timestamp": time.time()})
-
-    # def _add_timing_middleware(self, starlette_app):
-    #     """Add ASGI middleware to log request arrival times."""
-
-    #     class TimingMiddleware:
-    #         def __init__(self, app):
-    #             self.app = app
-
-    #         async def __call__(self, scope, receive, send):
-    #             if scope["type"] != "http":
-    #                 await self.app(scope, receive, send)
-    #                 return
-
-    #             # Log immediately when request arrives at server
-    #             start_time = time.time()
-    #             path = scope.get("path", "")
-    #             method = scope.get("method", "")
-
-    #             print(f"🚀 REQUEST ARRIVED: {method} {path} at {start_time}")
-
-    #             async def send_wrapper(message):
-    #                 if message["type"] == "http.response.start":
-    #                     # Log completion time for comparison
-    #                     end_time = time.time()
-    #                     if path in ["/health", "/control/initial_state"]:
-    #                         print(f"✅ REQUEST took: {end_time - start_time:.3f}s")
-    #                 await send(message)
-
-    #             await self.app(scope, receive, send_wrapper)
-    #     starlette_app.add_middleware(TimingMiddleware)
-
     def _discover_and_register_control_plane_endpoints(self):
         """
         Discover and register control plane endpoints on the subclass instance.
@@ -314,9 +247,6 @@ def _discover_and_register_control_plane_endpoints(self):
             # Create session-aware handler for this endpoint
             def create_endpoint_handler(func: Callable):
                 async def endpoint_handler(request: Request) -> JSONResponse:
-
-                    if func.__name__ == "get_initial_state_endpoint":
-                        logger.info(f"===== starting to handle endpoint: {func.__name__}, time: {time.time()}")
                     try:
                         # Extract session ID from request headers (similar to StreamableHTTP pattern)
                         session_id = request.headers.get("mcp-session-id")
@@ -327,15 +257,24 @@ async def endpoint_handler(request: Request) -> JSONResponse:
                             )
 
                         # Get or create session data
-                        with self.session_lock:
-                            session_data = self.sessions.get(session_id)
-                            if not session_data:
-                                # create a placeholder session data
-                                self.sessions[session_id] = {"placeholder": True}
-                        # For initial state endpoint, we need to create the session
-                        # based on the session ID and available information
-                        if func.__name__ == "get_initial_state_endpoint":
-                            env, obs, info = self._new_env(seed=None)
+                        session_data = self.sessions.get(session_id)
+                        if not session_data:
+                            if func.__name__ != "get_initial_state_endpoint":
+                                return JSONResponse(
+                                    {"error": f"Session {session_id} not found"},
+                                    status_code=404,
+                                )
+                            start_time = time.time()
+                            logger.info(
+                                f"### 🔍 NEW_ENV_START: timestamp: {start_time}, session_id: {session_id[:8] if len(session_id) > 8 else session_id}..."
+                            )
+                            loop = asyncio.get_running_loop()
+                            env, obs, info = await loop.run_in_executor(self.pool, self._new_env, None)
+                            # env, obs, info = self._new_env(None)
+                            end_time = time.time()
+                            logger.info(
+                                f"### 🔍 NEW_ENV_END: timestamp: {end_time}, elapsed: {end_time - start_time:.6f}s, session_id: {session_id[:8] if len(session_id) > 8 else session_id}..."
+                            )
                             # Initialize session state with extracted seed from session ID
                             session_data = {
                                 "env": env,
@@ -343,30 +282,13 @@ async def endpoint_handler(request: Request) -> JSONResponse:
                                 "session_data": {},  # Subclasses can store additional data here
                                 "session_id": session_id,
                             }
-                            # Store the session
-                            with self.session_lock:
-                                self.sessions[session_id] = session_data
-
-                        else:
-                            return JSONResponse(
-                                {"error": f"Session {session_id} not found"},
-                                status_code=404,
-                            )
-
-                        # Call the endpoint function with session data
-                        method_start = time.time()
-                        if func.__name__ == "get_initial_state_endpoint":
-                            print(f"🎯 METHOD START: {func.__name__} at {method_start}")
+                        with self.session_lock:
+                            self.sessions[session_id] = session_data
 
                         if inspect.iscoroutinefunction(func):
                             result = await func(session_data=session_data)
                         else:
                             result = func(session_data=session_data)
-
-                        # method_end = time.time()
-                        # if func.__name__ == "get_initial_state_endpoint":
-                        #     print(f"🎯 METHOD END: {func.__name__} at {method_end} (took {method_end - method_start:.3f}s)")
-
                         return JSONResponse(result)
 
                     except Exception as e:
@@ -412,10 +334,6 @@ def _get_or_create_session_control_plane(self, session_id: str) -> Dict[str, Any
         if session_id not in self.sessions:
             raise Exception(f"Session {session_id} not found")
 
-        # with self.session_lock:
-        # if session_id not in self.sessions:
-        #    return {}
-
         session_data = self.sessions[session_id]
         if "control_plane" not in session_data["session_data"]:
             session_data["session_data"]["control_plane"] = {
@@ -452,13 +370,6 @@ def _update_session_control_plane(
             f"🎛️  Session {session_id[:16]}... control plane: reward={reward}, terminated={terminated}, step={control_plane['step_count']}, total_reward={control_plane['total_reward']}"
         )
 
-    # def get_control_plane_state(self, session_id: str) -> Optional[Dict[str, Any]]:
-    #    """Get control plane state for a specific session (for rollout system)."""
-    #    with self.session_lock:
-    #        if session_id in self.sessions:
-    #            return self._get_or_create_session_control_plane(session_id).copy()
-    #        return None
-
     def _execute_environment_step(self, action_int: int) -> Dict[str, Any]:
         """
         Execute environment step and update control plane (single session).
@@ -566,30 +477,82 @@ def get_info_endpoint(self, session_data: Dict[str, Any]) -> Dict[str, Any]:
         return control_plane.get("info", {})
 
     @control_plane_endpoint("/control/initial_state")
-    def get_initial_state_endpoint(self, session_data: Dict[str, Any]) -> Dict[str, Any]:
+    async def get_initial_state_endpoint(self, session_data: Dict[str, Any]) -> Dict[str, Any]:
         """Get initial state for this session."""
-        print(f"🔍 STARTING get_initial_state_endpoint: {time.time()}")
+        endpoint_start = time.time()
+        session_id = session_data.get("session_id", "unknown")
+        logger.info(
+            f"### 🌟 ENDPOINT_START: get_initial_state_endpoint, timestamp: {endpoint_start}, session_id: {session_id[:8] if len(session_id) > 8 else session_id}..."
+        )
+
+        env_check_start = time.time()
+        logger.info(
+            f"### 🔍 ENV_CHECK_START: timestamp: {env_check_start}, elapsed: {env_check_start - endpoint_start:.6f}s, session_id: {session_id[:8] if len(session_id) > 8 else session_id}..."
+        )
+
         env = session_data.get("env")
         obs = session_data.get("obs")
 
+        env_check_end = time.time()
+        logger.info(
+            f"### 🔍 ENV_CHECK_END: timestamp: {env_check_end}, elapsed: {env_check_end - endpoint_start:.6f}s, duration: {env_check_end - env_check_start:.6f}s, env: {env is not None}, obs: {obs is not None}, session_id: {session_id[:8] if len(session_id) > 8 else session_id}..."
+        )
+
         if env and obs is not None:
+            format_start = time.time()
+            logger.info(
+                f"### 🔄 FORMAT_OBS_START: timestamp: {format_start}, elapsed: {format_start - endpoint_start:.6f}s, session_id: {session_id[:8] if len(session_id) > 8 else session_id}..."
+            )
+
             try:
                 formatted_obs = self.format_observation(obs, env)
+
+                format_end = time.time()
+                logger.info(
+                    f"### 🔄 FORMAT_OBS_END: timestamp: {format_end}, elapsed: {format_end - endpoint_start:.6f}s, duration: {format_end - format_start:.6f}s, session_id: {session_id[:8] if len(session_id) > 8 else session_id}..."
+                )
+
+                endpoint_end = time.time()
+                logger.info(
+                    f"### ✅ ENDPOINT_SUCCESS_END: timestamp: {endpoint_end}, total_duration: {endpoint_end - endpoint_start:.6f}s, session_id: {session_id[:8] if len(session_id) > 8 else session_id}..."
+                )
+
                 return formatted_obs
             except Exception as e:
-                logger.error(f"❌ Error in format_observation: {e}")
+                error_time = time.time()
+                logger.error(
+                    f"### ❌ FORMAT_OBS_ERROR: timestamp: {error_time}, elapsed: {error_time - endpoint_start:.6f}s, error: {str(e)}, session_id: {session_id[:8] if len(session_id) > 8 else session_id}..."
+                )
+
                 return {
                     "error": f"Failed to format observation: {str(e)}",
                     "observation_type": str(type(obs)),
                     "session_id": session_data.get("session_id", "unknown"),
                 }
         else:
+            fallback_start = time.time()
+            logger.info(
+                f"### 🔄 FALLBACK_START: timestamp: {fallback_start}, elapsed: {fallback_start - endpoint_start:.6f}s, session_id: {session_id[:8] if len(session_id) > 8 else session_id}..."
+            )
+
             # Fallback if session data is not available
-            return {
+            result = {
                 "observation": "session_not_initialized",
                 "session_id": session_data.get("session_id", "unknown"),
             }
 
+            fallback_end = time.time()
+            logger.info(
+                f"### 🔄 FALLBACK_END: timestamp: {fallback_end}, elapsed: {fallback_end - endpoint_start:.6f}s, duration: {fallback_end - fallback_start:.6f}s, session_id: {session_id[:8] if len(session_id) > 8 else session_id}..."
+            )
+
+            endpoint_end = time.time()
+            logger.info(
+                f"### ✅ ENDPOINT_FALLBACK_END: timestamp: {endpoint_end}, total_duration: {endpoint_end - endpoint_start:.6f}s, session_id: {session_id[:8] if len(session_id) > 8 else session_id}..."
+            )
+
+            return result
+
     def _get_session_control_plane_from_data(self, session_data: Dict[str, Any]) -> Dict[str, Any]:
         """Extract control plane state from session data."""
         return session_data.get("session_data", {}).get(
@@ -681,11 +644,6 @@ def _to_json_serializable(self, obj: Any) -> Any:
         Handles Pydantic models, dataclasses, lists, dicts, and primitive types.
         This is a utility method that can be used by format_observation implementations.
         """
-        import dataclasses
-        from datetime import date, datetime
-        from enum import Enum
-
-        from pydantic import BaseModel
 
         # Handle None and primitive types
         if obj is None or isinstance(obj, (str, int, float, bool)):
diff --git a/examples/blackjack_mcp/blackjack_mcp.py b/examples/blackjack_mcp/blackjack_mcp.py
index 83d8241a..f8b0a877 100644
--- a/examples/blackjack_mcp/blackjack_mcp.py
+++ b/examples/blackjack_mcp/blackjack_mcp.py
@@ -39,10 +39,10 @@ class BlackjackMcp(McpGym):
     - Multi-session support with session-based control plane state
     """
 
-    def __init__(self, seed: Optional[int] = None):
+    def __init__(self, seed: Optional[int] = None, **kwargs):
         """Initialize Blackjack MCP-Gym environment."""
         adapter = BlackjackAdapter()
-        super().__init__("Blackjack-v1", adapter, seed)
+        super().__init__("Blackjack-v1", adapter, seed, **kwargs)
 
         # Multi-session support is now handled by the base class
 
diff --git a/examples/cliff_walking_mcp/cliff_walking_mcp.py b/examples/cliff_walking_mcp/cliff_walking_mcp.py
index a1005cfb..68447064 100644
--- a/examples/cliff_walking_mcp/cliff_walking_mcp.py
+++ b/examples/cliff_walking_mcp/cliff_walking_mcp.py
@@ -38,10 +38,10 @@ class CliffWalkingMcp(McpGym):
     - Multi-session support with session-based control plane state
     """
 
-    def __init__(self, seed: Optional[int] = None):
+    def __init__(self, seed: Optional[int] = None, **kwargs):
         """Initialize Cliff Walking MCP-Gym environment."""
         adapter = CliffWalkingAdapter()
-        super().__init__("CliffWalking-v1", adapter, seed)
+        super().__init__("CliffWalking-v1", adapter, seed, **kwargs)
 
         # Multi-session support is now handled by the base class
 
diff --git a/examples/frozen_lake_mcp/frozen_lake_mcp.py b/examples/frozen_lake_mcp/frozen_lake_mcp.py
index 9cf92fac..570c9832 100644
--- a/examples/frozen_lake_mcp/frozen_lake_mcp.py
+++ b/examples/frozen_lake_mcp/frozen_lake_mcp.py
@@ -19,8 +19,8 @@
 
 from typing import Any, Dict, Optional
 
+from fastmcp import Context
 from frozen_lake_adapter import FrozenLakeAdapter
-from mcp.server.fastmcp import Context
 
 from eval_protocol.mcp import McpGym
 from eval_protocol.mcp.mcpgym import control_plane_endpoint
@@ -38,10 +38,10 @@ class FrozenLakeMcp(McpGym):
     - Multi-session support with session-based control plane state
     """
 
-    def __init__(self, seed: Optional[int] = None):
+    def __init__(self, seed: Optional[int] = None, **kwargs):
         """Initialize FrozenLake MCP-Gym environment."""
         adapter = FrozenLakeAdapter()
-        super().__init__("FrozenLake-v1", adapter, seed)
+        super().__init__("FrozenLake-v1", adapter, seed, **kwargs)
 
         # Multi-session support is now handled by the base class
 
diff --git a/examples/lunar_lander_mcp/lunar_lander_mcp.py b/examples/lunar_lander_mcp/lunar_lander_mcp.py
index 3b1c6046..0d15077c 100644
--- a/examples/lunar_lander_mcp/lunar_lander_mcp.py
+++ b/examples/lunar_lander_mcp/lunar_lander_mcp.py
@@ -35,10 +35,10 @@
 class LunarLanderMcp(McpGym):
     """LunarLander production server with visual rendering support."""
 
-    def __init__(self, seed: Optional[int] = None):
+    def __init__(self, seed: Optional[int] = None, **kwargs):
         """Initialize LunarLander MCP-Gym environment."""
         self.adapter = LunarLanderAdapter()
-        super().__init__("LunarLander-v3", self.adapter, seed)
+        super().__init__("LunarLander-v3", self.adapter, seed, **kwargs)
 
         # Multi-session support is now handled by the base class
 
diff --git a/examples/tau2_mcp/airplane_environment/airline_environment.py b/examples/tau2_mcp/airplane_environment/airline_environment.py
index 9518e025..b8c5bb3e 100644
--- a/examples/tau2_mcp/airplane_environment/airline_environment.py
+++ b/examples/tau2_mcp/airplane_environment/airline_environment.py
@@ -8,6 +8,7 @@
 import json
 import logging
 import os
+import time
 from copy import deepcopy
 from dataclasses import dataclass, field
 from enum import Enum
@@ -19,7 +20,7 @@
 
 logger = logging.getLogger(__name__)
 
-AIRLINE_DB_PATH = Path(__file__).parent / "db.json"
+from vendor.tau2.domains.airline.utils import AIRLINE_DB_PATH
 
 
 class AirlineEnvironment:
@@ -30,15 +31,19 @@ class AirlineEnvironment:
 
     def __init__(self, config: Optional[Dict[str, Any]] = None):
         self.config = config or {}
-        self.db = FlightDB.load(AIRLINE_DB_PATH)
-        self.airline_tools = AirlineTools(self.db)
+        self.db = None
+        self.airline_tools = None
 
     def reset(self, seed: Optional[int] = None) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         """Reset the environment to initial state"""
         logger.info("🔄 Resetting airline environment - reloading database from disk")
+        start_time = time.time()
         self.db = FlightDB.load(AIRLINE_DB_PATH)
         self.airline_tools = AirlineTools(self.db)
 
+        end_time = time.time()
+        logger.info(f"11RESET TOOK {end_time - start_time:.2f} seconds, called at {start_time}")
+
         return {}, {}
 
     def step(self, action: Dict[str, Any]) -> Tuple[Dict[str, Any], float, bool, bool, Dict[str, Any]]:
diff --git a/examples/tau2_mcp/server.py b/examples/tau2_mcp/server.py
index 14323929..17b3560d 100755
--- a/examples/tau2_mcp/server.py
+++ b/examples/tau2_mcp/server.py
@@ -40,6 +40,9 @@ def main():
     )
     parser.add_argument("--port", type=int, default=8000, help="Port for HTTP transport")
     parser.add_argument("--seed", type=int, default=None, help="Seed for the environment")
+    parser.add_argument(
+        "--max-workers", type=int, default=None, help="Maximum number of workers for the ThreadPoolExecutor"
+    )
 
     args = parser.parse_args()
 
@@ -62,7 +65,7 @@ def main():
     }
 
     server_class = domain_servers[args.domain]
-    server = server_class(seed=args.seed)
+    server = server_class(seed=args.seed, max_workers=args.max_workers)
 
     print(f"{domain_icons[args.domain]} Starting {args.domain.title()} MCP server on port {args.port}")
     print(f"🌱 Seed: {args.seed}")
diff --git a/examples/tau2_mcp/tau2_mcp.py b/examples/tau2_mcp/tau2_mcp.py
index b7dc401e..8cff77f8 100644
--- a/examples/tau2_mcp/tau2_mcp.py
+++ b/examples/tau2_mcp/tau2_mcp.py
@@ -27,7 +27,7 @@
 class AirlineDomainMcp(McpGym):
     """Airline booking MCP server for τ²-Bench integration"""
 
-    def __init__(self, seed: Optional[int] = None):
+    def __init__(self, seed: Optional[int] = None, **kwargs):
         """Initialize Airline MCP-Gym environment."""
         # Use EnvironmentAdapter directly as the default adapter
         default_config = {
@@ -37,7 +37,7 @@ def __init__(self, seed: Optional[int] = None):
 
         self.adapter = EnvironmentAdapter(env_class=AirlineEnvironment, default_config=default_config)
 
-        super().__init__("airline", self.adapter, seed)
+        super().__init__("airline", self.adapter, seed, **kwargs)
 
     def _register_tools(self):
         """Register airline-specific MCP tools matching τ²-Bench schemas"""
@@ -399,7 +399,7 @@ def get_flight_status(
 class MockDomainMcp(McpGym):
     """Mock domain MCP server for τ²-Bench integration"""
 
-    def __init__(self, seed: Optional[int] = None):
+    def __init__(self, seed: Optional[int] = None, **kwargs):
         """Initialize Mock MCP-Gym environment."""
         # Use EnvironmentAdapter directly as the default adapter
         default_config = {
@@ -409,7 +409,7 @@ def __init__(self, seed: Optional[int] = None):
 
         self.adapter = EnvironmentAdapter(env_class=MockEnvironment, default_config=default_config)
 
-        super().__init__("mock", self.adapter, seed)
+        super().__init__("mock", self.adapter, seed, **kwargs)
 
     def _register_tools(self):
         """Register mock-specific MCP tools matching τ²-Bench schemas"""
@@ -490,7 +490,7 @@ def transfer_to_human_agents(summary: str, ctx: Context) -> Dict[str, Any]:
 class RetailDomainMcp(McpGym):
     """Retail domain MCP server for τ²-Bench integration"""
 
-    def __init__(self, seed: Optional[int] = None):
+    def __init__(self, seed: Optional[int] = None, **kwargs):
         """Initialize Retail MCP-Gym environment."""
         # Use EnvironmentAdapter directly as the default adapter
         default_config = {
@@ -500,7 +500,7 @@ def __init__(self, seed: Optional[int] = None):
 
         self.adapter = EnvironmentAdapter(env_class=RetailEnvironment, default_config=default_config)
 
-        super().__init__("retail", self.adapter, seed)
+        super().__init__("retail", self.adapter, seed, **kwargs)
 
     def _register_tools(self):
         """Register retail-specific MCP tools matching τ²-Bench schemas"""
diff --git a/vendor/tau2/agent/llm_agent.py b/vendor/tau2/agent/llm_agent.py
index 01201f35..b2fdee99 100644
--- a/vendor/tau2/agent/llm_agent.py
+++ b/vendor/tau2/agent/llm_agent.py
@@ -69,13 +69,9 @@ def __init__(
 
     @property
     def system_prompt(self) -> str:
-        return SYSTEM_PROMPT.format(
-            domain_policy=self.domain_policy, agent_instruction=AGENT_INSTRUCTION
-        )
+        return SYSTEM_PROMPT.format(domain_policy=self.domain_policy, agent_instruction=AGENT_INSTRUCTION)
 
-    def get_init_state(
-        self, message_history: Optional[list[Message]] = None
-    ) -> LLMAgentState:
+    def get_init_state(self, message_history: Optional[list[Message]] = None) -> LLMAgentState:
         """Get the initial state of the agent.
 
         Args:
@@ -86,15 +82,15 @@ def get_init_state(
         """
         if message_history is None:
             message_history = []
-        assert all(is_valid_agent_history_message(m) for m in message_history), (
-            "Message history must contain only AssistantMessage, UserMessage, or ToolMessage to Agent."
-        )
+        assert all(
+            is_valid_agent_history_message(m) for m in message_history
+        ), "Message history must contain only AssistantMessage, UserMessage, or ToolMessage to Agent."
         return LLMAgentState(
             system_messages=[SystemMessage(role="system", content=self.system_prompt)],
             messages=message_history,
         )
 
-    def generate_next_message(
+    async def generate_next_message(
         self, message: ValidAgentInputMessage, state: LLMAgentState
     ) -> tuple[AssistantMessage, LLMAgentState]:
         """
@@ -105,7 +101,7 @@ def generate_next_message(
         else:
             state.messages.append(message)
         messages = state.system_messages + state.messages
-        assistant_message = generate(
+        assistant_message = await generate(
             model=self.llm,
             tools=self.tools,
             messages=messages,
@@ -172,9 +168,7 @@ def __init__(
         If provide_function_args is True, the resolution steps will include the function arguments.
         """
         super().__init__(tools=tools, domain_policy=domain_policy)
-        assert self.check_valid_task(task), (
-            f"Task {task.id} is not valid. Cannot run GT agent."
-        )
+        assert self.check_valid_task(task), f"Task {task.id} is not valid. Cannot run GT agent."
         self.task = task
         self.llm = llm
         self.llm_args = deepcopy(llm_args) if llm_args is not None else {}
@@ -201,9 +195,7 @@ def system_prompt(self) -> str:
             resolution_steps=self.make_agent_instructions_from_actions(),
         )
 
-    def get_init_state(
-        self, message_history: Optional[list[Message]] = None
-    ) -> LLMAgentState:
+    def get_init_state(self, message_history: Optional[list[Message]] = None) -> LLMAgentState:
         """Get the initial state of the agent.
 
         Args:
@@ -214,15 +206,15 @@ def get_init_state(
         """
         if message_history is None:
             message_history = []
-        assert all(is_valid_agent_history_message(m) for m in message_history), (
-            "Message history must contain only AssistantMessage, UserMessage, or ToolMessage to Agent."
-        )
+        assert all(
+            is_valid_agent_history_message(m) for m in message_history
+        ), "Message history must contain only AssistantMessage, UserMessage, or ToolMessage to Agent."
         return LLMAgentState(
             system_messages=[SystemMessage(role="system", content=self.system_prompt)],
             messages=message_history,
         )
 
-    def generate_next_message(
+    async def generate_next_message(
         self, message: ValidAgentInputMessage, state: LLMAgentState
     ) -> tuple[AssistantMessage, LLMAgentState]:
         """
@@ -233,7 +225,7 @@ def generate_next_message(
         else:
             state.messages.append(message)
         messages = state.system_messages + state.messages
-        assistant_message = generate(
+        assistant_message = await generate(
             model=self.llm,
             tools=self.tools,
             messages=messages,
@@ -263,9 +255,7 @@ def make_agent_instructions_from_actions(self) -> str:
         return "\n".join(lines)
 
     @classmethod
-    def make_agent_instructions_from_action(
-        cls, action: Action, include_function_args: bool = False
-    ) -> str:
+    def make_agent_instructions_from_action(cls, action: Action, include_function_args: bool = False) -> str:
         """
         Make agent instructions from an action.
         If the action is a user action, returns instructions for the agent to give to the user.
@@ -332,9 +322,7 @@ def __init__(
         Initialize the LLMAgent.
         """
         super().__init__(tools=tools, domain_policy=domain_policy)
-        assert self.check_valid_task(task), (
-            f"Task {task.id} is not valid. Cannot run GT agent."
-        )
+        assert self.check_valid_task(task), f"Task {task.id} is not valid. Cannot run GT agent."
         self.task = task
         self.llm = llm
         self.llm_args = llm_args if llm_args is not None else {}
@@ -417,9 +405,7 @@ def is_stop(cls, message: AssistantMessage) -> bool:
             return False
         return cls.STOP_TOKEN in message.content
 
-    def get_init_state(
-        self, message_history: Optional[list[Message]] = None
-    ) -> LLMAgentState:
+    def get_init_state(self, message_history: Optional[list[Message]] = None) -> LLMAgentState:
         """Get the initial state of the agent.
 
         Args:
@@ -430,15 +416,15 @@ def get_init_state(
         """
         if message_history is None:
             message_history = []
-        assert all(is_valid_agent_history_message(m) for m in message_history), (
-            "Message history must contain only AssistantMessage, UserMessage, or ToolMessage to Agent."
-        )
+        assert all(
+            is_valid_agent_history_message(m) for m in message_history
+        ), "Message history must contain only AssistantMessage, UserMessage, or ToolMessage to Agent."
         return LLMAgentState(
             system_messages=[SystemMessage(role="system", content=self.system_prompt)],
             messages=message_history,
         )
 
-    def generate_next_message(
+    async def generate_next_message(
         self, message: Optional[ValidAgentInputMessage], state: LLMAgentState
     ) -> tuple[AssistantMessage, LLMAgentState]:
         """
@@ -453,7 +439,7 @@ def generate_next_message(
         else:
             state.messages.append(message)
         messages = state.system_messages + state.messages
-        assistant_message = generate(
+        assistant_message = await generate(
             model=self.llm,
             tools=self.tools,
             messages=messages,
diff --git a/vendor/tau2/user/user_simulator.py b/vendor/tau2/user/user_simulator.py
index d594c4d8..d5508409 100644
--- a/vendor/tau2/user/user_simulator.py
+++ b/vendor/tau2/user/user_simulator.py
@@ -120,10 +120,12 @@ def is_stop(cls, message: UserMessage) -> bool:
         assert message.content is not None
         return STOP in message.content or TRANSFER in message.content or OUT_OF_SCOPE in message.content
 
-    def generate_next_message(self, message: ValidUserInputMessage, state: UserState) -> Tuple[UserMessage, UserState]:
-        return self._generate_next_message(message, state)
+    async def generate_next_message(
+        self, message: ValidUserInputMessage, state: UserState
+    ) -> Tuple[UserMessage, UserState]:
+        return await self._generate_next_message(message, state)
 
-    def _generate_next_message(
+    async def _generate_next_message(
         self, message: ValidUserInputMessage, state: UserState
     ) -> Tuple[UserMessage, UserState]:
         """Get the response from the user simulator.
@@ -143,7 +145,7 @@ def _generate_next_message(
         messages = state.system_messages + state.flip_roles()
 
         # Generate response
-        assistant_message = generate(
+        assistant_message = await generate(
             model=self.llm,
             messages=messages,
             tools=self.tools,
@@ -192,5 +194,7 @@ def is_stop(cls, message: UserMessage) -> bool:
     def set_seed(self, seed: int):
         pass
 
-    def generate_next_message(self, message: ValidUserInputMessage, state: UserState) -> tuple[UserMessage, UserState]:
+    async def generate_next_message(
+        self, message: ValidUserInputMessage, state: UserState
+    ) -> tuple[UserMessage, UserState]:
         raise NotImplementedError("DummyUser does not support generate_next_message")
diff --git a/vendor/tau2/utils/llm_utils.py b/vendor/tau2/utils/llm_utils.py
index 98ae698c..07578be3 100644
--- a/vendor/tau2/utils/llm_utils.py
+++ b/vendor/tau2/utils/llm_utils.py
@@ -3,7 +3,7 @@
 from typing import Any, Optional
 
 import litellm
-from litellm import completion, completion_cost
+from litellm import acompletion, completion_cost
 from litellm.caching.caching import Cache
 from litellm.main import ModelResponse, Usage
 from loguru import logger
@@ -178,7 +178,7 @@ def to_litellm_messages(messages: list[Message]) -> list[dict]:
     return litellm_messages
 
 
-def generate(
+async def generate(
     model: str,
     messages: list[Message],
     tools: Optional[list[Tool]] = None,
@@ -209,7 +209,7 @@ def generate(
     if tools and tool_choice is None:
         tool_choice = "auto"
     try:
-        response = completion(
+        response = await acompletion(
             model=model,
             messages=litellm_messages,
             tools=tools,

From 13a8506c3c4464fc2c07b02fe468d7d79626ddac Mon Sep 17 00:00:00 2001
From: Derek Xu <derek@fireworks.ai>
Date: Sun, 10 Aug 2025 05:46:48 +0000
Subject: [PATCH 06/14] debug

---
 eval_protocol/mcp/client/connection.py        |  24 +--
 eval_protocol/mcp/execution/manager.py        |  12 +-
 eval_protocol/mcp/mcpgym.py                   |  87 ++-------
 .../default_mcp_gym_rollout_processor.py      |   3 -
 examples/frozen_lake_mcp/frozen_lake_mcp.py   |   2 +-
 .../airline_environment.py                    |   4 -
 examples/tau2_mcp/tau2_mcp.py                 |   4 +-
 monitor_connections.sh                        |  11 --
 test_burst_client.py                          | 169 ------------------
 test_simple_mcp_server.py                     |  51 ------
 uv.lock                                       |   4 +-
 11 files changed, 30 insertions(+), 341 deletions(-)
 delete mode 100644 monitor_connections.sh
 delete mode 100644 test_burst_client.py
 delete mode 100644 test_simple_mcp_server.py

diff --git a/eval_protocol/mcp/client/connection.py b/eval_protocol/mcp/client/connection.py
index 6f67a5ca..f2a96af6 100644
--- a/eval_protocol/mcp/client/connection.py
+++ b/eval_protocol/mcp/client/connection.py
@@ -88,12 +88,6 @@ async def _prewarm_tools_cache(self, session: MCPSession) -> None:
         """
         cache_key = session.base_url
 
-        # Fast path: if cache already exists, return immediately (no lock)
-        if cache_key in self._tools_cache:
-            logger.debug(f"Tools cache already exists for {cache_key}")
-            return
-
-        # Slow path: need to create cache (use lock only for creation)
         async with self._tools_cache_lock:
             # Only fetch tools if not already cached for this base_url
             if cache_key not in self._tools_cache:
@@ -123,7 +117,7 @@ async def reset_session(self, session: MCPSession) -> None:
         headers = {"mcp-session-id": session.session_id}
         body = {"seed": session.seed}
 
-        timeout = httpx.Timeout(3.0)
+        timeout = httpx.Timeout(15.0)
         async with httpx.AsyncClient(timeout=timeout) as client:
             resp = await client.post(url, headers=headers, json=body)
             resp.raise_for_status()
@@ -145,23 +139,16 @@ async def discover_tools(self, session: MCPSession) -> List[Dict]:
 
         cache_key = session.base_url
 
-        # Fast path: Check cache first without lock (safe for reads)
-        if cache_key in self._tools_cache:
-            cached_tools = self._tools_cache[cache_key]
-            logger.debug(f"Using cached tools for session {session.session_id} ({len(cached_tools)} tools)")
-            return cached_tools
-
-        # Slow path: Cache miss - use lock only for writing
+        # Check cache first (should be pre-warmed during initialization)
         async with self._tools_cache_lock:
-            # Double-check pattern: another task might have cached it while we waited
             if cache_key in self._tools_cache:
                 cached_tools = self._tools_cache[cache_key]
                 logger.debug(f"Using cached tools for session {session.session_id} ({len(cached_tools)} tools)")
                 return cached_tools
 
-            # Fallback: if cache miss (shouldn't happen with pre-warming), fetch directly
-            logger.warning(f"Cache miss for {cache_key} - this shouldn't happen with pre-warming")
-            mcp_session = session._mcp_session
+        # Fallback: if cache miss (shouldn't happen with pre-warming), fetch directly
+        logger.warning(f"Cache miss for {cache_key} - this shouldn't happen with pre-warming")
+        mcp_session = session._mcp_session
 
         tools_response = await mcp_session.list_tools()
         tools = tools_response.tools if hasattr(tools_response, "tools") else []
@@ -213,7 +200,6 @@ async def get_initial_state(self, session: MCPSession) -> Any:
         logger.info(f"### 🌟 GET_INITIAL_STATE_START: timestamp: {method_start}, session_id: {session_id_short}...")
 
         if not session._mcp_session:
-            logger.error(f"### ❌ SESSION_NOT_INITIALIZED: session_id: {session_id_short}")
             raise RuntimeError("Session not initialized")
 
         # Try to get initial state from control plane endpoint first
diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py
index 6dd95803..af542300 100644
--- a/eval_protocol/mcp/execution/manager.py
+++ b/eval_protocol/mcp/execution/manager.py
@@ -288,13 +288,13 @@ async def _execute_rollout(
                         )
                         user_content = user_message.content if user_message.content else ""
 
-                    user_prompt = envs.format_user_prompt(rollout_idx, user_content)
-                    conversation_history.append({"role": "user", "content": user_prompt})
+                        user_prompt = envs.format_user_prompt(rollout_idx, user_content)
+                        conversation_history.append({"role": "user", "content": user_prompt})
 
-                    # Check if user simulator signaled termination
-                    if UserSimulator.is_stop(user_message):
-                        trajectory.terminated = True
-                        trajectory.termination_reason = TerminationReason.USER_STOP
+                        # Check if user simulator signaled termination
+                        if UserSimulator.is_stop(user_message):
+                            trajectory.terminated = True
+                            trajectory.termination_reason = TerminationReason.USER_STOP
 
                 # In each turn: keep looping until assistant is ready to provide final response
                 while not turn_completed and not trajectory.terminated:
diff --git a/eval_protocol/mcp/mcpgym.py b/eval_protocol/mcp/mcpgym.py
index 9618921c..c2e319c0 100644
--- a/eval_protocol/mcp/mcpgym.py
+++ b/eval_protocol/mcp/mcpgym.py
@@ -28,9 +28,7 @@
 from typing import Any, Callable, Dict, Optional, Tuple
 
 import uvicorn
-
-# from mcp.server.fastmcp import Context, FastMCP
-from fastmcp import Context, FastMCP
+from mcp.server.fastmcp import Context, FastMCP
 from pydantic import BaseModel
 from starlette.requests import Request
 from starlette.responses import JSONResponse
@@ -104,8 +102,11 @@ def __init__(
         self.adapter = adapter
 
         # Create FastMCP server
-        self.mcp = FastMCP(name=server_name)
-
+        self.mcp = FastMCP(
+            server_name,
+            host="0.0.0.0",
+            port=int(os.environ.get("PORT", 8000)),
+        )
         # Store host and port for later use in run() method
         self.host = "0.0.0.0"
         self.port = int(os.environ.get("PORT", 8000))
@@ -129,6 +130,7 @@ def __init__(
 
         self.pool = ThreadPoolExecutor(max_workers=max_workers)
 
+        # Reset with seed if provided
         self.env, self.obs, _info = self._new_env(seed=seed)
 
         # Register tools and control plane endpoints
@@ -220,7 +222,8 @@ async def reset_session_endpoint(request: Request) -> JSONResponse:
             if not session_id:
                 return JSONResponse({"error": "Missing mcp-session-id header"}, status_code=400)
             if session_id in self.sessions:
-                env, obs, _ = self._new_env(seed=seed)
+                loop = asyncio.get_running_loop()
+                env, obs, info = await loop.run_in_executor(self.pool, self._new_env, seed)
                 with self.session_lock:
                     self.sessions[session_id] = {
                         "env": env,
@@ -269,17 +272,10 @@ async def endpoint_handler(request: Request) -> JSONResponse:
                                     {"error": f"Session {session_id} not found"},
                                     status_code=404,
                                 )
-                            start_time = time.time()
-                            logger.info(
-                                f"### 🔍 NEW_ENV_START: timestamp: {start_time}, session_id: {session_id[:8] if len(session_id) > 8 else session_id}..."
-                            )
+
                             loop = asyncio.get_running_loop()
                             env, obs, info = await loop.run_in_executor(self.pool, self._new_env, None)
-                            # env, obs, info = self._new_env(None)
-                            end_time = time.time()
-                            logger.info(
-                                f"### 🔍 NEW_ENV_END: timestamp: {end_time}, elapsed: {end_time - start_time:.6f}s, session_id: {session_id[:8] if len(session_id) > 8 else session_id}..."
-                            )
+
                             # Initialize session state with extracted seed from session ID
                             session_data = {
                                 "env": env,
@@ -294,6 +290,7 @@ async def endpoint_handler(request: Request) -> JSONResponse:
                             result = await func(session_data=session_data)
                         else:
                             result = func(session_data=session_data)
+
                         return JSONResponse(result)
 
                     except Exception as e:
@@ -484,78 +481,26 @@ def get_info_endpoint(self, session_data: Dict[str, Any]) -> Dict[str, Any]:
     @control_plane_endpoint("/control/initial_state")
     async def get_initial_state_endpoint(self, session_data: Dict[str, Any]) -> Dict[str, Any]:
         """Get initial state for this session."""
-        endpoint_start = time.time()
         session_id = session_data.get("session_id", "unknown")
-        logger.info(
-            f"### 🌟 ENDPOINT_START: get_initial_state_endpoint, timestamp: {endpoint_start}, session_id: {session_id[:8] if len(session_id) > 8 else session_id}..."
-        )
-
-        env_check_start = time.time()
-        logger.info(
-            f"### 🔍 ENV_CHECK_START: timestamp: {env_check_start}, elapsed: {env_check_start - endpoint_start:.6f}s, session_id: {session_id[:8] if len(session_id) > 8 else session_id}..."
-        )
-
         env = session_data.get("env")
         obs = session_data.get("obs")
-
-        env_check_end = time.time()
-        logger.info(
-            f"### 🔍 ENV_CHECK_END: timestamp: {env_check_end}, elapsed: {env_check_end - endpoint_start:.6f}s, duration: {env_check_end - env_check_start:.6f}s, env: {env is not None}, obs: {obs is not None}, session_id: {session_id[:8] if len(session_id) > 8 else session_id}..."
-        )
-
         if env and obs is not None:
-            format_start = time.time()
-            logger.info(
-                f"### 🔄 FORMAT_OBS_START: timestamp: {format_start}, elapsed: {format_start - endpoint_start:.6f}s, session_id: {session_id[:8] if len(session_id) > 8 else session_id}..."
-            )
-
             try:
                 formatted_obs = self.format_observation(obs, env)
-
-                format_end = time.time()
-                logger.info(
-                    f"### 🔄 FORMAT_OBS_END: timestamp: {format_end}, elapsed: {format_end - endpoint_start:.6f}s, duration: {format_end - format_start:.6f}s, session_id: {session_id[:8] if len(session_id) > 8 else session_id}..."
-                )
-
-                endpoint_end = time.time()
-                logger.info(
-                    f"### ✅ ENDPOINT_SUCCESS_END: timestamp: {endpoint_end}, total_duration: {endpoint_end - endpoint_start:.6f}s, session_id: {session_id[:8] if len(session_id) > 8 else session_id}..."
-                )
-
                 return formatted_obs
             except Exception as e:
-                error_time = time.time()
-                logger.error(
-                    f"### ❌ FORMAT_OBS_ERROR: timestamp: {error_time}, elapsed: {error_time - endpoint_start:.6f}s, error: {str(e)}, session_id: {session_id[:8] if len(session_id) > 8 else session_id}..."
-                )
-
+                logger.error(f"❌ Error in format_observation: {e}")
                 return {
                     "error": f"Failed to format observation: {str(e)}",
                     "observation_type": str(type(obs)),
                     "session_id": session_data.get("session_id", "unknown"),
                 }
         else:
-            fallback_start = time.time()
-            logger.info(
-                f"### 🔄 FALLBACK_START: timestamp: {fallback_start}, elapsed: {fallback_start - endpoint_start:.6f}s, session_id: {session_id[:8] if len(session_id) > 8 else session_id}..."
-            )
-
             # Fallback if session data is not available
             result = {
                 "observation": "session_not_initialized",
                 "session_id": session_data.get("session_id", "unknown"),
             }
-
-            fallback_end = time.time()
-            logger.info(
-                f"### 🔄 FALLBACK_END: timestamp: {fallback_end}, elapsed: {fallback_end - endpoint_start:.6f}s, duration: {fallback_end - fallback_start:.6f}s, session_id: {session_id[:8] if len(session_id) > 8 else session_id}..."
-            )
-
-            endpoint_end = time.time()
-            logger.info(
-                f"### ✅ ENDPOINT_FALLBACK_END: timestamp: {endpoint_end}, total_duration: {endpoint_end - endpoint_start:.6f}s, session_id: {session_id[:8] if len(session_id) > 8 else session_id}..."
-            )
-
             return result
 
     def _get_session_control_plane_from_data(self, session_data: Dict[str, Any]) -> Dict[str, Any]:
@@ -623,9 +568,9 @@ async def run_with_high_concurrency():
 
                 config = uvicorn.Config(
                     starlette_app,
-                    host=self.host,
-                    port=self.port,
-                    log_level="info",  # Use default log level instead of accessing settings
+                    host=self.mcp.settings.host,
+                    port=self.mcp.settings.port,
+                    log_level=self.mcp.settings.log_level.lower(),  # Use default log level instead of accessing settings
                     proxy_headers=True,
                     forwarded_allow_ips="*",
                     # HIGH CONCURRENCY SETTINGS
diff --git a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py
index c713e4d9..eb7cdf36 100644
--- a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py
+++ b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py
@@ -224,14 +224,11 @@ async def default_mcp_gym_rollout_processor(
         )
 
         # Create MCP environments directly from evaluation_rows
-        print("DEBUG1", time.time())
         envs = await ep.make(
             "http://localhost:9700/mcp/",
             evaluation_rows=rows,
             model_id=policy.model_id,
         )
-        print("DEBUG2", time.time())
-        print("max_concurrent_rollouts", config.max_concurrent_rollouts)
 
         # Run rollout with environments and policy
         evaluation_rows = await ep.rollout(
diff --git a/examples/frozen_lake_mcp/frozen_lake_mcp.py b/examples/frozen_lake_mcp/frozen_lake_mcp.py
index 570c9832..d38103f8 100644
--- a/examples/frozen_lake_mcp/frozen_lake_mcp.py
+++ b/examples/frozen_lake_mcp/frozen_lake_mcp.py
@@ -19,8 +19,8 @@
 
 from typing import Any, Dict, Optional
 
-from fastmcp import Context
 from frozen_lake_adapter import FrozenLakeAdapter
+from mcp.server.fastmcp import Context
 
 from eval_protocol.mcp import McpGym
 from eval_protocol.mcp.mcpgym import control_plane_endpoint
diff --git a/examples/tau2_mcp/airplane_environment/airline_environment.py b/examples/tau2_mcp/airplane_environment/airline_environment.py
index b8c5bb3e..0c1e2d14 100644
--- a/examples/tau2_mcp/airplane_environment/airline_environment.py
+++ b/examples/tau2_mcp/airplane_environment/airline_environment.py
@@ -37,13 +37,9 @@ def __init__(self, config: Optional[Dict[str, Any]] = None):
     def reset(self, seed: Optional[int] = None) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         """Reset the environment to initial state"""
         logger.info("🔄 Resetting airline environment - reloading database from disk")
-        start_time = time.time()
         self.db = FlightDB.load(AIRLINE_DB_PATH)
         self.airline_tools = AirlineTools(self.db)
 
-        end_time = time.time()
-        logger.info(f"11RESET TOOK {end_time - start_time:.2f} seconds, called at {start_time}")
-
         return {}, {}
 
     def step(self, action: Dict[str, Any]) -> Tuple[Dict[str, Any], float, bool, bool, Dict[str, Any]]:
diff --git a/examples/tau2_mcp/tau2_mcp.py b/examples/tau2_mcp/tau2_mcp.py
index 8cff77f8..a244e695 100644
--- a/examples/tau2_mcp/tau2_mcp.py
+++ b/examples/tau2_mcp/tau2_mcp.py
@@ -12,9 +12,7 @@
 from typing import Annotated, Any, Dict, List, Optional
 
 from airplane_environment.airline_environment import AirlineEnvironment
-
-# from mcp.server.fastmcp import Context
-from fastmcp import Context
+from mcp.server.fastmcp import Context
 from mock_environment.mock_environment import MockEnvironment
 from pydantic import Field
 from retail_environment.retail_environment import RetailEnvironment
diff --git a/monitor_connections.sh b/monitor_connections.sh
deleted file mode 100644
index 547b13a9..00000000
--- a/monitor_connections.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-echo "Monitoring connections to port 9700..."
-echo "Press Ctrl+C to stop"
-
-while true; do
-    count=$(netstat -an | grep :9700 | grep ESTABLISHED | wc -l)
-    timestamp=$(date '+%H:%M:%S')
-    echo "$timestamp: $count connections to port 9700"
-    sleep 1
-done
diff --git a/test_burst_client.py b/test_burst_client.py
deleted file mode 100644
index 507bfc40..00000000
--- a/test_burst_client.py
+++ /dev/null
@@ -1,169 +0,0 @@
-#!/usr/bin/env python3
-"""
-Burst Client Test - Simulates 50 threads calling envs.reset() -> get_initial_state
-Exact pattern: _execute_rollout() -> envs.reset() -> get_initial_state -> client.get()
-"""
-
-import asyncio
-import threading
-import time
-from typing import Any, Dict, List
-
-import httpx
-
-
-class EnvResetClient:
-    """
-    Simulates the exact pattern from your code:
-    50 threads -> _execute_rollout() -> envs.reset() -> get_initial_state -> client.get()
-    """
-
-    def __init__(self, base_url: str = "http://localhost:8000"):
-        self.base_url = base_url
-        self.initial_state_url = f"{base_url}/control/initial_state"
-
-    async def get_initial_state(self, thread_id: int) -> Dict[str, Any]:
-        """
-        Simulates the get_initial_state call from your McpGym code.
-        This is the slow HTTP call that happens during envs.reset().
-        """
-        headers = {"Content-Type": "application/json", "Accept": "application/json"}
-
-        start_time = time.time()
-
-        try:
-            async with httpx.AsyncClient(timeout=30.0) as client:
-                # This is the exact pattern from your code
-                initial_state_response = await client.get(
-                    self.initial_state_url,
-                    headers=headers,
-                    timeout=30.0,
-                )
-                initial_state_response.raise_for_status()
-                result = initial_state_response.json()
-
-                end_time = time.time()
-                duration = end_time - start_time
-
-                return {"thread_id": thread_id, "success": True, "duration": duration, "initial_state": result}
-
-        except Exception as e:
-            end_time = time.time()
-            duration = end_time - start_time
-            return {"thread_id": thread_id, "success": False, "duration": duration, "error": str(e)}
-
-    async def envs_reset(self, thread_id: int) -> Dict[str, Any]:
-        """
-        Simulates envs.reset() which internally calls get_initial_state.
-        This is what gets called from _execute_rollout().
-        """
-        print(f"🔄 Thread {thread_id}: envs.reset() called")
-
-        # This simulates the envs.reset() -> get_initial_state call chain
-        return await self.get_initial_state(thread_id)
-
-
-async def _execute_rollout(thread_id: int, client: EnvResetClient) -> Dict[str, Any]:
-    """
-    Simulates _execute_rollout() function that calls envs.reset().
-    This runs concurrently using asyncio, matching your actual pattern.
-    """
-    print(f"🚀 Rollout {thread_id}: _execute_rollout() started")
-
-    # This is where envs.reset() gets called
-    result = await client.envs_reset(thread_id)
-    return result
-
-
-async def run_burst_test(num_clients: int = 50, server_url: str = "http://localhost:8000"):
-    """
-    Run burst test simulating 50 concurrent _execute_rollout() calls.
-    Each one calls envs.reset() -> get_initial_state -> client.get()
-    """
-    print(f"🚀 Starting burst test with {num_clients} concurrent rollouts")
-    print(f"🎯 Target server: {server_url}")
-    print(f"📋 Pattern: _execute_rollout() -> envs.reset() -> get_initial_state -> client.get()")
-
-    client = EnvResetClient(server_url)
-
-    # Create tasks for concurrent rollouts (simulating your threading pattern)
-    start_time = time.time()
-    tasks = [_execute_rollout(i, client) for i in range(num_clients)]
-
-    # Run all rollouts concurrently
-    results = await asyncio.gather(*tasks, return_exceptions=True)
-
-    end_time = time.time()
-    total_duration = end_time - start_time
-
-    # Analyze results
-    successful = [r for r in results if isinstance(r, dict) and r.get("success")]
-    failed = [r for r in results if isinstance(r, dict) and not r.get("success")]
-    exceptions = [r for r in results if not isinstance(r, dict)]
-
-    print(f"\n📊 BURST TEST RESULTS:")
-    print(f"   Total rollouts: {num_clients}")
-    print(f"   Total time: {total_duration:.3f}s")
-    print(f"   Successful: {len(successful)}")
-    print(f"   Failed: {len(failed)}")
-    print(f"   Exceptions: {len(exceptions)}")
-
-    if successful:
-        avg_duration = sum(r["duration"] for r in successful) / len(successful)
-        min_duration = min(r["duration"] for r in successful)
-        max_duration = max(r["duration"] for r in successful)
-
-        print(f"   Average rollout duration: {avg_duration:.3f}s")
-        print(f"   Min rollout duration: {min_duration:.3f}s")
-        print(f"   Max rollout duration: {max_duration:.3f}s")
-
-        # Show sample successful result
-        sample = successful[0]
-        print(f"\n✅ Sample successful rollout:")
-        print(f"   Thread ID: {sample['thread_id']}")
-        print(f"   Initial state: {sample['initial_state']['observation']}")
-        print(f"   Timestamp: {sample['initial_state']['timestamp']}")
-
-    if failed:
-        print(f"\n❌ Sample failed rollouts:")
-        for fail in failed[:3]:  # Show first 3 failures
-            print(f"   Thread {fail['thread_id']}: {fail['error']}")
-
-    if exceptions:
-        print(f"\n💥 Sample exceptions:")
-        for exc in exceptions[:3]:  # Show first 3 exceptions
-            print(f"   {type(exc).__name__}: {exc}")
-
-    # Key test: If concurrent, should take ~1 second. If sequential, ~50 seconds.
-    if total_duration < 5:  # Allow some overhead
-        print(f"\n🎉 CONCURRENCY WORKING! Total time {total_duration:.3f}s (expected ~1s for concurrent)")
-    else:
-        print(f"\n⚠️  POSSIBLE SEQUENTIAL EXECUTION! Total time {total_duration:.3f}s (expected ~1s for concurrent)")
-
-    return len(successful) == num_clients
-
-
-def main():
-    """Run the burst test."""
-    import argparse
-
-    parser = argparse.ArgumentParser(
-        description="Envs Reset Burst Test - Simulates 50 rollouts calling get_initial_state"
-    )
-    parser.add_argument("--rollouts", type=int, default=50, help="Number of concurrent rollouts")
-    parser.add_argument("--server", default="http://localhost:8000", help="Server URL")
-
-    args = parser.parse_args()
-
-    success = asyncio.run(run_burst_test(args.rollouts, args.server))
-
-    if success:
-        print(f"\n🎉 ALL {args.rollouts} ROLLOUTS SUCCESSFUL!")
-        exit(0)
-    else:
-        print(f"\n💥 SOME ROLLOUTS FAILED!")
-        exit(1)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/test_simple_mcp_server.py b/test_simple_mcp_server.py
deleted file mode 100644
index c22920fa..00000000
--- a/test_simple_mcp_server.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/usr/bin/env python3
-"""
-Simple MCP Server for Testing get_initial_state Concurrency
-Simulates the exact pattern: envs.reset() -> get_initial_state -> slow HTTP endpoint
-"""
-
-import asyncio
-import os
-import time
-
-from fastmcp import FastMCP
-from starlette.requests import Request
-from starlette.responses import JSONResponse
-
-# Create a simple MCP server
-mcp = FastMCP(name="TestServer")
-
-
-@mcp.custom_route("/control/initial_state", methods=["GET"])
-async def get_initial_state_endpoint(request: Request) -> JSONResponse:
-    """
-    Simulate the get_initial_state endpoint that's slow.
-    This mimics the pattern in your McpGym code.
-    """
-    print(f"🔍 get_initial_state called at {time.time()}")
-
-    # Simulate the slow operation (like environment initialization)
-    time.sleep(1)  # 1 second delay to test concurrency
-
-    # Return a dummy initial state
-    return JSONResponse({"observation": "dummy_initial_state", "session_id": "test_session", "timestamp": time.time()})
-
-
-@mcp.tool
-def dummy_tool() -> str:
-    """Dummy tool for MCP compatibility."""
-    return "dummy"
-
-
-def main():
-    """Run the test server."""
-    port = int(os.environ.get("PORT", 8000))
-    print(f"🚀 Starting get_initial_state test server on port {port}")
-    print(f"📡 Endpoint: http://localhost:{port}/control/initial_state")
-
-    # Use FastMCP 2.0 run method with streamable-http transport
-    mcp.run(transport="http", host="0.0.0.0", port=port)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/uv.lock b/uv.lock
index 6851c396..4a9008a2 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
 version = 1
-revision = 2
+revision = 3
 requires-python = ">=3.10"
 resolution-markers = [
     "python_full_version >= '3.13'",
@@ -1130,7 +1130,6 @@ dependencies = [
     { name = "deepdiff" },
     { name = "docstring-parser" },
     { name = "fastapi" },
-    { name = "fastmcp" },
     { name = "fsspec" },
     { name = "gymnasium" },
     { name = "httpx" },
@@ -1243,7 +1242,6 @@ requires-dist = [
     { name = "docstring-parser", specifier = ">=0.15" },
     { name = "e2b", marker = "extra == 'dev'" },
     { name = "fastapi", specifier = ">=0.116.1" },
-    { name = "fastmcp", specifier = ">=2.10.6" },
     { name = "fireworks-ai", marker = "extra == 'fireworks'", specifier = ">=0.19.12" },
     { name = "flake8", marker = "extra == 'dev'", specifier = ">=3.9.2" },
     { name = "fsspec" },

From 28932e45696394d13817d3894756011f572ad2c4 Mon Sep 17 00:00:00 2001
From: Derek Xu <derek@fireworks.ai>
Date: Sun, 10 Aug 2025 07:29:22 +0000
Subject: [PATCH 07/14] cleanup

---
 eval_protocol/mcp/client/connection.py | 92 +++++-------------------
 eval_protocol/mcp/execution/manager.py |  3 -
 eval_protocol/mcp/execution/policy.py  |  1 -
 eval_protocol/mcp/mcpgym.py            |  8 +--
 tests/pytest/test_tau_bench_airline.py | 98 ++++++++++++++------------
 5 files changed, 71 insertions(+), 131 deletions(-)

diff --git a/eval_protocol/mcp/client/connection.py b/eval_protocol/mcp/client/connection.py
index f2a96af6..ee68ae78 100644
--- a/eval_protocol/mcp/client/connection.py
+++ b/eval_protocol/mcp/client/connection.py
@@ -195,10 +195,6 @@ async def get_initial_state(self, session: MCPSession) -> Any:
         Returns:
             Initial observation/state
         """
-        method_start = time.time()
-        session_id_short = session.session_id[:8] if len(session.session_id) > 8 else session.session_id
-        logger.info(f"### 🌟 GET_INITIAL_STATE_START: timestamp: {method_start}, session_id: {session_id_short}...")
-
         if not session._mcp_session:
             raise RuntimeError("Session not initialized")
 
@@ -207,105 +203,53 @@ async def get_initial_state(self, session: MCPSession) -> Any:
 
         try:
             # Extract base URL and session ID from the MCP session
-            url_extract_start = time.time()
-            logger.info(
-                f"### 🔍 URL_EXTRACT_START: timestamp: {url_extract_start}, elapsed: {url_extract_start - method_start:.6f}s, session_id: {session_id_short}..."
-            )
-
             base_url = session.base_url.rstrip("/").removesuffix("/mcp")
             session_id = session.session_id
 
-            url_extract_end = time.time()
-            logger.info(
-                f"### 🔍 URL_EXTRACT_END: timestamp: {url_extract_end}, elapsed: {url_extract_end - method_start:.6f}s, duration: {url_extract_end - url_extract_start:.6f}s, base_url: {base_url}, session_id: {session_id_short}..."
-            )
-
             if session_id:
-                headers_start = time.time()
-                logger.info(
-                    f"### 🔍 HEADERS_CREATE_START: timestamp: {headers_start}, elapsed: {headers_start - method_start:.6f}s, session_id: {session_id_short}..."
-                )
-
                 headers = {"mcp-session-id": session_id}
 
-                headers_end = time.time()
-                logger.info(
-                    f"### 🔍 HEADERS_CREATE_END: timestamp: {headers_end}, elapsed: {headers_end - method_start:.6f}s, duration: {headers_end - headers_start:.6f}s, session_id: {session_id_short}..."
-                )
-
                 # Query initial state endpoint
                 try:
-                    timeout_start = time.time()
-                    logger.info(
-                        f"### 🔍 TIMEOUT_CONFIG_START: timestamp: {timeout_start}, elapsed: {timeout_start - method_start:.6f}s, session_id: {session_id_short}..."
-                    )
-
                     # Use shorter timeout for playback mode, longer timeout for high-concurrency initialization
                     # (50+ concurrent sessions need more time for initial state setup)
                     timeout = 3.0 if hasattr(session, "_is_playback_mode") and session._is_playback_mode else 15.0
 
-                    timeout_end = time.time()
-                    logger.info(
-                        f"### 🔍 TIMEOUT_CONFIG_END: timestamp: {timeout_end}, elapsed: {timeout_end - method_start:.6f}s, duration: {timeout_end - timeout_start:.6f}s, timeout: {timeout}s, session_id: {session_id_short}..."
-                    )
-
-                    # TIMING: Get shared client
-                    # client = await self._get_shared_client(timeout)
-
-                    # TIMING: HTTP request with shared client
-                    request_start = time.time()
-                    logger.info(
-                        f"### 🌐 HTTP_REQUEST_START: timestamp: {request_start}, elapsed: {request_start - method_start:.6f}s, url: {base_url}/control/initial_state, session_id: {session_id_short}..."
-                    )
-
-                    timeout = 3.0 if hasattr(session, "_is_playback_mode") and session._is_playback_mode else 15.0
-
                     async with httpx.AsyncClient(timeout=timeout) as client:
                         initial_state_response = await client.get(
                             f"{base_url}/control/initial_state",
                             headers=headers,
                             timeout=timeout,
                         )
-                        request_time = time.time() - request_start
-
-                        request_end = time.time()
-                        logger.info(
-                            f"### 🌐 HTTP_REQUEST_END: timestamp: {request_end}, elapsed: {request_end - method_start:.6f}s, duration: {request_time:.6f}s, status_code: {initial_state_response.status_code}, session_id: {session_id_short}..."
-                        )
 
                         if initial_state_response.status_code == 200:
                             initial_observation = initial_state_response.json()
-                            success_end = time.time()
                             logger.info(
-                                f"### ✅ RETURN: timestamp: {success_end}, total_duration: {success_end - method_start:.6f}s, session_id: {session_id_short}..."
+                                f"Session {session.session_id}: ✅ Successfully fetched session-aware initial state from control plane endpoint"
                             )
-                            # return initial_observation
                         else:
-                            error_time = time.time()
                             logger.warning(
-                                f"### ⚠️ HTTP_ERROR_RESPONSE: timestamp: {error_time}, elapsed: {error_time - method_start:.6f}s, status_code: {initial_state_response.status_code}, session_id: {session_id_short}"
+                                f"Control plane initial state endpoint returned {initial_state_response.status_code}"
                             )
                 except httpx.TimeoutException:
-                    timeout_error_time = time.time()
-                    logger.warning(
-                        f"### ⏰ HTTP_TIMEOUT: timestamp: {timeout_error_time}, elapsed: {timeout_error_time - method_start:.6f}s, timeout: {timeout}s, session_id: {session_id_short}"
-                    )
+                    logger.warning(f"Control plane initial state endpoint timed out after {timeout}s")
                 except Exception as e:
-                    http_error_time = time.time()
-                    logger.warning(
-                        f"### ❌ HTTP_ERROR: timestamp: {http_error_time}, elapsed: {http_error_time - method_start:.6f}s, error: {str(e)}, session_id: {session_id_short}"
-                    )
-
+                    logger.warning(f"Failed to query control plane initial state endpoint: {e}")
         except Exception as e:
-            general_error_time = time.time()
-            logger.warning(
-                f"### ❌ GENERAL_ERROR: timestamp: {general_error_time}, elapsed: {general_error_time - method_start:.6f}s, error: {str(e)}, session_id: {session_id_short}"
-            )
-
-        method_end = time.time()
-        logger.info(
-            f"### 🔴 GET_INITIAL_STATE_END: timestamp: {method_end}, total_duration: {method_end - method_start:.6f}s, session_id: {session_id_short}..."
-        )
+            logger.warning(f"Failed to query control plane initial state endpoint: {e}")
+
+        # Fallback to MCP resource if control plane endpoint fails (backward compatibility)
+        if initial_observation is None:
+            logger.debug(f"Session {session.session_id}: Falling back to MCP resource for initial state")
+            initial_observation = await self._get_initial_state_from_mcp_resource(session)
+
+        # Ensure we have some observation
+        if initial_observation is None:
+            logger.debug(f"Session {session.session_id}: Using default initial state")
+            initial_observation = {
+                "observation": "default_initial_state",
+                "session_id": session.session_id,
+            }
 
         return initial_observation
 
diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py
index af542300..94e06f8a 100644
--- a/eval_protocol/mcp/execution/manager.py
+++ b/eval_protocol/mcp/execution/manager.py
@@ -207,9 +207,6 @@ async def _execute_rollout(
         """
         session = envs.sessions[rollout_idx]
         dataset_row = envs.dataset_rows[rollout_idx]
-        rollout_start = time.time()
-        elapsed_from_main_start = rollout_start - start_time
-        logger.info(f"DEBUG4. Starting rollout {dataset_row.id} at {rollout_start}")
 
         # Initialize trajectory
         trajectory = Trajectory(
diff --git a/eval_protocol/mcp/execution/policy.py b/eval_protocol/mcp/execution/policy.py
index a92ec662..06233c4b 100644
--- a/eval_protocol/mcp/execution/policy.py
+++ b/eval_protocol/mcp/execution/policy.py
@@ -19,7 +19,6 @@
 from .base_policy import LLMBasePolicy
 
 logger = logging.getLogger(__name__)
-litellm._turn_on_debug()
 
 
 class LiteLLMPolicy(LLMBasePolicy):
diff --git a/eval_protocol/mcp/mcpgym.py b/eval_protocol/mcp/mcpgym.py
index c2e319c0..2386c324 100644
--- a/eval_protocol/mcp/mcpgym.py
+++ b/eval_protocol/mcp/mcpgym.py
@@ -107,9 +107,6 @@ def __init__(
             host="0.0.0.0",
             port=int(os.environ.get("PORT", 8000)),
         )
-        # Store host and port for later use in run() method
-        self.host = "0.0.0.0"
-        self.port = int(os.environ.get("PORT", 8000))
 
         # Multi-session support
         self.sessions = {}  # session_id -> {"env": env, "obs": obs, "session_data": data}
@@ -497,11 +494,10 @@ async def get_initial_state_endpoint(self, session_data: Dict[str, Any]) -> Dict
                 }
         else:
             # Fallback if session data is not available
-            result = {
+            return {
                 "observation": "session_not_initialized",
                 "session_id": session_data.get("session_id", "unknown"),
             }
-            return result
 
     def _get_session_control_plane_from_data(self, session_data: Dict[str, Any]) -> Dict[str, Any]:
         """Extract control plane state from session data."""
@@ -570,7 +566,7 @@ async def run_with_high_concurrency():
                     starlette_app,
                     host=self.mcp.settings.host,
                     port=self.mcp.settings.port,
-                    log_level=self.mcp.settings.log_level.lower(),  # Use default log level instead of accessing settings
+                    log_level=self.mcp.settings.log_level.lower(),
                     proxy_headers=True,
                     forwarded_allow_ips="*",
                     # HIGH CONCURRENCY SETTINGS
diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py
index 7628cc2e..cbdaaaf6 100644
--- a/tests/pytest/test_tau_bench_airline.py
+++ b/tests/pytest/test_tau_bench_airline.py
@@ -58,7 +58,7 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval
 
         rows.append(eval_row)
 
-    return rows
+    return rows[0:3]
 
 
 @evaluation_test(
@@ -139,23 +139,27 @@ def test_tau_bench_airline_evaluation(row: EvaluationRow) -> EvaluationRow:
         id="Filler", evaluation_criteria=evaluation_criteria, user_scenario=UserScenario(instructions="Filler")
     )  # id and user_scenario are required for the Task type but not used in calculating reward
 
-    env_reward_info = EnvironmentEvaluator.calculate_reward(
-        environment_constructor=registry.get_env_constructor("airline"),
-        task=task,
-        full_trajectory=trajectory_objects,
-    )
-    # action_reward_info = ActionEvaluator.calculate_reward(
-    #     task=task,
-    #     full_trajectory=trajectory_objects,
-    # )
-    communicate_reward_info = CommunicateEvaluator.calculate_reward(
-        task=task,
-        full_trajectory=trajectory_objects,
-    )
-    # nl_reward_info = NLAssertionsEvaluator.calculate_reward(
-    #     task=task,
-    #     full_trajectory=trajectory_objects,
-    # )
+    if RewardType.DB in task.evaluation_criteria.reward_basis:
+        env_reward_info = EnvironmentEvaluator.calculate_reward(
+            environment_constructor=registry.get_env_constructor("airline"),
+            task=task,
+            full_trajectory=trajectory_objects,
+        )
+    if RewardType.ACTION in task.evaluation_criteria.reward_basis:
+        action_reward_info = ActionEvaluator.calculate_reward(
+            task=task,
+            full_trajectory=trajectory_objects,
+        )
+    if RewardType.COMMUNICATE in task.evaluation_criteria.reward_basis:
+        communicate_reward_info = CommunicateEvaluator.calculate_reward(
+            task=task,
+            full_trajectory=trajectory_objects,
+        )
+    if RewardType.NL_ASSERTION in task.evaluation_criteria.reward_basis:
+        nl_reward_info = NLAssertionsEvaluator.calculate_reward(
+            task=task,
+            full_trajectory=trajectory_objects,
+        )
 
     reward = 1.0
     env_bases = {RewardType.DB, RewardType.ENV_ASSERTION}
@@ -169,14 +173,14 @@ def test_tau_bench_airline_evaluation(row: EvaluationRow) -> EvaluationRow:
         if env_reward_info.reward_breakdown is not None:
             reward_breakdown.update(env_reward_info.reward_breakdown)
         reward *= env_reward_info.reward
-    # if task_reward_basis & action_bases:
-    #     if action_reward_info.reward_breakdown is not None:
-    #         reward_breakdown.update(action_reward_info.reward_breakdown)
-    #     reward *= action_reward_info.reward
-    # if task_reward_basis & nl_bases:
-    #     if nl_reward_info.reward_breakdown is not None:
-    #         reward_breakdown.update(nl_reward_info.reward_breakdown)
-    #     reward *= nl_reward_info.reward
+    if task_reward_basis & action_bases:
+        if action_reward_info.reward_breakdown is not None:
+            reward_breakdown.update(action_reward_info.reward_breakdown)
+        reward *= action_reward_info.reward
+    if task_reward_basis & nl_bases:
+        if nl_reward_info.reward_breakdown is not None:
+            reward_breakdown.update(nl_reward_info.reward_breakdown)
+        reward *= nl_reward_info.reward
     if task_reward_basis & comm_bases:
         if communicate_reward_info.reward_breakdown is not None:
             reward_breakdown.update(communicate_reward_info.reward_breakdown)
@@ -188,27 +192,27 @@ def test_tau_bench_airline_evaluation(row: EvaluationRow) -> EvaluationRow:
     if task_reward_basis & env_bases and env_reward_info.reward == 0:
         failed_reasons.append("❌ Environment/DB check failed")
 
-    # if task_reward_basis & action_bases and action_reward_info.reward == 0:
-    #     failed_actions = []
-    #     if hasattr(action_reward_info, "action_checks") and action_reward_info.action_checks:
-    #         failed_actions = [
-    #             f"{ac.action.name}({ac.action.arguments})"
-    #             for ac in action_reward_info.action_checks
-    #             if not ac.action_match
-    #         ]
-    #     if failed_actions:
-    #         failed_reasons.append(f"❌ Failed actions: {failed_actions}")
-    #     else:
-    #         failed_reasons.append("❌ Actions failed")
-
-    # if task_reward_basis & nl_bases and nl_reward_info.reward == 0:
-    #     failed_nl = []
-    #     if hasattr(nl_reward_info, "nl_assertions") and nl_reward_info.nl_assertions:
-    #         failed_nl = [nla.nl_assertion for nla in nl_reward_info.nl_assertions if not nla.met]
-    #     if failed_nl:
-    #         failed_reasons.append(f"❌ Failed NL assertions: {failed_nl}")
-    #     else:
-    #         failed_reasons.append("❌ NL Assertions failed")
+    if task_reward_basis & action_bases and action_reward_info.reward == 0:
+        failed_actions = []
+        if hasattr(action_reward_info, "action_checks") and action_reward_info.action_checks:
+            failed_actions = [
+                f"{ac.action.name}({ac.action.arguments})"
+                for ac in action_reward_info.action_checks
+                if not ac.action_match
+            ]
+        if failed_actions:
+            failed_reasons.append(f"❌ Failed actions: {failed_actions}")
+        else:
+            failed_reasons.append("❌ Actions failed")
+
+    if task_reward_basis & nl_bases and nl_reward_info.reward == 0:
+        failed_nl = []
+        if hasattr(nl_reward_info, "nl_assertions") and nl_reward_info.nl_assertions:
+            failed_nl = [nla.nl_assertion for nla in nl_reward_info.nl_assertions if not nla.met]
+        if failed_nl:
+            failed_reasons.append(f"❌ Failed NL assertions: {failed_nl}")
+        else:
+            failed_reasons.append("❌ NL Assertions failed")
 
     if task_reward_basis & comm_bases and communicate_reward_info.reward == 0:
         failed_comm = []

From 9af5b0c80556e19ff212690060bd60957283780d Mon Sep 17 00:00:00 2001
From: Derek Xu <derek@fireworks.ai>
Date: Sun, 10 Aug 2025 07:32:56 +0000
Subject: [PATCH 08/14] small fix

---
 eval_protocol/mcp/client/connection.py | 5 ++---
 eval_protocol/mcp/execution/manager.py | 5 ++---
 eval_protocol/mcp/mcpgym.py            | 1 -
 tests/pytest/test_tau_bench_airline.py | 2 +-
 4 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/eval_protocol/mcp/client/connection.py b/eval_protocol/mcp/client/connection.py
index ee68ae78..feca11ab 100644
--- a/eval_protocol/mcp/client/connection.py
+++ b/eval_protocol/mcp/client/connection.py
@@ -214,14 +214,12 @@ async def get_initial_state(self, session: MCPSession) -> Any:
                     # Use shorter timeout for playback mode, longer timeout for high-concurrency initialization
                     # (50+ concurrent sessions need more time for initial state setup)
                     timeout = 3.0 if hasattr(session, "_is_playback_mode") and session._is_playback_mode else 15.0
-
                     async with httpx.AsyncClient(timeout=timeout) as client:
                         initial_state_response = await client.get(
                             f"{base_url}/control/initial_state",
                             headers=headers,
                             timeout=timeout,
                         )
-
                         if initial_state_response.status_code == 200:
                             initial_observation = initial_state_response.json()
                             logger.info(
@@ -234,7 +232,8 @@ async def get_initial_state(self, session: MCPSession) -> Any:
                 except httpx.TimeoutException:
                     logger.warning(f"Control plane initial state endpoint timed out after {timeout}s")
                 except Exception as e:
-                    logger.warning(f"Failed to query control plane initial state endpoint: {e}")
+                    logger.warning(f"Failed to query initial state endpoint: {e}")
+
         except Exception as e:
             logger.warning(f"Failed to query control plane initial state endpoint: {e}")
 
diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py
index 94e06f8a..fce561c7 100644
--- a/eval_protocol/mcp/execution/manager.py
+++ b/eval_protocol/mcp/execution/manager.py
@@ -11,9 +11,7 @@
 import os
 import threading
 import time
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from dataclasses import asdict, dataclass
-from datetime import datetime
+from dataclasses import asdict
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
 from openai.types import CompletionUsage
@@ -260,6 +258,7 @@ async def _execute_rollout(
                 {"role": "system", "content": system_prompt},
                 {"role": "user", "content": user_prompt},
             ]
+
             logger.info(f"🎯 Starting rollout {rollout_idx} in thread {threading.current_thread().name}")
 
             # Run rollout loop for this specific environment
diff --git a/eval_protocol/mcp/mcpgym.py b/eval_protocol/mcp/mcpgym.py
index 2386c324..cf942a0f 100644
--- a/eval_protocol/mcp/mcpgym.py
+++ b/eval_protocol/mcp/mcpgym.py
@@ -20,7 +20,6 @@
 import logging
 import os
 import threading
-import time
 from abc import ABC, abstractmethod
 from concurrent.futures import ThreadPoolExecutor
 from datetime import date, datetime
diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py
index cbdaaaf6..5bb025b6 100644
--- a/tests/pytest/test_tau_bench_airline.py
+++ b/tests/pytest/test_tau_bench_airline.py
@@ -58,7 +58,7 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval
 
         rows.append(eval_row)
 
-    return rows[0:3]
+    return rows
 
 
 @evaluation_test(

From 8de812e988501c4931ea94c06d5eac4c1cd97d8e Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Sun, 10 Aug 2025 02:27:14 -0700
Subject: [PATCH 09/14] standard deviation check

---
 eval_protocol/models.py                 | 17 ++++++-
 eval_protocol/pytest/evaluation_test.py | 65 +++++++++++++++++--------
 tests/pytest/test_tau_bench_airline.py  |  4 +-
 3 files changed, 62 insertions(+), 24 deletions(-)

diff --git a/eval_protocol/models.py b/eval_protocol/models.py
index 583985b4..42233cbd 100644
--- a/eval_protocol/models.py
+++ b/eval_protocol/models.py
@@ -202,6 +202,21 @@ class InputMetadata(BaseModel):
     )
 
 
+class EvaluationThreshold(BaseModel):
+    """Threshold configuration for evaluation tests.
+
+    The success field is required - tests must specify a minimum success rate.
+    The standard_deviation field is optional - if provided, tests must also meet the maximum standard deviation requirement.
+    """
+
+    success: float = Field(
+        ..., description="Minimum success rate threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
+    )
+    standard_deviation: Optional[float] = Field(
+        None, description="Maximum standard deviation threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
+    )
+
+
 class EvalMetadata(BaseModel):
     """Metadata about the evaluation that was run."""
 
@@ -216,7 +231,7 @@ class EvalMetadata(BaseModel):
     )
     num_runs: int = Field(..., description="Number of times the evaluation was repeated")
     aggregation_method: str = Field(..., description="Method used to aggregate scores across runs")
-    threshold_of_success: Optional[float] = Field(None, description="Threshold score for test success")
+    threshold: Optional[EvaluationThreshold] = Field(None, description="Threshold configuration for test success")
     passed: Optional[bool] = Field(None, description="Whether the evaluation passed based on the threshold")
 
 
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index 245467bb..49b382d2 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -1,11 +1,13 @@
+import copy
 import inspect
 import os
-from typing import Any, Callable, Dict, List, Optional
+import statistics
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import pytest
 
 from eval_protocol.dataset_logger import default_logger
-from eval_protocol.models import CompletionParams, EvalMetadata, EvaluationRow, InputMetadata
+from eval_protocol.models import CompletionParams, EvalMetadata, EvaluationRow, EvaluationThreshold, InputMetadata
 from eval_protocol.pytest.default_dataset_adapter import default_dataset_adapter
 from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor
 from eval_protocol.pytest.types import (
@@ -40,7 +42,7 @@ def evaluation_test(
     rollout_processor: RolloutProcessor = default_no_op_rollout_processor,
     evaluation_test_kwargs: Optional[List[EvaluationInputParam]] = None,
     aggregation_method: AggregationMethod = "mean",
-    threshold_of_success: Optional[float] = None,
+    threshold: Optional[EvaluationThreshold] = None,
     num_runs: int = 1,
     max_dataset_rows: Optional[int] = None,
     mcp_config_path: Optional[str] = None,
@@ -68,8 +70,8 @@ def evaluation_test(
         rollout_processor: Function used to perform the rollout.
         evaluation_test_kwargs: Kwargs for the evaluation function.
         aggregation_method: How to aggregate scores across rows.
-        threshold_of_success: If set, fail the test if the aggregated score is
-            below this threshold.
+        threshold: Threshold configuration for test success.
+            Success rate must be above success, and if set, standard deviation must be below standard_deviation.
         num_runs: Number of times to repeat the evaluation.
         max_dataset_rows: Limit dataset to the first N rows.
         mcp_config_path: Path to MCP config file that follows MCPMultiClientConfiguration schema
@@ -194,7 +196,7 @@ def create_wrapper_with_signature() -> Callable:
             def wrapper_body(**kwargs):
                 model_name = kwargs["model"]
                 eval_metadata = None
-                all_results: List[EvaluationRow] = []
+                all_results: List[List[EvaluationRow]] = [[] for _ in range(num_runs)]
 
                 try:
                     # Handle dataset loading
@@ -218,7 +220,7 @@ def wrapper_body(**kwargs):
                         status="running",
                         num_runs=num_runs,
                         aggregation_method=aggregation_method,
-                        threshold_of_success=threshold_of_success,
+                        threshold=threshold,
                         passed=None,
                     )
 
@@ -255,9 +257,13 @@ def wrapper_body(**kwargs):
                         server_script_path=server_script_path,
                         steps=steps,
                     )
-                    input_dataset = execute_function(rollout_processor, rows=data, config=config)
 
-                    for _ in range(num_runs):
+                    for i in range(num_runs):
+                        # Regenerate outputs each run by deep-copying the pristine dataset
+                        # so model responses are not reused across runs.
+                        fresh_rows = [copy.deepcopy(r) for r in data]
+                        input_dataset = execute_function(rollout_processor, rows=fresh_rows, config=config)
+
                         if mode == "pointwise":
                             # Pointwise mode: apply the evaluator function to each row
                             for row in input_dataset:
@@ -270,7 +276,7 @@ def wrapper_body(**kwargs):
                                     raise ValueError(
                                         f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
                                     )
-                                all_results.append(result)
+                                all_results[i].append(result)
                         else:
                             # Batch mode: call the test function with the full dataset
                             results = execute_with_params(
@@ -294,28 +300,45 @@ def wrapper_body(**kwargs):
                                 raise ValueError(
                                     f"Test function {test_func.__name__} returned a list containing non-EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."
                                 )
-                            all_results.extend(results)
+                            all_results[i] = results
 
-                    scores = [r.evaluation_result.score for r in all_results if r.evaluation_result]
+                    scores = [
+                        sum([r.evaluation_result.score for r in result if r.evaluation_result]) / len(result)
+                        for result in all_results
+                    ]
                     agg_score = aggregate(scores, aggregation_method)
+                    score_std = statistics.stdev(scores) if len(scores) > 1 else 0.0
 
                     # Determine if the evaluation passed based on threshold
                     passed = None
-                    if threshold_of_success is not None:
-                        passed = agg_score >= threshold_of_success
+
+                    if threshold is not None:
+                        success_passed, std_passed = True, True
+
+                        success_passed = agg_score >= threshold["success"]
+
+                        if threshold["standard_deviation"] is not None:
+                            std_passed = score_std <= threshold["standard_deviation"]
+
+                        passed = success_passed and std_passed
 
                     # Update eval metadata status and passed field for all results
-                    for r in all_results:
-                        if r.eval_metadata is not None:
-                            r.eval_metadata.status = "finished"
-                            r.eval_metadata.passed = passed
+                    for result in all_results:
+                        for r in result:
+                            if r.eval_metadata is not None:
+                                r.eval_metadata.status = "finished"
+                                r.eval_metadata.passed = passed
                         default_logger.log(r)
 
                     # Check threshold after logging
-                    if threshold_of_success is not None and not passed:
+                    if threshold is not None and not passed:
                         assert (
-                            agg_score >= threshold_of_success
-                        ), f"Aggregated score {agg_score:.3f} below threshold {threshold_of_success}"
+                            agg_score >= threshold["success"]
+                        ), f"Aggregated score {agg_score:.3f} below threshold {threshold["success"]}"
+                        if threshold["standard_deviation"] is not None:
+                            assert (
+                                score_std <= threshold["standard_deviation"]
+                            ), f"Standard deviation {score_std:.3f} above threshold {threshold["standard_deviation"]}"
 
                 except Exception as e:
                     # Update eval metadata status to error and log it
diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py
index 5bb025b6..cdee3463 100644
--- a/tests/pytest/test_tau_bench_airline.py
+++ b/tests/pytest/test_tau_bench_airline.py
@@ -67,8 +67,8 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval
     model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
     rollout_input_params=[{"temperature": 0.8, "max_tokens": 4096, "reasoning_effort": "high"}],
     rollout_processor=default_mcp_gym_rollout_processor,
-    threshold_of_success=0.4,
-    num_runs=1,
+    threshold={"success": 0.4, "standard_deviation": 0.03},
+    num_runs=4,
     mode="pointwise",
     max_concurrent_rollouts=50,
     server_script_path="examples/tau2_mcp/server.py",

From 70c2cac6666771ef2bc2a83b54b594f739033f1b Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Sun, 10 Aug 2025 02:52:20 -0700
Subject: [PATCH 10/14] added type checking

---
 eval_protocol/pytest/evaluation_test.py | 28 +++++++++++++++----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index 49b382d2..4ddd499d 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -86,6 +86,12 @@ def evaluation_test(
     def decorator(
         test_func: TestFunction,
     ):
+        if threshold is not None:
+            if isinstance(threshold, dict):
+                evaluation_threshold = EvaluationThreshold(**threshold)
+            elif isinstance(threshold, float):
+                evaluation_threshold = EvaluationThreshold(success=threshold)
+
         sig = inspect.signature(test_func)
 
         # For pointwise/rowwise mode, we expect a different signature
@@ -220,7 +226,7 @@ def wrapper_body(**kwargs):
                         status="running",
                         num_runs=num_runs,
                         aggregation_method=aggregation_method,
-                        threshold=threshold,
+                        threshold=evaluation_threshold,
                         passed=None,
                     )
 
@@ -312,13 +318,13 @@ def wrapper_body(**kwargs):
                     # Determine if the evaluation passed based on threshold
                     passed = None
 
-                    if threshold is not None:
+                    if evaluation_threshold is not None:
                         success_passed, std_passed = True, True
 
-                        success_passed = agg_score >= threshold["success"]
+                        success_passed = agg_score >= evaluation_threshold.success
 
-                        if threshold["standard_deviation"] is not None:
-                            std_passed = score_std <= threshold["standard_deviation"]
+                        if evaluation_threshold.standard_deviation is not None:
+                            std_passed = score_std <= evaluation_threshold.standard_deviation
 
                         passed = success_passed and std_passed
 
@@ -331,14 +337,14 @@ def wrapper_body(**kwargs):
                         default_logger.log(r)
 
                     # Check threshold after logging
-                    if threshold is not None and not passed:
+                    if evaluation_threshold is not None and not passed:
                         assert (
-                            agg_score >= threshold["success"]
-                        ), f"Aggregated score {agg_score:.3f} below threshold {threshold["success"]}"
-                        if threshold["standard_deviation"] is not None:
+                            agg_score >= evaluation_threshold.success
+                        ), f"Aggregated score {agg_score:.3f} below threshold {evaluation_threshold.success}"
+                        if evaluation_threshold.standard_deviation is not None:
                             assert (
-                                score_std <= threshold["standard_deviation"]
-                            ), f"Standard deviation {score_std:.3f} above threshold {threshold["standard_deviation"]}"
+                                score_std <= evaluation_threshold.standard_deviation
+                            ), f"Standard deviation {score_std:.3f} above threshold {evaluation_threshold.standard_deviation}"
 
                 except Exception as e:
                     # Update eval metadata status to error and log it

From a8f80feb39eb81e3119a23a02bb131c9d477d222 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Sun, 10 Aug 2025 02:58:54 -0700
Subject: [PATCH 11/14] changing other files

---
 tests/pytest/test_apps_coding.py              | 15 +++-----
 tests/pytest/test_basic_coding.py             | 38 +++++++++----------
 tests/pytest/test_frozen_lake.py              |  4 +-
 tests/pytest/test_hallucination.py            |  2 +-
 tests/pytest/test_lunar_lander.py             | 30 ++++++++-------
 tests/pytest/test_markdown_highlighting.py    |  2 +-
 tests/pytest/test_pytest_math_example.py      |  2 +-
 .../pytest/test_pytest_math_format_length.py  |  2 +-
 .../pytest/test_pytest_word_count_example.py  |  2 +-
 9 files changed, 47 insertions(+), 50 deletions(-)

diff --git a/tests/pytest/test_apps_coding.py b/tests/pytest/test_apps_coding.py
index 4780388a..3441ef02 100644
--- a/tests/pytest/test_apps_coding.py
+++ b/tests/pytest/test_apps_coding.py
@@ -18,10 +18,7 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
     Convert entries from APPS dataset to EvaluationRow objects.
     """
     return [
-        EvaluationRow(
-            messages=[Message(role="user", content=row["question"])],
-            ground_truth=row["input_output"]
-        )
+        EvaluationRow(messages=[Message(role="user", content=row["question"])], ground_truth=row["input_output"])
         for row in data
     ]
 
@@ -31,7 +28,7 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
     dataset_adapter=apps_dataset_to_evaluation_row,
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
-    threshold_of_success=0.33,
+    threshold=0.33,
     rollout_processor=default_single_turn_rollout_processor,
     num_runs=1,
     mode="pointwise",
@@ -42,7 +39,7 @@ def test_apps_code_evaluation(row: EvaluationRow) -> EvaluationRow:
 
     Args:
         row: EvaluationRow containing the conversation messages and ground_truth as JSON string
-    
+
     Returns:
         EvaluationRow with the evaluation result
     """
@@ -51,8 +48,8 @@ def test_apps_code_evaluation(row: EvaluationRow) -> EvaluationRow:
         messages=row.messages,
         ground_truth=row.ground_truth,
     )
-    
+
     # Set the evaluation result on the row
     row.evaluation_result = result
-    
-    return row 
\ No newline at end of file
+
+    return row
diff --git a/tests/pytest/test_basic_coding.py b/tests/pytest/test_basic_coding.py
index 35d1a1b3..76f6df99 100644
--- a/tests/pytest/test_basic_coding.py
+++ b/tests/pytest/test_basic_coding.py
@@ -9,7 +9,7 @@
 
 from eval_protocol.models import EvaluateResult, EvaluationRow, Message
 from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
-from eval_protocol.rewards.code_execution import extract_code_blocks, execute_python_code
+from eval_protocol.rewards.code_execution import execute_python_code, extract_code_blocks
 
 
 def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
@@ -18,8 +18,8 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
     """
     return [
         EvaluationRow(
-            messages=[Message(role="user", content=f"{row['prompt']} Input: {row['input']}")], 
-            ground_truth=row["expected_output"]
+            messages=[Message(role="user", content=f"{row['prompt']} Input: {row['input']}")],
+            ground_truth=row["expected_output"],
         )
         for row in data
     ]
@@ -30,7 +30,7 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
     dataset_adapter=coding_dataset_to_evaluation_row,
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
-    threshold_of_success=0.8,
+    threshold=0.8,
     rollout_processor=default_single_turn_rollout_processor,
     num_runs=1,
     mode="pointwise",
@@ -38,16 +38,16 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
 def test_coding_code_evaluation(row: EvaluationRow) -> EvaluationRow:
     """
     Evaluation function that tests code correctness by executing it locally.
-    
+
     This function:
     1. Extracts Python code from the assistant's response
     2. Executes the code locally with timeout=10
     3. Compares the output to ground_truth
     4. Returns a score of 1.0 if output matches, 0.0 otherwise
-    
+
     Args:
         row: EvaluationRow containing the conversation messages and expected_output in ground_truth
-        
+
     Returns:
         EvaluationRow with the evaluation result
     """
@@ -55,38 +55,34 @@ def test_coding_code_evaluation(row: EvaluationRow) -> EvaluationRow:
     if len(row.messages) < 2 or row.messages[-1].role != "assistant":
         row.evaluation_result = EvaluateResult(score=0.0, reason="No assistant response found")
         return row
-    
+
     assistant_content = row.messages[-1].content or ""
     expected_output = (row.ground_truth or "").strip()
-    
+
     # Extract Python code blocks
     code_blocks = extract_code_blocks(assistant_content, language="python")
     if not code_blocks:
         row.evaluation_result = EvaluateResult(score=0.0, reason="No Python code block found")
         return row
-    
+
     code = code_blocks[0]["code"]
-    
+
     # Execute the code locally
     execution_result = execute_python_code(code, timeout=10)
-    
+
     if not execution_result.get("success", False):
         error_msg = execution_result.get("error", "Code execution failed")
         row.evaluation_result = EvaluateResult(score=0.0, reason=f"Execution error: {error_msg}")
         return row
-    
+
     # Compare output with expected
     actual_output = (execution_result.get("output", "") or "").strip()
-    
+
     if actual_output == expected_output:
-        row.evaluation_result = EvaluateResult(
-            score=1.0, 
-            reason=f"✅ Output matches: '{actual_output}'"
-        )
+        row.evaluation_result = EvaluateResult(score=1.0, reason=f"✅ Output matches: '{actual_output}'")
     else:
         row.evaluation_result = EvaluateResult(
-            score=0.0, 
-            reason=f"❌ Expected: '{expected_output}', Got: '{actual_output}'"
+            score=0.0, reason=f"❌ Expected: '{expected_output}', Got: '{actual_output}'"
         )
-    
+
     return row
diff --git a/tests/pytest/test_frozen_lake.py b/tests/pytest/test_frozen_lake.py
index 76551920..d53f3e6b 100644
--- a/tests/pytest/test_frozen_lake.py
+++ b/tests/pytest/test_frozen_lake.py
@@ -7,7 +7,7 @@
 
 from typing import Any, Dict, List
 
-from eval_protocol.models import EvaluateResult, EvaluationRow, Message, InputMetadata, CompletionParams, MetricResult
+from eval_protocol.models import CompletionParams, EvaluateResult, EvaluationRow, InputMetadata, Message, MetricResult
 from eval_protocol.pytest import evaluation_test
 from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor
 
@@ -41,7 +41,7 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
     rollout_processor=default_mcp_gym_rollout_processor,
-    threshold_of_success=0.66,
+    threshold=0.66,
     num_runs=1,
     max_concurrent_rollouts=3,
     mode="pointwise",
diff --git a/tests/pytest/test_hallucination.py b/tests/pytest/test_hallucination.py
index b396e12c..2baf4a63 100644
--- a/tests/pytest/test_hallucination.py
+++ b/tests/pytest/test_hallucination.py
@@ -35,7 +35,7 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 512}],
     rollout_processor=default_single_turn_rollout_processor,
-    threshold_of_success=0.33,
+    threshold=0.33,
     num_runs=1,
     mode="pointwise",
 )
diff --git a/tests/pytest/test_lunar_lander.py b/tests/pytest/test_lunar_lander.py
index 896adc49..39d1650d 100644
--- a/tests/pytest/test_lunar_lander.py
+++ b/tests/pytest/test_lunar_lander.py
@@ -7,7 +7,7 @@
 
 from typing import Any, Dict, List
 
-from eval_protocol.models import EvaluateResult, EvaluationRow, Message, InputMetadata, CompletionParams
+from eval_protocol.models import CompletionParams, EvaluateResult, EvaluationRow, InputMetadata, Message
 from eval_protocol.pytest import evaluation_test
 from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor
 
@@ -17,7 +17,7 @@ def lunar_lander_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
     Convert entries from lunar lander dataset to EvaluationRow objects.
     """
     rows = []
-    
+
     for row in data:
         eval_row = EvaluationRow(
             messages=[Message(role="system", content=row["system_prompt"])],
@@ -26,12 +26,12 @@ def lunar_lander_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
                 dataset_info={
                     "environment_context": row["environment_context"],
                     "user_prompt_template": row["user_prompt_template"],
-                }
-            )
+                },
+            ),
         )
-        
+
         rows.append(eval_row)
-    
+
     return rows
 
 
@@ -41,7 +41,7 @@ def lunar_lander_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
     model=["gpt-4.1"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
     rollout_processor=default_mcp_gym_rollout_processor,
-    threshold_of_success=0.0,
+    threshold=0.0,
     num_runs=1,
     mode="pointwise",
     max_concurrent_rollouts=3,
@@ -51,24 +51,28 @@ def lunar_lander_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
 def test_lunar_lander_evaluation(row: EvaluationRow) -> EvaluationRow:
     """
     Test lunar lander evaluation using the pytest framework.
-    
+
     This test evaluates how well the model can control the lunar lander to achieve
     a successful landing by checking the final reward and termination status.
-    
+
     Args:
         row: EvaluationRow object from lunar lander dataset
-        
+
     Returns:
         EvaluationRow object with evaluation results
     """
     score = row.get_total_reward()
 
     evaluation_score = 1.0 if score >= 200 else 0.0
-    reason = f"✅ Successful landing with reward {score:.2f}" if score >= 200 else f"❌ Failed landing with reward {score:.2f}"
+    reason = (
+        f"✅ Successful landing with reward {score:.2f}"
+        if score >= 200
+        else f"❌ Failed landing with reward {score:.2f}"
+    )
 
     row.evaluation_result = EvaluateResult(
         score=evaluation_score,
         reason=reason,
     )
-    
-    return row 
\ No newline at end of file
+
+    return row
diff --git a/tests/pytest/test_markdown_highlighting.py b/tests/pytest/test_markdown_highlighting.py
index cc2ae4f5..446aac00 100644
--- a/tests/pytest/test_markdown_highlighting.py
+++ b/tests/pytest/test_markdown_highlighting.py
@@ -26,7 +26,7 @@ def markdown_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu
     dataset_adapter=markdown_dataset_to_evaluation_row,
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
-    threshold_of_success=0.5,
+    threshold=0.5,
     rollout_processor=default_single_turn_rollout_processor,
     num_runs=1,
     mode="pointwise",
diff --git a/tests/pytest/test_pytest_math_example.py b/tests/pytest/test_pytest_math_example.py
index 05b0022c..6c2fcad4 100644
--- a/tests/pytest/test_pytest_math_example.py
+++ b/tests/pytest/test_pytest_math_example.py
@@ -11,7 +11,7 @@
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0}],
     max_dataset_rows=5,
-    threshold_of_success=0.0,
+    threshold=0.0,
     rollout_processor=default_single_turn_rollout_processor,
     mode="pointwise",
     evaluation_test_kwargs=[
diff --git a/tests/pytest/test_pytest_math_format_length.py b/tests/pytest/test_pytest_math_format_length.py
index fbc59efc..9385db2b 100644
--- a/tests/pytest/test_pytest_math_format_length.py
+++ b/tests/pytest/test_pytest_math_format_length.py
@@ -14,7 +14,7 @@
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0}],
     max_dataset_rows=5,
-    threshold_of_success=0.0,
+    threshold=0.0,
     rollout_processor=default_single_turn_rollout_processor,
     mode="pointwise",
     evaluation_test_kwargs=[
diff --git a/tests/pytest/test_pytest_word_count_example.py b/tests/pytest/test_pytest_word_count_example.py
index 51062a1f..44fe7237 100644
--- a/tests/pytest/test_pytest_word_count_example.py
+++ b/tests/pytest/test_pytest_word_count_example.py
@@ -11,7 +11,7 @@
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0}],
     max_dataset_rows=5,
-    threshold_of_success=0.3,  # Reasonable threshold for word count evaluation
+    threshold=0.3,  # Reasonable threshold for word count evaluation
     rollout_processor=default_single_turn_rollout_processor,
     mode="pointwise",  # Use pointwise mode for elegant row-by-row evaluation
 )

From 2694ebdecd0d5a6d80e7161ca6ffe33ba2fda16f Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Sun, 10 Aug 2025 19:15:06 -0700
Subject: [PATCH 12/14] updated model

---
 eval_protocol/models.py                       |  4 +-
 eval_protocol/pytest/evaluation_test.py       | 37 ++++++++++---------
 tests/pytest/test_apps_coding.py              |  2 +-
 tests/pytest/test_basic_coding.py             |  2 +-
 tests/pytest/test_frozen_lake.py              |  2 +-
 tests/pytest/test_hallucination.py            |  2 +-
 tests/pytest/test_lunar_lander.py             |  2 +-
 tests/pytest/test_markdown_highlighting.py    |  2 +-
 tests/pytest/test_pytest_math_example.py      |  2 +-
 .../pytest/test_pytest_math_format_length.py  |  2 +-
 .../pytest/test_pytest_word_count_example.py  |  2 +-
 tests/pytest/test_tau_bench_airline.py        |  6 +--
 12 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/eval_protocol/models.py b/eval_protocol/models.py
index 2cfdc049..a70fddab 100644
--- a/eval_protocol/models.py
+++ b/eval_protocol/models.py
@@ -231,7 +231,9 @@ class EvalMetadata(BaseModel):
     )
     num_runs: int = Field(..., description="Number of times the evaluation was repeated")
     aggregation_method: str = Field(..., description="Method used to aggregate scores across runs")
-    threshold: Optional[EvaluationThreshold] = Field(None, description="Threshold configuration for test success")
+    passed_threshold: Optional[EvaluationThreshold] = Field(
+        None, description="Threshold configuration for test success"
+    )
     passed: Optional[bool] = Field(None, description="Whether the evaluation passed based on the threshold")
 
 
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index 0867f263..eb950c37 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -53,7 +53,7 @@ def evaluation_test(  # noqa: C901
     rollout_processor: RolloutProcessor = default_no_op_rollout_processor,
     evaluation_test_kwargs: Optional[List[EvaluationInputParam]] = None,
     aggregation_method: AggregationMethod = "mean",
-    threshold: Optional[EvaluationThreshold] = None,
+    passed_threshold: Optional[Union[EvaluationThreshold, float]] = None,
     num_runs: int = 1,
     max_dataset_rows: Optional[int] = None,
     mcp_config_path: Optional[str] = None,
@@ -113,7 +113,7 @@ def evaluation_test(  # noqa: C901
         rollout_processor: Function used to perform the rollout.
         evaluation_test_kwargs: Kwargs for the evaluation function.
         aggregation_method: How to aggregate scores across rows.
-        threshold: Threshold configuration for test success.
+        passed_threshold: Threshold configuration for test success.
             Success rate must be above success, and if set, standard deviation must be below standard_deviation.
         num_runs: Number of times to repeat the rollout and evaluations.
         max_dataset_rows: Limit dataset to the first N rows.
@@ -129,11 +129,11 @@ def evaluation_test(  # noqa: C901
     def decorator(
         test_func: TestFunction,
     ):
-        if threshold is not None:
-            if isinstance(threshold, dict):
-                evaluation_threshold = EvaluationThreshold(**threshold)
-            elif isinstance(threshold, float):
-                evaluation_threshold = EvaluationThreshold(success=threshold)
+        if passed_threshold is not None:
+            if isinstance(passed_threshold, float):
+                threshold = EvaluationThreshold(success=passed_threshold)
+            else:
+                threshold = EvaluationThreshold(**passed_threshold)
 
         sig = inspect.signature(test_func)
 
@@ -361,7 +361,7 @@ def _log_eval_error(
                         status="running",
                         num_runs=num_runs,
                         aggregation_method=aggregation_method,
-                        threshold=evaluation_threshold,
+                        passed_threshold=threshold,
                         passed=None,
                     )
 
@@ -459,6 +459,7 @@ def _log_eval_error(
                         sum([r.evaluation_result.score for r in result if r.evaluation_result]) / len(result)
                         for result in all_results
                     ]
+                    print(f"SCORES: {scores}")
                     agg_score = aggregate(scores, aggregation_method)
                     score_std = statistics.stdev(scores) if len(scores) > 1 else 0.0
 
@@ -495,13 +496,13 @@ def _log_eval_error(
                     # Determine if the evaluation passed based on threshold
                     passed = None
 
-                    if evaluation_threshold is not None:
+                    if threshold is not None:
                         success_passed, std_passed = True, True
 
-                        success_passed = agg_score >= evaluation_threshold.success
+                        success_passed = agg_score >= threshold.success
 
-                        if evaluation_threshold.standard_deviation is not None:
-                            std_passed = score_std <= evaluation_threshold.standard_deviation
+                        if threshold.standard_deviation is not None:
+                            std_passed = score_std <= threshold.standard_deviation
 
                         passed = success_passed and std_passed
 
@@ -636,14 +637,14 @@ def _extract_effort_tag(params: dict) -> str | None:
                         pass
 
                     # Check threshold after logging
-                    if evaluation_threshold is not None and not passed:
+                    if threshold is not None and not passed:
                         assert (
-                            agg_score >= evaluation_threshold.success
-                        ), f"Aggregated score {agg_score:.3f} below threshold {evaluation_threshold.success}"
-                        if evaluation_threshold.standard_deviation is not None:
+                            agg_score >= threshold.success
+                        ), f"Aggregated score {agg_score:.3f} below threshold {threshold.success}"
+                        if threshold.standard_deviation is not None:
                             assert (
-                                score_std <= evaluation_threshold.standard_deviation
-                            ), f"Standard deviation {score_std:.3f} above threshold {evaluation_threshold.standard_deviation}"
+                                score_std <= threshold.standard_deviation
+                            ), f"Standard deviation {score_std:.3f} above threshold {threshold.standard_deviation}"
 
                 except AssertionError:
                     _log_eval_error("finished", data if "data" in locals() else None, passed=False)
diff --git a/tests/pytest/test_apps_coding.py b/tests/pytest/test_apps_coding.py
index 3441ef02..f9c84695 100644
--- a/tests/pytest/test_apps_coding.py
+++ b/tests/pytest/test_apps_coding.py
@@ -28,7 +28,7 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
     dataset_adapter=apps_dataset_to_evaluation_row,
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
-    threshold=0.33,
+    passed_threshold=0.33,
     rollout_processor=default_single_turn_rollout_processor,
     num_runs=1,
     mode="pointwise",
diff --git a/tests/pytest/test_basic_coding.py b/tests/pytest/test_basic_coding.py
index 76f6df99..c96a8302 100644
--- a/tests/pytest/test_basic_coding.py
+++ b/tests/pytest/test_basic_coding.py
@@ -30,7 +30,7 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
     dataset_adapter=coding_dataset_to_evaluation_row,
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
-    threshold=0.8,
+    passed_threshold=0.8,
     rollout_processor=default_single_turn_rollout_processor,
     num_runs=1,
     mode="pointwise",
diff --git a/tests/pytest/test_frozen_lake.py b/tests/pytest/test_frozen_lake.py
index d53f3e6b..74d5e317 100644
--- a/tests/pytest/test_frozen_lake.py
+++ b/tests/pytest/test_frozen_lake.py
@@ -41,7 +41,7 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
     rollout_processor=default_mcp_gym_rollout_processor,
-    threshold=0.66,
+    passed_threshold=0.66,
     num_runs=1,
     max_concurrent_rollouts=3,
     mode="pointwise",
diff --git a/tests/pytest/test_hallucination.py b/tests/pytest/test_hallucination.py
index 2baf4a63..54779f09 100644
--- a/tests/pytest/test_hallucination.py
+++ b/tests/pytest/test_hallucination.py
@@ -35,7 +35,7 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 512}],
     rollout_processor=default_single_turn_rollout_processor,
-    threshold=0.33,
+    passed_threshold=0.33,
     num_runs=1,
     mode="pointwise",
 )
diff --git a/tests/pytest/test_lunar_lander.py b/tests/pytest/test_lunar_lander.py
index 39d1650d..ab4dad69 100644
--- a/tests/pytest/test_lunar_lander.py
+++ b/tests/pytest/test_lunar_lander.py
@@ -41,7 +41,7 @@ def lunar_lander_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
     model=["gpt-4.1"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
     rollout_processor=default_mcp_gym_rollout_processor,
-    threshold=0.0,
+    passed_threshold=0.0,
     num_runs=1,
     mode="pointwise",
     max_concurrent_rollouts=3,
diff --git a/tests/pytest/test_markdown_highlighting.py b/tests/pytest/test_markdown_highlighting.py
index 855105c1..89302163 100644
--- a/tests/pytest/test_markdown_highlighting.py
+++ b/tests/pytest/test_markdown_highlighting.py
@@ -30,7 +30,7 @@ def markdown_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu
     dataset_adapter=markdown_dataset_to_evaluation_row,
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
-    threshold=0.5,
+    passed_threshold=0.5,
     rollout_processor=default_single_turn_rollout_processor,
     num_runs=1,
     mode="pointwise",
diff --git a/tests/pytest/test_pytest_math_example.py b/tests/pytest/test_pytest_math_example.py
index 6c2fcad4..afe74a4e 100644
--- a/tests/pytest/test_pytest_math_example.py
+++ b/tests/pytest/test_pytest_math_example.py
@@ -11,7 +11,7 @@
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0}],
     max_dataset_rows=5,
-    threshold=0.0,
+    passed_threshold=0.0,
     rollout_processor=default_single_turn_rollout_processor,
     mode="pointwise",
     evaluation_test_kwargs=[
diff --git a/tests/pytest/test_pytest_math_format_length.py b/tests/pytest/test_pytest_math_format_length.py
index 9385db2b..e51b062f 100644
--- a/tests/pytest/test_pytest_math_format_length.py
+++ b/tests/pytest/test_pytest_math_format_length.py
@@ -14,7 +14,7 @@
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0}],
     max_dataset_rows=5,
-    threshold=0.0,
+    passed_threshold=0.0,
     rollout_processor=default_single_turn_rollout_processor,
     mode="pointwise",
     evaluation_test_kwargs=[
diff --git a/tests/pytest/test_pytest_word_count_example.py b/tests/pytest/test_pytest_word_count_example.py
index 44fe7237..b0c4850d 100644
--- a/tests/pytest/test_pytest_word_count_example.py
+++ b/tests/pytest/test_pytest_word_count_example.py
@@ -11,7 +11,7 @@
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0}],
     max_dataset_rows=5,
-    threshold=0.3,  # Reasonable threshold for word count evaluation
+    passed_threshold=0.3,  # Reasonable threshold for word count evaluation
     rollout_processor=default_single_turn_rollout_processor,
     mode="pointwise",  # Use pointwise mode for elegant row-by-row evaluation
 )
diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py
index cdee3463..80aadf14 100644
--- a/tests/pytest/test_tau_bench_airline.py
+++ b/tests/pytest/test_tau_bench_airline.py
@@ -65,10 +65,10 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval
     input_dataset=["tests/pytest/data/airline_dataset.jsonl"],
     dataset_adapter=tau_bench_airline_to_evaluation_row,
     model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
-    rollout_input_params=[{"temperature": 0.8, "max_tokens": 4096, "reasoning_effort": "high"}],
+    rollout_input_params=[{"temperature": 0.8, "max_tokens": 4096, "reasoning_effort": "low"}],
     rollout_processor=default_mcp_gym_rollout_processor,
-    threshold={"success": 0.4, "standard_deviation": 0.03},
-    num_runs=4,
+    passed_threshold={"success": 0.4, "standard_deviation": 0.1},
+    num_runs=8,
     mode="pointwise",
     max_concurrent_rollouts=50,
     server_script_path="examples/tau2_mcp/server.py",

From 9a9088ea9c4dcff16e9f5ac9d11e9b000d38eb94 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Mon, 11 Aug 2025 01:48:57 -0700
Subject: [PATCH 13/14] bug fix

---
 eval_protocol/mcp/execution/manager.py  | 4 ++--
 eval_protocol/pytest/evaluation_test.py | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py
index fce561c7..bebc815a 100644
--- a/eval_protocol/mcp/execution/manager.py
+++ b/eval_protocol/mcp/execution/manager.py
@@ -159,8 +159,8 @@ async def _execute_with_semaphore(idx):
                 messages.append(Message.model_validate(msg_dict))
 
             evaluation_rows[idx].messages = messages
-            evaluation_rows[idx].input_metadata.row_id = envs.dataset_rows[idx].id
-            evaluation_rows[idx].input_metadata.dataset_info = asdict(envs.dataset_rows[idx])
+            # evaluation_rows[idx].input_metadata.row_id = envs.dataset_rows[idx].id
+            # evaluation_rows[idx].input_metadata.dataset_info = asdict(envs.dataset_rows[idx])
             evaluation_rows[idx].tools = shared_tool_schema
             evaluation_rows[idx].usage = CompletionUsage(**trajectory.usage)
             evaluation_rows[idx].input_metadata.completion_params = CompletionParams(
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index eb950c37..8bed168b 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -134,6 +134,8 @@ def decorator(
                 threshold = EvaluationThreshold(success=passed_threshold)
             else:
                 threshold = EvaluationThreshold(**passed_threshold)
+        else:
+            threshold = None
 
         sig = inspect.signature(test_func)
 
@@ -405,7 +407,7 @@ def _log_eval_error(
                         # Regenerate outputs each run by deep-copying the pristine dataset
                         # so model responses are not reused across runs.
                         run_id = generate_id()
-                        fresh_dataset = [copy.deepcopy(r) for r in data]
+                        fresh_dataset = [r.model_copy(deep=True) for r in data]
 
                         # apply new run_id to fresh_dataset
                         for row in fresh_dataset:

From 92e876507780988d87df3921752e8927f8a857f6 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Mon, 11 Aug 2025 02:32:25 -0700
Subject: [PATCH 14/14] merge error

---
 eval_protocol/pytest/evaluation_test.py | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index 8bed168b..dc177384 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -299,13 +299,6 @@ def wrapper_body(**kwargs):
 
                 cohort_id = generate_id()
 
-                def _log_eval_error(
-                    status: Literal["finished", "error"], rows: Optional[List[EvaluationRow]] | None, passed: bool
-                ) -> None:
-                    log_eval_status_and_rows(eval_metadata, rows, status, passed, default_logger)
-
-                cohort_id = generate_id()
-
                 def _log_eval_error(
                     status: Literal["finished", "error"], rows: Optional[List[EvaluationRow]] | None, passed: bool
                 ) -> None:
@@ -461,25 +454,9 @@ def _log_eval_error(
                         sum([r.evaluation_result.score for r in result if r.evaluation_result]) / len(result)
                         for result in all_results
                     ]
-                    print(f"SCORES: {scores}")
                     agg_score = aggregate(scores, aggregation_method)
                     score_std = statistics.stdev(scores) if len(scores) > 1 else 0.0
 
-                    # Compute 95% confidence interval for the fixed-set mean μ (by-question, using repeats)
-                    ci_low: float | None = None
-                    ci_high: float | None = None
-                    if aggregation_method == "mean":
-                        try:
-                            result_ci = compute_fixed_set_mu_ci([item for sublist in all_results for item in sublist])
-                            mu_ci_low, mu_ci_high = result_ci[1], result_ci[2]
-                            if mu_ci_low is not None and mu_ci_high is not None:
-                                ci_low = float(mu_ci_low)
-                                ci_high = float(mu_ci_high)
-                                # Keep agg_score as-is (mean over scores). For equal repeats per question these match.
-                        except Exception:
-                            ci_low = None
-                            ci_high = None
-
                     # Compute 95% confidence interval for the fixed-set mean μ (by-question, using repeats)
                     ci_low: float | None = None
                     ci_high: float | None = None