eval-protocol · dphuang2 · Aug 4, 2025 · Aug 4, 2025 · Aug 4, 2025 · Aug 4, 2025
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -4,5 +4,6 @@
   "python.testing.pytestArgs": ["tests", "examples", "-s", "--tb=short"],
   "python.testing.autoTestDiscoverOnSaveEnabled": true,
   "python.defaultInterpreterPath": "./.venv/bin/python",
-  "python.testing.cwd": "${workspaceFolder}"
+  "python.testing.cwd": "${workspaceFolder}",
+  "editor.defaultFormatter": "ms-python.black-formatter"
 }
diff --git a/eval_protocol/pytest/__init__.py b/eval_protocol/pytest/__init__.py
@@ -3,11 +3,13 @@
 from .default_single_turn_rollout_process import default_single_turn_rollout_processor
 from .evaluation_test import evaluation_test
 from .types import RolloutProcessor, RolloutProcessorConfig
+from .default_dataset_adapter import default_dataset_adapter
 
 __all__ = [
     "default_agent_rollout_processor",
     "default_no_op_rollout_processor",
     "default_single_turn_rollout_processor",
+    "default_dataset_adapter",
     "RolloutProcessor",
     "RolloutProcessorConfig",
     "evaluation_test",

diff --git a/eval_protocol/pytest/default_dataset_adapter.py b/eval_protocol/pytest/default_dataset_adapter.py
@@ -0,0 +1,10 @@
+from typing import Any, Dict, List
+
+from eval_protocol.models import EvaluationRow
+
+
+def default_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
+    """
+    Default dataset adapter that simply returns the rows as is.
+    """
+    return [EvaluationRow(**row) for row in rows]
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -24,11 +24,19 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
 
         messages_payload = [{"role": m.role, "content": m.content} for m in row.messages]
 
-        response = await client.chat.completions.create(
-            model=config.model, messages=messages_payload, **config.input_params
-        )
+        create_kwargs = dict(model=config.model, messages=messages_payload, **config.input_params)
+        if row.tools is not None:
+            create_kwargs["tools"] = row.tools
+        response = await client.chat.completions.create(**create_kwargs)
         assistant_content = response.choices[0].message.content or ""
-        messages = list(row.messages) + [Message(role="assistant", content=assistant_content)]
+        tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None
+        messages = list(row.messages) + [
+            Message(
+                role="assistant",
+                content=assistant_content,
+                tool_calls=tool_calls,
+            )
+        ]
 
         return EvaluationRow(
             messages=messages,

diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -1,6 +1,7 @@
 import inspect
 from typing import Any, Callable, Dict, List, Optional
 
+from eval_protocol.pytest.default_dataset_adapter import default_dataset_adapter
 import pytest
 
 from eval_protocol.models import EvaluationRow
@@ -32,7 +33,7 @@ def evaluation_test(
     model: List[ModelParam],
     input_messages: Optional[List[InputMessagesParam]] = None,
     input_dataset: Optional[List[DatasetPathParam]] = None,
-    dataset_adapter: Optional[Callable[[List[Dict[str, Any]]], Dataset]] = lambda x: x,
+    dataset_adapter: Optional[Callable[[List[Dict[str, Any]]], Dataset]] = default_dataset_adapter,
     rollout_input_params: Optional[List[RolloutInputParam]] = None,
     rollout_processor: RolloutProcessor = default_no_op_rollout_processor,
     evaluation_test_kwargs: Optional[List[EvaluationInputParam]] = None,
@@ -144,17 +145,16 @@ def generate_combinations():
                     for ip in params:
                         for im in messages:
                             for etk in kwargs:
-                                # Skip combinations that don't make sense
-                                # If we have a dataset, we should have params for rollout
-                                if ds is not None and ip is None:
-                                    continue
-                                # If we have messages but no dataset, that's fine
-                                # If we have no dataset and no messages, that's also fine
+                                # if no dataset and no messages, raise an error
+                                if ds is None and im is None:
+                                    raise ValueError("No dataset or messages provided. Please provide at least one of input_dataset or input_messages.")
                                 combinations.append((m, ds, ip, im, etk))
 
             return combinations
 
         combinations = generate_combinations()
+        if len(combinations) == 0:
+            raise ValueError("No combinations of parameters were found. Please provide at least a model and one of input_dataset or input_messages.")
 
         # Create parameter tuples for pytest.mark.parametrize
         param_tuples = []

diff --git a/tests/pytest/data/function_calling.jsonl b/tests/pytest/data/function_calling.jsonl
@@ -0,0 +1,13 @@
+{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\", \"unit\": \"celsius\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}}, "required": ["location", "unit"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}]}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "weather_london_perfect", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
+{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\", \"unit\": \"fahrenheit\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}}, "required": ["location", "unit"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "weather_london_unit_mismatch", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
+{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "fetch_weather", "arguments": "{\"location\": \"London\", \"unit\": \"celsius\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}}, "required": ["location", "unit"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "weather_london_name_mismatch", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
+{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\"}"}}, {"type": "function", "function": {"name": "extra_call", "arguments": "{}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}}, "required": ["location"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\"}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "weather_extra_call", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
+{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}}, "required": ["location"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\"}\"}}, {\"type\": \"function\", \"function\": {\"name\": \"expected_extra_call\", \"arguments\": \"{}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "weather_missing_call", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
+{"messages": [{"role": "user", "content": "Tell me a joke."}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}}, "required": ["location"]}}}], "ground_truth": "{\"tool_calls\": []}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "joke_unexpected_call", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
+{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "content": "It might be sunny."}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}}, "required": ["location"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\"}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "weather_no_call", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
+{"messages": [{"role": "user", "content": "Tell me a joke."}, {"role": "assistant", "content": "Why did the chicken cross the road?"}], "tools": [], "ground_truth": "{\"tool_calls\": []}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "joke_no_calls", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
+{"messages": [{"role": "user", "content": "What's the weather in Berlin?"}, {"role": "assistant", "content": "<tool_call>{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"Berlin\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}</tool_call>"}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}}, "required": ["location", "unit"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"Berlin\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}]}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "weather_xml_parsing", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "medium"}}}
+{"messages": [{"role": "user", "content": "Create a user for John Doe"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "create_user", "arguments": "{\"user\": {\"firstName\": \"John\", \"lastName\": \"Doe\", \"age\": 30}}"}}]}], "tools": [{"type": "function", "function": {"name": "create_user", "description": "Create a new user", "parameters": {"type": "object", "properties": {"user": {"type": "object", "properties": {"firstName": {"type": "string"}, "lastName": {"type": "string"}, "age": {"type": "number"}}, "required": ["firstName", "lastName", "age"]}}, "required": ["user"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"create_user\", \"arguments\": \"{\\\"user\\\": {\\\"firstName\\\": \\\"John\\\", \\\"lastName\\\": \\\"Doe\\\", \\\"age\\\": 30}}\"}}]}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "create_user_nested", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "hard"}}}
+{"messages": [{"role": "user", "content": "Find user"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "find_user", "arguments": "{\"id\": 123, \"name\": \"John Doe\""}}]}], "tools": [{"type": "function", "function": {"name": "find_user", "description": "Find a user by ID and name", "parameters": {"type": "object", "properties": {"id": {"type": "number"}, "name": {"type": "string"}}, "required": ["id", "name"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"find_user\", \"arguments\": \"{\\\"id\\\": 123, \\\"name\\\": \\\"John Doe\\\"}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "find_user_invalid_json", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "medium"}}}
+{"messages": [{"role": "user", "content": "Query"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "test_func", "arguments": "not a json string"}}]}], "tools": [{"type": "function", "function": {"name": "test_func", "description": "Test function", "parameters": {"type": "object", "properties": {}}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"test_func\", \"arguments\": \"not a json string\"}}]}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "test_func_non_json", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
+{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"id": "call_test123", "type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\", \"unit\": \"celsius\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}}, "required": ["location", "unit"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}]}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "weather_with_id", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
diff --git a/tests/pytest/test_pytest_function_calling.py b/tests/pytest/test_pytest_function_calling.py
@@ -0,0 +1,33 @@
+import json
+from typing import Any, Dict, List
+from eval_protocol.models import EvaluationRow
+from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
+from eval_protocol.rewards.function_calling import exact_tool_match_reward
+
+
+def function_calling_to_evaluation_row(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
+    """
+    Convert a function calling row to an evaluation row.
+    """
+    dataset: List[EvaluationRow] = []
+    for row in rows:
+        dataset.append(
+            EvaluationRow(messages=row["messages"][:1], tools=row["tools"], ground_truth=row["ground_truth"])
+        )
+    return dataset
+
+
+@evaluation_test(
+    input_dataset=["tests/pytest/data/function_calling.jsonl"],
+    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    mode="pointwise",
+    dataset_adapter=function_calling_to_evaluation_row,
+    rollout_processor=default_single_turn_rollout_processor,
+)
+async def test_pytest_function_calling(row: EvaluationRow) -> EvaluationRow:
+    """Run pointwise evaluation on sample dataset using pytest interface."""
+    ground_truth = json.loads(row.ground_truth)
+    result = exact_tool_match_reward(row.messages, ground_truth)
+    row.evaluation_result = result
+    print(result)
+    return row