diff --git a/.vscode/settings.json b/.vscode/settings.json index f7f61b72..6ec04673 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -4,5 +4,6 @@ "python.testing.pytestArgs": ["tests", "examples", "-s", "--tb=short"], "python.testing.autoTestDiscoverOnSaveEnabled": true, "python.defaultInterpreterPath": "./.venv/bin/python", - "python.testing.cwd": "${workspaceFolder}" + "python.testing.cwd": "${workspaceFolder}", + "editor.defaultFormatter": "ms-python.black-formatter" } diff --git a/eval_protocol/pytest/__init__.py b/eval_protocol/pytest/__init__.py index 2daef101..ce881ccc 100644 --- a/eval_protocol/pytest/__init__.py +++ b/eval_protocol/pytest/__init__.py @@ -3,11 +3,13 @@ from .default_single_turn_rollout_process import default_single_turn_rollout_processor from .evaluation_test import evaluation_test from .types import RolloutProcessor, RolloutProcessorConfig +from .default_dataset_adapter import default_dataset_adapter __all__ = [ "default_agent_rollout_processor", "default_no_op_rollout_processor", "default_single_turn_rollout_processor", + "default_dataset_adapter", "RolloutProcessor", "RolloutProcessorConfig", "evaluation_test", diff --git a/eval_protocol/pytest/default_dataset_adapter.py b/eval_protocol/pytest/default_dataset_adapter.py new file mode 100644 index 00000000..87377cff --- /dev/null +++ b/eval_protocol/pytest/default_dataset_adapter.py @@ -0,0 +1,10 @@ +from typing import Any, Dict, List + +from eval_protocol.models import EvaluationRow + + +def default_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: + """ + Default dataset adapter that simply returns the rows as is. + """ + return [EvaluationRow(**row) for row in rows] \ No newline at end of file diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py index dbc8fb68..1d6d9f7c 100644 --- a/eval_protocol/pytest/default_single_turn_rollout_process.py +++ b/eval_protocol/pytest/default_single_turn_rollout_process.py @@ -24,11 +24,19 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: messages_payload = [{"role": m.role, "content": m.content} for m in row.messages] - response = await client.chat.completions.create( - model=config.model, messages=messages_payload, **config.input_params - ) + create_kwargs = dict(model=config.model, messages=messages_payload, **config.input_params) + if row.tools is not None: + create_kwargs["tools"] = row.tools + response = await client.chat.completions.create(**create_kwargs) assistant_content = response.choices[0].message.content or "" - messages = list(row.messages) + [Message(role="assistant", content=assistant_content)] + tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None + messages = list(row.messages) + [ + Message( + role="assistant", + content=assistant_content, + tool_calls=tool_calls, + ) + ] return EvaluationRow( messages=messages, diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index ec5af367..b8387a7a 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -1,6 +1,7 @@ import inspect from typing import Any, Callable, Dict, List, Optional +from eval_protocol.pytest.default_dataset_adapter import default_dataset_adapter import pytest from eval_protocol.models import EvaluationRow @@ -32,7 +33,7 @@ def evaluation_test( model: List[ModelParam], input_messages: Optional[List[InputMessagesParam]] = None, input_dataset: Optional[List[DatasetPathParam]] = None, - dataset_adapter: Optional[Callable[[List[Dict[str, Any]]], Dataset]] = lambda x: x, + dataset_adapter: Optional[Callable[[List[Dict[str, Any]]], Dataset]] = default_dataset_adapter, rollout_input_params: Optional[List[RolloutInputParam]] = None, rollout_processor: RolloutProcessor = default_no_op_rollout_processor, evaluation_test_kwargs: Optional[List[EvaluationInputParam]] = None, @@ -144,17 +145,16 @@ def generate_combinations(): for ip in params: for im in messages: for etk in kwargs: - # Skip combinations that don't make sense - # If we have a dataset, we should have params for rollout - if ds is not None and ip is None: - continue - # If we have messages but no dataset, that's fine - # If we have no dataset and no messages, that's also fine + # if no dataset and no messages, raise an error + if ds is None and im is None: + raise ValueError("No dataset or messages provided. Please provide at least one of input_dataset or input_messages.") combinations.append((m, ds, ip, im, etk)) return combinations combinations = generate_combinations() + if len(combinations) == 0: + raise ValueError("No combinations of parameters were found. Please provide at least a model and one of input_dataset or input_messages.") # Create parameter tuples for pytest.mark.parametrize param_tuples = [] diff --git a/tests/pytest/data/function_calling.jsonl b/tests/pytest/data/function_calling.jsonl new file mode 100644 index 00000000..273d2eb6 --- /dev/null +++ b/tests/pytest/data/function_calling.jsonl @@ -0,0 +1,13 @@ +{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\", \"unit\": \"celsius\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}}, "required": ["location", "unit"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}]}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "weather_london_perfect", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}} +{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\", \"unit\": \"fahrenheit\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}}, "required": ["location", "unit"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "weather_london_unit_mismatch", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}} +{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "fetch_weather", "arguments": "{\"location\": \"London\", \"unit\": \"celsius\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}}, "required": ["location", "unit"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "weather_london_name_mismatch", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}} +{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\"}"}}, {"type": "function", "function": {"name": "extra_call", "arguments": "{}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}}, "required": ["location"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\"}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "weather_extra_call", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}} +{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}}, "required": ["location"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\"}\"}}, {\"type\": \"function\", \"function\": {\"name\": \"expected_extra_call\", \"arguments\": \"{}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "weather_missing_call", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}} +{"messages": [{"role": "user", "content": "Tell me a joke."}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}}, "required": ["location"]}}}], "ground_truth": "{\"tool_calls\": []}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "joke_unexpected_call", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}} +{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "content": "It might be sunny."}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}}, "required": ["location"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\"}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "weather_no_call", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}} +{"messages": [{"role": "user", "content": "Tell me a joke."}, {"role": "assistant", "content": "Why did the chicken cross the road?"}], "tools": [], "ground_truth": "{\"tool_calls\": []}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "joke_no_calls", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}} +{"messages": [{"role": "user", "content": "What's the weather in Berlin?"}, {"role": "assistant", "content": "{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"Berlin\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}"}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}}, "required": ["location", "unit"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"Berlin\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}]}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "weather_xml_parsing", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "medium"}}} +{"messages": [{"role": "user", "content": "Create a user for John Doe"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "create_user", "arguments": "{\"user\": {\"firstName\": \"John\", \"lastName\": \"Doe\", \"age\": 30}}"}}]}], "tools": [{"type": "function", "function": {"name": "create_user", "description": "Create a new user", "parameters": {"type": "object", "properties": {"user": {"type": "object", "properties": {"firstName": {"type": "string"}, "lastName": {"type": "string"}, "age": {"type": "number"}}, "required": ["firstName", "lastName", "age"]}}, "required": ["user"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"create_user\", \"arguments\": \"{\\\"user\\\": {\\\"firstName\\\": \\\"John\\\", \\\"lastName\\\": \\\"Doe\\\", \\\"age\\\": 30}}\"}}]}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "create_user_nested", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "hard"}}} +{"messages": [{"role": "user", "content": "Find user"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "find_user", "arguments": "{\"id\": 123, \"name\": \"John Doe\""}}]}], "tools": [{"type": "function", "function": {"name": "find_user", "description": "Find a user by ID and name", "parameters": {"type": "object", "properties": {"id": {"type": "number"}, "name": {"type": "string"}}, "required": ["id", "name"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"find_user\", \"arguments\": \"{\\\"id\\\": 123, \\\"name\\\": \\\"John Doe\\\"}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "find_user_invalid_json", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "medium"}}} +{"messages": [{"role": "user", "content": "Query"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "test_func", "arguments": "not a json string"}}]}], "tools": [{"type": "function", "function": {"name": "test_func", "description": "Test function", "parameters": {"type": "object", "properties": {}}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"test_func\", \"arguments\": \"not a json string\"}}]}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "test_func_non_json", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}} +{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"id": "call_test123", "type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\", \"unit\": \"celsius\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}}, "required": ["location", "unit"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}]}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "weather_with_id", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}} diff --git a/tests/pytest/test_pytest_function_calling.py b/tests/pytest/test_pytest_function_calling.py new file mode 100644 index 00000000..ed536004 --- /dev/null +++ b/tests/pytest/test_pytest_function_calling.py @@ -0,0 +1,33 @@ +import json +from typing import Any, Dict, List +from eval_protocol.models import EvaluationRow +from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test +from eval_protocol.rewards.function_calling import exact_tool_match_reward + + +def function_calling_to_evaluation_row(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: + """ + Convert a function calling row to an evaluation row. + """ + dataset: List[EvaluationRow] = [] + for row in rows: + dataset.append( + EvaluationRow(messages=row["messages"][:1], tools=row["tools"], ground_truth=row["ground_truth"]) + ) + return dataset + + +@evaluation_test( + input_dataset=["tests/pytest/data/function_calling.jsonl"], + model=["accounts/fireworks/models/kimi-k2-instruct"], + mode="pointwise", + dataset_adapter=function_calling_to_evaluation_row, + rollout_processor=default_single_turn_rollout_processor, +) +async def test_pytest_function_calling(row: EvaluationRow) -> EvaluationRow: + """Run pointwise evaluation on sample dataset using pytest interface.""" + ground_truth = json.loads(row.ground_truth) + result = exact_tool_match_reward(row.messages, ground_truth) + row.evaluation_result = result + print(result) + return row