Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@
"python.testing.pytestArgs": ["tests", "examples", "-s", "--tb=short"],
"python.testing.autoTestDiscoverOnSaveEnabled": true,
"python.defaultInterpreterPath": "./.venv/bin/python",
"python.testing.cwd": "${workspaceFolder}"
"python.testing.cwd": "${workspaceFolder}",
"editor.defaultFormatter": "ms-python.black-formatter"
}
2 changes: 2 additions & 0 deletions eval_protocol/pytest/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
from .default_single_turn_rollout_process import default_single_turn_rollout_processor
from .evaluation_test import evaluation_test
from .types import RolloutProcessor, RolloutProcessorConfig
from .default_dataset_adapter import default_dataset_adapter

__all__ = [
"default_agent_rollout_processor",
"default_no_op_rollout_processor",
"default_single_turn_rollout_processor",
"default_dataset_adapter",
"RolloutProcessor",
"RolloutProcessorConfig",
"evaluation_test",
Expand Down
10 changes: 10 additions & 0 deletions eval_protocol/pytest/default_dataset_adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from typing import Any, Dict, List

from eval_protocol.models import EvaluationRow


def default_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
"""
Default dataset adapter that simply returns the rows as is.
"""
return [EvaluationRow(**row) for row in rows]
16 changes: 12 additions & 4 deletions eval_protocol/pytest/default_single_turn_rollout_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,19 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:

messages_payload = [{"role": m.role, "content": m.content} for m in row.messages]

response = await client.chat.completions.create(
model=config.model, messages=messages_payload, **config.input_params
)
create_kwargs = dict(model=config.model, messages=messages_payload, **config.input_params)
if row.tools is not None:
create_kwargs["tools"] = row.tools
response = await client.chat.completions.create(**create_kwargs)
assistant_content = response.choices[0].message.content or ""
messages = list(row.messages) + [Message(role="assistant", content=assistant_content)]
tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None
messages = list(row.messages) + [
Message(
role="assistant",
content=assistant_content,
tool_calls=tool_calls,
)
]

return EvaluationRow(
messages=messages,
Expand Down
14 changes: 7 additions & 7 deletions eval_protocol/pytest/evaluation_test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import inspect
from typing import Any, Callable, Dict, List, Optional

from eval_protocol.pytest.default_dataset_adapter import default_dataset_adapter
import pytest

from eval_protocol.models import EvaluationRow
Expand Down Expand Up @@ -32,7 +33,7 @@ def evaluation_test(
model: List[ModelParam],
input_messages: Optional[List[InputMessagesParam]] = None,
input_dataset: Optional[List[DatasetPathParam]] = None,
dataset_adapter: Optional[Callable[[List[Dict[str, Any]]], Dataset]] = lambda x: x,
dataset_adapter: Optional[Callable[[List[Dict[str, Any]]], Dataset]] = default_dataset_adapter,
rollout_input_params: Optional[List[RolloutInputParam]] = None,
rollout_processor: RolloutProcessor = default_no_op_rollout_processor,
evaluation_test_kwargs: Optional[List[EvaluationInputParam]] = None,
Expand Down Expand Up @@ -144,17 +145,16 @@ def generate_combinations():
for ip in params:
for im in messages:
for etk in kwargs:
# Skip combinations that don't make sense
# If we have a dataset, we should have params for rollout
if ds is not None and ip is None:
continue
# If we have messages but no dataset, that's fine
# If we have no dataset and no messages, that's also fine
# if no dataset and no messages, raise an error
if ds is None and im is None:
raise ValueError("No dataset or messages provided. Please provide at least one of input_dataset or input_messages.")
combinations.append((m, ds, ip, im, etk))

return combinations

combinations = generate_combinations()
if len(combinations) == 0:
raise ValueError("No combinations of parameters were found. Please provide at least a model and one of input_dataset or input_messages.")

# Create parameter tuples for pytest.mark.parametrize
param_tuples = []
Expand Down
13 changes: 13 additions & 0 deletions tests/pytest/data/function_calling.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\", \"unit\": \"celsius\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}}, "required": ["location", "unit"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}]}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "weather_london_perfect", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\", \"unit\": \"fahrenheit\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}}, "required": ["location", "unit"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "weather_london_unit_mismatch", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "fetch_weather", "arguments": "{\"location\": \"London\", \"unit\": \"celsius\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}}, "required": ["location", "unit"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "weather_london_name_mismatch", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\"}"}}, {"type": "function", "function": {"name": "extra_call", "arguments": "{}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}}, "required": ["location"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\"}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "weather_extra_call", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}}, "required": ["location"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\"}\"}}, {\"type\": \"function\", \"function\": {\"name\": \"expected_extra_call\", \"arguments\": \"{}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "weather_missing_call", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
{"messages": [{"role": "user", "content": "Tell me a joke."}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}}, "required": ["location"]}}}], "ground_truth": "{\"tool_calls\": []}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "joke_unexpected_call", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "content": "It might be sunny."}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}}, "required": ["location"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\"}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "weather_no_call", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
{"messages": [{"role": "user", "content": "Tell me a joke."}, {"role": "assistant", "content": "Why did the chicken cross the road?"}], "tools": [], "ground_truth": "{\"tool_calls\": []}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "joke_no_calls", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
{"messages": [{"role": "user", "content": "What's the weather in Berlin?"}, {"role": "assistant", "content": "<tool_call>{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"Berlin\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}</tool_call>"}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}}, "required": ["location", "unit"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"Berlin\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}]}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "weather_xml_parsing", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "medium"}}}
{"messages": [{"role": "user", "content": "Create a user for John Doe"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "create_user", "arguments": "{\"user\": {\"firstName\": \"John\", \"lastName\": \"Doe\", \"age\": 30}}"}}]}], "tools": [{"type": "function", "function": {"name": "create_user", "description": "Create a new user", "parameters": {"type": "object", "properties": {"user": {"type": "object", "properties": {"firstName": {"type": "string"}, "lastName": {"type": "string"}, "age": {"type": "number"}}, "required": ["firstName", "lastName", "age"]}}, "required": ["user"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"create_user\", \"arguments\": \"{\\\"user\\\": {\\\"firstName\\\": \\\"John\\\", \\\"lastName\\\": \\\"Doe\\\", \\\"age\\\": 30}}\"}}]}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "create_user_nested", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "hard"}}}
{"messages": [{"role": "user", "content": "Find user"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "find_user", "arguments": "{\"id\": 123, \"name\": \"John Doe\""}}]}], "tools": [{"type": "function", "function": {"name": "find_user", "description": "Find a user by ID and name", "parameters": {"type": "object", "properties": {"id": {"type": "number"}, "name": {"type": "string"}}, "required": ["id", "name"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"find_user\", \"arguments\": \"{\\\"id\\\": 123, \\\"name\\\": \\\"John Doe\\\"}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "find_user_invalid_json", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "medium"}}}
{"messages": [{"role": "user", "content": "Query"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "test_func", "arguments": "not a json string"}}]}], "tools": [{"type": "function", "function": {"name": "test_func", "description": "Test function", "parameters": {"type": "object", "properties": {}}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"test_func\", \"arguments\": \"not a json string\"}}]}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "test_func_non_json", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"id": "call_test123", "type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\", \"unit\": \"celsius\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}}, "required": ["location", "unit"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}]}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "weather_with_id", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
33 changes: 33 additions & 0 deletions tests/pytest/test_pytest_function_calling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import json
from typing import Any, Dict, List
from eval_protocol.models import EvaluationRow
from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
from eval_protocol.rewards.function_calling import exact_tool_match_reward


def function_calling_to_evaluation_row(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
"""
Convert a function calling row to an evaluation row.
"""
dataset: List[EvaluationRow] = []
for row in rows:
dataset.append(
EvaluationRow(messages=row["messages"][:1], tools=row["tools"], ground_truth=row["ground_truth"])
)
return dataset


@evaluation_test(
input_dataset=["tests/pytest/data/function_calling.jsonl"],
model=["accounts/fireworks/models/kimi-k2-instruct"],
mode="pointwise",
dataset_adapter=function_calling_to_evaluation_row,
rollout_processor=default_single_turn_rollout_processor,
)
async def test_pytest_function_calling(row: EvaluationRow) -> EvaluationRow:
"""Run pointwise evaluation on sample dataset using pytest interface."""
ground_truth = json.loads(row.ground_truth)
result = exact_tool_match_reward(row.messages, ground_truth)
row.evaluation_result = result
print(result)
return row
Loading