Skip to content

Commit 96fcc09

Browse files
author
Dylan Huang
authored
function calling example (#14)
* function calling example * no need to try/except * fix
1 parent 7d194fe commit 96fcc09

File tree

7 files changed

+79
-12
lines changed

7 files changed

+79
-12
lines changed

.vscode/settings.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,6 @@
44
"python.testing.pytestArgs": ["tests", "examples", "-s", "--tb=short"],
55
"python.testing.autoTestDiscoverOnSaveEnabled": true,
66
"python.defaultInterpreterPath": "./.venv/bin/python",
7-
"python.testing.cwd": "${workspaceFolder}"
7+
"python.testing.cwd": "${workspaceFolder}",
8+
"editor.defaultFormatter": "ms-python.black-formatter"
89
}

eval_protocol/pytest/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@
33
from .default_single_turn_rollout_process import default_single_turn_rollout_processor
44
from .evaluation_test import evaluation_test
55
from .types import RolloutProcessor, RolloutProcessorConfig
6+
from .default_dataset_adapter import default_dataset_adapter
67

78
__all__ = [
89
"default_agent_rollout_processor",
910
"default_no_op_rollout_processor",
1011
"default_single_turn_rollout_processor",
12+
"default_dataset_adapter",
1113
"RolloutProcessor",
1214
"RolloutProcessorConfig",
1315
"evaluation_test",
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from typing import Any, Dict, List
2+
3+
from eval_protocol.models import EvaluationRow
4+
5+
6+
def default_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
7+
"""
8+
Default dataset adapter that simply returns the rows as is.
9+
"""
10+
return [EvaluationRow(**row) for row in rows]

eval_protocol/pytest/default_single_turn_rollout_process.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,19 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
2424

2525
messages_payload = [{"role": m.role, "content": m.content} for m in row.messages]
2626

27-
response = await client.chat.completions.create(
28-
model=config.model, messages=messages_payload, **config.input_params
29-
)
27+
create_kwargs = dict(model=config.model, messages=messages_payload, **config.input_params)
28+
if row.tools is not None:
29+
create_kwargs["tools"] = row.tools
30+
response = await client.chat.completions.create(**create_kwargs)
3031
assistant_content = response.choices[0].message.content or ""
31-
messages = list(row.messages) + [Message(role="assistant", content=assistant_content)]
32+
tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None
33+
messages = list(row.messages) + [
34+
Message(
35+
role="assistant",
36+
content=assistant_content,
37+
tool_calls=tool_calls,
38+
)
39+
]
3240

3341
return EvaluationRow(
3442
messages=messages,

eval_protocol/pytest/evaluation_test.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import inspect
22
from typing import Any, Callable, Dict, List, Optional
33

4+
from eval_protocol.pytest.default_dataset_adapter import default_dataset_adapter
45
import pytest
56

67
from eval_protocol.models import EvaluationRow
@@ -32,7 +33,7 @@ def evaluation_test(
3233
model: List[ModelParam],
3334
input_messages: Optional[List[InputMessagesParam]] = None,
3435
input_dataset: Optional[List[DatasetPathParam]] = None,
35-
dataset_adapter: Optional[Callable[[List[Dict[str, Any]]], Dataset]] = lambda x: x,
36+
dataset_adapter: Optional[Callable[[List[Dict[str, Any]]], Dataset]] = default_dataset_adapter,
3637
rollout_input_params: Optional[List[RolloutInputParam]] = None,
3738
rollout_processor: RolloutProcessor = default_no_op_rollout_processor,
3839
evaluation_test_kwargs: Optional[List[EvaluationInputParam]] = None,
@@ -144,17 +145,16 @@ def generate_combinations():
144145
for ip in params:
145146
for im in messages:
146147
for etk in kwargs:
147-
# Skip combinations that don't make sense
148-
# If we have a dataset, we should have params for rollout
149-
if ds is not None and ip is None:
150-
continue
151-
# If we have messages but no dataset, that's fine
152-
# If we have no dataset and no messages, that's also fine
148+
# if no dataset and no messages, raise an error
149+
if ds is None and im is None:
150+
raise ValueError("No dataset or messages provided. Please provide at least one of input_dataset or input_messages.")
153151
combinations.append((m, ds, ip, im, etk))
154152

155153
return combinations
156154

157155
combinations = generate_combinations()
156+
if len(combinations) == 0:
157+
raise ValueError("No combinations of parameters were found. Please provide at least a model and one of input_dataset or input_messages.")
158158

159159
# Create parameter tuples for pytest.mark.parametrize
160160
param_tuples = []
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\", \"unit\": \"celsius\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}}, "required": ["location", "unit"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}]}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "weather_london_perfect", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
2+
{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\", \"unit\": \"fahrenheit\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}}, "required": ["location", "unit"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "weather_london_unit_mismatch", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
3+
{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "fetch_weather", "arguments": "{\"location\": \"London\", \"unit\": \"celsius\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}}, "required": ["location", "unit"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "weather_london_name_mismatch", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
4+
{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\"}"}}, {"type": "function", "function": {"name": "extra_call", "arguments": "{}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}}, "required": ["location"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\"}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "weather_extra_call", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
5+
{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}}, "required": ["location"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\"}\"}}, {\"type\": \"function\", \"function\": {\"name\": \"expected_extra_call\", \"arguments\": \"{}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "weather_missing_call", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
6+
{"messages": [{"role": "user", "content": "Tell me a joke."}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}}, "required": ["location"]}}}], "ground_truth": "{\"tool_calls\": []}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "joke_unexpected_call", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
7+
{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "content": "It might be sunny."}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}}, "required": ["location"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\"}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "weather_no_call", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
8+
{"messages": [{"role": "user", "content": "Tell me a joke."}, {"role": "assistant", "content": "Why did the chicken cross the road?"}], "tools": [], "ground_truth": "{\"tool_calls\": []}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "joke_no_calls", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
9+
{"messages": [{"role": "user", "content": "What's the weather in Berlin?"}, {"role": "assistant", "content": "<tool_call>{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"Berlin\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}</tool_call>"}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}}, "required": ["location", "unit"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"Berlin\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}]}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "weather_xml_parsing", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "medium"}}}
10+
{"messages": [{"role": "user", "content": "Create a user for John Doe"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "create_user", "arguments": "{\"user\": {\"firstName\": \"John\", \"lastName\": \"Doe\", \"age\": 30}}"}}]}], "tools": [{"type": "function", "function": {"name": "create_user", "description": "Create a new user", "parameters": {"type": "object", "properties": {"user": {"type": "object", "properties": {"firstName": {"type": "string"}, "lastName": {"type": "string"}, "age": {"type": "number"}}, "required": ["firstName", "lastName", "age"]}}, "required": ["user"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"create_user\", \"arguments\": \"{\\\"user\\\": {\\\"firstName\\\": \\\"John\\\", \\\"lastName\\\": \\\"Doe\\\", \\\"age\\\": 30}}\"}}]}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "create_user_nested", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "hard"}}}
11+
{"messages": [{"role": "user", "content": "Find user"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "find_user", "arguments": "{\"id\": 123, \"name\": \"John Doe\""}}]}], "tools": [{"type": "function", "function": {"name": "find_user", "description": "Find a user by ID and name", "parameters": {"type": "object", "properties": {"id": {"type": "number"}, "name": {"type": "string"}}, "required": ["id", "name"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"find_user\", \"arguments\": \"{\\\"id\\\": 123, \\\"name\\\": \\\"John Doe\\\"}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "find_user_invalid_json", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "medium"}}}
12+
{"messages": [{"role": "user", "content": "Query"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "test_func", "arguments": "not a json string"}}]}], "tools": [{"type": "function", "function": {"name": "test_func", "description": "Test function", "parameters": {"type": "object", "properties": {}}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"test_func\", \"arguments\": \"not a json string\"}}]}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "test_func_non_json", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
13+
{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"id": "call_test123", "type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\", \"unit\": \"celsius\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}}, "required": ["location", "unit"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}]}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "weather_with_id", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import json
2+
from typing import Any, Dict, List
3+
from eval_protocol.models import EvaluationRow
4+
from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
5+
from eval_protocol.rewards.function_calling import exact_tool_match_reward
6+
7+
8+
def function_calling_to_evaluation_row(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
9+
"""
10+
Convert a function calling row to an evaluation row.
11+
"""
12+
dataset: List[EvaluationRow] = []
13+
for row in rows:
14+
dataset.append(
15+
EvaluationRow(messages=row["messages"][:1], tools=row["tools"], ground_truth=row["ground_truth"])
16+
)
17+
return dataset
18+
19+
20+
@evaluation_test(
21+
input_dataset=["tests/pytest/data/function_calling.jsonl"],
22+
model=["accounts/fireworks/models/kimi-k2-instruct"],
23+
mode="pointwise",
24+
dataset_adapter=function_calling_to_evaluation_row,
25+
rollout_processor=default_single_turn_rollout_processor,
26+
)
27+
async def test_pytest_function_calling(row: EvaluationRow) -> EvaluationRow:
28+
"""Run pointwise evaluation on sample dataset using pytest interface."""
29+
ground_truth = json.loads(row.ground_truth)
30+
result = exact_tool_match_reward(row.messages, ground_truth)
31+
row.evaluation_result = result
32+
print(result)
33+
return row

0 commit comments

Comments
 (0)