|
| 1 | +{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\", \"unit\": \"celsius\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}}, "required": ["location", "unit"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}]}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "weather_london_perfect", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}} |
| 2 | +{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\", \"unit\": \"fahrenheit\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}}, "required": ["location", "unit"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "weather_london_unit_mismatch", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}} |
| 3 | +{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "fetch_weather", "arguments": "{\"location\": \"London\", \"unit\": \"celsius\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}}, "required": ["location", "unit"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "weather_london_name_mismatch", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}} |
| 4 | +{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\"}"}}, {"type": "function", "function": {"name": "extra_call", "arguments": "{}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}}, "required": ["location"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\"}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "weather_extra_call", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}} |
| 5 | +{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}}, "required": ["location"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\"}\"}}, {\"type\": \"function\", \"function\": {\"name\": \"expected_extra_call\", \"arguments\": \"{}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "weather_missing_call", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}} |
| 6 | +{"messages": [{"role": "user", "content": "Tell me a joke."}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}}, "required": ["location"]}}}], "ground_truth": "{\"tool_calls\": []}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "joke_unexpected_call", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}} |
| 7 | +{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "content": "It might be sunny."}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}}, "required": ["location"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\"}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "weather_no_call", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}} |
| 8 | +{"messages": [{"role": "user", "content": "Tell me a joke."}, {"role": "assistant", "content": "Why did the chicken cross the road?"}], "tools": [], "ground_truth": "{\"tool_calls\": []}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "joke_no_calls", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}} |
| 9 | +{"messages": [{"role": "user", "content": "What's the weather in Berlin?"}, {"role": "assistant", "content": "<tool_call>{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"Berlin\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}</tool_call>"}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}}, "required": ["location", "unit"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"Berlin\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}]}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "weather_xml_parsing", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "medium"}}} |
| 10 | +{"messages": [{"role": "user", "content": "Create a user for John Doe"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "create_user", "arguments": "{\"user\": {\"firstName\": \"John\", \"lastName\": \"Doe\", \"age\": 30}}"}}]}], "tools": [{"type": "function", "function": {"name": "create_user", "description": "Create a new user", "parameters": {"type": "object", "properties": {"user": {"type": "object", "properties": {"firstName": {"type": "string"}, "lastName": {"type": "string"}, "age": {"type": "number"}}, "required": ["firstName", "lastName", "age"]}}, "required": ["user"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"create_user\", \"arguments\": \"{\\\"user\\\": {\\\"firstName\\\": \\\"John\\\", \\\"lastName\\\": \\\"Doe\\\", \\\"age\\\": 30}}\"}}]}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "create_user_nested", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "hard"}}} |
| 11 | +{"messages": [{"role": "user", "content": "Find user"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "find_user", "arguments": "{\"id\": 123, \"name\": \"John Doe\""}}]}], "tools": [{"type": "function", "function": {"name": "find_user", "description": "Find a user by ID and name", "parameters": {"type": "object", "properties": {"id": {"type": "number"}, "name": {"type": "string"}}, "required": ["id", "name"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"find_user\", \"arguments\": \"{\\\"id\\\": 123, \\\"name\\\": \\\"John Doe\\\"}\"}}]}", "evaluation_result": {"score": 0.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 0.0", "metrics": {}}, "input_metadata": {"row_id": "find_user_invalid_json", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "medium"}}} |
| 12 | +{"messages": [{"role": "user", "content": "Query"}, {"role": "assistant", "tool_calls": [{"type": "function", "function": {"name": "test_func", "arguments": "not a json string"}}]}], "tools": [{"type": "function", "function": {"name": "test_func", "description": "Test function", "parameters": {"type": "object", "properties": {}}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"test_func\", \"arguments\": \"not a json string\"}}]}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "test_func_non_json", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}} |
| 13 | +{"messages": [{"role": "user", "content": "What's the weather in London?"}, {"role": "assistant", "tool_calls": [{"id": "call_test123", "type": "function", "function": {"name": "get_weather", "arguments": "{\"location\": \"London\", \"unit\": \"celsius\"}"}}]}], "tools": [{"type": "function", "function": {"name": "get_weather", "description": "Get weather information for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}}, "required": ["location", "unit"]}}}], "ground_truth": "{\"tool_calls\": [{\"type\": \"function\", \"function\": {\"name\": \"get_weather\", \"arguments\": \"{\\\"location\\\": \\\"London\\\", \\\"unit\\\": \\\"celsius\\\"}\"}}]}", "evaluation_result": {"score": 1.0, "is_score_valid": true, "reason": "Exact tool match evaluation score: 1.0", "metrics": {}}, "input_metadata": {"row_id": "weather_with_id", "completion_params": {"model": "gpt-4o-mini", "temperature": 0.0, "max_tokens": 1000}, "dataset_info": {"task_type": "function_calling", "difficulty": "easy"}}} |
0 commit comments