From 07fda02490d1a09c7ab92595d6397622cb64230d Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Mon, 4 Aug 2025 14:28:02 -0700 Subject: [PATCH] add json schema example --- tests/pytest/data/json_schema.jsonl | 12 ++++++++ tests/pytest/test_pytest_json_schema.py | 37 +++++++++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 tests/pytest/data/json_schema.jsonl create mode 100644 tests/pytest/test_pytest_json_schema.py diff --git a/tests/pytest/data/json_schema.jsonl b/tests/pytest/data/json_schema.jsonl new file mode 100644 index 00000000..678dc324 --- /dev/null +++ b/tests/pytest/data/json_schema.jsonl @@ -0,0 +1,12 @@ +{"messages": [{"role": "user", "content": "Give me user information in the following JSON format:\n```json\n{\n \"user\": {\n \"name\": \"string\",\n \"age\": \"number\",\n \"isActive\": \"boolean\"\n },\n \"products\": [\n {\n \"id\": \"string\",\n \"price\": \"number\"\n }\n ]\n}\n```"}, {"role": "assistant", "content": "Here's the user information:\n```json\n{\n \"user\": {\n \"name\": \"John Doe\",\n \"age\": 30,\n \"isActive\": true\n },\n \"products\": [\n {\n \"id\": \"prod-1\",\n \"price\": 29.99\n }\n ]\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 1.0, "reason": "Perfect schema match", "metrics": {"schema_similarity": {"score": 1.0, "reason": "Schema similarity: 1.00", "is_score_valid": true}}}, "input_metadata": {"dataset_info": {"test_case": "exact_match", "description": "JSON content that exactly matches the expected schema", "expected_schema": {"type": "object", "properties": {"user": {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "number"}, "isActive": {"type": "boolean"}}}, "products": {"type": "array", "items": {"type": "object", "properties": {"id": {"type": "string"}, "price": {"type": "number"}}}}}}}}} +{"messages": [{"role": "user", "content": "Give me user information in the following JSON format:\n```json\n{\n \"user\": {\n \"name\": \"string\",\n \"age\": \"number\",\n \"isActive\": \"boolean\",\n \"address\": {\n \"street\": \"string\",\n \"city\": \"string\"\n }\n }\n}\n```"}, {"role": "assistant", "content": "Here's the user information:\n```json\n{\n \"user\": {\n \"name\": \"John Doe\",\n \"age\": 30,\n \"email\": \"john@example.com\"\n }\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 0.5, "reason": "Partial schema match with missing and extra properties", "metrics": {"schema_similarity": {"score": 0.5, "reason": "Schema similarity: 0.50", "is_score_valid": false}}}, "input_metadata": {"dataset_info": {"test_case": "partial_match", "description": "JSON content with missing and extra properties compared to expected schema", "expected_schema": {"type": "object", "properties": {"user": {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "number"}, "isActive": {"type": "boolean"}, "address": {"type": "object", "properties": {"street": {"type": "string"}, "city": {"type": "string"}}}}}}}}}} +{"messages": [{"role": "user", "content": "Give me user information in the following JSON format:\n```json\n{\n \"name\": \"string\",\n \"age\": \"number\"\n}\n```"}, {"role": "assistant", "content": "Here's the user information:\n```json\n{\n \"name\": \"John Doe\",\n \"age\": 30\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 1.0, "reason": "Perfect schema match for simple object", "metrics": {"schema_similarity": {"score": 1.0, "reason": "Schema similarity: 1.00", "is_score_valid": true}}}, "input_metadata": {"dataset_info": {"test_case": "extract_from_message", "description": "JSON extracted from message content with code block formatting", "expected_schema": {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "number"}}}}}} +{"messages": [{"role": "user", "content": "Give me user information in the following JSON format:\n```json\n{\n \"name\": \"string\",\n \"age\": \"number\",\n \"tags\": [\"string\"]\n}\n```"}, {"role": "assistant", "content": "Here's the user information:\n```json\n{\n \"name\": \"John Doe\",\n \"age\": \"30\",\n \"tags\": {\n \"tag1\": \"value1\"\n }\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 0.33, "reason": "Type mismatches in schema validation", "metrics": {"schema_similarity": {"score": 0.33, "reason": "Schema similarity: 0.33", "is_score_valid": false}}}, "input_metadata": {"dataset_info": {"test_case": "mismatched_types", "description": "JSON content with type mismatches (string instead of number, object instead of array)", "expected_schema": {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "number"}, "tags": {"type": "array", "items": {"type": "string"}}}}}}} +{"messages": [{"role": "user", "content": "Give me user information in the following JSON format:\n```json\n{\n \"name\": \"string\",\n \"age\": \"number\"\n}\n```"}, {"role": "assistant", "content": "Here's the user information:\n```json\n{\n \"name\": \"John Doe\",\n \"age\": 30\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 1.0, "reason": "Perfect schema match using JSON string input", "metrics": {"schema_similarity": {"score": 1.0, "reason": "Schema similarity: 1.00", "is_score_valid": true}}}, "input_metadata": {"dataset_info": {"test_case": "json_string_input", "description": "JSON schema validation using JSON string instead of object", "expected_schema": {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "number"}}}}}} +{"messages": [{"role": "user", "content": "Give me an empty object in the following JSON format:\n```json\n{}\n```"}, {"role": "assistant", "content": "Here's an empty object:\n```json\n{}\n```"}], "ground_truth": null, "evaluation_result": {"score": 1.0, "reason": "Perfect match for empty object schema", "metrics": {"schema_similarity": {"score": 1.0, "reason": "Schema similarity: 1.00", "is_score_valid": true}}}, "input_metadata": {"dataset_info": {"test_case": "empty_properties", "description": "JSON schema validation with empty object properties", "expected_schema": {"type": "object", "properties": {}}}}} +{"messages": [{"role": "user", "content": "Give me user information in the following JSON format:\n```json\n{\n \"name\": \"string\",\n \"age\": \"number\"\n}\n```"}, {"role": "assistant", "content": "Here's the user information:\n```json\n{\n \"name\": \"John Doe\",\n \"age\": \n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 0.0, "reason": "Invalid JSON content", "metrics": {"error": {"score": 0.0, "reason": "Invalid JSON content: Here's the user information:\n```json\n{\n \"name\": \"John Doe\",\n \"age\": \n}\n```", "is_score_valid": false}}}, "input_metadata": {"dataset_info": {"test_case": "invalid_json", "description": "JSON schema validation with invalid JSON syntax", "expected_schema": {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "number"}}}}}} +{"messages": [{"role": "user", "content": "Give me user information in the following JSON format:\n```json\n{\n \"name\": \"string\",\n \"age\": \"number\"\n}\n```\n\nPlease provide a user object with name and age."}, {"role": "assistant", "content": "Here's the user information:\n```json\n{\n \"name\": \"John Doe\",\n \"age\": 30\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 0.925, "reason": "Combined schema and LLM evaluation", "metrics": {"schema_similarity": {"score": 1.0, "reason": "Schema similarity: 1.00\nMatching properties (2):\n - age: number\n - name: string", "is_score_valid": true}, "llm_judge": {"score": 0.85, "reason": "This is a good JSON structure that matches the expected schema. The data is well-formatted and contains the required fields with appropriate types.", "is_score_valid": false}, "schema_score": {"score": 1.0, "reason": "Schema validation score: 1.00", "is_score_valid": true}, "llm_score": {"score": 0.85, "reason": "LLM judge score: 0.85", "is_score_valid": false}, "weights": {"score": 0.0, "reason": "Weights used - Schema: 0.7, LLM: 0.3", "is_score_valid": true}}}, "input_metadata": {"dataset_info": {"test_case": "llm_judge_integration", "description": "Combined schema validation and LLM judgment evaluation", "expected_schema": {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "number"}}}, "expected_behavior": "Provide a user object with name and age"}}} +{"messages": [{"role": "user", "content": "Give me user information in the following JSON format:\n```json\n{\n \"name\": \"string\",\n \"age\": \"number\"\n}\n```\n\nPlease provide a user object with name and age."}, {"role": "assistant", "content": "Here's the user information:\n```json\n{\n \"name\": \"John Doe\",\n \"age\": 30\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 0.72, "reason": "Combined evaluation with custom weights", "metrics": {"schema_similarity": {"score": 1.0, "reason": "Schema similarity: 1.00\nMatching properties (2):\n - age: number\n - name: string", "is_score_valid": true}, "llm_judge": {"score": 0.6, "reason": "This JSON is acceptable but has some issues. The structure is correct but could be more comprehensive.", "is_score_valid": false}, "schema_score": {"score": 1.0, "reason": "Schema validation score: 1.00", "is_score_valid": true}, "llm_score": {"score": 0.6, "reason": "LLM judge score: 0.6", "is_score_valid": false}, "weights": {"score": 0.0, "reason": "Weights used - Schema: 0.3, LLM: 0.7", "is_score_valid": true}}}, "input_metadata": {"dataset_info": {"test_case": "custom_weights", "description": "Combined evaluation with custom weights (schema=0.3, llm=0.7)", "expected_schema": {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "number"}}}, "expected_behavior": "Provide a user object with name and age"}}} +{"messages": [{"role": "user", "content": "Create a complex nested object in the following JSON format:\n```json\n{\n \"company\": {\n \"name\": \"string\",\n \"employees\": [\n {\n \"id\": \"string\",\n \"name\": \"string\",\n \"department\": {\n \"name\": \"string\",\n \"location\": \"string\",\n \"code\": \"string\"\n },\n \"salary\": \"number\",\n \"skills\": [\"string\"]\n }\n ],\n \"metadata\": {\n \"founded\": \"number\",\n \"isPublic\": \"boolean\",\n \"tags\": [\"string\"]\n }\n }\n}\n```"}, {"role": "assistant", "content": "Here's a complex nested object:\n```json\n{\n \"company\": {\n \"name\": \"Tech Corp\",\n \"employees\": [\n {\n \"id\": \"emp-001\",\n \"name\": \"Alice Johnson\",\n \"department\": {\n \"name\": \"Engineering\",\n \"location\": \"Building A\",\n \"extra_field\": \"some value\"\n },\n \"skills\": [\"Python\", \"JavaScript\", \"SQL\"]\n },\n {\n \"id\": \"emp-002\",\n \"name\": \"Bob Smith\",\n \"department\": {\n \"name\": \"Marketing\",\n \"location\": \"Building B\"\n },\n \"skills\": [\"Content Writing\", \"SEO\"]\n }\n ],\n \"metadata\": {\n \"founded\": 2010,\n \"isPublic\": false,\n \"tags\": [\"technology\", \"startup\"]\n }\n }\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 0.8, "reason": "Complex nested schema with some mismatches", "metrics": {"schema_similarity": {"score": 0.8, "reason": "Schema similarity: 0.80", "is_score_valid": false}}}, "input_metadata": {"dataset_info": {"test_case": "complex_nested", "description": "Complex nested JSON object with arrays and objects", "expected_schema": {"type": "object", "properties": {"company": {"type": "object", "properties": {"name": {"type": "string"}, "employees": {"type": "array", "items": {"type": "object", "properties": {"id": {"type": "string"}, "name": {"type": "string"}, "department": {"type": "object", "properties": {"name": {"type": "string"}, "location": {"type": "string"}, "code": {"type": "string"}}}, "salary": {"type": "number"}, "skills": {"type": "array", "items": {"type": "string"}}}}}}}, "metadata": {"type": "object", "properties": {"founded": {"type": "number"}, "isPublic": {"type": "boolean"}, "tags": {"type": "array", "items": {"type": "string"}}}}}}}}} +{"messages": [{"role": "user", "content": "Provide an array of simple objects in the following JSON format:\n```json\n[\n {\n \"id\": \"number\",\n \"name\": \"string\",\n \"active\": \"boolean\"\n }\n]\n```"}, {"role": "assistant", "content": "Here's an array of simple objects:\n```json\n[\n {\n \"id\": 1,\n \"name\": \"Item One\",\n \"active\": true\n },\n {\n \"id\": 2,\n \"name\": \"Item Two\",\n \"active\": false\n },\n {\n \"id\": 3,\n \"name\": \"Item Three\",\n \"active\": true\n }\n]\n```"}], "ground_truth": null, "evaluation_result": {"score": 1.0, "reason": "Perfect array schema match", "metrics": {"schema_similarity": {"score": 1.0, "reason": "Schema similarity: 1.00", "is_score_valid": true}}}, "input_metadata": {"dataset_info": {"test_case": "array_of_objects", "description": "JSON array containing simple objects with consistent structure", "expected_schema": {"type": "array", "items": {"type": "object", "properties": {"id": {"type": "number"}, "name": {"type": "string"}, "active": {"type": "boolean"}}}}}}} +{"messages": [{"role": "user", "content": "Give me a mixed data structure in the following JSON format:\n```json\n{\n \"string_field\": \"string\",\n \"number_field\": \"number\",\n \"boolean_field\": \"boolean\",\n \"null_field\": \"null\",\n \"array_field\": [\"number\"],\n \"object_field\": {\n \"nested_string\": \"string\",\n \"nested_number\": \"number\"\n },\n \"date_field\": \"string\",\n \"enum_field\": \"string\"\n}\n```"}, {"role": "assistant", "content": "Here's a mixed data structure:\n```json\n{\n \"string_field\": \"Hello World\",\n \"number_field\": 42,\n \"boolean_field\": true,\n \"null_field\": null,\n \"array_field\": [1, 2, 3, 4, 5],\n \"object_field\": {\n \"nested_string\": \"Nested Value\",\n \"nested_number\": 123.45\n },\n \"extra_field\": \"unexpected field\"\n}\n```"}], "ground_truth": null, "evaluation_result": {"score": 0.75, "reason": "Mixed data types with some schema mismatches", "metrics": {"schema_similarity": {"score": 0.75, "reason": "Schema similarity: 0.75", "is_score_valid": false}}}, "input_metadata": {"dataset_info": {"test_case": "mixed_data_types", "description": "JSON object with various data types including strings, numbers, booleans, null, arrays, and objects", "expected_schema": {"type": "object", "properties": {"string_field": {"type": "string"}, "number_field": {"type": "number"}, "boolean_field": {"type": "boolean"}, "null_field": {"type": "null"}, "array_field": {"type": "array", "items": {"type": "number"}}, "object_field": {"type": "object", "properties": {"nested_string": {"type": "string"}, "nested_number": {"type": "number"}}}, "date_field": {"type": "string"}, "enum_field": {"type": "string"}}}}}} diff --git a/tests/pytest/test_pytest_json_schema.py b/tests/pytest/test_pytest_json_schema.py new file mode 100644 index 00000000..9dfc278a --- /dev/null +++ b/tests/pytest/test_pytest_json_schema.py @@ -0,0 +1,37 @@ +import json +from typing import Any, Dict, List +from eval_protocol.models import EvaluationRow +from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test +from eval_protocol.rewards.json_schema import json_schema_reward + + +def json_schema_to_evaluation_row(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: + """ + Convert a json schema row to an evaluation row. + """ + dataset: List[EvaluationRow] = [] + for row in rows: + dataset.append( + EvaluationRow( + messages=row["messages"][:1], + ground_truth=row["ground_truth"], + input_metadata=row["input_metadata"], + ) + ) + return dataset + + +@evaluation_test( + input_dataset=["tests/pytest/data/json_schema.jsonl"], + model=["accounts/fireworks/models/kimi-k2-instruct"], + mode="pointwise", + rollout_processor=default_single_turn_rollout_processor, + dataset_adapter=json_schema_to_evaluation_row, +) +async def test_pytest_function_calling(row: EvaluationRow) -> EvaluationRow: + """Run pointwise evaluation on sample dataset using pytest interface.""" + expected_schema = row.input_metadata.dataset_info["expected_schema"] + result = json_schema_reward(row.messages, expected_schema=expected_schema) + row.evaluation_result = result + print(row.evaluation_result) + return row