diff --git a/eval_protocol/pytest/__init__.py b/eval_protocol/pytest/__init__.py index 8a31438a..2daef101 100644 --- a/eval_protocol/pytest/__init__.py +++ b/eval_protocol/pytest/__init__.py @@ -3,7 +3,6 @@ from .default_single_turn_rollout_process import default_single_turn_rollout_processor from .evaluation_test import evaluation_test from .types import RolloutProcessor, RolloutProcessorConfig -from .utils import evaluate __all__ = [ "default_agent_rollout_processor", @@ -11,6 +10,5 @@ "default_single_turn_rollout_processor", "RolloutProcessor", "RolloutProcessorConfig", - "evaluate", "evaluation_test", ] diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index c4b222d2..ee5291c6 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -8,10 +8,11 @@ from eval_protocol.pytest.types import ( Dataset, DatasetPathParam, + EvaluationInputParam, EvaluationTestMode, InputMessagesParam, - InputParam, ModelParam, + RolloutInputParam, RolloutProcessor, RolloutProcessorConfig, TestFunction, @@ -32,8 +33,9 @@ def evaluation_test( input_messages: Optional[List[InputMessagesParam]] = None, input_dataset: Optional[List[DatasetPathParam]] = None, dataset_adapter: Optional[Callable[[List[Dict[str, Any]]], Dataset]] = lambda x: x, - input_params: Optional[List[InputParam]] = None, + rollout_input_params: Optional[List[RolloutInputParam]] = None, rollout_processor: RolloutProcessor = default_no_op_rollout_processor, + evaluation_test_kwargs: Optional[List[EvaluationInputParam]] = None, aggregation_method: AggregationMethod = "mean", threshold_of_success: Optional[float] = None, num_runs: int = 1, @@ -56,8 +58,9 @@ def evaluation_test( to a list of EvaluationRows if you have a custom dataset format. dataset_adapter: Function to convert the input dataset to a list of EvaluationRows. This is useful if you have a custom dataset format. - input_params: Generation parameters for the model. + rollout_input_params: Generation parameters for the rollout. rollout_processor: Function used to perform the rollout. + evaluation_test_kwargs: Kwargs for the evaluation function. aggregation_method: How to aggregate scores across rows. threshold_of_success: If set, fail the test if the aggregated score is below this threshold. @@ -104,12 +107,19 @@ def execute_with_params( test_func: TestFunction, row: EvaluationRow | None = None, input_dataset: List[EvaluationRow] | None = None, + evaluation_test_kwargs: Optional[EvaluationInputParam] = None, ): kwargs = {} if input_dataset is not None: kwargs["rows"] = input_dataset if row is not None: kwargs["row"] = row + if evaluation_test_kwargs is not None: + if "row" in evaluation_test_kwargs: + raise ValueError("'row' is a reserved parameter for the evaluation function") + if "rows" in evaluation_test_kwargs: + raise ValueError("'rows' is a reserved parameter for the evaluation function") + kwargs.update(evaluation_test_kwargs) return execute_function(test_func, **kwargs) # Calculate all possible combinations of parameters @@ -118,21 +128,23 @@ def generate_combinations(): # Handle optional parameters with defaults datasets: List[Optional[DatasetPathParam]] = input_dataset if input_dataset is not None else [None] # type: ignore - params: List[Optional[InputParam]] = input_params if input_params is not None else [None] # type: ignore + params: List[Optional[RolloutInputParam]] = rollout_input_params if rollout_input_params is not None else [None] # type: ignore messages: List[Optional[InputMessagesParam]] = input_messages if input_messages is not None else [None] # type: ignore + kwargs: List[Optional[EvaluationInputParam]] = evaluation_test_kwargs if evaluation_test_kwargs is not None else [None] # type: ignore # Generate all combinations for m in model: for ds in datasets: for ip in params: for im in messages: - # Skip combinations that don't make sense - # If we have a dataset, we should have params for rollout - if ds is not None and ip is None: - continue - # If we have messages but no dataset, that's fine - # If we have no dataset and no messages, that's also fine - combinations.append((m, ds, ip, im)) + for etk in kwargs: + # Skip combinations that don't make sense + # If we have a dataset, we should have params for rollout + if ds is not None and ip is None: + continue + # If we have messages but no dataset, that's fine + # If we have no dataset and no messages, that's also fine + combinations.append((m, ds, ip, im, etk)) return combinations @@ -141,27 +153,31 @@ def generate_combinations(): # Create parameter tuples for pytest.mark.parametrize param_tuples = [] for combo in combinations: - model_name, dataset, params, messages = combo + model_name, dataset, params, messages, etk = combo param_tuple = [model_name] if input_dataset is not None: param_tuple.append(dataset) - if input_params is not None: + if rollout_input_params is not None: param_tuple.append(params) if input_messages is not None: param_tuple.append(messages) + if evaluation_test_kwargs is not None: + param_tuple.append(etk) param_tuples.append(tuple(param_tuple)) # For batch mode, use the original parameter names test_param_names = ["model"] if input_dataset is not None: test_param_names.append("dataset_path") - if input_params is not None: + if rollout_input_params is not None: test_param_names.append("input_params") if input_messages is not None: test_param_names.append("input_messages") + if evaluation_test_kwargs is not None: + test_param_names.append("evaluation_test_kwargs") # Create wrapper function with exact signature that pytest expects - def create_wrapper_with_signature(): + def create_wrapper_with_signature() -> Callable: # Create the function body that will be used def wrapper_body(**kwargs): model_name = kwargs["model"] @@ -193,6 +209,7 @@ def wrapper_body(**kwargs): result = execute_with_params( test_func, row=row, + evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {}, ) if result is None or not isinstance(result, EvaluationRow): raise ValueError( @@ -204,6 +221,7 @@ def wrapper_body(**kwargs): results = execute_with_params( test_func, input_dataset=input_dataset, + evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {}, ) if results is None: raise ValueError( @@ -234,6 +252,7 @@ def wrapper_body(**kwargs): wrapper = create_wrapper_with_signature() wrapper = pytest.mark.parametrize(test_param_names, param_tuples)(wrapper) + wrapper.original_evaluation_test_func = test_func return wrapper diff --git a/eval_protocol/pytest/types.py b/eval_protocol/pytest/types.py index 57bef1cc..67cec58d 100644 --- a/eval_protocol/pytest/types.py +++ b/eval_protocol/pytest/types.py @@ -9,8 +9,9 @@ ModelParam = str # gpt-4o, gpt-4o-mini, accounts/fireworks/models/llama-3.1-8b-instruct DatasetPathParam = str -InputParam = Dict[str, Any] +RolloutInputParam = Dict[str, Any] InputMessagesParam = List[Message] +EvaluationInputParam = Dict[str, Any] Dataset = List[EvaluationRow] @@ -37,7 +38,7 @@ @dataclass class RolloutProcessorConfig: model: ModelParam - input_params: InputParam # optional input parameters for inference + input_params: RolloutInputParam # optional input parameters for inference mcp_config_path: str # for agent rollout processor diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py index fccf5f81..a0f02525 100644 --- a/eval_protocol/pytest/utils.py +++ b/eval_protocol/pytest/utils.py @@ -39,18 +39,6 @@ def execute_function(func: Callable, **kwargs) -> Any: return results -def evaluate( - rows: List[EvaluationRow], reward_fn: Callable[..., EvaluateResult], **kwargs: Any -) -> List[EvaluationRow]: - """Apply a reward function to each row and attach the result.""" - evaluated: List[EvaluationRow] = [] - for row in rows: - result = reward_fn(messages=row.messages, ground_truth=row.ground_truth, **kwargs) - row.evaluation_result = result - evaluated.append(row) - return evaluated - - AggregationMethod = Literal["mean", "max", "min"] diff --git a/eval_protocol/rewards/math.py b/eval_protocol/rewards/math.py index 221b3891..e0406314 100644 --- a/eval_protocol/rewards/math.py +++ b/eval_protocol/rewards/math.py @@ -565,6 +565,11 @@ def math_reward( require_units: bool = False, **kwargs: Any, ) -> EvaluateResult: + """ + NOTE: This is the deprecated/old way of creating an eval in Eval Protocol. + What use to be the @reward_function decorator is now the @evaluation_test + decorator with the mode="pointwise" parameter. + """ if ( not messages or not isinstance(messages[-1], Message) diff --git a/tests/pytest/test_markdown_highlighting.py b/tests/pytest/test_markdown_highlighting.py index e85f0742..34b17363 100644 --- a/tests/pytest/test_markdown_highlighting.py +++ b/tests/pytest/test_markdown_highlighting.py @@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional from eval_protocol.models import EvaluateResult, EvaluationRow, Message -from eval_protocol.pytest import evaluation_test, default_single_turn_rollout_processor, evaluate +from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test def markdown_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]: @@ -21,17 +21,27 @@ def markdown_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu ] -def markdown_format_evaluate(messages: List[Message], ground_truth: Optional[str] = None, **kwargs) -> EvaluateResult: +@evaluation_test( + input_dataset=["tests/pytest/data/markdown_dataset.jsonl"], + dataset_adapter=markdown_dataset_to_evaluation_row, + model=["accounts/fireworks/models/llama-v3p1-8b-instruct"], + rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}], + threshold_of_success=1.0, + rollout_processor=default_single_turn_rollout_processor, + num_runs=1, + mode="pointwise", +) +def test_markdown_highlighting_evaluation(row: EvaluationRow) -> EvaluationRow: """ Evaluation function that checks if the model's response contains the required number of formatted sections. """ - assistant_response = messages[-1].content + assistant_response = row.messages[-1].content if not assistant_response: return EvaluateResult(score=0.0, reason="❌ No assistant response found") - required_highlights = int(ground_truth) + required_highlights = int(row.ground_truth) # Check if the response contains the required number of formatted sections # e.g. **bold** or *italic* @@ -50,26 +60,11 @@ def markdown_format_evaluate(messages: List[Message], ground_truth: Optional[str meets_requirement = actual_count >= required_highlights if meets_requirement: - return EvaluateResult( + row.evaluation_result = EvaluateResult( score=1.0, reason=f"✅ Found {actual_count} highlighted sections (required: {required_highlights})" ) else: - return EvaluateResult( + row.evaluation_result = EvaluateResult( score=0.0, reason=f"❌ Only found {actual_count} highlighted sections (required: {required_highlights})" ) - - -@evaluation_test( - input_dataset=["tests/pytest/data/markdown_dataset.jsonl"], - dataset_adapter=markdown_dataset_to_evaluation_row, - model=["accounts/fireworks/models/llama-v3p1-8b-instruct"], - input_params=[{"temperature": 0.0, "max_tokens": 4096}], - threshold_of_success=1.0, - rollout_processor=default_single_turn_rollout_processor, - num_runs=1, -) -def test_markdown_highlighting_evaluation(rows: List[EvaluationRow]) -> List[EvaluationRow]: - """ - Test markdown highlighting validation using batch mode with evaluate(). - """ - return evaluate(rows, markdown_format_evaluate) + return row diff --git a/tests/pytest/test_pytest_math_example.py b/tests/pytest/test_pytest_math_example.py index 367794a0..01228baf 100644 --- a/tests/pytest/test_pytest_math_example.py +++ b/tests/pytest/test_pytest_math_example.py @@ -1,7 +1,7 @@ -from typing import List -from eval_protocol.models import EvaluationRow -from eval_protocol.pytest import default_single_turn_rollout_processor, evaluate, evaluation_test -from examples.math_example.main import evaluate as math_evaluate +from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult +from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test +from eval_protocol.rewards.math import math_reward +from examples.math_example.main import check_think_answer_format from tests.pytest.helper.gsm8k_to_evaluation_row import gsm8k_to_evaluation_row @@ -9,11 +9,68 @@ input_dataset=["development/gsm8k_sample.jsonl"], dataset_adapter=gsm8k_to_evaluation_row, model=["accounts/fireworks/models/kimi-k2-instruct"], - input_params=[{"temperature": 0.0}], + rollout_input_params=[{"temperature": 0.0}], max_dataset_rows=5, threshold_of_success=0.0, rollout_processor=default_single_turn_rollout_processor, + mode="pointwise", + evaluation_test_kwargs=[ + {"math_reward_kwargs": {"tolerance": 0.001, "absolute_tolerance": 1e-8, "require_units": False}} + ], ) -def test_math_dataset(rows: List[EvaluationRow]) -> List[EvaluationRow]: - """Run math evaluation on sample dataset using pytest interface.""" - return evaluate(rows, math_evaluate) +def test_math_dataset(row: EvaluationRow, **kwargs) -> EvaluationRow: + """ + Evaluate math problem solving considering both accuracy and format. + + This function demonstrates how to combine multiple evaluation criteria: + - Numerical accuracy using built-in math evaluation + - Format compliance checking for ...... structure + + Args: + row: EvaluationRow containing the conversation messages and ground truth + **kwargs: Additional parameters (like math_reward_kwargs) + + Returns: + EvaluationRow with the evaluation result + """ + # Get the assistant's response + assistant_message = row.messages[-1] + if isinstance(assistant_message, dict): + assistant_response = assistant_message.get("content", "") + else: + assistant_response = assistant_message.content or "" + + # Evaluate numerical accuracy using built-in function + accuracy_result = math_reward(messages=row.messages, ground_truth=row.ground_truth, **kwargs["math_reward_kwargs"]) + + # Evaluate format compliance (looking for ...... format) + format_correct = check_think_answer_format(assistant_response) + format_score = 1.0 if format_correct else 0.0 + + # For math_example, accuracy takes priority - if accuracy is 0, overall score is 0 + # If accuracy is 1, then format can contribute to the score + if accuracy_result.score == 0.0: + combined_score = 0.0 + else: + combined_score = accuracy_result.score # Only accuracy matters for math_example + + # Create metrics structure expected by tests + metrics = { + "accuracy_reward": MetricResult( + score=accuracy_result.score, + reason=f"Numerical accuracy: {accuracy_result.reason}", + is_score_valid=True, + ), + "format_reward": MetricResult( + score=format_score, + reason=f"Format compliance: {'correct' if format_correct else 'incorrect'} ...... structure", + is_score_valid=True, + ), + } + + row.evaluation_result = EvaluateResult( + score=combined_score, + reason=f"Combined score: {combined_score:.2f} (accuracy: {accuracy_result.score:.2f}, format: {format_score:.2f})", + metrics=metrics, + ) + return row diff --git a/tests/pytest/test_pytest_math_format_length.py b/tests/pytest/test_pytest_math_format_length.py index ba5dd60b..2ffe899c 100644 --- a/tests/pytest/test_pytest_math_format_length.py +++ b/tests/pytest/test_pytest_math_format_length.py @@ -1,7 +1,10 @@ -from typing import List -from eval_protocol.models import EvaluationRow -from eval_protocol.pytest import default_single_turn_rollout_processor, evaluate, evaluation_test -from examples.math_with_format_and_length.main import evaluate as math_fl_evaluate +import math + +from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult +from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test +from eval_protocol.rewards.length import count_tokens +from eval_protocol.rewards.math import math_reward +from examples.math_with_format_and_length.main import check_think_answer_format from tests.pytest.helper.gsm8k_to_evaluation_row import gsm8k_to_evaluation_row @@ -9,11 +12,74 @@ input_dataset=["development/gsm8k_sample.jsonl"], dataset_adapter=gsm8k_to_evaluation_row, model=["accounts/fireworks/models/kimi-k2-instruct"], - input_params=[{"temperature": 0.0}], + rollout_input_params=[{"temperature": 0.0}], max_dataset_rows=5, threshold_of_success=0.0, rollout_processor=default_single_turn_rollout_processor, + mode="pointwise", + evaluation_test_kwargs=[ + { + "config": { + "max_length": 1000, + "min_value_wrong": 0.0, + "max_value_wrong": 0.3, + "min_value_correct": 0.5, + "max_value_correct": 1.0, + "token_method": "whitespace", + }, + "math_reward_kwargs": {"tolerance": 0.001, "absolute_tolerance": 1e-8, "require_units": False}, + } + ], ) -def test_math_format_length_dataset(rows: List[EvaluationRow]) -> List[EvaluationRow]: - """Run math with format and length evaluation on sample dataset.""" - return evaluate(rows, math_fl_evaluate) +def test_math_format_length_dataset(row: EvaluationRow, **kwargs) -> EvaluationRow: + """Evaluate math reasoning with format and length considerations.""" + config = kwargs["config"] + assistant_message = row.messages[-1] + text = assistant_message["content"] if isinstance(assistant_message, dict) else assistant_message.content or "" + + # Accuracy using built-in math reward + accuracy_result = math_reward(messages=row.messages, ground_truth=row.ground_truth, **kwargs["math_reward_kwargs"]) + accuracy_score = accuracy_result.score + + # Format compliance + format_correct = check_think_answer_format(text) + format_score = 1.0 if format_correct else 0.0 + + # Length score (cosine scaled) + token_count = count_tokens(text, method=config["token_method"]) + progress = min(1.0, token_count / config["max_length"]) + cosine_factor = math.cos(progress * math.pi) + if accuracy_score == 1.0: + min_v = config["min_value_correct"] + max_v = config["max_value_correct"] + else: + min_v = config["max_value_wrong"] + max_v = config["min_value_wrong"] + length_score = min_v + 0.5 * (max_v - min_v) * (1.0 + cosine_factor) + + combined_score = (accuracy_score + format_score + length_score) / 3.0 + + metrics = { + "accuracy_reward": MetricResult(score=accuracy_score, reason=accuracy_result.reason, is_score_valid=True), + "format_reward": MetricResult( + score=format_score, + reason="correct format" if format_correct else "incorrect format", + is_score_valid=True, + ), + "length_reward": MetricResult( + score=length_score, + reason=f"{token_count} tokens", + is_score_valid=token_count <= config["max_length"], + ), + } + + result = EvaluateResult( + score=combined_score, + reason=( + f"Combined score {combined_score:.2f} (acc: {accuracy_score:.2f}, " + f"format: {format_score:.2f}, length: {length_score:.2f})" + ), + metrics=metrics, + ) + row.evaluation_result = result + return row diff --git a/tests/pytest/test_pytest_word_count_example.py b/tests/pytest/test_pytest_word_count_example.py index a0cb908c..daa256f7 100644 --- a/tests/pytest/test_pytest_word_count_example.py +++ b/tests/pytest/test_pytest_word_count_example.py @@ -9,7 +9,7 @@ input_dataset=["development/gsm8k_sample.jsonl"], dataset_adapter=word_count_to_evaluation_row, model=["accounts/fireworks/models/kimi-k2-instruct"], - input_params=[{"temperature": 0.0}], + rollout_input_params=[{"temperature": 0.0}], max_dataset_rows=5, threshold_of_success=0.3, # Reasonable threshold for word count evaluation rollout_processor=default_single_turn_rollout_processor,