diff --git a/tests/pytest/test_markdown_highlighting.py b/tests/pytest/test_markdown_highlighting.py index 34b17363..5c003931 100644 --- a/tests/pytest/test_markdown_highlighting.py +++ b/tests/pytest/test_markdown_highlighting.py @@ -26,7 +26,7 @@ def markdown_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu dataset_adapter=markdown_dataset_to_evaluation_row, model=["accounts/fireworks/models/llama-v3p1-8b-instruct"], rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}], - threshold_of_success=1.0, + threshold_of_success=0.5, rollout_processor=default_single_turn_rollout_processor, num_runs=1, mode="pointwise", diff --git a/tests/pytest/test_pytest_math_example.py b/tests/pytest/test_pytest_math_example.py index 01228baf..24139dcc 100644 --- a/tests/pytest/test_pytest_math_example.py +++ b/tests/pytest/test_pytest_math_example.py @@ -23,8 +23,8 @@ def test_math_dataset(row: EvaluationRow, **kwargs) -> EvaluationRow: Evaluate math problem solving considering both accuracy and format. This function demonstrates how to combine multiple evaluation criteria: - - Numerical accuracy using built-in math evaluation - - Format compliance checking for ...... structure + - Numerical accuracy using built-in math evaluation (80% weight) + - Format compliance checking for ...... structure (20% weight) Args: row: EvaluationRow containing the conversation messages and ground truth @@ -47,12 +47,8 @@ def test_math_dataset(row: EvaluationRow, **kwargs) -> EvaluationRow: format_correct = check_think_answer_format(assistant_response) format_score = 1.0 if format_correct else 0.0 - # For math_example, accuracy takes priority - if accuracy is 0, overall score is 0 - # If accuracy is 1, then format can contribute to the score - if accuracy_result.score == 0.0: - combined_score = 0.0 - else: - combined_score = accuracy_result.score # Only accuracy matters for math_example + # Calculate combined score with 80% accuracy and 20% formatting weight + combined_score = (0.8 * accuracy_result.score) + (0.2 * format_score) # Create metrics structure expected by tests metrics = {