Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/pytest/test_markdown_highlighting.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def markdown_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu
dataset_adapter=markdown_dataset_to_evaluation_row,
model=["accounts/fireworks/models/llama-v3p1-8b-instruct"],
rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
threshold_of_success=1.0,
threshold_of_success=0.5,
rollout_processor=default_single_turn_rollout_processor,
num_runs=1,
mode="pointwise",
Expand Down
12 changes: 4 additions & 8 deletions tests/pytest/test_pytest_math_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ def test_math_dataset(row: EvaluationRow, **kwargs) -> EvaluationRow:
Evaluate math problem solving considering both accuracy and format.

This function demonstrates how to combine multiple evaluation criteria:
- Numerical accuracy using built-in math evaluation
- Format compliance checking for <think>...</think><answer>...</answer> structure
- Numerical accuracy using built-in math evaluation (80% weight)
- Format compliance checking for <think>...</think><answer>...</answer> structure (20% weight)

Args:
row: EvaluationRow containing the conversation messages and ground truth
Expand All @@ -47,12 +47,8 @@ def test_math_dataset(row: EvaluationRow, **kwargs) -> EvaluationRow:
format_correct = check_think_answer_format(assistant_response)
format_score = 1.0 if format_correct else 0.0

# For math_example, accuracy takes priority - if accuracy is 0, overall score is 0
# If accuracy is 1, then format can contribute to the score
if accuracy_result.score == 0.0:
combined_score = 0.0
else:
combined_score = accuracy_result.score # Only accuracy matters for math_example
# Calculate combined score with 80% accuracy and 20% formatting weight
combined_score = (0.8 * accuracy_result.score) + (0.2 * format_score)

# Create metrics structure expected by tests
metrics = {
Expand Down
Loading