Skip to content

Commit 6d7f57c

Browse files
committed
Update eval_imo25_benchmark.py
1 parent fd0326a commit 6d7f57c

File tree

1 file changed

+22
-10
lines changed

1 file changed

+22
-10
lines changed

scripts/eval_imo25_benchmark.py

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -409,11 +409,24 @@ def evaluate_solution(problem_data: Dict, solution: str, model: str = "google/ge
409409
confidence = "low"
410410

411411
return {
412+
# Primary binary result - this is what matters
413+
"is_correct": imo25_verification["is_correct"],
414+
"verdict": "Correct" if imo25_verification["is_correct"] else "Incorrect",
415+
416+
# For compatibility with existing analysis code
412417
"correctness_score": correctness_score,
413418
"is_likely_correct": imo25_verification["is_correct"],
414419
"confidence": confidence,
415420

416-
# Detailed breakdown - simplified for IMO25 style
421+
# Verification details for transparency
422+
"verification_details": {
423+
"stage1_analysis": imo25_verification["judge_response"],
424+
"stage2_check": imo25_verification["correctness_check"],
425+
"errors_found": imo25_verification["errors_found"],
426+
"bug_report": imo25_verification["bug_report"] if imo25_verification["bug_report"] else None
427+
},
428+
429+
# Legacy compatibility for existing analysis code
417430
"layer_scores": {
418431
"structural_quality": quality_analysis["completeness_score"],
419432
"insights_verification": 1.0 if imo25_verification["is_correct"] else 0.0,
@@ -425,7 +438,7 @@ def evaluate_solution(problem_data: Dict, solution: str, model: str = "google/ge
425438
},
426439
"score_variance": 0.0, # No variance in binary assessment
427440

428-
# Detailed component results
441+
# Simplified component results
429442
"quality_analysis": quality_analysis,
430443
"insights_check": {
431444
"required_insights_found": 1 if imo25_verification["is_correct"] else 0,
@@ -436,7 +449,7 @@ def evaluate_solution(problem_data: Dict, solution: str, model: str = "google/ge
436449
"answer_extraction": answer_extraction,
437450

438451
# Method identifier
439-
"evaluation_method": "imo25_two_stage"
452+
"evaluation_method": "imo25_two_stage_binary"
440453
}
441454

442455
def save_result(filename: str, result: Dict):
@@ -469,7 +482,7 @@ def analyze_results(results: List[Dict], approach_name: str = None):
469482
return
470483

471484
total_problems = len(results)
472-
likely_correct = sum(1 for r in results if r['evaluation']['is_likely_correct'])
485+
likely_correct = sum(1 for r in results if r['evaluation']['is_correct'])
473486
high_confidence = sum(1 for r in results if r['evaluation']['confidence'] == 'high')
474487

475488
avg_correctness = sum(r['evaluation']['correctness_score'] for r in results) / total_problems
@@ -497,7 +510,7 @@ def analyze_results(results: List[Dict], approach_name: str = None):
497510
if prob_type not in type_stats:
498511
type_stats[prob_type] = {'total': 0, 'correct': 0, 'scores': []}
499512
type_stats[prob_type]['total'] += 1
500-
if result['evaluation']['is_likely_correct']:
513+
if result['evaluation']['is_correct']:
501514
type_stats[prob_type]['correct'] += 1
502515
type_stats[prob_type]['scores'].append(result['evaluation']['correctness_score'])
503516

@@ -512,12 +525,11 @@ def analyze_results(results: List[Dict], approach_name: str = None):
512525
for result in results:
513526
prob_id = result['problem_data']['id']
514527
prob_type = result['problem_data']['type']
515-
score = result['evaluation']['correctness_score']
516-
confidence = result['evaluation']['confidence']
517528
tokens = result['response']['reasoning_tokens']
518-
519-
status = "✓" if result['evaluation']['is_likely_correct'] else "✗"
520-
print(f"Problem {prob_id} ({prob_type}): {status} Score: {score:.3f} ({confidence}) - {tokens:,} tokens")
529+
is_correct = result['evaluation']['is_correct']
530+
verdict = result['evaluation']['verdict']
531+
status = "✓" if is_correct else "✗"
532+
print(f"Problem {prob_id} ({prob_type}): {status} {verdict} - {tokens:,} tokens")
521533

522534
# Quality analysis summary
523535
print(f"\nSolution Quality Analysis:")

0 commit comments

Comments
 (0)