@@ -409,11 +409,24 @@ def evaluate_solution(problem_data: Dict, solution: str, model: str = "google/ge
409409 confidence = "low"
410410
411411 return {
412+ # Primary binary result - this is what matters
413+ "is_correct" : imo25_verification ["is_correct" ],
414+ "verdict" : "Correct" if imo25_verification ["is_correct" ] else "Incorrect" ,
415+
416+ # For compatibility with existing analysis code
412417 "correctness_score" : correctness_score ,
413418 "is_likely_correct" : imo25_verification ["is_correct" ],
414419 "confidence" : confidence ,
415420
416- # Detailed breakdown - simplified for IMO25 style
421+ # Verification details for transparency
422+ "verification_details" : {
423+ "stage1_analysis" : imo25_verification ["judge_response" ],
424+ "stage2_check" : imo25_verification ["correctness_check" ],
425+ "errors_found" : imo25_verification ["errors_found" ],
426+ "bug_report" : imo25_verification ["bug_report" ] if imo25_verification ["bug_report" ] else None
427+ },
428+
429+ # Legacy compatibility for existing analysis code
417430 "layer_scores" : {
418431 "structural_quality" : quality_analysis ["completeness_score" ],
419432 "insights_verification" : 1.0 if imo25_verification ["is_correct" ] else 0.0 ,
@@ -425,7 +438,7 @@ def evaluate_solution(problem_data: Dict, solution: str, model: str = "google/ge
425438 },
426439 "score_variance" : 0.0 , # No variance in binary assessment
427440
428- # Detailed component results
441+ # Simplified component results
429442 "quality_analysis" : quality_analysis ,
430443 "insights_check" : {
431444 "required_insights_found" : 1 if imo25_verification ["is_correct" ] else 0 ,
@@ -436,7 +449,7 @@ def evaluate_solution(problem_data: Dict, solution: str, model: str = "google/ge
436449 "answer_extraction" : answer_extraction ,
437450
438451 # Method identifier
439- "evaluation_method" : "imo25_two_stage "
452+ "evaluation_method" : "imo25_two_stage_binary "
440453 }
441454
442455def save_result (filename : str , result : Dict ):
@@ -469,7 +482,7 @@ def analyze_results(results: List[Dict], approach_name: str = None):
469482 return
470483
471484 total_problems = len (results )
472- likely_correct = sum (1 for r in results if r ['evaluation' ]['is_likely_correct ' ])
485+ likely_correct = sum (1 for r in results if r ['evaluation' ]['is_correct ' ])
473486 high_confidence = sum (1 for r in results if r ['evaluation' ]['confidence' ] == 'high' )
474487
475488 avg_correctness = sum (r ['evaluation' ]['correctness_score' ] for r in results ) / total_problems
@@ -497,7 +510,7 @@ def analyze_results(results: List[Dict], approach_name: str = None):
497510 if prob_type not in type_stats :
498511 type_stats [prob_type ] = {'total' : 0 , 'correct' : 0 , 'scores' : []}
499512 type_stats [prob_type ]['total' ] += 1
500- if result ['evaluation' ]['is_likely_correct ' ]:
513+ if result ['evaluation' ]['is_correct ' ]:
501514 type_stats [prob_type ]['correct' ] += 1
502515 type_stats [prob_type ]['scores' ].append (result ['evaluation' ]['correctness_score' ])
503516
@@ -512,12 +525,11 @@ def analyze_results(results: List[Dict], approach_name: str = None):
512525 for result in results :
513526 prob_id = result ['problem_data' ]['id' ]
514527 prob_type = result ['problem_data' ]['type' ]
515- score = result ['evaluation' ]['correctness_score' ]
516- confidence = result ['evaluation' ]['confidence' ]
517528 tokens = result ['response' ]['reasoning_tokens' ]
518-
519- status = "✓" if result ['evaluation' ]['is_likely_correct' ] else "✗"
520- print (f"Problem { prob_id } ({ prob_type } ): { status } Score: { score :.3f} ({ confidence } ) - { tokens :,} tokens" )
529+ is_correct = result ['evaluation' ]['is_correct' ]
530+ verdict = result ['evaluation' ]['verdict' ]
531+ status = "✓" if is_correct else "✗"
532+ print (f"Problem { prob_id } ({ prob_type } ): { status } { verdict } - { tokens :,} tokens" )
521533
522534 # Quality analysis summary
523535 print (f"\n Solution Quality Analysis:" )
0 commit comments