Skip to content

Commit fd0326a

Browse files
committed
Update eval_imo25_benchmark.py
1 parent 757a620 commit fd0326a

File tree

1 file changed

+146
-140
lines changed

1 file changed

+146
-140
lines changed

scripts/eval_imo25_benchmark.py

Lines changed: 146 additions & 140 deletions
Original file line numberDiff line numberDiff line change
@@ -99,91 +99,143 @@ def extract_final_answer(solution: str, problem_id: int) -> Dict[str, any]:
9999
return result
100100

101101

102-
def verify_solution_with_llm(problem: str, solution: str, model: str) -> Dict[str, any]:
102+
def imo25_verify_solution(problem: str, solution: str, model: str) -> Dict[str, any]:
103103
"""
104-
Use an LLM as a judge to verify the correctness of a solution
104+
Two-stage verification system from IMO25 repository:
105+
Stage 1: Detailed verification using comprehensive IMO grader prompt
106+
Stage 2: Simple yes/no check on solution correctness
105107
"""
106-
judge_prompt = f"""You are an expert mathematical judge evaluating IMO solutions.
107108

108-
PROBLEM:
109+
# Stage 1: Detailed verification using IMO25's verification system prompt
110+
verification_system_prompt = """You are an expert mathematician and a meticulous grader for an International Mathematical Olympiad (IMO) level exam. Your primary task is to rigorously verify the provided mathematical solution. A solution is to be judged correct **only if every step is rigorously justified.** A solution that arrives at a correct final answer through flawed reasoning, educated guesses, or with gaps in its arguments must be flagged as incorrect or incomplete.
111+
112+
### Instructions ###
113+
114+
**1. Core Instructions**
115+
* Your sole task is to find and report all issues in the provided solution. You must act as a **verifier**, NOT a solver. **Do NOT attempt to correct the errors or fill the gaps you find.**
116+
* You must perform a **step-by-step** check of the entire solution. This analysis will be presented in a **Detailed Verification Log**, where you justify your assessment of each step: for correct steps, a brief justification suffices; for steps with errors or gaps, you must provide a detailed explanation.
117+
118+
**2. How to Handle Issues in the Solution**
119+
When you identify an issue in a step, you MUST first classify it into one of the following two categories and then follow the specified procedure.
120+
121+
* **a. Critical Error:**
122+
This is any error that breaks the logical chain of the proof. This includes both **logical fallacies** (e.g., claiming that `A>B, C>D` implies `A-C>B-D`) and **factual errors** (e.g., a calculation error like `2+3=6`).
123+
* **Procedure:**
124+
* Explain the specific error and state that it **invalidates the current line of reasoning**.
125+
* Do NOT check any further steps that rely on this error.
126+
* You MUST, however, scan the rest of the solution to identify and verify any fully independent parts. For example, if a proof is split into multiple cases, an error in one case does not prevent you from checking the other cases.
127+
128+
* **b. Justification Gap:**
129+
This is for steps where the conclusion may be correct, but the provided argument is incomplete, hand-wavy, or lacks sufficient rigor.
130+
* **Procedure:**
131+
* Explain the gap in the justification.
132+
* State that you will **assume the step's conclusion is true** for the sake of argument.
133+
* Then, proceed to verify all subsequent steps to check if the remainder of the argument is sound.
134+
135+
**3. Output Format**
136+
Your response MUST be structured into two main sections: a **Summary** followed by the **Detailed Verification Log**.
137+
138+
* **a. Summary**
139+
This section MUST be at the very beginning of your response. It must contain two components:
140+
* **Final Verdict**: A single, clear sentence declaring the overall validity of the solution. For example: "The solution is correct," "The solution contains a Critical Error and is therefore invalid," or "The solution's approach is viable but contains several Justification Gaps."
141+
* **List of Findings**: A bulleted list that summarizes **every** issue you discovered. For each finding, you must provide:
142+
* **Location:** A direct quote of the key phrase or equation where the issue occurs.
143+
* **Issue:** A brief description of the problem and its classification (**Critical Error** or **Justification Gap**).
144+
145+
* **b. Detailed Verification Log**
146+
Following the summary, provide the full, step-by-step verification log as defined in the Core Instructions. When you refer to a specific part of the solution, **quote the relevant text** to make your reference clear before providing your detailed analysis of that part.
147+
148+
**Example of the Required Summary Format**
149+
*This is a generic example to illustrate the required format. Your findings must be based on the actual solution provided below.*
150+
151+
**Final Verdict:** The solution is **invalid** because it contains a Critical Error.
152+
153+
**List of Findings:**
154+
* **Location:** "By interchanging the limit and the integral, we get..."
155+
* **Issue:** Justification Gap - The solution interchanges a limit and an integral without providing justification, such as proving uniform convergence.
156+
* **Location:** "From $A > B$ and $C > D$, it follows that $A-C > B-D$"
157+
* **Issue:** Critical Error - This step is a logical fallacy. Subtracting inequalities in this manner is not a valid mathematical operation.
158+
159+
### Verification Task Reminder ###
160+
161+
Your task is to act as an IMO grader. Now, generate the **summary** and the **step-by-step verification log** for the solution above. In your log, justify each correct step and explain in detail any errors or justification gaps you find, as specified in the instructions above."""
162+
163+
verification_prompt = f"""
164+
======================================================================
165+
### Problem ###
166+
109167
{problem}
110168
111-
STUDENT SOLUTION:
169+
======================================================================
170+
### Solution ###
171+
112172
{solution}
113173
114-
Please evaluate this solution and provide:
115-
1. CORRECTNESS SCORE (0-10): How mathematically correct is this solution?
116-
2. COMPLETENESS SCORE (0-10): How complete and rigorous is the proof?
117-
3. KEY INSIGHTS: Did the solution identify the key mathematical insights needed?
118-
4. ERRORS: List any mathematical errors or logical gaps
119-
5. OVERALL ASSESSMENT: Is this solution likely correct?
120-
121-
Provide your assessment in the following format:
122-
CORRECTNESS: [0-10]
123-
COMPLETENESS: [0-10]
124-
KEY_INSIGHTS: [Yes/No]
125-
ERRORS: [List any errors]
126-
OVERALL: [Correct/Incorrect/Partial]
127-
REASONING: [Brief explanation]"""
174+
{verification_system_prompt}
175+
"""
128176

129177
try:
178+
# Stage 1: Detailed verification
130179
response = client.with_options(timeout=300).chat.completions.create(
131180
model=model,
132181
messages=[
133-
{"role": "system", "content": "You are an expert mathematician and IMO judge."},
134-
{"role": "user", "content": judge_prompt}
182+
{"role": "system", "content": verification_system_prompt},
183+
{"role": "user", "content": verification_prompt}
135184
],
136185
max_tokens=30000,
137-
temperature=0.1 # Low temperature for consistent judging
186+
temperature=0.1
138187
)
139188

140-
judge_response = response.choices[0].message.content.strip()
141-
142-
# Parse the structured response
143-
result = {
144-
"judge_response": judge_response,
145-
"correctness_score": 0.0,
146-
"completeness_score": 0.0,
147-
"has_key_insights": False,
148-
"errors_found": [],
149-
"overall_assessment": "unknown",
150-
"judge_reasoning": "",
151-
"success": True
152-
}
153-
154-
# Extract scores using regex
155-
correctness_match = re.search(r'CORRECTNESS:\s*([0-9.]+)', judge_response)
156-
if correctness_match:
157-
result["correctness_score"] = float(correctness_match.group(1)) / 10.0
189+
verification_response = response.choices[0].message.content.strip()
158190

159-
completeness_match = re.search(r'COMPLETENESS:\s*([0-9.]+)', judge_response)
160-
if completeness_match:
161-
result["completeness_score"] = float(completeness_match.group(1)) / 10.0
191+
# Stage 2: Simple yes/no check on correctness
192+
check_correctness_prompt = f"""Response in "yes" or "no". Is the following statement saying the solution is correct, or does not contain critical error or a major justification gap?
162193
163-
insights_match = re.search(r'KEY_INSIGHTS:\s*(Yes|No)', judge_response, re.IGNORECASE)
164-
if insights_match:
165-
result["has_key_insights"] = insights_match.group(1).lower() == "yes"
194+
{verification_response}"""
166195

167-
errors_match = re.search(r'ERRORS:\s*(.+?)(?=OVERALL:|$)', judge_response, re.DOTALL)
168-
if errors_match:
169-
errors_text = errors_match.group(1).strip()
170-
if errors_text and "none" not in errors_text.lower():
171-
result["errors_found"] = [errors_text]
196+
response2 = client.with_options(timeout=300).chat.completions.create(
197+
model=model,
198+
messages=[
199+
{"role": "user", "content": check_correctness_prompt}
200+
],
201+
max_tokens=10,
202+
temperature=0.1
203+
)
172204

173-
overall_match = re.search(r'OVERALL:\s*(Correct|Incorrect|Partial)', judge_response, re.IGNORECASE)
174-
if overall_match:
175-
result["overall_assessment"] = overall_match.group(1).lower()
205+
correctness_check = response2.choices[0].message.content.strip().lower()
206+
is_correct = "yes" in correctness_check
176207

177-
reasoning_match = re.search(r'REASONING:\s*(.+)', judge_response, re.DOTALL)
178-
if reasoning_match:
179-
result["judge_reasoning"] = reasoning_match.group(1).strip()
208+
# Extract bug report if solution is incorrect
209+
bug_report = ""
210+
if not is_correct:
211+
# Try to extract the detailed verification log
212+
verification_log_match = re.search(r'### Detailed Verification Log ###\s*(.*)', verification_response, re.DOTALL)
213+
if verification_log_match:
214+
bug_report = verification_log_match.group(1).strip()
215+
else:
216+
bug_report = verification_response
180217

181-
return result
218+
return {
219+
"judge_response": verification_response,
220+
"correctness_check": correctness_check,
221+
"is_correct": is_correct,
222+
"bug_report": bug_report,
223+
"correctness_score": 1.0 if is_correct else 0.0,
224+
"completeness_score": 1.0 if is_correct else 0.0,
225+
"has_key_insights": is_correct,
226+
"errors_found": [bug_report] if bug_report else [],
227+
"overall_assessment": "correct" if is_correct else "incorrect",
228+
"judge_reasoning": verification_response,
229+
"success": True
230+
}
182231

183232
except Exception as e:
184-
logger.error(f"Error in LLM judge verification: {e}")
233+
logger.error(f"Error in IMO25 verification: {e}")
185234
return {
186235
"judge_response": f"Error: {str(e)}",
236+
"correctness_check": "error",
237+
"is_correct": False,
238+
"bug_report": f"Verification error: {str(e)}",
187239
"correctness_score": 0.0,
188240
"completeness_score": 0.0,
189241
"has_key_insights": False,
@@ -328,109 +380,63 @@ def get_llm_response(problem: str, model: str, extra_body: dict = None, timeout:
328380

329381
def evaluate_solution(problem_data: Dict, solution: str, model: str = "google/gemini-2.5-flash-lite") -> Dict[str, any]:
330382
"""
331-
Enhanced multi-layer evaluation of IMO solution using:
332-
- Structural quality analysis (20%)
333-
- Problem-specific insights verification (40%)
334-
- LLM-as-judge verification (30%)
335-
- Overall completeness (10%)
336-
"""
337-
logger.info(f"Running enhanced evaluation for problem {problem_data['id']}")
383+
IMO25-style evaluation using rigorous two-stage verification system:
384+
1. Detailed verification with comprehensive IMO grader prompt
385+
2. Simple yes/no check on solution correctness
338386
339-
# Layer 1: Structural quality analysis (20% weight)
340-
quality_analysis = extract_solution_quality(solution)
341-
structural_score = quality_analysis["completeness_score"]
342-
343-
# Layer 2: Problem-specific insights verification (40% weight)
344-
insights_check = verify_problem_specific_insights(problem_data, solution)
345-
insights_score = insights_check["insight_score"]
387+
This eliminates self-judgment bias and provides more accurate assessment
388+
"""
389+
logger.info(f"Running IMO25-style evaluation for problem {problem_data['id']}")
346390

347-
# Layer 3: LLM-as-judge verification (30% weight)
348-
llm_verification = verify_solution_with_llm(problem_data["problem"], solution, model)
349-
llm_score = 0.0
350-
if llm_verification["success"]:
351-
# Combine correctness and completeness from LLM judge
352-
llm_score = (llm_verification["correctness_score"] + llm_verification["completeness_score"]) / 2.0
391+
# Use IMO25's rigorous two-stage verification
392+
imo25_verification = imo25_verify_solution(problem_data["problem"], solution, model)
353393

354-
# Layer 4: Final answer extraction and verification
394+
# Extract answer for compatibility with existing code
355395
answer_extraction = extract_final_answer(solution, problem_data["id"])
356396

357-
# Use calibrated scoring based on problem type and official answers
358-
problem_type = problem_data.get("answer_type", "proof")
359-
360-
if problem_type in ["set", "number", "formula", "threshold"]:
361-
# For problems with specific answers, heavily weight correct answer
362-
if answer_extraction["official_answer_found"]:
363-
answer_score = 1.0 # Perfect score for exact official answer
364-
else:
365-
answer_score = answer_extraction["confidence"] * 0.3 # Much lower for non-official
366-
367-
# Adjust weights for problems with specific answers
368-
weights = {
369-
"structural": 0.10,
370-
"insights": 0.30,
371-
"llm_judge": 0.20,
372-
"answer": 0.40 # Higher weight for exact answer match
373-
}
374-
else:
375-
# For proof problems, weight insights and structure more heavily
376-
answer_score = answer_extraction["confidence"]
377-
weights = {
378-
"structural": 0.25,
379-
"insights": 0.35,
380-
"llm_judge": 0.30,
381-
"answer": 0.10
382-
}
383-
384-
final_score = (
385-
structural_score * weights["structural"] +
386-
insights_score * weights["insights"] +
387-
llm_score * weights["llm_judge"] +
388-
answer_score * weights["answer"]
389-
)
397+
# Simple structural analysis for quality metrics
398+
quality_analysis = extract_solution_quality(solution)
390399

391-
# Determine confidence based on agreement across layers
392-
layer_scores = [structural_score, insights_score, llm_score, answer_score]
393-
score_variance = sum((score - final_score) ** 2 for score in layer_scores) / len(layer_scores)
400+
# In IMO25 system, correctness is binary based on verification
401+
correctness_score = 1.0 if imo25_verification["is_correct"] else 0.0
394402

395-
if final_score >= 0.8 and score_variance < 0.05:
396-
confidence = "very_high"
397-
elif final_score >= 0.7 and score_variance < 0.1:
403+
# Confidence based on verification success and quality
404+
if imo25_verification["is_correct"] and quality_analysis["completeness_score"] > 0.7:
398405
confidence = "high"
399-
elif final_score >= 0.5 and score_variance < 0.15:
406+
elif imo25_verification["is_correct"]:
400407
confidence = "medium"
401408
else:
402409
confidence = "low"
403410

404-
# Overall assessment
405-
is_likely_correct = (
406-
final_score >= 0.6 and
407-
insights_score >= 0.5 and
408-
(llm_verification["overall_assessment"] in ["correct", "partial"] if llm_verification["success"] else True)
409-
)
410-
411411
return {
412-
"correctness_score": final_score,
413-
"is_likely_correct": is_likely_correct,
412+
"correctness_score": correctness_score,
413+
"is_likely_correct": imo25_verification["is_correct"],
414414
"confidence": confidence,
415415

416-
# Detailed breakdown
416+
# Detailed breakdown - simplified for IMO25 style
417417
"layer_scores": {
418-
"structural_quality": structural_score,
419-
"insights_verification": insights_score,
420-
"llm_judge": llm_score,
421-
"answer_extraction": answer_score
418+
"structural_quality": quality_analysis["completeness_score"],
419+
"insights_verification": 1.0 if imo25_verification["is_correct"] else 0.0,
420+
"llm_judge": correctness_score,
421+
"answer_extraction": answer_extraction["confidence"]
422422
},
423-
"weights_used": weights,
424-
"score_variance": score_variance,
423+
"weights_used": {
424+
"imo25_verification": 1.0 # Single source of truth
425+
},
426+
"score_variance": 0.0, # No variance in binary assessment
425427

426428
# Detailed component results
427429
"quality_analysis": quality_analysis,
428-
"insights_check": insights_check,
429-
"llm_verification": llm_verification,
430+
"insights_check": {
431+
"required_insights_found": 1 if imo25_verification["is_correct"] else 0,
432+
"total_required_insights": 1,
433+
"insight_score": 1.0 if imo25_verification["is_correct"] else 0.0
434+
},
435+
"llm_verification": imo25_verification,
430436
"answer_extraction": answer_extraction,
431437

432-
# Legacy compatibility
433-
"evaluation_method": "enhanced_multi_layer"
438+
# Method identifier
439+
"evaluation_method": "imo25_two_stage"
434440
}
435441

436442
def save_result(filename: str, result: Dict):

0 commit comments

Comments
 (0)