f

codelion · codelion · commit 757a620539d8 · 2025-09-25T07:57:20.000+08:00
diff --git a/scripts/eval_aime_benchmark.py b/scripts/eval_aime_benchmark.py
@@ -311,7 +311,7 @@ def get_llm_response(problem: str, model: str, analyze_logits: bool = False, ext
             messages=[
                 {"role": "user", "content": SYSTEM_PROMPT + problem}
             ],
-            max_tokens=8192,
+            max_tokens=30000,
             **kwargs
         )
         
diff --git a/scripts/eval_imo25_benchmark.py b/scripts/eval_imo25_benchmark.py
@@ -133,7 +133,7 @@ def verify_solution_with_llm(problem: str, solution: str, model: str) -> Dict[st
                 {"role": "system", "content": "You are an expert mathematician and IMO judge."},
                 {"role": "user", "content": judge_prompt}
             ],
-            max_tokens=2048,
+            max_tokens=30000,
             temperature=0.1  # Low temperature for consistent judging
         )
 
@@ -302,7 +302,7 @@ def get_llm_response(problem: str, model: str, extra_body: dict = None, timeout:
                 {"role": "system", "content": SYSTEM_PROMPT},
                 {"role": "user", "content": problem}
             ],
-            max_tokens=8192,  # Extended token limit for complex proofs
+            max_tokens=30000,  # Extended token limit for complex proofs
             **kwargs
         )
 

Original file line number	Diff line number	Diff line change
`@@ -311,7 +311,7 @@ def get_llm_response(problem: str, model: str, analyze_logits: bool = False, ext`
`311`	`311`	`messages=[`
`312`	`312`	`{"role": "user", "content": SYSTEM_PROMPT + problem}`
`313`	`313`	`],`
`314`		`- max_tokens=8192,`
	`314`	`+ max_tokens=30000,`
`315`	`315`	`**kwargs`
`316`	`316`	`)`
`317`	`317`