f

codelion · codelion · commit fece9173c826 · 2025-09-25T18:51:01.000+08:00
diff --git a/optillm/mars/agent.py b/optillm/mars/agent.py
@@ -77,9 +77,18 @@ def generate_solution(self, problem: str, request_id: str = None) -> Tuple[Agent
 
             solution_text = response.choices[0].message.content.strip()
 
+            # ENHANCED LOGGING: Log solution details
+            solution_length = len(solution_text)
+
+            logger.info(f"Agent {self.agent_id} solution details:")
+            logger.info(f"  - Length: {solution_length} characters")
+            logger.info(f"  - Last 100 chars: ...{solution_text[-100:] if solution_length > 100 else solution_text}")
+
             # Extract reasoning tokens from the correct nested structure
             reasoning_tokens = 0
+            total_tokens = 0
             if hasattr(response, 'usage') and response.usage:
+                total_tokens = getattr(response.usage, 'total_tokens', 0)
                 # Check completion_tokens_details first (OpenRouter structure)
                 if hasattr(response.usage, 'completion_tokens_details') and response.usage.completion_tokens_details:
                     reasoning_tokens = getattr(response.usage.completion_tokens_details, 'reasoning_tokens', 0)
@@ -88,10 +97,12 @@ def generate_solution(self, problem: str, request_id: str = None) -> Tuple[Agent
                 if reasoning_tokens == 0:
                     reasoning_tokens = getattr(response.usage, 'reasoning_tokens', 0)
 
+            logger.info(f"Agent {self.agent_id} token usage: reasoning={reasoning_tokens}, total={total_tokens}")
+
             # Extract confidence from solution (heuristic based on response characteristics)
             confidence = self._estimate_confidence(solution_text)
 
-            # Create agent solution object
+            # Create agent solution object with enhanced metadata
             agent_solution = AgentSolution(
                 agent_id=self.agent_id,
                 temperature=self.temperature,
@@ -101,20 +112,27 @@ def generate_solution(self, problem: str, request_id: str = None) -> Tuple[Agent
                 timestamp=datetime.now()
             )
 
+            # Add metadata to solution object
+            agent_solution.solution_length = solution_length
+            agent_solution.total_tokens = total_tokens
+
             logger.info(f"Agent {self.agent_id} generated solution with {reasoning_tokens} reasoning tokens")
             return agent_solution, reasoning_tokens
 
         except Exception as e:
             logger.error(f"Agent {self.agent_id} error generating solution: {str(e)}")
             # Return empty solution with error indication
-            return AgentSolution(
+            error_solution = AgentSolution(
                 agent_id=self.agent_id,
                 temperature=self.temperature,
                 solution=f"Error generating solution: {str(e)}",
                 confidence=0.0,
                 reasoning_tokens=0,
                 timestamp=datetime.now()
-            ), 0
+            )
+            error_solution.solution_length = len(error_solution.solution)
+            error_solution.total_tokens = 0
+            return error_solution, 0
 
     def verify_solution(self, problem: str, solution: str, verifier_id: int, solution_agent_id: int, request_id: str = None) -> VerificationResult:
         """Verify a solution using mathematical reasoning"""
diff --git a/optillm/mars/mars.py b/optillm/mars/mars.py
@@ -35,6 +35,7 @@ def multi_agent_reasoning_system(
     initial_query: str,
     client,
     model: str,
+    request_config: dict = None,
     request_id: str = None
 ) -> Tuple[str, int]:
     """
@@ -51,21 +52,30 @@ def multi_agent_reasoning_system(
         Tuple of (final_solution, total_reasoning_tokens)
     """
     return asyncio.run(_run_mars_parallel(
-        system_prompt, initial_query, client, model, request_id
+        system_prompt, initial_query, client, model, request_config, request_id
     ))
 
 async def _run_mars_parallel(
     system_prompt: str,
     initial_query: str,
     client,
     model: str,
+    request_config: dict = None,
     request_id: str = None
 ) -> Tuple[str, int]:
     """Async implementation of MARS with parallel execution"""
     logger.info(f"Starting MARS with model: {model}")
 
     # Initialize configuration
     config = DEFAULT_CONFIG.copy()
+
+    # Override max_tokens from request_config if provided
+    if request_config and 'max_tokens' in request_config:
+        config['max_tokens'] = request_config['max_tokens']
+        logger.info(f"Using max_tokens from request: {config['max_tokens']}")
+    else:
+        logger.info(f"Using default max_tokens: {config['max_tokens']}")
+
     total_reasoning_tokens = 0
 
     # Calculate optimal worker count for parallel execution
@@ -191,6 +201,14 @@ async def generate_solution_async(agent: MARSAgent):
             workspace.add_solution(solution)
             total_tokens += tokens
             successful_solutions += 1
+
+            # ENHANCED LOGGING: Log individual agent solution details
+            logger.info(f"Agent {agent_id} exploration complete:")
+            logger.info(f"  - Solution length: {solution.solution_length} chars")
+            logger.info(f"  - Total tokens: {solution.total_tokens}")
+            logger.info(f"  - Reasoning tokens: {solution.reasoning_tokens}")
+            logger.info(f"  - Confidence: {solution.confidence:.2f}")
+            logger.info(f"  - Solution preview: {solution.solution[:200]}...")
         else:
             logger.error(f"Agent {agent_id} generated no solution")
 
@@ -274,15 +292,22 @@ def _synthesize_final_solution(
 
         # Extract reasoning tokens from correct nested structure (matching agent.py fix)
         reasoning_tokens = 0
+        total_tokens = 0
         if hasattr(response, 'usage') and response.usage:
+            total_tokens = getattr(response.usage, 'total_tokens', 0)
             # Check completion_tokens_details first (OpenRouter structure)
             if hasattr(response.usage, 'completion_tokens_details') and response.usage.completion_tokens_details:
                 reasoning_tokens = getattr(response.usage.completion_tokens_details, 'reasoning_tokens', 0)
             # Fallback to direct usage field (standard OpenAI structure)
             if reasoning_tokens == 0:
                 reasoning_tokens = getattr(response.usage, 'reasoning_tokens', 0)
 
-        logger.info(f"Synthesis complete with {reasoning_tokens} reasoning tokens")
+        # ENHANCED LOGGING: Log synthesis details
+        logger.info(f"Synthesis complete:")
+        logger.info(f"  - Synthesis solution length: {len(final_solution)} characters")
+        logger.info(f"  - Reasoning tokens: {reasoning_tokens}")
+        logger.info(f"  - Total tokens: {total_tokens}")
+        logger.info(f"  - Final solution preview: {final_solution[:200]}...")
         return final_solution, reasoning_tokens
 
     except Exception as e:
diff --git a/optillm/server.py b/optillm/server.py
@@ -423,7 +423,7 @@ def execute_single_approach(approach, system_prompt, initial_query, client, mode
         elif approach == 'cepo':
             return cepo(system_prompt, initial_query, client, model, cepo_config, request_id)
         elif approach == 'mars':
-            return multi_agent_reasoning_system(system_prompt, initial_query, client, model, request_id)
+            return multi_agent_reasoning_system(system_prompt, initial_query, client, model, request_config=request_config, request_id=request_id)
     elif approach in plugin_approaches:
         # Check if the plugin accepts request_config
         plugin_func = plugin_approaches[approach]
diff --git a/scripts/eval_imo25_benchmark.py b/scripts/eval_imo25_benchmark.py
@@ -182,7 +182,7 @@ def imo25_verify_solution(problem: str, solution: str, model: str) -> Dict[str,
                 {"role": "system", "content": verification_system_prompt},
                 {"role": "user", "content": verification_prompt}
             ],
-            max_tokens=30000,
+            max_tokens=64000,
             temperature=0.1
         )
 
@@ -354,7 +354,7 @@ def get_llm_response(problem: str, model: str, extra_body: dict = None, timeout:
                 {"role": "system", "content": SYSTEM_PROMPT},
                 {"role": "user", "content": problem}
             ],
-            max_tokens=30000,  # Extended token limit for complex proofs
+            max_tokens=64000,  # Extended token limit for complex IMO proofs (increased from 30000)
             **kwargs
         )