add tests

codelion · codelion · commit 13a4b91b0ac6 · 2025-09-24T17:36:54.000+08:00
diff --git a/optillm/mars/mars.py b/optillm/mars/mars.py
@@ -1,10 +1,12 @@
 """
-MARS: Multi-Agent Reasoning System main orchestration
+MARS: Multi-Agent Reasoning System main orchestration with parallel execution
 """
 
+import asyncio
 import logging
 from typing import Dict, Any, List, Tuple
 from datetime import datetime
+from concurrent.futures import ThreadPoolExecutor
 import optillm
 from optillm import conversation_logger
 
@@ -36,7 +38,7 @@ def multi_agent_reasoning_system(
     request_id: str = None
 ) -> Tuple[str, int]:
     """
-    Main MARS function implementing multi-agent mathematical reasoning
+    Main MARS function implementing multi-agent mathematical reasoning with parallel execution
 
     Args:
         system_prompt: System-level instructions
@@ -48,12 +50,31 @@ def multi_agent_reasoning_system(
     Returns:
         Tuple of (final_solution, total_reasoning_tokens)
     """
+    return asyncio.run(_run_mars_parallel(
+        system_prompt, initial_query, client, model, request_id
+    ))
+
+async def _run_mars_parallel(
+    system_prompt: str,
+    initial_query: str,
+    client,
+    model: str,
+    request_id: str = None
+) -> Tuple[str, int]:
+    """Async implementation of MARS with parallel execution"""
     logger.info(f"Starting MARS with model: {model}")
 
     # Initialize configuration
     config = DEFAULT_CONFIG.copy()
     total_reasoning_tokens = 0
 
+    # Calculate optimal worker count for parallel execution
+    max_workers = max(
+        config['num_agents'],  # For generation phase
+        config['num_agents'] * min(2, config['verification_passes_required'])  # For verification
+    )
+    logger.info(f"Using {max_workers} parallel workers")
+
     # Initialize workspace for collaboration
     workspace = MARSWorkspace(initial_query, config)
 
@@ -66,37 +87,41 @@ def multi_agent_reasoning_system(
 
         logger.info(f"Initialized {len(agents)} agents with diverse temperatures")
 
-        # Phase 2: Multi-Agent Exploration
-        logger.info("Phase 1: Multi-Agent Exploration")
-        exploration_tokens = _run_exploration_phase(agents, workspace, request_id)
-        total_reasoning_tokens += exploration_tokens
+        # Create thread pool executor for parallel API calls
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            # Phase 2: Multi-Agent Exploration (parallel)
+            logger.info("Phase 1: Multi-Agent Exploration")
+            exploration_tokens = await _run_exploration_phase_parallel(
+                agents, workspace, request_id, executor
+            )
+            total_reasoning_tokens += exploration_tokens
 
-        # Phase 3: Verification System
-        logger.info("Phase 2: Verification System")
-        verifier = MARSVerifier(agents, workspace, config)
-        verification_summary = verifier.verify_solutions(request_id)
+            # Phase 3: Verification System (parallel)
+            logger.info("Phase 2: Verification System")
+            verifier = MARSVerifier(agents, workspace, config)
+            verification_summary = await verifier.verify_solutions_parallel(request_id, executor)
 
-        # Phase 4: Iterative Improvement (if needed)
-        iteration_count = 0
-        while workspace.should_continue_iteration() and iteration_count < config['max_iterations']:
-            iteration_count += 1
-            logger.info(f"Phase 3: Iterative Improvement - Iteration {iteration_count}")
+            # Phase 4: Iterative Improvement (if needed)
+            iteration_count = 0
+            while workspace.should_continue_iteration() and iteration_count < config['max_iterations']:
+                iteration_count += 1
+                logger.info(f"Phase 3: Iterative Improvement - Iteration {iteration_count}")
 
-            # Improve unverified solutions
-            improvement_summary = verifier.iterative_improvement(request_id)
-            total_reasoning_tokens += improvement_summary['total_reasoning_tokens']
+                # Improve unverified solutions (parallel)
+                improvement_summary = await verifier.iterative_improvement_parallel(request_id, executor)
+                total_reasoning_tokens += improvement_summary['total_reasoning_tokens']
 
-            # Re-verify improved solutions
-            verification_summary = verifier.verify_solutions(request_id)
+                # Re-verify improved solutions (parallel)
+                verification_summary = await verifier.verify_solutions_parallel(request_id, executor)
 
-            # Check for early termination
-            if config['early_termination'] and workspace.has_consensus():
-                logger.info("Early termination: consensus reached")
-                break
+                # Check for early termination
+                if config['early_termination'] and workspace.has_consensus():
+                    logger.info("Early termination: consensus reached")
+                    break
 
-            workspace.iteration_count = iteration_count
+                workspace.iteration_count = iteration_count
 
-        # Phase 5: Final Synthesis
+        # Phase 5: Final Synthesis (sequential - needs all results)
         logger.info("Phase 4: Final Synthesis")
         final_solution, synthesis_tokens = _synthesize_final_solution(
             workspace, client, model, config, request_id
@@ -126,24 +151,50 @@ def multi_agent_reasoning_system(
         except:
             return error_response, 0
 
-def _run_exploration_phase(agents: List[MARSAgent], workspace: MARSWorkspace, request_id: str = None) -> int:
-    """Run the multi-agent exploration phase"""
-    total_tokens = 0
-
-    # Generate solutions from all agents in parallel (conceptually)
-    for agent in agents:
+async def _run_exploration_phase_parallel(
+    agents: List[MARSAgent],
+    workspace: MARSWorkspace,
+    request_id: str = None,
+    executor: ThreadPoolExecutor = None
+) -> int:
+    """Run the multi-agent exploration phase with parallel execution"""
+
+    async def generate_solution_async(agent: MARSAgent):
+        """Async wrapper for agent solution generation"""
+        loop = asyncio.get_event_loop()
         try:
-            agent_solution, reasoning_tokens = agent.generate_solution(
-                workspace.problem, request_id
+            solution, tokens = await loop.run_in_executor(
+                executor,
+                agent.generate_solution,
+                workspace.problem,
+                request_id
             )
-            workspace.add_solution(agent_solution)
-            total_tokens += reasoning_tokens
-
+            return agent.agent_id, solution, tokens, None
         except Exception as e:
             logger.error(f"Agent {agent.agent_id} failed during exploration: {str(e)}")
+            return agent.agent_id, None, 0, e
+
+    # Run all agents in parallel
+    tasks = [generate_solution_async(agent) for agent in agents]
+    results = await asyncio.gather(*tasks, return_exceptions=True)
+
+    total_tokens = 0
+    successful_solutions = 0
+
+    for result in results:
+        if isinstance(result, Exception):
+            logger.error(f"Agent task failed: {str(result)}")
             continue
 
-    logger.info(f"Exploration phase complete: {len(workspace.solutions)} solutions generated")
+        agent_id, solution, tokens, error = result
+        if error is None and solution is not None:
+            workspace.add_solution(solution)
+            total_tokens += tokens
+            successful_solutions += 1
+        else:
+            logger.error(f"Agent {agent_id} generated no solution")
+
+    logger.info(f"Exploration phase complete: {successful_solutions} solutions generated in parallel")
     return total_tokens
 
 def _synthesize_final_solution(
diff --git a/optillm/mars/verifier.py b/optillm/mars/verifier.py
@@ -1,10 +1,12 @@
 """
-MARS Verification system implementing 5-pass verification threshold
+MARS Verification system implementing 5-pass verification threshold with parallel execution
 """
 
+import asyncio
 import logging
 from typing import Dict, List, Any, Tuple
 from datetime import datetime
+from concurrent.futures import ThreadPoolExecutor
 from .workspace import MARSWorkspace, AgentSolution, VerificationResult
 from .agent import MARSAgent
 
@@ -50,6 +52,71 @@ def verify_solutions(self, request_id: str = None) -> Dict[str, Any]:
         logger.info(f"Verification complete: {verification_summary['solutions_verified']} solutions verified")
         return verification_summary
 
+    async def verify_solutions_parallel(
+        self,
+        request_id: str = None,
+        executor: ThreadPoolExecutor = None
+    ) -> Dict[str, Any]:
+        """Run comprehensive verification on all solutions in workspace with parallel execution"""
+        logger.info(f"Starting parallel verification process with {self.verification_threshold}-pass threshold")
+
+        verification_summary = {
+            'total_verifications': 0,
+            'solutions_verified': 0,
+            'consensus_reached': False,
+            'verification_details': []
+        }
+
+        solutions = self.workspace.solutions
+        if not solutions:
+            logger.warning("No solutions to verify")
+            return verification_summary
+
+        # Verify all solutions in parallel
+        async def verify_solution_async(solution: AgentSolution):
+            """Async wrapper for single solution verification"""
+            loop = asyncio.get_event_loop()
+            try:
+                result = await loop.run_in_executor(
+                    executor,
+                    self._verify_single_solution,
+                    solution,
+                    request_id
+                )
+                return result
+            except Exception as e:
+                logger.error(f"Verification failed for solution from agent {solution.agent_id}: {str(e)}")
+                return {
+                    'solution_agent_id': solution.agent_id,
+                    'verification_count': 0,
+                    'consecutive_passes': 0,
+                    'passes_threshold': False,
+                    'verification_results': []
+                }
+
+        # Run verifications in parallel
+        tasks = [verify_solution_async(solution) for solution in solutions]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+
+        # Process results
+        for result in results:
+            if isinstance(result, Exception):
+                logger.error(f"Verification task failed: {str(result)}")
+                continue
+
+            verification_summary['verification_details'].append(result)
+            verification_summary['total_verifications'] += result['verification_count']
+
+            if result['passes_threshold']:
+                verification_summary['solutions_verified'] += 1
+
+        # Check for consensus
+        verified_solutions = self.workspace.get_verified_solutions()
+        verification_summary['consensus_reached'] = len(verified_solutions) >= self.config.get('consensus_threshold', 2)
+
+        logger.info(f"Parallel verification complete: {verification_summary['solutions_verified']} solutions verified")
+        return verification_summary
+
     def _verify_single_solution(self, solution: AgentSolution, request_id: str = None) -> Dict[str, Any]:
         """Verify a single solution with multiple passes"""
         logger.info(f"Verifying solution from agent {solution.agent_id}")
@@ -177,6 +244,86 @@ def iterative_improvement(self, request_id: str = None) -> Dict[str, Any]:
 
         return improvement_summary
 
+    async def iterative_improvement_parallel(
+        self,
+        request_id: str = None,
+        executor: ThreadPoolExecutor = None
+    ) -> Dict[str, Any]:
+        """Run iterative improvement on solutions that failed verification with parallel execution"""
+        logger.info("Starting parallel iterative improvement process")
+
+        improvement_summary = {
+            'solutions_improved': 0,
+            'improvement_attempts': 0,
+            'total_reasoning_tokens': 0
+        }
+
+        # Get solutions that need improvement
+        unverified_solutions = [s for s in self.workspace.solutions if not s.is_verified]
+
+        # Filter solutions that have verification feedback and can be improved
+        improvable_solutions = []
+        for solution in unverified_solutions:
+            if solution.verification_results:
+                latest_verification = solution.verification_results[-1]
+                if latest_verification['assessment'] in ['INCORRECT', 'INCOMPLETE']:
+                    original_agent = next((a for a in self.agents if a.agent_id == solution.agent_id), None)
+                    if original_agent:
+                        improvable_solutions.append((solution, original_agent, latest_verification))
+
+        if not improvable_solutions:
+            logger.info("No solutions need improvement")
+            return improvement_summary
+
+        # Improve solutions in parallel
+        async def improve_solution_async(solution_data):
+            """Async wrapper for solution improvement"""
+            solution, agent, verification = solution_data
+            loop = asyncio.get_event_loop()
+
+            try:
+                improved_solution, reasoning_tokens = await loop.run_in_executor(
+                    executor,
+                    agent.improve_solution,
+                    self.workspace.problem,
+                    solution.solution,
+                    verification['detailed_report'],
+                    verification['issues'],
+                    request_id
+                )
+
+                # Update solution with improvement
+                solution.solution = improved_solution
+                solution.timestamp = datetime.now()
+                solution.reasoning_tokens += reasoning_tokens
+
+                logger.info(f"Improved solution from agent {solution.agent_id}")
+                return solution.agent_id, True, reasoning_tokens, None
+
+            except Exception as e:
+                logger.error(f"Failed to improve solution from agent {solution.agent_id}: {str(e)}")
+                return solution.agent_id, False, 0, e
+
+        # Run improvements in parallel
+        tasks = [improve_solution_async(sol_data) for sol_data in improvable_solutions]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+
+        # Process results
+        for result in results:
+            improvement_summary['improvement_attempts'] += 1
+
+            if isinstance(result, Exception):
+                logger.error(f"Improvement task failed: {str(result)}")
+                continue
+
+            agent_id, success, tokens, error = result
+            if success:
+                improvement_summary['solutions_improved'] += 1
+                improvement_summary['total_reasoning_tokens'] += tokens
+
+        logger.info(f"Parallel improvement complete: {improvement_summary['solutions_improved']} solutions improved")
+        return improvement_summary
+
     def final_consensus_check(self) -> bool:
         """Final check to determine if consensus has been reached"""
         verified_solutions = self.workspace.get_verified_solutions()
diff --git a/scripts/eval_aime_benchmark.py b/scripts/eval_aime_benchmark.py
@@ -306,7 +306,7 @@ def get_llm_response(problem: str, model: str, analyze_logits: bool = False, ext
         if extra_body:
             kwargs["extra_body"] = extra_body
         
-        response = client.with_options(timeout=3600.0).chat.completions.create(
+        response = client.with_options(timeout=6000.0).chat.completions.create(
             model=model,
             messages=[
                 {"role": "user", "content": SYSTEM_PROMPT + problem}
diff --git a/tests/test.py b/tests/test.py
@@ -26,6 +26,7 @@
 from optillm.plansearch import plansearch
 from optillm.leap import leap
 from optillm.reread import re2_approach
+from optillm.mars import multi_agent_reasoning_system
 from optillm.cepo.cepo import cepo, CepoConfig, init_cepo_config
 
 # Setup logging
@@ -57,6 +58,7 @@ def __init__(self):
     'plansearch': plansearch,
     'leap': leap,
     're2': re2_approach,
+    'mars': multi_agent_reasoning_system,
     'cepo': lambda s, q, c, m: cepo(s,q,c,m,init_cepo_config({'cepo_config_file': './optillm/cepo/configs/cepo_config.yaml'})),
 }
 
diff --git a/tests/test_approaches.py b/tests/test_approaches.py
diff --git a/tests/test_mars_parallel.py b/tests/test_mars_parallel.py