cecli-dev
diff --git a/‎benchmark/README.md‎
Lines changed: 0 additions & 5 deletions b/‎benchmark/README.md‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎benchmark/benchmark.py‎
Lines changed: 59 additions & 4 deletions b/‎benchmark/benchmark.py‎
Lines changed: 59 additions & 4 deletions
diff --git a/‎benchmark/benchmark_classic.py‎
Lines changed: 18 additions & 3 deletions b/‎benchmark/benchmark_classic.py‎
Lines changed: 18 additions & 3 deletions
diff --git a/‎benchmark/npm-test.sh‎
Lines changed: 1 addition & 1 deletion b/‎benchmark/npm-test.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmark/primary_variations.sh‎
Lines changed: 201 additions & 0 deletions b/‎benchmark/primary_variations.sh‎
Lines changed: 201 additions & 0 deletions
diff --git a/‎cecli/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎cecli/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cecli/coders/agent_coder.py‎
Lines changed: 1 addition & 1 deletion b/‎cecli/coders/agent_coder.py‎
Lines changed: 1 addition & 1 deletion
@@ -68,11 +68,6 @@ Launch the docker container and run the benchmark inside it:
 # PR's welcome to more effectively grab the keys without causing anxiety.
 ./benchmark/docker.sh
 
-# Inside the container, install aider as a development build.
-# This way you're running the code that you cloned above, including any local changes.
-# TODO: this step should be included in the Dockerfile
-pip install -e .[dev]
-
 # Run the benchmark:
 ./benchmark/benchmark.py a-helpful-name-for-this-run --model gpt-3.5-turbo --edit-format whole --threads 10 --exercises-dir polyglot-benchmark
 
 
@@ -171,6 +171,9 @@ def main(
         ),
     ),
     dry: bool = typer.Option(False, "--dry", help="Run in dry mode (no cecli, no tests)"),
+    stats: bool = typer.Option(
+        False, "--stats", help="Generate statistics YAML file from benchmark results"
+    ),
 ):
     # setup logging and verbosity
     if quiet:
@@ -182,6 +185,33 @@ def main(
 
     logging.basicConfig(level=log_level, format="%(message)s")
 
+    # Handle --stats flag: generate statistics YAML file and exit
+    if stats:
+        # Convert SimpleNamespace to dict for YAML serialization
+        def simple_namespace_to_dict(obj):
+            if isinstance(obj, SimpleNamespace):
+                return {k: simple_namespace_to_dict(v) for k, v in vars(obj).items()}
+            elif isinstance(obj, dict):
+                return {k: simple_namespace_to_dict(v) for k, v in obj.items()}
+            elif isinstance(obj, list):
+                return [simple_namespace_to_dict(item) for item in obj]
+            else:
+                return obj
+
+        # Get statistics
+        stats_result = summarize_results(results_dir, verbose, stats_languages=languages)
+
+        # Convert to dict
+        stats_dict = simple_namespace_to_dict(stats_result)
+
+        # Write to results.yaml
+        results_yaml_path = Path(results_dir) / "results.yaml"
+        with open(results_yaml_path, "w") as f:
+            yaml.dump(stats_dict, f, default_flow_style=False)
+
+        print(f"Statistics written to: {results_yaml_path}")
+        return 0
+
     from cecli import models
 
     if dry:
@@ -449,10 +479,13 @@ def load_results(results_dir, stats_languages=None):
 
 
 def summarize_results(results_dir, verbose, stats_languages=None):
+    # Convert results_dir to Path object if it's a string
+    results_dir = Path(results_dir)
+
     lang_to_results = load_results(results_dir, stats_languages)
 
     res = SimpleNamespace()
-    res.total_tests = len(list(Path(results_dir).glob("*/.cecli.results.json")))
+    res.total_tests = len(list(results_dir.glob("*/.cecli.results.json")))
 
     try:
         tries = max(
@@ -838,7 +871,6 @@ async def run_test_real(
     # Lazy imports: only needed in the actual benchmark execution path
     import git
 
-    import cecli.prompts.utils.system as prompts
     from cecli import models
     from cecli.coders import Coder
     from cecli.io import InputOutput
@@ -942,7 +974,23 @@ async def run_test_real(
     if instructions_append.exists():
         instructions += instructions_append.read_text()
 
-    instructions += prompts.instructions_addendum.format(file_list=file_list)
+    instructions_addendum = """
+    ####
+
+    Use the above instructions to modify the supplied files: {file_list}
+    Don't change the names of existing functions or classes, as they may be referenced from other code like unit tests, etc.
+    Only use standard libraries, don't suggest installing any packages.
+    """  # noqa: E501
+
+    test_failures = """
+    ####
+
+    See the testing errors above.
+    The tests are correct, don't try and change them.
+    Fix the code in {file_list} to resolve the errors.
+    """  # noqa: E501
+
+    instructions += instructions_addendum.format(file_list=file_list)
 
     io = InputOutput(
         pretty=False,
@@ -1014,7 +1062,14 @@ async def run_test_real(
         # Reduce repo map contention and size for benchmarks
         map_cache_dir=str(testdir),
         repomap_in_memory=repomap_in_memory,
+        args=SimpleNamespace(
+            agent_config='{"skip_cli_confirmations": true}',
+            use_enhanced_map=True,
+            verbose=verbose,
+            yes_always_commands=True,
+        ),
         map_mul_no_files=4,
+        mcp_manager=None,
     )
     if map_tokens is not None:
         coder_kwargs["map_tokens"] = map_tokens
@@ -1091,7 +1146,7 @@ async def run_test_real(
         logger.info(errors[-1])
         errors = "\n".join(errors)
         instructions = errors
-        instructions += prompts.test_failures.format(file_list=file_list)
+        instructions += test_failures.format(file_list=file_list)
 
     if not dry:
         # Clean up build directories after all attempts
 
@@ -870,7 +870,6 @@ def run_test_real(
     # Lazy imports: only needed in the actual benchmark execution path
     import git
 
-    import cecli.prompts.utils.system as prompts
     from cecli import models
     from cecli.coders import Coder
     from cecli.io import InputOutput
@@ -961,7 +960,23 @@ def run_test_real(
     if instructions_append.exists():
         instructions += instructions_append.read_text()
 
-    instructions += prompts.instructions_addendum.format(file_list=file_list)
+    instructions_addendum = """
+    ####
+
+    Use the above instructions to modify the supplied files: {file_list}
+    Don't change the names of existing functions or classes, as they may be referenced from other code like unit tests, etc.
+    Only use standard libraries, don't suggest installing any packages.
+    """  # noqa: E501
+
+    test_failures = """
+    ####
+
+    See the testing errors above.
+    The tests are correct, don't try and change them.
+    Fix the code in {file_list} to resolve the errors.
+    """  # noqa: E501
+
+    instructions += instructions_addendum.format(file_list=file_list)
 
     io = InputOutput(
         pretty=False,
@@ -1108,7 +1123,7 @@ def run_test_real(
         print(errors[-1])
         errors = "\n".join(errors)
         instructions = errors
-        instructions += prompts.test_failures.format(file_list=file_list)
+        instructions += test_failures.format(file_list=file_list)
 
     # Clean up build directories after all attempts
     # Rust target/debug
 
@@ -9,5 +9,5 @@ set -e
 
 
 sed -i 's/\bxtest(/test(/g' *.spec.js
-npm run test
+timeout 5m npm run test
 
@@ -0,0 +1,201 @@
+#!/bin/bash
+# Benchmark runner script for testing multiple OpenRouter models
+# Usage: ./run_benchmark_variations.sh [OPTIONS]
+
+set -e  # Exit on error
+
+# Default values
+BASE_NAME="primary-variation"
+EDIT_FORMAT="diff"
+MAP_TOKENS="512"
+THREADS="1"
+HASH_RE="^4"
+NUM_TESTS="16"
+EXERCISES_DIR="polyglot-benchmark"
+OUTPUT_DIR="tmp.benchmarks"
+SLEEP_BETWEEN=30  # Seconds to sleep between runs
+
+# List of models to test
+# RERUN
+#    "openrouter/minimax/minimax-m2.1"
+#    "openrouter/qwen/qwen3-vl-235b-a22b-thinking"
+MODELS=(
+#    "openrouter/deepseek/deepseek-v3.2"
+#    "openrouter/moonshotai/kimi-k2.5"
+#    "openrouter/minimax/minimax-m2.1"
+#    "openrouter/minimax/minimax-m2.1"
+#    "openrouter/qwen/qwen3-vl-235b-a22b-thinking"
+#    "openrouter/openai/gpt-oss-120b"
+    "openrouter/openai/gpt-5.2"   
+#    "openrouter/google/gemini-3-flash-preview"
+#    "openrouter/google/gemini-3-pro-preview"
+#    "openrouter/anthropic/claude-haiku-4.5"
+#    "openrouter/anthropic/claude-sonnet-4.5" 
+)
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --base-name)
+            BASE_NAME="$2"
+            shift 2
+            ;;
+        --edit-format)
+            EDIT_FORMAT="$2"
+            shift 2
+            ;;
+        --map-tokens)
+            MAP_TOKENS="$2"
+            shift 2
+            ;;
+        --threads)
+            THREADS="$2"
+            shift 2
+            ;;
+        --hash-re)
+            HASH_RE="$2"
+            shift 2
+            ;;
+        --num-tests)
+            NUM_TESTS="$2"
+            shift 2
+            ;;
+        --exercises-dir)
+            EXERCISES_DIR="$2"
+            shift 2
+            ;;
+        --output-dir)
+            OUTPUT_DIR="$2"
+            shift 2
+            ;;
+        --sleep)
+            SLEEP_BETWEEN="$2"
+            shift 2
+            ;;
+        --help)
+            echo "Usage: $0 [OPTIONS]"
+            echo ""
+            echo "Options:"
+            echo "  --base-name NAME      Base name for benchmark runs (default: $BASE_NAME)"
+            echo "  --edit-format FORMAT  Edit format to use (default: $EDIT_FORMAT)"
+            echo "  --map-tokens TOKENS   Map tokens (default: $MAP_TOKENS)"
+            echo "  --threads N           Number of threads (default: $THREADS)"
+            echo "  --hash-re REGEX       Hash regex filter (default: $HASH_RE)"
+            echo "  --num-tests N         Number of tests to run (default: $NUM_TESTS)"
+            echo "  --exercises-dir DIR   Exercises directory (default: $EXERCISES_DIR)"
+            echo "  --output-dir DIR      Output directory (default: $OUTPUT_DIR)"
+            echo "  --sleep SECONDS       Sleep between runs in seconds (default: $SLEEP_BETWEEN)"
+            echo "  --help                Show this help message"
+            echo ""
+            echo "Example:"
+            echo "  $0 --threads 2 --num-tests 5"
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+# Function to run a single benchmark
+run_benchmark() {
+    local model="$1"
+    local run_name="$2"
+
+    echo "========================================================================"
+    echo "Starting benchmark: $run_name"
+    echo "Model: $model"
+    echo "Time: $(date)"
+    echo "========================================================================"
+
+    # Create the benchmark command
+    ./benchmark/benchmark.py "$run_name" \
+        --model "$model" \
+        --edit-format "$EDIT_FORMAT" \
+        --map-tokens "$MAP_TOKENS" \
+        --threads "$THREADS" \
+        --hash-re "$HASH_RE" \
+        --num-tests "$NUM_TESTS" \
+        --exercises-dir "$EXERCISES_DIR"
+
+    echo "Benchmark completed: $run_name"
+    echo "Results directory: $OUTPUT_DIR/$(ls -t $OUTPUT_DIR | grep "$run_name" | head -1)"
+    echo ""
+}
+
+# Function to generate statistics for all completed runs
+generate_stats() {
+    echo "========================================================================"
+    echo "Generating statistics for all completed runs"
+    echo "========================================================================"
+
+    for dir in "$OUTPUT_DIR"/*; do
+        if [ -d "$dir" ] && [ -f "$dir/.cecli.results.json" ]; then
+            echo "Processing: $(basename "$dir")"
+            ./benchmark/benchmark.py --stats "$dir" || true
+            echo ""
+        fi
+    done
+}
+
+# Main execution
+main() {
+    echo "========================================================================"
+    echo "OpenRouter Model Benchmark Runner"
+    echo "========================================================================"
+    echo "Configuration:"
+    echo "  Base name:      $BASE_NAME"
+    echo "  Edit format:    $EDIT_FORMAT"
+    echo "  Map tokens:     $MAP_TOKENS"
+    echo "  Threads:        $THREADS"
+    echo "  Hash regex:     $HASH_RE"
+    echo "  Num tests:      $NUM_TESTS"
+    echo "  Exercises dir:  $EXERCISES_DIR"
+    echo "  Output dir:     $OUTPUT_DIR"
+    echo "  Sleep between:  ${SLEEP_BETWEEN}s"
+    echo "  Models to test: ${#MODELS[@]}"
+    echo ""
+
+    # Create output directory if it doesn't exist
+    mkdir -p "$OUTPUT_DIR"
+
+    # Run benchmarks for each model
+    for model in "${MODELS[@]}"; do
+        # Create a run name by replacing slashes with hyphens
+        local model_slug=$(echo "$model" | sed 's|/|-|g')
+        local run_name="${BASE_NAME}-${model_slug}"
+
+        run_benchmark "$model" "$run_name"
+
+        # Sleep between runs to avoid rate limiting
+        if [ "$SLEEP_BETWEEN" -gt 0 ]; then
+            echo "Sleeping for ${SLEEP_BETWEEN} seconds before next run..."
+            sleep "$SLEEP_BETWEEN"
+            echo ""
+        fi
+    done
+
+    # Generate statistics
+    generate_stats
+
+    echo "========================================================================"
+    echo "All benchmarks completed!"
+    echo "========================================================================"
+    echo ""
+    echo "Summary of results directories:"
+    ls -la "$OUTPUT_DIR" | grep "$BASE_NAME"
+    echo ""
+    echo "To view statistics for a specific run:"
+    echo "  ./benchmark/benchmark.py --stats $OUTPUT_DIR/<run-directory>"
+    echo ""
+    echo "To compare all results:"
+    echo "  for dir in $OUTPUT_DIR/*$BASE_NAME*; do"
+    echo "    echo \"=== \$(basename \$dir) ===\""
+    echo "    ./benchmark/benchmark.py --stats \"\$dir\" 2>/dev/null | grep -E '(pass_rate|total_cost|completed_tests)' || true"
+    echo "  done"
+}
+
+# Run main function
+main
@@ -1,6 +1,6 @@
 from packaging import version
 
-__version__ = "0.96.9.dev"
+__version__ = "0.96.10.dev"
 safe_version = __version__
 
 try:
 
@@ -208,7 +208,7 @@ def get_local_tool_schemas(self):
 
     async def initialize_mcp_tools(self):
         if not self.mcp_manager:
-            self.mcp_manager = McpServerManager()
+            self.mcp_manager = McpServerManager([], self.io, self.args.verbose)
 
         server_name = "Local"
         server = self.mcp_manager.get_server(server_name)
Original file line number	Diff line number	Diff line change
`@@ -9,5 +9,5 @@ set -e`
`9`	`9`
`10`	`10`
`11`	`11`	`sed -i 's/\bxtest(/test(/g' *.spec.js`
`12`		`-npm run test`
	`12`	`+timeout 5m npm run test`
`13`	`13`