Skip to content

Commit 0de366e

Browse files
author
Your Name
committed
Benchmark and website updates
1 parent f829a33 commit 0de366e

16 files changed

Lines changed: 329 additions & 64 deletions

benchmark/README.md

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,11 +68,6 @@ Launch the docker container and run the benchmark inside it:
6868
# PR's welcome to more effectively grab the keys without causing anxiety.
6969
./benchmark/docker.sh
7070
71-
# Inside the container, install aider as a development build.
72-
# This way you're running the code that you cloned above, including any local changes.
73-
# TODO: this step should be included in the Dockerfile
74-
pip install -e .[dev]
75-
7671
# Run the benchmark:
7772
./benchmark/benchmark.py a-helpful-name-for-this-run --model gpt-3.5-turbo --edit-format whole --threads 10 --exercises-dir polyglot-benchmark
7873

benchmark/benchmark.py

Lines changed: 59 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,9 @@ def main(
171171
),
172172
),
173173
dry: bool = typer.Option(False, "--dry", help="Run in dry mode (no cecli, no tests)"),
174+
stats: bool = typer.Option(
175+
False, "--stats", help="Generate statistics YAML file from benchmark results"
176+
),
174177
):
175178
# setup logging and verbosity
176179
if quiet:
@@ -182,6 +185,33 @@ def main(
182185

183186
logging.basicConfig(level=log_level, format="%(message)s")
184187

188+
# Handle --stats flag: generate statistics YAML file and exit
189+
if stats:
190+
# Convert SimpleNamespace to dict for YAML serialization
191+
def simple_namespace_to_dict(obj):
192+
if isinstance(obj, SimpleNamespace):
193+
return {k: simple_namespace_to_dict(v) for k, v in vars(obj).items()}
194+
elif isinstance(obj, dict):
195+
return {k: simple_namespace_to_dict(v) for k, v in obj.items()}
196+
elif isinstance(obj, list):
197+
return [simple_namespace_to_dict(item) for item in obj]
198+
else:
199+
return obj
200+
201+
# Get statistics
202+
stats_result = summarize_results(results_dir, verbose, stats_languages=languages)
203+
204+
# Convert to dict
205+
stats_dict = simple_namespace_to_dict(stats_result)
206+
207+
# Write to results.yaml
208+
results_yaml_path = Path(results_dir) / "results.yaml"
209+
with open(results_yaml_path, "w") as f:
210+
yaml.dump(stats_dict, f, default_flow_style=False)
211+
212+
print(f"Statistics written to: {results_yaml_path}")
213+
return 0
214+
185215
from cecli import models
186216

187217
if dry:
@@ -449,10 +479,13 @@ def load_results(results_dir, stats_languages=None):
449479

450480

451481
def summarize_results(results_dir, verbose, stats_languages=None):
482+
# Convert results_dir to Path object if it's a string
483+
results_dir = Path(results_dir)
484+
452485
lang_to_results = load_results(results_dir, stats_languages)
453486

454487
res = SimpleNamespace()
455-
res.total_tests = len(list(Path(results_dir).glob("*/.cecli.results.json")))
488+
res.total_tests = len(list(results_dir.glob("*/.cecli.results.json")))
456489

457490
try:
458491
tries = max(
@@ -838,7 +871,6 @@ async def run_test_real(
838871
# Lazy imports: only needed in the actual benchmark execution path
839872
import git
840873

841-
import cecli.prompts.utils.system as prompts
842874
from cecli import models
843875
from cecli.coders import Coder
844876
from cecli.io import InputOutput
@@ -942,7 +974,23 @@ async def run_test_real(
942974
if instructions_append.exists():
943975
instructions += instructions_append.read_text()
944976

945-
instructions += prompts.instructions_addendum.format(file_list=file_list)
977+
instructions_addendum = """
978+
####
979+
980+
Use the above instructions to modify the supplied files: {file_list}
981+
Don't change the names of existing functions or classes, as they may be referenced from other code like unit tests, etc.
982+
Only use standard libraries, don't suggest installing any packages.
983+
""" # noqa: E501
984+
985+
test_failures = """
986+
####
987+
988+
See the testing errors above.
989+
The tests are correct, don't try and change them.
990+
Fix the code in {file_list} to resolve the errors.
991+
""" # noqa: E501
992+
993+
instructions += instructions_addendum.format(file_list=file_list)
946994

947995
io = InputOutput(
948996
pretty=False,
@@ -1014,7 +1062,14 @@ async def run_test_real(
10141062
# Reduce repo map contention and size for benchmarks
10151063
map_cache_dir=str(testdir),
10161064
repomap_in_memory=repomap_in_memory,
1065+
args=SimpleNamespace(
1066+
agent_config='{"skip_cli_confirmations": true}',
1067+
use_enhanced_map=True,
1068+
verbose=verbose,
1069+
yes_always_commands=True,
1070+
),
10171071
map_mul_no_files=4,
1072+
mcp_manager=None,
10181073
)
10191074
if map_tokens is not None:
10201075
coder_kwargs["map_tokens"] = map_tokens
@@ -1091,7 +1146,7 @@ async def run_test_real(
10911146
logger.info(errors[-1])
10921147
errors = "\n".join(errors)
10931148
instructions = errors
1094-
instructions += prompts.test_failures.format(file_list=file_list)
1149+
instructions += test_failures.format(file_list=file_list)
10951150

10961151
if not dry:
10971152
# Clean up build directories after all attempts

benchmark/benchmark_classic.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -870,7 +870,6 @@ def run_test_real(
870870
# Lazy imports: only needed in the actual benchmark execution path
871871
import git
872872

873-
import cecli.prompts.utils.system as prompts
874873
from cecli import models
875874
from cecli.coders import Coder
876875
from cecli.io import InputOutput
@@ -961,7 +960,23 @@ def run_test_real(
961960
if instructions_append.exists():
962961
instructions += instructions_append.read_text()
963962

964-
instructions += prompts.instructions_addendum.format(file_list=file_list)
963+
instructions_addendum = """
964+
####
965+
966+
Use the above instructions to modify the supplied files: {file_list}
967+
Don't change the names of existing functions or classes, as they may be referenced from other code like unit tests, etc.
968+
Only use standard libraries, don't suggest installing any packages.
969+
""" # noqa: E501
970+
971+
test_failures = """
972+
####
973+
974+
See the testing errors above.
975+
The tests are correct, don't try and change them.
976+
Fix the code in {file_list} to resolve the errors.
977+
""" # noqa: E501
978+
979+
instructions += instructions_addendum.format(file_list=file_list)
965980

966981
io = InputOutput(
967982
pretty=False,
@@ -1108,7 +1123,7 @@ def run_test_real(
11081123
print(errors[-1])
11091124
errors = "\n".join(errors)
11101125
instructions = errors
1111-
instructions += prompts.test_failures.format(file_list=file_list)
1126+
instructions += test_failures.format(file_list=file_list)
11121127

11131128
# Clean up build directories after all attempts
11141129
# Rust target/debug

benchmark/npm-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,5 @@ set -e
99

1010

1111
sed -i 's/\bxtest(/test(/g' *.spec.js
12-
npm run test
12+
timeout 5m npm run test
1313

benchmark/primary_variations.sh

Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
#!/bin/bash
2+
# Benchmark runner script for testing multiple OpenRouter models
3+
# Usage: ./run_benchmark_variations.sh [OPTIONS]
4+
5+
set -e # Exit on error
6+
7+
# Default values
8+
BASE_NAME="primary-variation"
9+
EDIT_FORMAT="diff"
10+
MAP_TOKENS="512"
11+
THREADS="1"
12+
HASH_RE="^4"
13+
NUM_TESTS="16"
14+
EXERCISES_DIR="polyglot-benchmark"
15+
OUTPUT_DIR="tmp.benchmarks"
16+
SLEEP_BETWEEN=30 # Seconds to sleep between runs
17+
18+
# List of models to test
19+
# RERUN
20+
# "openrouter/minimax/minimax-m2.1"
21+
# "openrouter/qwen/qwen3-vl-235b-a22b-thinking"
22+
MODELS=(
23+
# "openrouter/deepseek/deepseek-v3.2"
24+
# "openrouter/moonshotai/kimi-k2.5"
25+
# "openrouter/minimax/minimax-m2.1"
26+
# "openrouter/minimax/minimax-m2.1"
27+
# "openrouter/qwen/qwen3-vl-235b-a22b-thinking"
28+
# "openrouter/openai/gpt-oss-120b"
29+
"openrouter/openai/gpt-5.2"
30+
# "openrouter/google/gemini-3-flash-preview"
31+
# "openrouter/google/gemini-3-pro-preview"
32+
# "openrouter/anthropic/claude-haiku-4.5"
33+
# "openrouter/anthropic/claude-sonnet-4.5"
34+
)
35+
36+
# Parse command line arguments
37+
while [[ $# -gt 0 ]]; do
38+
case $1 in
39+
--base-name)
40+
BASE_NAME="$2"
41+
shift 2
42+
;;
43+
--edit-format)
44+
EDIT_FORMAT="$2"
45+
shift 2
46+
;;
47+
--map-tokens)
48+
MAP_TOKENS="$2"
49+
shift 2
50+
;;
51+
--threads)
52+
THREADS="$2"
53+
shift 2
54+
;;
55+
--hash-re)
56+
HASH_RE="$2"
57+
shift 2
58+
;;
59+
--num-tests)
60+
NUM_TESTS="$2"
61+
shift 2
62+
;;
63+
--exercises-dir)
64+
EXERCISES_DIR="$2"
65+
shift 2
66+
;;
67+
--output-dir)
68+
OUTPUT_DIR="$2"
69+
shift 2
70+
;;
71+
--sleep)
72+
SLEEP_BETWEEN="$2"
73+
shift 2
74+
;;
75+
--help)
76+
echo "Usage: $0 [OPTIONS]"
77+
echo ""
78+
echo "Options:"
79+
echo " --base-name NAME Base name for benchmark runs (default: $BASE_NAME)"
80+
echo " --edit-format FORMAT Edit format to use (default: $EDIT_FORMAT)"
81+
echo " --map-tokens TOKENS Map tokens (default: $MAP_TOKENS)"
82+
echo " --threads N Number of threads (default: $THREADS)"
83+
echo " --hash-re REGEX Hash regex filter (default: $HASH_RE)"
84+
echo " --num-tests N Number of tests to run (default: $NUM_TESTS)"
85+
echo " --exercises-dir DIR Exercises directory (default: $EXERCISES_DIR)"
86+
echo " --output-dir DIR Output directory (default: $OUTPUT_DIR)"
87+
echo " --sleep SECONDS Sleep between runs in seconds (default: $SLEEP_BETWEEN)"
88+
echo " --help Show this help message"
89+
echo ""
90+
echo "Example:"
91+
echo " $0 --threads 2 --num-tests 5"
92+
exit 0
93+
;;
94+
*)
95+
echo "Unknown option: $1"
96+
echo "Use --help for usage information"
97+
exit 1
98+
;;
99+
esac
100+
done
101+
102+
# Function to run a single benchmark
103+
run_benchmark() {
104+
local model="$1"
105+
local run_name="$2"
106+
107+
echo "========================================================================"
108+
echo "Starting benchmark: $run_name"
109+
echo "Model: $model"
110+
echo "Time: $(date)"
111+
echo "========================================================================"
112+
113+
# Create the benchmark command
114+
./benchmark/benchmark.py "$run_name" \
115+
--model "$model" \
116+
--edit-format "$EDIT_FORMAT" \
117+
--map-tokens "$MAP_TOKENS" \
118+
--threads "$THREADS" \
119+
--hash-re "$HASH_RE" \
120+
--num-tests "$NUM_TESTS" \
121+
--exercises-dir "$EXERCISES_DIR"
122+
123+
echo "Benchmark completed: $run_name"
124+
echo "Results directory: $OUTPUT_DIR/$(ls -t $OUTPUT_DIR | grep "$run_name" | head -1)"
125+
echo ""
126+
}
127+
128+
# Function to generate statistics for all completed runs
129+
generate_stats() {
130+
echo "========================================================================"
131+
echo "Generating statistics for all completed runs"
132+
echo "========================================================================"
133+
134+
for dir in "$OUTPUT_DIR"/*; do
135+
if [ -d "$dir" ] && [ -f "$dir/.cecli.results.json" ]; then
136+
echo "Processing: $(basename "$dir")"
137+
./benchmark/benchmark.py --stats "$dir" || true
138+
echo ""
139+
fi
140+
done
141+
}
142+
143+
# Main execution
144+
main() {
145+
echo "========================================================================"
146+
echo "OpenRouter Model Benchmark Runner"
147+
echo "========================================================================"
148+
echo "Configuration:"
149+
echo " Base name: $BASE_NAME"
150+
echo " Edit format: $EDIT_FORMAT"
151+
echo " Map tokens: $MAP_TOKENS"
152+
echo " Threads: $THREADS"
153+
echo " Hash regex: $HASH_RE"
154+
echo " Num tests: $NUM_TESTS"
155+
echo " Exercises dir: $EXERCISES_DIR"
156+
echo " Output dir: $OUTPUT_DIR"
157+
echo " Sleep between: ${SLEEP_BETWEEN}s"
158+
echo " Models to test: ${#MODELS[@]}"
159+
echo ""
160+
161+
# Create output directory if it doesn't exist
162+
mkdir -p "$OUTPUT_DIR"
163+
164+
# Run benchmarks for each model
165+
for model in "${MODELS[@]}"; do
166+
# Create a run name by replacing slashes with hyphens
167+
local model_slug=$(echo "$model" | sed 's|/|-|g')
168+
local run_name="${BASE_NAME}-${model_slug}"
169+
170+
run_benchmark "$model" "$run_name"
171+
172+
# Sleep between runs to avoid rate limiting
173+
if [ "$SLEEP_BETWEEN" -gt 0 ]; then
174+
echo "Sleeping for ${SLEEP_BETWEEN} seconds before next run..."
175+
sleep "$SLEEP_BETWEEN"
176+
echo ""
177+
fi
178+
done
179+
180+
# Generate statistics
181+
generate_stats
182+
183+
echo "========================================================================"
184+
echo "All benchmarks completed!"
185+
echo "========================================================================"
186+
echo ""
187+
echo "Summary of results directories:"
188+
ls -la "$OUTPUT_DIR" | grep "$BASE_NAME"
189+
echo ""
190+
echo "To view statistics for a specific run:"
191+
echo " ./benchmark/benchmark.py --stats $OUTPUT_DIR/<run-directory>"
192+
echo ""
193+
echo "To compare all results:"
194+
echo " for dir in $OUTPUT_DIR/*$BASE_NAME*; do"
195+
echo " echo \"=== \$(basename \$dir) ===\""
196+
echo " ./benchmark/benchmark.py --stats \"\$dir\" 2>/dev/null | grep -E '(pass_rate|total_cost|completed_tests)' || true"
197+
echo " done"
198+
}
199+
200+
# Run main function
201+
main

cecli/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from packaging import version
22

3-
__version__ = "0.96.9.dev"
3+
__version__ = "0.96.10.dev"
44
safe_version = __version__
55

66
try:

cecli/coders/agent_coder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ def get_local_tool_schemas(self):
208208

209209
async def initialize_mcp_tools(self):
210210
if not self.mcp_manager:
211-
self.mcp_manager = McpServerManager()
211+
self.mcp_manager = McpServerManager([], self.io, self.args.verbose)
212212

213213
server_name = "Local"
214214
server = self.mcp_manager.get_server(server_name)

0 commit comments

Comments
 (0)