|
| 1 | +#!/bin/bash |
| 2 | +# Benchmark runner script for testing multiple OpenRouter models |
| 3 | +# Usage: ./run_benchmark_variations.sh [OPTIONS] |
| 4 | + |
| 5 | +set -e # Exit on error |
| 6 | + |
| 7 | +# Default values |
| 8 | +BASE_NAME="primary-variation" |
| 9 | +EDIT_FORMAT="diff" |
| 10 | +MAP_TOKENS="512" |
| 11 | +THREADS="1" |
| 12 | +HASH_RE="^4" |
| 13 | +NUM_TESTS="16" |
| 14 | +EXERCISES_DIR="polyglot-benchmark" |
| 15 | +OUTPUT_DIR="tmp.benchmarks" |
| 16 | +SLEEP_BETWEEN=30 # Seconds to sleep between runs |
| 17 | + |
| 18 | +# List of models to test |
| 19 | +# RERUN |
| 20 | +# "openrouter/minimax/minimax-m2.1" |
| 21 | +# "openrouter/qwen/qwen3-vl-235b-a22b-thinking" |
| 22 | +MODELS=( |
| 23 | +# "openrouter/deepseek/deepseek-v3.2" |
| 24 | +# "openrouter/moonshotai/kimi-k2.5" |
| 25 | +# "openrouter/minimax/minimax-m2.1" |
| 26 | +# "openrouter/minimax/minimax-m2.1" |
| 27 | +# "openrouter/qwen/qwen3-vl-235b-a22b-thinking" |
| 28 | +# "openrouter/openai/gpt-oss-120b" |
| 29 | + "openrouter/openai/gpt-5.2" |
| 30 | +# "openrouter/google/gemini-3-flash-preview" |
| 31 | +# "openrouter/google/gemini-3-pro-preview" |
| 32 | +# "openrouter/anthropic/claude-haiku-4.5" |
| 33 | +# "openrouter/anthropic/claude-sonnet-4.5" |
| 34 | +) |
| 35 | + |
| 36 | +# Parse command line arguments |
| 37 | +while [[ $# -gt 0 ]]; do |
| 38 | + case $1 in |
| 39 | + --base-name) |
| 40 | + BASE_NAME="$2" |
| 41 | + shift 2 |
| 42 | + ;; |
| 43 | + --edit-format) |
| 44 | + EDIT_FORMAT="$2" |
| 45 | + shift 2 |
| 46 | + ;; |
| 47 | + --map-tokens) |
| 48 | + MAP_TOKENS="$2" |
| 49 | + shift 2 |
| 50 | + ;; |
| 51 | + --threads) |
| 52 | + THREADS="$2" |
| 53 | + shift 2 |
| 54 | + ;; |
| 55 | + --hash-re) |
| 56 | + HASH_RE="$2" |
| 57 | + shift 2 |
| 58 | + ;; |
| 59 | + --num-tests) |
| 60 | + NUM_TESTS="$2" |
| 61 | + shift 2 |
| 62 | + ;; |
| 63 | + --exercises-dir) |
| 64 | + EXERCISES_DIR="$2" |
| 65 | + shift 2 |
| 66 | + ;; |
| 67 | + --output-dir) |
| 68 | + OUTPUT_DIR="$2" |
| 69 | + shift 2 |
| 70 | + ;; |
| 71 | + --sleep) |
| 72 | + SLEEP_BETWEEN="$2" |
| 73 | + shift 2 |
| 74 | + ;; |
| 75 | + --help) |
| 76 | + echo "Usage: $0 [OPTIONS]" |
| 77 | + echo "" |
| 78 | + echo "Options:" |
| 79 | + echo " --base-name NAME Base name for benchmark runs (default: $BASE_NAME)" |
| 80 | + echo " --edit-format FORMAT Edit format to use (default: $EDIT_FORMAT)" |
| 81 | + echo " --map-tokens TOKENS Map tokens (default: $MAP_TOKENS)" |
| 82 | + echo " --threads N Number of threads (default: $THREADS)" |
| 83 | + echo " --hash-re REGEX Hash regex filter (default: $HASH_RE)" |
| 84 | + echo " --num-tests N Number of tests to run (default: $NUM_TESTS)" |
| 85 | + echo " --exercises-dir DIR Exercises directory (default: $EXERCISES_DIR)" |
| 86 | + echo " --output-dir DIR Output directory (default: $OUTPUT_DIR)" |
| 87 | + echo " --sleep SECONDS Sleep between runs in seconds (default: $SLEEP_BETWEEN)" |
| 88 | + echo " --help Show this help message" |
| 89 | + echo "" |
| 90 | + echo "Example:" |
| 91 | + echo " $0 --threads 2 --num-tests 5" |
| 92 | + exit 0 |
| 93 | + ;; |
| 94 | + *) |
| 95 | + echo "Unknown option: $1" |
| 96 | + echo "Use --help for usage information" |
| 97 | + exit 1 |
| 98 | + ;; |
| 99 | + esac |
| 100 | +done |
| 101 | + |
| 102 | +# Function to run a single benchmark |
| 103 | +run_benchmark() { |
| 104 | + local model="$1" |
| 105 | + local run_name="$2" |
| 106 | + |
| 107 | + echo "========================================================================" |
| 108 | + echo "Starting benchmark: $run_name" |
| 109 | + echo "Model: $model" |
| 110 | + echo "Time: $(date)" |
| 111 | + echo "========================================================================" |
| 112 | + |
| 113 | + # Create the benchmark command |
| 114 | + ./benchmark/benchmark.py "$run_name" \ |
| 115 | + --model "$model" \ |
| 116 | + --edit-format "$EDIT_FORMAT" \ |
| 117 | + --map-tokens "$MAP_TOKENS" \ |
| 118 | + --threads "$THREADS" \ |
| 119 | + --hash-re "$HASH_RE" \ |
| 120 | + --num-tests "$NUM_TESTS" \ |
| 121 | + --exercises-dir "$EXERCISES_DIR" |
| 122 | + |
| 123 | + echo "Benchmark completed: $run_name" |
| 124 | + echo "Results directory: $OUTPUT_DIR/$(ls -t $OUTPUT_DIR | grep "$run_name" | head -1)" |
| 125 | + echo "" |
| 126 | +} |
| 127 | + |
| 128 | +# Function to generate statistics for all completed runs |
| 129 | +generate_stats() { |
| 130 | + echo "========================================================================" |
| 131 | + echo "Generating statistics for all completed runs" |
| 132 | + echo "========================================================================" |
| 133 | + |
| 134 | + for dir in "$OUTPUT_DIR"/*; do |
| 135 | + if [ -d "$dir" ] && [ -f "$dir/.cecli.results.json" ]; then |
| 136 | + echo "Processing: $(basename "$dir")" |
| 137 | + ./benchmark/benchmark.py --stats "$dir" || true |
| 138 | + echo "" |
| 139 | + fi |
| 140 | + done |
| 141 | +} |
| 142 | + |
| 143 | +# Main execution |
| 144 | +main() { |
| 145 | + echo "========================================================================" |
| 146 | + echo "OpenRouter Model Benchmark Runner" |
| 147 | + echo "========================================================================" |
| 148 | + echo "Configuration:" |
| 149 | + echo " Base name: $BASE_NAME" |
| 150 | + echo " Edit format: $EDIT_FORMAT" |
| 151 | + echo " Map tokens: $MAP_TOKENS" |
| 152 | + echo " Threads: $THREADS" |
| 153 | + echo " Hash regex: $HASH_RE" |
| 154 | + echo " Num tests: $NUM_TESTS" |
| 155 | + echo " Exercises dir: $EXERCISES_DIR" |
| 156 | + echo " Output dir: $OUTPUT_DIR" |
| 157 | + echo " Sleep between: ${SLEEP_BETWEEN}s" |
| 158 | + echo " Models to test: ${#MODELS[@]}" |
| 159 | + echo "" |
| 160 | + |
| 161 | + # Create output directory if it doesn't exist |
| 162 | + mkdir -p "$OUTPUT_DIR" |
| 163 | + |
| 164 | + # Run benchmarks for each model |
| 165 | + for model in "${MODELS[@]}"; do |
| 166 | + # Create a run name by replacing slashes with hyphens |
| 167 | + local model_slug=$(echo "$model" | sed 's|/|-|g') |
| 168 | + local run_name="${BASE_NAME}-${model_slug}" |
| 169 | + |
| 170 | + run_benchmark "$model" "$run_name" |
| 171 | + |
| 172 | + # Sleep between runs to avoid rate limiting |
| 173 | + if [ "$SLEEP_BETWEEN" -gt 0 ]; then |
| 174 | + echo "Sleeping for ${SLEEP_BETWEEN} seconds before next run..." |
| 175 | + sleep "$SLEEP_BETWEEN" |
| 176 | + echo "" |
| 177 | + fi |
| 178 | + done |
| 179 | + |
| 180 | + # Generate statistics |
| 181 | + generate_stats |
| 182 | + |
| 183 | + echo "========================================================================" |
| 184 | + echo "All benchmarks completed!" |
| 185 | + echo "========================================================================" |
| 186 | + echo "" |
| 187 | + echo "Summary of results directories:" |
| 188 | + ls -la "$OUTPUT_DIR" | grep "$BASE_NAME" |
| 189 | + echo "" |
| 190 | + echo "To view statistics for a specific run:" |
| 191 | + echo " ./benchmark/benchmark.py --stats $OUTPUT_DIR/<run-directory>" |
| 192 | + echo "" |
| 193 | + echo "To compare all results:" |
| 194 | + echo " for dir in $OUTPUT_DIR/*$BASE_NAME*; do" |
| 195 | + echo " echo \"=== \$(basename \$dir) ===\"" |
| 196 | + echo " ./benchmark/benchmark.py --stats \"\$dir\" 2>/dev/null | grep -E '(pass_rate|total_cost|completed_tests)' || true" |
| 197 | + echo " done" |
| 198 | +} |
| 199 | + |
| 200 | +# Run main function |
| 201 | +main |
0 commit comments