From f55e7250805667f2f770c8ceee5686ed046bd4f4 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Sat, 25 Apr 2026 01:14:31 +0000 Subject: [PATCH] =?UTF-8?q?=F0=9F=A4=96=20bench:=20use=20GPT-5.5=20for=20t?= =?UTF-8?q?bench?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch nightly Terminal-Bench defaults to GPT-5.5 and Opus 4.7 with xhigh thinking. Add leaderboard metadata for both models and update tbench examples. --- _Generated with `mux` • Model: `openai:gpt-5.5` • Thinking: `xhigh` • Cost: `$9.21`_ --- .github/workflows/nightly-terminal-bench.yml | 8 ++++---- .github/workflows/terminal-bench.yml | 4 ++-- .mux/skills/tbench/SKILL.md | 18 +++++++++--------- .../prepare_leaderboard_submission.py | 18 ++++++++++++++++-- 4 files changed, 31 insertions(+), 17 deletions(-) diff --git a/.github/workflows/nightly-terminal-bench.yml b/.github/workflows/nightly-terminal-bench.yml index 2cc03fcac8..f442719c4c 100644 --- a/.github/workflows/nightly-terminal-bench.yml +++ b/.github/workflows/nightly-terminal-bench.yml @@ -10,7 +10,7 @@ on: workflow_dispatch: inputs: models: - description: 'Models to test (comma-separated, or "all" for opus-4-7 + gpt-5.3-codex + gpt-5.5 + google/gemini-3-pro-preview + google/gemini-3-flash-preview)' + description: 'Models to test (comma-separated, or "all" for opus-4-7 + gpt-5.5 + google/gemini-3-pro-preview + google/gemini-3-flash-preview)' required: false default: "all" type: string @@ -120,7 +120,7 @@ jobs: INPUT_MODELS: ${{ inputs.models }} run: | if [ "$INPUT_MODELS" = "all" ] || [ -z "$INPUT_MODELS" ]; then - echo 'models=["anthropic/claude-opus-4-7","openai/gpt-5.3-codex","openai/gpt-5.5","google/gemini-3-pro-preview","google/gemini-3-flash-preview"]' >> "$GITHUB_OUTPUT" + echo 'models=["anthropic/claude-opus-4-7","openai/gpt-5.5","google/gemini-3-pro-preview","google/gemini-3-flash-preview"]' >> "$GITHUB_OUTPUT" else # Convert comma-separated to JSON array models_json=$(echo "$INPUT_MODELS" | jq -R -s -c 'split(",") | map(gsub("^\\s+|\\s+$"; ""))') @@ -138,8 +138,8 @@ jobs: uses: ./.github/workflows/terminal-bench.yml with: model_name: ${{ matrix.model }} - # gpt-5 class and any Claude Opus (4.6/4.7/future) use xhigh thinking; others use high - mux_run_args: "--thinking ${{ (contains(matrix.model, 'gpt-5') || contains(matrix.model, 'claude-opus')) && 'xhigh' || 'high' }}" + # GPT-5.5 and Claude Opus use xhigh thinking; others use high. + mux_run_args: "--thinking ${{ (contains(matrix.model, 'gpt-5.5') || contains(matrix.model, 'claude-opus')) && 'xhigh' || 'high' }}" dataset: "terminal-bench@2.0" concurrency: "48" env: "daytona" diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml index 8aff72f6c1..a84fead0af 100644 --- a/.github/workflows/terminal-bench.yml +++ b/.github/workflows/terminal-bench.yml @@ -4,7 +4,7 @@ on: workflow_call: inputs: model_name: - description: "Model to use (e.g., anthropic/claude-opus-4-5)" + description: "Model to use (e.g., anthropic/claude-opus-4-7, openai/gpt-5.5)" required: false type: string dataset: @@ -88,7 +88,7 @@ on: required: false type: string model_name: - description: "Model to use (e.g., anthropic/claude-opus-4-5, openai/gpt-5.5)" + description: "Model to use (e.g., anthropic/claude-opus-4-7, openai/gpt-5.5)" required: false type: string mux_run_args: diff --git a/.mux/skills/tbench/SKILL.md b/.mux/skills/tbench/SKILL.md index 0edc915517..248f03d41e 100644 --- a/.mux/skills/tbench/SKILL.md +++ b/.mux/skills/tbench/SKILL.md @@ -18,8 +18,8 @@ make benchmark-terminal # Run specific tasks make benchmark-terminal TB_TASK_NAMES="hello-world chess-best-move" -# Run with specific model -make benchmark-terminal TB_ARGS="--agent-kwarg model_name=anthropic/claude-opus-4-5" +# Run with specific model and xhigh thinking +MUX_RUN_ARGS="--thinking xhigh" make benchmark-terminal TB_ARGS="--agent-kwarg model_name=anthropic/claude-opus-4-7" # Run on Daytona cloud (high parallelism) TB_ENV=daytona TB_CONCURRENCY=48 make benchmark-terminal @@ -90,7 +90,7 @@ TB_TIMEOUT=600 make benchmark-terminal TB_SAMPLE_SIZE=5 The agent adapter accepts a few Harbor kwargs (passed via `--agent-kwarg`): -- `model_name`: Model to use (e.g., `anthropic/claude-sonnet-4-5`, `openai/gpt-5-codex`) +- `model_name`: Model to use (e.g., `anthropic/claude-opus-4-7`, `openai/gpt-5.5`) - `experiments`: Experiments to enable, comma-separated (e.g., `programmatic-tool-calling`) All other `mux run` CLI flags (thinking level, mode, runtime, budget, etc.) are passed via `MUX_RUN_ARGS` — no per-flag plumbing needed. @@ -100,23 +100,23 @@ All other `mux run` CLI flags (thinking level, mode, runtime, budget, etc.) are ```bash # Run with model, thinking, and 1M context gh workflow run terminal-bench.yml \ - -f model_name=anthropic/claude-opus-4-6 \ + -f model_name=anthropic/claude-opus-4-7 \ -f mux_run_args="--thinking xhigh --use-1m" # Run with budget cap gh workflow run terminal-bench.yml \ - -f model_name=anthropic/claude-opus-4-6 \ - -f mux_run_args="--thinking high --budget 5.00" + -f model_name=openai/gpt-5.5 \ + -f mux_run_args="--thinking xhigh --budget 5.00" ``` **Local runs:** ```bash # Pass flags via MUX_RUN_ARGS env var -MUX_RUN_ARGS="--thinking high --use-1m" make benchmark-terminal +MUX_RUN_ARGS="--thinking xhigh --use-1m" make benchmark-terminal # Model and experiments via TB_ARGS -make benchmark-terminal TB_ARGS="--agent-kwarg model_name=openai/gpt-5-codex --agent-kwarg experiments=programmatic-tool-calling" +make benchmark-terminal TB_ARGS="--agent-kwarg model_name=openai/gpt-5.5 --agent-kwarg experiments=programmatic-tool-calling" ``` ## Results @@ -168,7 +168,7 @@ python3 benchmarks/terminal_bench/prepare_leaderboard_submission.py --artifacts- python3 benchmarks/terminal_bench/prepare_leaderboard_submission.py # Only prepare specific models -python3 benchmarks/terminal_bench/prepare_leaderboard_submission.py --n-runs 5 --models anthropic/claude-opus-4-5 +python3 benchmarks/terminal_bench/prepare_leaderboard_submission.py --n-runs 5 --models anthropic/claude-opus-4-7 ``` This creates a properly structured submission folder at `leaderboard_submission/` containing: diff --git a/benchmarks/terminal_bench/prepare_leaderboard_submission.py b/benchmarks/terminal_bench/prepare_leaderboard_submission.py index df9b1ac262..89a099e1b5 100755 --- a/benchmarks/terminal_bench/prepare_leaderboard_submission.py +++ b/benchmarks/terminal_bench/prepare_leaderboard_submission.py @@ -102,7 +102,14 @@ "model_org_display_name": "Anthropic", "folder_name": "Claude-Opus-4.6", }, - # Keep historical GPT-5.2 metadata alongside the new GPT-5.4 bench target + "anthropic/claude-opus-4-7": { + "model_name": "claude-opus-4-7", + "model_provider": "anthropic", + "model_display_name": "Claude Opus 4.7", + "model_org_display_name": "Anthropic", + "folder_name": "Claude-Opus-4.7", + }, + # Keep historical GPT metadata alongside the current GPT-5.5 bench target # so mixed or older artifact sets still map to the canonical leaderboard names. "openai/gpt-5.2": { "model_name": "gpt-5.2", @@ -118,6 +125,13 @@ "model_org_display_name": "OpenAI", "folder_name": "GPT-5.4", }, + "openai/gpt-5.5": { + "model_name": "gpt-5.5", + "model_provider": "openai", + "model_display_name": "GPT-5.5", + "model_org_display_name": "OpenAI", + "folder_name": "GPT-5.5", + }, "openai/gpt-5-codex": { "model_name": "gpt-5-codex", "model_provider": "openai", @@ -428,7 +442,7 @@ def main(): parser.add_argument( "--models", nargs="+", - help="Only process specific models (e.g., anthropic/claude-opus-4-5)", + help="Only process specific models (e.g., anthropic/claude-opus-4-7, openai/gpt-5.5)", ) args = parser.parse_args()