From f55e7250805667f2f770c8ceee5686ed046bd4f4 Mon Sep 17 00:00:00 2001
From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com>
Date: Sat, 25 Apr 2026 01:14:31 +0000
Subject: [PATCH] =?UTF-8?q?=F0=9F=A4=96=20bench:=20use=20GPT-5.5=20for=20t?=
 =?UTF-8?q?bench?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Switch nightly Terminal-Bench defaults to GPT-5.5 and Opus 4.7 with xhigh thinking. Add leaderboard metadata for both models and update tbench examples.

---

_Generated with `mux` • Model: `openai:gpt-5.5` • Thinking: `xhigh` • Cost: `$9.21`_

<!-- mux-attribution: model=openai:gpt-5.5 thinking=xhigh costs=9.21 -->
---
 .github/workflows/nightly-terminal-bench.yml   |  8 ++++----
 .github/workflows/terminal-bench.yml           |  4 ++--
 .mux/skills/tbench/SKILL.md                    | 18 +++++++++---------
 .../prepare_leaderboard_submission.py          | 18 ++++++++++++++++--
 4 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/nightly-terminal-bench.yml b/.github/workflows/nightly-terminal-bench.yml
index 2cc03fcac8..f442719c4c 100644
--- a/.github/workflows/nightly-terminal-bench.yml
+++ b/.github/workflows/nightly-terminal-bench.yml
@@ -10,7 +10,7 @@ on:
   workflow_dispatch:
     inputs:
       models:
-        description: 'Models to test (comma-separated, or "all" for opus-4-7 + gpt-5.3-codex + gpt-5.5 + google/gemini-3-pro-preview + google/gemini-3-flash-preview)'
+        description: 'Models to test (comma-separated, or "all" for opus-4-7 + gpt-5.5 + google/gemini-3-pro-preview + google/gemini-3-flash-preview)'
         required: false
         default: "all"
         type: string
@@ -120,7 +120,7 @@ jobs:
           INPUT_MODELS: ${{ inputs.models }}
         run: |
           if [ "$INPUT_MODELS" = "all" ] || [ -z "$INPUT_MODELS" ]; then
-            echo 'models=["anthropic/claude-opus-4-7","openai/gpt-5.3-codex","openai/gpt-5.5","google/gemini-3-pro-preview","google/gemini-3-flash-preview"]' >> "$GITHUB_OUTPUT"
+            echo 'models=["anthropic/claude-opus-4-7","openai/gpt-5.5","google/gemini-3-pro-preview","google/gemini-3-flash-preview"]' >> "$GITHUB_OUTPUT"
           else
             # Convert comma-separated to JSON array
             models_json=$(echo "$INPUT_MODELS" | jq -R -s -c 'split(",") | map(gsub("^\\s+|\\s+$"; ""))')
@@ -138,8 +138,8 @@ jobs:
     uses: ./.github/workflows/terminal-bench.yml
     with:
       model_name: ${{ matrix.model }}
-      # gpt-5 class and any Claude Opus (4.6/4.7/future) use xhigh thinking; others use high
-      mux_run_args: "--thinking ${{ (contains(matrix.model, 'gpt-5') || contains(matrix.model, 'claude-opus')) && 'xhigh' || 'high' }}"
+      # GPT-5.5 and Claude Opus use xhigh thinking; others use high.
+      mux_run_args: "--thinking ${{ (contains(matrix.model, 'gpt-5.5') || contains(matrix.model, 'claude-opus')) && 'xhigh' || 'high' }}"
       dataset: "terminal-bench@2.0"
       concurrency: "48"
       env: "daytona"
diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml
index 8aff72f6c1..a84fead0af 100644
--- a/.github/workflows/terminal-bench.yml
+++ b/.github/workflows/terminal-bench.yml
@@ -4,7 +4,7 @@ on:
   workflow_call:
     inputs:
       model_name:
-        description: "Model to use (e.g., anthropic/claude-opus-4-5)"
+        description: "Model to use (e.g., anthropic/claude-opus-4-7, openai/gpt-5.5)"
         required: false
         type: string
       dataset:
@@ -88,7 +88,7 @@ on:
         required: false
         type: string
       model_name:
-        description: "Model to use (e.g., anthropic/claude-opus-4-5, openai/gpt-5.5)"
+        description: "Model to use (e.g., anthropic/claude-opus-4-7, openai/gpt-5.5)"
         required: false
         type: string
       mux_run_args:
diff --git a/.mux/skills/tbench/SKILL.md b/.mux/skills/tbench/SKILL.md
index 0edc915517..248f03d41e 100644
--- a/.mux/skills/tbench/SKILL.md
+++ b/.mux/skills/tbench/SKILL.md
@@ -18,8 +18,8 @@ make benchmark-terminal
 # Run specific tasks
 make benchmark-terminal TB_TASK_NAMES="hello-world chess-best-move"
 
-# Run with specific model
-make benchmark-terminal TB_ARGS="--agent-kwarg model_name=anthropic/claude-opus-4-5"
+# Run with specific model and xhigh thinking
+MUX_RUN_ARGS="--thinking xhigh" make benchmark-terminal TB_ARGS="--agent-kwarg model_name=anthropic/claude-opus-4-7"
 
 # Run on Daytona cloud (high parallelism)
 TB_ENV=daytona TB_CONCURRENCY=48 make benchmark-terminal
@@ -90,7 +90,7 @@ TB_TIMEOUT=600 make benchmark-terminal TB_SAMPLE_SIZE=5
 
 The agent adapter accepts a few Harbor kwargs (passed via `--agent-kwarg`):
 
-- `model_name`: Model to use (e.g., `anthropic/claude-sonnet-4-5`, `openai/gpt-5-codex`)
+- `model_name`: Model to use (e.g., `anthropic/claude-opus-4-7`, `openai/gpt-5.5`)
 - `experiments`: Experiments to enable, comma-separated (e.g., `programmatic-tool-calling`)
 
 All other `mux run` CLI flags (thinking level, mode, runtime, budget, etc.) are passed via `MUX_RUN_ARGS` — no per-flag plumbing needed.
@@ -100,23 +100,23 @@ All other `mux run` CLI flags (thinking level, mode, runtime, budget, etc.) are
 ```bash
 # Run with model, thinking, and 1M context
 gh workflow run terminal-bench.yml \
-  -f model_name=anthropic/claude-opus-4-6 \
+  -f model_name=anthropic/claude-opus-4-7 \
   -f mux_run_args="--thinking xhigh --use-1m"
 
 # Run with budget cap
 gh workflow run terminal-bench.yml \
-  -f model_name=anthropic/claude-opus-4-6 \
-  -f mux_run_args="--thinking high --budget 5.00"
+  -f model_name=openai/gpt-5.5 \
+  -f mux_run_args="--thinking xhigh --budget 5.00"
 ```
 
 **Local runs:**
 
 ```bash
 # Pass flags via MUX_RUN_ARGS env var
-MUX_RUN_ARGS="--thinking high --use-1m" make benchmark-terminal
+MUX_RUN_ARGS="--thinking xhigh --use-1m" make benchmark-terminal
 
 # Model and experiments via TB_ARGS
-make benchmark-terminal TB_ARGS="--agent-kwarg model_name=openai/gpt-5-codex --agent-kwarg experiments=programmatic-tool-calling"
+make benchmark-terminal TB_ARGS="--agent-kwarg model_name=openai/gpt-5.5 --agent-kwarg experiments=programmatic-tool-calling"
 ```
 
 ## Results
@@ -168,7 +168,7 @@ python3 benchmarks/terminal_bench/prepare_leaderboard_submission.py --artifacts-
 python3 benchmarks/terminal_bench/prepare_leaderboard_submission.py
 
 # Only prepare specific models
-python3 benchmarks/terminal_bench/prepare_leaderboard_submission.py --n-runs 5 --models anthropic/claude-opus-4-5
+python3 benchmarks/terminal_bench/prepare_leaderboard_submission.py --n-runs 5 --models anthropic/claude-opus-4-7
 ```
 
 This creates a properly structured submission folder at `leaderboard_submission/` containing:
diff --git a/benchmarks/terminal_bench/prepare_leaderboard_submission.py b/benchmarks/terminal_bench/prepare_leaderboard_submission.py
index df9b1ac262..89a099e1b5 100755
--- a/benchmarks/terminal_bench/prepare_leaderboard_submission.py
+++ b/benchmarks/terminal_bench/prepare_leaderboard_submission.py
@@ -102,7 +102,14 @@
         "model_org_display_name": "Anthropic",
         "folder_name": "Claude-Opus-4.6",
     },
-    # Keep historical GPT-5.2 metadata alongside the new GPT-5.4 bench target
+    "anthropic/claude-opus-4-7": {
+        "model_name": "claude-opus-4-7",
+        "model_provider": "anthropic",
+        "model_display_name": "Claude Opus 4.7",
+        "model_org_display_name": "Anthropic",
+        "folder_name": "Claude-Opus-4.7",
+    },
+    # Keep historical GPT metadata alongside the current GPT-5.5 bench target
     # so mixed or older artifact sets still map to the canonical leaderboard names.
     "openai/gpt-5.2": {
         "model_name": "gpt-5.2",
@@ -118,6 +125,13 @@
         "model_org_display_name": "OpenAI",
         "folder_name": "GPT-5.4",
     },
+    "openai/gpt-5.5": {
+        "model_name": "gpt-5.5",
+        "model_provider": "openai",
+        "model_display_name": "GPT-5.5",
+        "model_org_display_name": "OpenAI",
+        "folder_name": "GPT-5.5",
+    },
     "openai/gpt-5-codex": {
         "model_name": "gpt-5-codex",
         "model_provider": "openai",
@@ -428,7 +442,7 @@ def main():
     parser.add_argument(
         "--models",
         nargs="+",
-        help="Only process specific models (e.g., anthropic/claude-opus-4-5)",
+        help="Only process specific models (e.g., anthropic/claude-opus-4-7, openai/gpt-5.5)",
     )
     args = parser.parse_args()