coder · ibetitsmike · Apr 25, 2026
diff --git a/.github/workflows/nightly-terminal-bench.yml b/.github/workflows/nightly-terminal-bench.yml
@@ -10,7 +10,7 @@ on:
   workflow_dispatch:
     inputs:
       models:
-        description: 'Models to test (comma-separated, or "all" for opus-4-7 + gpt-5.3-codex + gpt-5.5 + google/gemini-3-pro-preview + google/gemini-3-flash-preview)'
+        description: 'Models to test (comma-separated, or "all" for opus-4-7 + gpt-5.5 + google/gemini-3-pro-preview + google/gemini-3-flash-preview)'
         required: false
         default: "all"
         type: string
@@ -120,7 +120,7 @@ jobs:
           INPUT_MODELS: ${{ inputs.models }}
         run: |
           if [ "$INPUT_MODELS" = "all" ] || [ -z "$INPUT_MODELS" ]; then
-            echo 'models=["anthropic/claude-opus-4-7","openai/gpt-5.3-codex","openai/gpt-5.5","google/gemini-3-pro-preview","google/gemini-3-flash-preview"]' >> "$GITHUB_OUTPUT"
+            echo 'models=["anthropic/claude-opus-4-7","openai/gpt-5.5","google/gemini-3-pro-preview","google/gemini-3-flash-preview"]' >> "$GITHUB_OUTPUT"
           else
             # Convert comma-separated to JSON array
             models_json=$(echo "$INPUT_MODELS" | jq -R -s -c 'split(",") | map(gsub("^\\s+|\\s+$"; ""))')
@@ -138,8 +138,8 @@ jobs:
     uses: ./.github/workflows/terminal-bench.yml
     with:
       model_name: ${{ matrix.model }}
-      # gpt-5 class and any Claude Opus (4.6/4.7/future) use xhigh thinking; others use high
-      mux_run_args: "--thinking ${{ (contains(matrix.model, 'gpt-5') || contains(matrix.model, 'claude-opus')) && 'xhigh' || 'high' }}"
+      # GPT-5.5 and Claude Opus use xhigh thinking; others use high.
+      mux_run_args: "--thinking ${{ (contains(matrix.model, 'gpt-5.5') || contains(matrix.model, 'claude-opus')) && 'xhigh' || 'high' }}"
       dataset: "terminal-bench@2.0"
       concurrency: "48"
       env: "daytona"

diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml
@@ -4,7 +4,7 @@ on:
   workflow_call:
     inputs:
       model_name:
-        description: "Model to use (e.g., anthropic/claude-opus-4-5)"
+        description: "Model to use (e.g., anthropic/claude-opus-4-7, openai/gpt-5.5)"
         required: false
         type: string
       dataset:
@@ -88,7 +88,7 @@ on:
         required: false
         type: string
       model_name:
-        description: "Model to use (e.g., anthropic/claude-opus-4-5, openai/gpt-5.5)"
+        description: "Model to use (e.g., anthropic/claude-opus-4-7, openai/gpt-5.5)"
         required: false
         type: string
       mux_run_args:

diff --git a/.mux/skills/tbench/SKILL.md b/.mux/skills/tbench/SKILL.md
@@ -18,8 +18,8 @@ make benchmark-terminal
 # Run specific tasks
 make benchmark-terminal TB_TASK_NAMES="hello-world chess-best-move"
 
-# Run with specific model
-make benchmark-terminal TB_ARGS="--agent-kwarg model_name=anthropic/claude-opus-4-5"
+# Run with specific model and xhigh thinking
+MUX_RUN_ARGS="--thinking xhigh" make benchmark-terminal TB_ARGS="--agent-kwarg model_name=anthropic/claude-opus-4-7"
 
 # Run on Daytona cloud (high parallelism)
 TB_ENV=daytona TB_CONCURRENCY=48 make benchmark-terminal
@@ -90,7 +90,7 @@ TB_TIMEOUT=600 make benchmark-terminal TB_SAMPLE_SIZE=5
 
 The agent adapter accepts a few Harbor kwargs (passed via `--agent-kwarg`):
 
-- `model_name`: Model to use (e.g., `anthropic/claude-sonnet-4-5`, `openai/gpt-5-codex`)
+- `model_name`: Model to use (e.g., `anthropic/claude-opus-4-7`, `openai/gpt-5.5`)
 - `experiments`: Experiments to enable, comma-separated (e.g., `programmatic-tool-calling`)
 
 All other `mux run` CLI flags (thinking level, mode, runtime, budget, etc.) are passed via `MUX_RUN_ARGS` — no per-flag plumbing needed.
@@ -100,23 +100,23 @@ All other `mux run` CLI flags (thinking level, mode, runtime, budget, etc.) are
 ```bash
 # Run with model, thinking, and 1M context
 gh workflow run terminal-bench.yml \
-  -f model_name=anthropic/claude-opus-4-6 \
+  -f model_name=anthropic/claude-opus-4-7 \
   -f mux_run_args="--thinking xhigh --use-1m"
 
 # Run with budget cap
 gh workflow run terminal-bench.yml \
-  -f model_name=anthropic/claude-opus-4-6 \
-  -f mux_run_args="--thinking high --budget 5.00"
+  -f model_name=openai/gpt-5.5 \
+  -f mux_run_args="--thinking xhigh --budget 5.00"
 ```
 
 **Local runs:**
 
 ```bash
 # Pass flags via MUX_RUN_ARGS env var
-MUX_RUN_ARGS="--thinking high --use-1m" make benchmark-terminal
+MUX_RUN_ARGS="--thinking xhigh --use-1m" make benchmark-terminal
 
 # Model and experiments via TB_ARGS
-make benchmark-terminal TB_ARGS="--agent-kwarg model_name=openai/gpt-5-codex --agent-kwarg experiments=programmatic-tool-calling"
+make benchmark-terminal TB_ARGS="--agent-kwarg model_name=openai/gpt-5.5 --agent-kwarg experiments=programmatic-tool-calling"
 ```
 
 ## Results
@@ -168,7 +168,7 @@ python3 benchmarks/terminal_bench/prepare_leaderboard_submission.py --artifacts-
 python3 benchmarks/terminal_bench/prepare_leaderboard_submission.py
 
 # Only prepare specific models
-python3 benchmarks/terminal_bench/prepare_leaderboard_submission.py --n-runs 5 --models anthropic/claude-opus-4-5
+python3 benchmarks/terminal_bench/prepare_leaderboard_submission.py --n-runs 5 --models anthropic/claude-opus-4-7
 ```
 
 This creates a properly structured submission folder at `leaderboard_submission/` containing:

diff --git a/benchmarks/terminal_bench/prepare_leaderboard_submission.py b/benchmarks/terminal_bench/prepare_leaderboard_submission.py
@@ -102,7 +102,14 @@
         "model_org_display_name": "Anthropic",
         "folder_name": "Claude-Opus-4.6",
     },
-    # Keep historical GPT-5.2 metadata alongside the new GPT-5.4 bench target
+    "anthropic/claude-opus-4-7": {
+        "model_name": "claude-opus-4-7",
+        "model_provider": "anthropic",
+        "model_display_name": "Claude Opus 4.7",
+        "model_org_display_name": "Anthropic",
+        "folder_name": "Claude-Opus-4.7",
+    },
+    # Keep historical GPT metadata alongside the current GPT-5.5 bench target
     # so mixed or older artifact sets still map to the canonical leaderboard names.
     "openai/gpt-5.2": {
         "model_name": "gpt-5.2",
@@ -118,6 +125,13 @@
         "model_org_display_name": "OpenAI",
         "folder_name": "GPT-5.4",
     },
+    "openai/gpt-5.5": {
+        "model_name": "gpt-5.5",
+        "model_provider": "openai",
+        "model_display_name": "GPT-5.5",
+        "model_org_display_name": "OpenAI",
+        "folder_name": "GPT-5.5",
+    },
     "openai/gpt-5-codex": {
         "model_name": "gpt-5-codex",
         "model_provider": "openai",
@@ -428,7 +442,7 @@ def main():
     parser.add_argument(
         "--models",
         nargs="+",
-        help="Only process specific models (e.g., anthropic/claude-opus-4-5)",
+        help="Only process specific models (e.g., anthropic/claude-opus-4-7, openai/gpt-5.5)",
     )
     args = parser.parse_args()