From ee75bd33634f468d4f69658e9e3d9f99b843d143 Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 9 May 2026 11:28:10 -0400
Subject: [PATCH 1/5] Bump AI PR review to gpt-5.5 + add Single-Pass
 Completeness Mandate

Updates the OpenAI Codex model from gpt-5.4 to gpt-5.5 across both the
CI workflow and the local review script (PRICING entries and reasoning-
model detection extended to cover the gpt-5.x family).

Adds a Single-Pass Completeness Mandate to the CI prompt that instructs
the reviewer to enumerate sibling-surface mirrors (BR/DR, schema/render,
default/precomputed, summary/full), run pattern-wide greps, check
reciprocal/symmetry directions, and sweep transitive workflow deps in
the initial pass rather than across multiple /ai-review rounds. Re-review
Scope amended to apply the same audits to newly-added code while
preserving the existing [Newly identified] convention for unchanged code.

Behavior change: _is_reasoning_model() now correctly classifies gpt-5.4
and gpt-5.5 as reasoning models (latent bug fix per OpenAI docs). For
local-script users still passing --model gpt-5.4: max_output_tokens
doubles to 32768, temperature=0 is omitted from the API payload, and
the "consider --timeout 900" advisory newly fires.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .claude/commands/ai-review-local.md | 14 ++++-----
 .claude/scripts/openai_review.py    | 10 ++++---
 .github/codex/prompts/pr_review.md  | 46 +++++++++++++++++++++++++++--
 .github/workflows/ai_pr_review.yml  |  2 +-
 tests/test_openai_review.py         | 32 ++++++++++++++++----
 5 files changed, 85 insertions(+), 19 deletions(-)
diff --git a/.claude/commands/ai-review-local.md b/.claude/commands/ai-review-local.md
index 1cb2f22b..553097eb 100644
--- a/.claude/commands/ai-review-local.md
+++ b/.claude/commands/ai-review-local.md
@@ -22,14 +22,14 @@ pre-PR use. Designed for iterative review/revision cycles before submitting a PR
   files (default: 200000). Changed source files are always included regardless of budget.
 - `--force-fresh`: Skip delta-diff mode, run a full fresh review even if previous state exists
 - `--full-registry`: Include the entire REGISTRY.md instead of selective sections
-- `--model <name>`: Override the OpenAI model (default: `gpt-5.4`)
+- `--model <name>`: Override the OpenAI model (default: `gpt-5.5`)
 - `--timeout <seconds>`: HTTP request timeout (default: 300). Use 900 for reasoning models.
 - `--dry-run`: Print the compiled prompt without calling the API
 
-**Reasoning models** (`gpt-5.4-pro`, `o3`, `o4-mini`, etc.): Reviews may take 10-15
+**Reasoning models** (`gpt-5.5`, `gpt-5.5-pro`, `o3`, `o4-mini`, etc.): Reviews may take 10-15
 minutes. For deep reviews with reasoning models, combine `--token-budget` with `--model`:
 ```
-/ai-review-local --model gpt-5.4-pro --token-budget 500000 --context deep
+/ai-review-local --model gpt-5.5-pro --token-budget 500000 --context deep
 ```
 
 ## Constraints
@@ -47,7 +47,7 @@ before any data is sent externally.
 ### Step 1: Parse Arguments
 
 Parse `$ARGUMENTS` for the optional flags listed above. All flags are optional —
-the default behavior (standard context, selective registry, gpt-5.4, live API call)
+the default behavior (standard context, selective registry, gpt-5.5, live API call)
 requires no arguments.
 
 ### Step 2: Validate Prerequisites
@@ -334,8 +334,8 @@ python3 .claude/scripts/openai_review.py \
 Note: `--force-fresh` is a skill-only flag — it controls whether delta diffs are
 generated in Step 4 and is NOT passed to the script.
 
-**Reasoning model handling:** If the model contains `-pro` or starts with `o1`/`o3`/`o4`
-(e.g., `gpt-5.4-pro`, `o3`, `o4-mini`):
+**Reasoning model handling:** If the model contains `-pro`, starts with `o1`/`o3`/`o4`, or starts with `gpt-5.4`/`gpt-5.5`
+(e.g., `gpt-5.5`, `gpt-5.5-pro`, `o3`, `o4-mini`):
 - Pass `--timeout 900` to the script (unless the user explicitly specified `--timeout`)
 - Run the Bash command with `run_in_background: true` (bypasses the 600s Bash tool timeout cap)
 - After the background command completes, continue to Step 6
@@ -466,7 +466,7 @@ runs `--force-fresh` or when a rebase invalidates the tracked commit.
 /ai-review-local --model gpt-4.1 --full-registry
 
 # Deep review with reasoning model (may take 10-15 minutes)
-/ai-review-local --model gpt-5.4-pro --token-budget 500000 --context deep
+/ai-review-local --model gpt-5.5-pro --token-budget 500000 --context deep
 
 # Limit token budget for faster/cheaper reviews
 /ai-review-local --token-budget 100000
diff --git a/.claude/scripts/openai_review.py b/.claude/scripts/openai_review.py
index ac8487f8..ccba6997 100644
--- a/.claude/scripts/openai_review.py
+++ b/.claude/scripts/openai_review.py
@@ -853,6 +853,8 @@ def apply_token_budget(
 # Source: https://platform.openai.com/docs/pricing
 # MAINTENANCE: Update when OpenAI changes pricing.
 PRICING = {
+    "gpt-5.5": (5.00, 30.00),
+    "gpt-5.5-pro": (30.00, 180.00),
     "gpt-5.4": (2.50, 15.00),
     "gpt-5.4-pro": (30.00, 180.00),
     "gpt-4.1": (2.00, 8.00),
@@ -900,8 +902,8 @@ def estimate_cost(
         "code changes that have not yet been submitted as a pull request.",
     ),
     (
-        "Review ONLY the changes introduced by this PR (diff)",
-        "Review ONLY the changes shown in the diff below",
+        "Review the changes introduced by this PR (diff)",
+        "Review the code changes shown in the diff below",
     ),
     (
         "If the PR changes an estimator",
@@ -1095,7 +1097,7 @@ def compile_prompt(
 # ---------------------------------------------------------------------------
 
 ENDPOINT = "https://api.openai.com/v1/responses"
-DEFAULT_MODEL = "gpt-5.4"
+DEFAULT_MODEL = "gpt-5.5"
 DEFAULT_TIMEOUT = 300  # seconds
 DEFAULT_MAX_TOKENS = 16384
 REASONING_MAX_TOKENS = 32768
@@ -1103,7 +1105,7 @@ def compile_prompt(
 
 def _is_reasoning_model(model: str) -> bool:
     """Return True for models that use internal chain-of-thought reasoning."""
-    return model.startswith(("o1", "o3", "o4")) or "-pro" in model
+    return model.startswith(("o1", "o3", "o4", "gpt-5.4", "gpt-5.5")) or "-pro" in model
 
 
 def estimate_tokens(text: str) -> int:
diff --git a/.github/codex/prompts/pr_review.md b/.github/codex/prompts/pr_review.md
index a64124de..783dd777 100644
--- a/.github/codex/prompts/pr_review.md
+++ b/.github/codex/prompts/pr_review.md
@@ -54,6 +54,42 @@ When reviewing new features or code paths, specifically check:
    - Command to check: `grep -n "pattern" diff_diff/*.py`
    - Flag as P1 if only partial fixes were made
 
+## Single-Pass Completeness Mandate (Initial Review Only)
+
+This is an INITIAL review. Treat this as the only chance to enumerate findings.
+Follow-up rounds are expensive — find ALL P0/P1/P2 issues in this pass.
+
+Before finalizing, confirm you have run each of these audits on the diff:
+
+1. **Sibling-surface mirror audit**: For every fix or change in a method, schema,
+   default-value path, or report block, identify the parallel surface in the same
+   codebase (BR ↔ DR, schema ↔ renderer, default ↔ precomputed, summary ↔ full)
+   and check whether the same change applies there. Flag the unmirrored side as P1.
+
+2. **Pattern-wide grep**: When you flag any anti-pattern or bug class, use `grep`
+   on `diff_diff/**.py` to identify sibling occurrences of the same pattern and
+   enumerate them in the SAME finding. Only LOAD a sibling file's full contents
+   if grep returns a hit and you need surrounding context to verify the issue.
+   Do not defer pattern-class findings to a follow-up round.
+
+3. **Reciprocal/symmetry check**: For dispatch code, validation, or guards in
+   one direction (A-on-B), explicitly enumerate the reciprocal direction (B-on-A)
+   and confirm coverage.
+
+4. **Transitive workflow deps**: For GH Actions workflow `paths:` or pytest
+   selection changes, sweep transitive auto-loaded files (conftest.py,
+   pyproject.toml, ancestor conftests) and confirm they are included.
+
+5. **Scope override (with carve-outs)**: The audits above explicitly authorize
+   loading files outside the diff to verify completeness. This overrides the
+   "minimum surrounding context" default in the Rules section below.
+
+   **DO NOT load these paths** (the workflow's diff-build deliberately excludes
+   them; they are noise or out-of-scope):
+   - `docs/tutorials/*.ipynb` (notebook outputs are large JSON blobs)
+   - `benchmarks/data/real/*.json`
+   - `benchmarks/data/real/*.csv`
+
 ## Deferred Work Acceptance
 
 This project tracks deferred technical debt in `TODO.md` under "Tech Debt from Code Reviews."
@@ -68,7 +104,9 @@ This project tracks deferred technical debt in `TODO.md` under "Tech Debt from C
   and incorrect statistical output are not.
 
 Rules:
-- Review ONLY the changes introduced by this PR (diff) and the minimum surrounding context needed.
+- Review the changes introduced by this PR (diff). The Single-Pass Completeness
+  Mandate above authorizes broader audits (sibling surfaces, pattern-wide greps,
+  reciprocal checks, transitive deps) — do those upfront rather than deferring.
 - Provide a single Markdown report with:
   - Overall assessment (see Assessment Criteria below)
   - Executive summary (3–6 bullets)
@@ -116,7 +154,11 @@ When this is a re-review (the PR has prior AI review comments):
 - New P1+ findings on unchanged code MAY be raised but must be marked "[Newly identified]"
   to distinguish from moving goalposts. Limit these to clear, concrete issues — not
   speculative concerns or stylistic preferences.
-- New code added since the last review IS in scope for new findings.
+- New code added since the last review IS in scope for new findings — apply the
+  Single-Pass Completeness Mandate's audits (sibling surfaces, pattern-wide greps,
+  reciprocal checks) to that new code in this re-review pass. For UNCHANGED code,
+  the existing [Newly identified] convention from the bullet above still applies:
+  new P1+ findings MAY be raised but must be marked "[Newly identified]".
 - If all previous P1+ findings are resolved, the assessment should be ✅ even if new
   P2/P3 items are noticed.
 
diff --git a/.github/workflows/ai_pr_review.yml b/.github/workflows/ai_pr_review.yml
index cb81f564..6307fb3e 100644
--- a/.github/workflows/ai_pr_review.yml
+++ b/.github/workflows/ai_pr_review.yml
@@ -165,7 +165,7 @@ jobs:
           sandbox: read-only
           safety-strategy: drop-sudo
           # Recommended by OpenAI for review quality/consistency:
-          model: gpt-5.4
+          model: gpt-5.5
           effort: xhigh
 
       - name: Post PR comment (new on /ai-review, update on opened)
diff --git a/tests/test_openai_review.py b/tests/test_openai_review.py
index 9e8e9d91..0d68d552 100644
--- a/tests/test_openai_review.py
+++ b/tests/test_openai_review.py
@@ -1546,8 +1546,14 @@ def test_pro_is_reasoning(self, review_mod):
     def test_pro_snapshot_is_reasoning(self, review_mod):
         assert review_mod._is_reasoning_model("gpt-5.4-pro-2026-03-05") is True
 
-    def test_gpt54_is_not_reasoning(self, review_mod):
-        assert review_mod._is_reasoning_model("gpt-5.4") is False
+    def test_gpt54_is_reasoning(self, review_mod):
+        assert review_mod._is_reasoning_model("gpt-5.4") is True
+
+    def test_gpt55_is_reasoning(self, review_mod):
+        assert review_mod._is_reasoning_model("gpt-5.5") is True
+
+    def test_gpt55_pro_is_reasoning(self, review_mod):
+        assert review_mod._is_reasoning_model("gpt-5.5-pro") is True
 
     def test_gpt41_is_not_reasoning(self, review_mod):
         assert review_mod._is_reasoning_model("gpt-4.1") is False
@@ -1571,6 +1577,22 @@ def test_pro_snapshot_matches_pro(self, review_mod):
         base = review_mod.estimate_cost(1_000_000, 1_000_000, "gpt-5.4-pro")
         assert snapshot == base
 
+    def test_gpt55_has_own_pricing(self, review_mod):
+        """gpt-5.5 should not fall back to gpt-5.4 pricing via prefix."""
+        gpt55_cost = review_mod.estimate_cost(1_000_000, 1_000_000, "gpt-5.5")
+        gpt54_cost = review_mod.estimate_cost(1_000_000, 1_000_000, "gpt-5.4")
+        assert gpt55_cost is not None
+        assert gpt54_cost is not None
+        assert gpt55_cost != gpt54_cost
+
+    def test_gpt55_pro_has_own_pricing(self, review_mod):
+        """gpt-5.5-pro should not fall back to gpt-5.5 pricing via prefix."""
+        pro_cost = review_mod.estimate_cost(1_000_000, 1_000_000, "gpt-5.5-pro")
+        base_cost = review_mod.estimate_cost(1_000_000, 1_000_000, "gpt-5.5")
+        assert pro_cost is not None
+        assert base_cost is not None
+        assert pro_cost != base_cost
+
 
 class TestExtractResponseText:
     def test_prefers_output_text_field(self, review_mod):
@@ -1656,8 +1678,8 @@ def fake_urlopen(req, timeout=None):
         return captured
 
     def test_standard_model_payload(self, review_mod, mock_urlopen):
-        """Standard model sends input, max_output_tokens, and temperature=0."""
-        content, usage = review_mod.call_openai("test prompt", "gpt-5.4", "fake-key")
+        """Standard (non-reasoning) model sends input, max_output_tokens, and temperature=0."""
+        content, usage = review_mod.call_openai("test prompt", "gpt-4.1", "fake-key")
         payload = mock_urlopen["payload"]
         assert payload["input"] == "test prompt"
         assert payload["max_output_tokens"] == review_mod.DEFAULT_MAX_TOKENS
@@ -1669,7 +1691,7 @@ def test_standard_model_payload(self, review_mod, mock_urlopen):
 
     def test_reasoning_model_payload(self, review_mod, mock_urlopen):
         """Reasoning model omits temperature and uses REASONING_MAX_TOKENS."""
-        content, _ = review_mod.call_openai("test prompt", "gpt-5.4-pro", "fake-key")
+        content, _ = review_mod.call_openai("test prompt", "gpt-5.5", "fake-key")
         payload = mock_urlopen["payload"]
         assert payload["max_output_tokens"] == review_mod.REASONING_MAX_TOKENS
         assert "temperature" not in payload

From 9b76cd49015c2dc9d83879538acbbb7d1c2e29ad Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 9 May 2026 11:44:16 -0400
Subject: [PATCH 2/5] Address local-script execution-context gaps from CI
 review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two related fixes recognizing that .claude/scripts/openai_review.py runs
as a static-prompt API call without shell or file-loading access — unlike
CI Codex which has sandbox tool access.

1. Reasoning-aware default --timeout. Previously the script defaulted to
   300s regardless of model, so direct script invocations on the new
   gpt-5.5 default could time out on reasoning runs (which can take
   10-15 min). New _resolve_timeout() helper picks 900s for reasoning
   models and 300s for standard models when --timeout is omitted;
   explicit --timeout values are preserved unchanged.

2. Soften the Single-Pass Completeness Mandate for local mode. The CI
   Mandate instructs the reviewer to run shell greps, load sibling
   files, and sweep transitive paths. The local raw-API path cannot do
   any of that, so leaving the CI wording in place caused the model to
   claim audits it could not perform. New _SUBSTITUTIONS entry replaces
   the CI Mandate block with a local-mode note that scopes audits to
   the loaded prompt content and explicitly forbids claiming
   shell/file-loading tool runs.

Adds 8 regression tests: TestResolveTimeout (6 cases covering omitted
+ reasoning, omitted + non-reasoning, explicit-preserved both paths,
zero-preserved, gpt-5.4 routing) and two additions to
TestAdaptReviewCriteria asserting the local prompt strips the CI
audit instructions and surfaces the local-mode note.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .claude/scripts/openai_review.py | 93 ++++++++++++++++++++++++++++----
 tests/test_openai_review.py      | 55 +++++++++++++++++++
 2 files changed, 138 insertions(+), 10 deletions(-)

diff --git a/.claude/scripts/openai_review.py b/.claude/scripts/openai_review.py
index ccba6997..d01dcb74 100644
--- a/.claude/scripts/openai_review.py
+++ b/.claude/scripts/openai_review.py
@@ -935,6 +935,62 @@ def estimate_cost(
         "Use the branch name only to understand which "
         "methods/papers are intended.",
     ),
+    # Replace the CI Single-Pass Completeness Mandate with a local-mode note.
+    # The CI Mandate instructs the reviewer to run shell greps, load sibling
+    # files, and sweep transitive paths — none of which the local raw API
+    # path can do. Leaving the CI wording in place would cause the model to
+    # claim audits it cannot perform.
+    (
+        """## Single-Pass Completeness Mandate (Initial Review Only)
+
+This is an INITIAL review. Treat this as the only chance to enumerate findings.
+Follow-up rounds are expensive — find ALL P0/P1/P2 issues in this pass.
+
+Before finalizing, confirm you have run each of these audits on the diff:
+
+1. **Sibling-surface mirror audit**: For every fix or change in a method, schema,
+   default-value path, or report block, identify the parallel surface in the same
+   codebase (BR ↔ DR, schema ↔ renderer, default ↔ precomputed, summary ↔ full)
+   and check whether the same change applies there. Flag the unmirrored side as P1.
+
+2. **Pattern-wide grep**: When you flag any anti-pattern or bug class, use `grep`
+   on `diff_diff/**.py` to identify sibling occurrences of the same pattern and
+   enumerate them in the SAME finding. Only LOAD a sibling file's full contents
+   if grep returns a hit and you need surrounding context to verify the issue.
+   Do not defer pattern-class findings to a follow-up round.
+
+3. **Reciprocal/symmetry check**: For dispatch code, validation, or guards in
+   one direction (A-on-B), explicitly enumerate the reciprocal direction (B-on-A)
+   and confirm coverage.
+
+4. **Transitive workflow deps**: For GH Actions workflow `paths:` or pytest
+   selection changes, sweep transitive auto-loaded files (conftest.py,
+   pyproject.toml, ancestor conftests) and confirm they are included.
+
+5. **Scope override (with carve-outs)**: The audits above explicitly authorize
+   loading files outside the diff to verify completeness. This overrides the
+   "minimum surrounding context" default in the Rules section below.
+
+   **DO NOT load these paths** (the workflow's diff-build deliberately excludes
+   them; they are noise or out-of-scope):
+   - `docs/tutorials/*.ipynb` (notebook outputs are large JSON blobs)
+   - `benchmarks/data/real/*.json`
+   - `benchmarks/data/real/*.csv`""",
+        """## Single-Pass Completeness Audit (Local Review)
+
+This is a local review running as a static-prompt API call. You do NOT have
+shell or file-loading access — only the prompt content below is available
+(diff + changed source files + first-level imports).
+
+Find ALL P0/P1/P2 issues within the loaded context. Audit sibling surfaces,
+parallel patterns, and reciprocal directions THAT ARE VISIBLE in the loaded
+files.
+
+Do NOT claim to have run shell greps, loaded sibling files outside the
+prompt, or audited paths not present here. If a relevant audit is impossible
+because the necessary context is not in the prompt, say so explicitly rather
+than asserting completeness.""",
+    ),
 ]
 
 
@@ -1098,7 +1154,8 @@ def compile_prompt(
 
 ENDPOINT = "https://api.openai.com/v1/responses"
 DEFAULT_MODEL = "gpt-5.5"
-DEFAULT_TIMEOUT = 300  # seconds
+DEFAULT_TIMEOUT = 300  # seconds — non-reasoning models
+REASONING_TIMEOUT = 900  # seconds — reasoning models can take 10-15 min
 DEFAULT_MAX_TOKENS = 16384
 REASONING_MAX_TOKENS = 32768
 
@@ -1108,6 +1165,20 @@ def _is_reasoning_model(model: str) -> bool:
     return model.startswith(("o1", "o3", "o4", "gpt-5.4", "gpt-5.5")) or "-pro" in model
 
 
+def _resolve_timeout(timeout: "int | None", model: str) -> int:
+    """Resolve the effective HTTP timeout for an API call.
+
+    If --timeout was explicitly provided, use it. Otherwise pick
+    REASONING_TIMEOUT (900s) for reasoning models and DEFAULT_TIMEOUT
+    (300s) for standard models. This prevents reasoning-model reviews
+    from hitting a too-short default when the wrapper command does not
+    pass --timeout.
+    """
+    if timeout is not None:
+        return timeout
+    return REASONING_TIMEOUT if _is_reasoning_model(model) else DEFAULT_TIMEOUT
+
+
 def estimate_tokens(text: str) -> int:
     """Rough token estimate (~4 chars per token). May vary +/- 50% for code."""
     return len(text) // 4
@@ -1345,8 +1416,12 @@ def main() -> None:
     parser.add_argument(
         "--timeout",
         type=int,
-        default=DEFAULT_TIMEOUT,
-        help=f"HTTP request timeout in seconds (default: {DEFAULT_TIMEOUT})",
+        default=None,
+        help=(
+            f"HTTP request timeout in seconds. If omitted, defaults to "
+            f"{REASONING_TIMEOUT} for reasoning models and {DEFAULT_TIMEOUT} for "
+            f"standard models."
+        ),
     )
     parser.add_argument(
         "--delta-diff",
@@ -1376,6 +1451,10 @@ def main() -> None:
 
     args = parser.parse_args()
 
+    # Resolve --timeout: reasoning models default to REASONING_TIMEOUT (900s),
+    # standard models to DEFAULT_TIMEOUT (300s). Explicit --timeout overrides.
+    args.timeout = _resolve_timeout(args.timeout, args.model)
+
     # Post-parse validation
     if args.context != "minimal" and not args.repo_root:
         parser.error(
@@ -1616,13 +1695,7 @@ def main() -> None:
         sys.exit(0)
 
     # Call OpenAI API
-    if _is_reasoning_model(args.model) and args.timeout == DEFAULT_TIMEOUT:
-        print(
-            f"Note: {args.model} is a reasoning model. Consider --timeout 900 "
-            "for large reviews.",
-            file=sys.stderr,
-        )
-    print(f"Sending review to {args.model}...", file=sys.stderr)
+    print(f"Sending review to {args.model} (timeout: {args.timeout}s)...", file=sys.stderr)
     print(f"Estimated input tokens: ~{est_tokens:,}", file=sys.stderr)
     if cost_str:
         print(f"Estimated cost: {cost_str}", file=sys.stderr)
diff --git a/tests/test_openai_review.py b/tests/test_openai_review.py
index 0d68d552..dd9e7740 100644
--- a/tests/test_openai_review.py
+++ b/tests/test_openai_review.py
@@ -221,6 +221,34 @@ def test_all_substitutions_apply_to_real_prompt(self, review_mod, capsys):
         captured = capsys.readouterr()
         assert "Warning: prompt substitution did not match" not in captured.err
 
+    def test_local_prompt_strips_ci_mandate_audit_instructions(self, review_mod):
+        """Local mode must not instruct the model to run shell greps or load
+        files outside the prompt — those are CI-Codex-only capabilities."""
+        assert _SCRIPT_PATH is not None
+        repo_root = _SCRIPT_PATH.parent.parent.parent
+        prompt_path = repo_root / ".github" / "codex" / "prompts" / "pr_review.md"
+        if not prompt_path.exists():
+            pytest.skip("pr_review.md not found")
+        source = prompt_path.read_text()
+        adapted = review_mod._adapt_review_criteria(source)
+        assert "use `grep`\n   on `diff_diff/**.py`" not in adapted
+        assert "Transitive workflow deps" not in adapted
+        assert "Scope override (with carve-outs)" not in adapted
+
+    def test_local_prompt_has_local_audit_note(self, review_mod):
+        """Local mode adds an explicit no-tool-access note in place of the
+        CI Mandate, so the model does not claim audits it cannot perform."""
+        assert _SCRIPT_PATH is not None
+        repo_root = _SCRIPT_PATH.parent.parent.parent
+        prompt_path = repo_root / ".github" / "codex" / "prompts" / "pr_review.md"
+        if not prompt_path.exists():
+            pytest.skip("pr_review.md not found")
+        source = prompt_path.read_text()
+        adapted = review_mod._adapt_review_criteria(source)
+        assert "Single-Pass Completeness Audit (Local Review)" in adapted
+        assert "static-prompt API call" in adapted
+        assert "Do NOT claim to have run shell greps" in adapted
+
 
 # ---------------------------------------------------------------------------
 # compile_prompt
@@ -1562,6 +1590,33 @@ def test_gpt41_mini_is_not_reasoning(self, review_mod):
         assert review_mod._is_reasoning_model("gpt-4.1-mini") is False
 
 
+class TestResolveTimeout:
+    """The script must auto-select a 900s timeout for reasoning models when
+    --timeout is omitted; the old 300s default would time out reasoning runs."""
+
+    def test_omitted_reasoning_defaults_to_900(self, review_mod):
+        assert review_mod._resolve_timeout(None, "gpt-5.5") == review_mod.REASONING_TIMEOUT
+        assert review_mod._resolve_timeout(None, "gpt-5.5") == 900
+
+    def test_omitted_non_reasoning_defaults_to_300(self, review_mod):
+        assert review_mod._resolve_timeout(None, "gpt-4.1") == review_mod.DEFAULT_TIMEOUT
+        assert review_mod._resolve_timeout(None, "gpt-4.1") == 300
+
+    def test_explicit_timeout_preserved_for_reasoning(self, review_mod):
+        assert review_mod._resolve_timeout(1200, "gpt-5.5") == 1200
+
+    def test_explicit_timeout_preserved_for_non_reasoning(self, review_mod):
+        assert review_mod._resolve_timeout(60, "gpt-4.1") == 60
+
+    def test_explicit_zero_preserved(self, review_mod):
+        """Zero is a valid explicit value (not the same as None)."""
+        assert review_mod._resolve_timeout(0, "gpt-5.5") == 0
+
+    def test_gpt54_routes_to_reasoning_default(self, review_mod):
+        """gpt-5.4 is also reasoning-classified post-fix; should get 900s."""
+        assert review_mod._resolve_timeout(None, "gpt-5.4") == 900
+
+
 class TestProModelPricing:
     def test_pro_gets_own_pricing(self, review_mod):
         """gpt-5.4-pro should not fall back to gpt-5.4 pricing."""

From 8fea196403c6387e5e6e62456f95c5772ed48a13 Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 9 May 2026 11:54:41 -0400
Subject: [PATCH 3/5] Resolve timeout in call_openai() and update skill-doc
 default text

Two P2 polish items from CI review on 9b76cd4:

1. call_openai() signature still defaulted to timeout=DEFAULT_TIMEOUT,
   leaving any future direct caller exposed to the legacy 300s default
   even after the CLI gained model-aware resolution. Route the helper's
   default through _resolve_timeout() so the model-aware behavior
   applies uniformly regardless of caller.

2. .claude/commands/ai-review-local.md described --timeout as having a
   fixed 300s default, which is stale now that the script auto-selects
   900s for reasoning models. Update the description to reflect the
   dynamic default and enumerate the reasoning-model prefixes.

Adds 2 regression tests asserting omitted-timeout call_openai calls
propagate 900s on gpt-5.5 and 300s on gpt-4.1 to urlopen.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .claude/commands/ai-review-local.md |  2 +-
 .claude/scripts/openai_review.py    |  8 +++++++-
 tests/test_openai_review.py         | 11 +++++++++++
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/.claude/commands/ai-review-local.md b/.claude/commands/ai-review-local.md
index 553097eb..64c469bd 100644
--- a/.claude/commands/ai-review-local.md
+++ b/.claude/commands/ai-review-local.md
@@ -23,7 +23,7 @@ pre-PR use. Designed for iterative review/revision cycles before submitting a PR
 - `--force-fresh`: Skip delta-diff mode, run a full fresh review even if previous state exists
 - `--full-registry`: Include the entire REGISTRY.md instead of selective sections
 - `--model <name>`: Override the OpenAI model (default: `gpt-5.5`)
-- `--timeout <seconds>`: HTTP request timeout (default: 300). Use 900 for reasoning models.
+- `--timeout <seconds>`: HTTP request timeout. If omitted, defaults to 900 for reasoning models (gpt-5.4, gpt-5.5, *-pro, o1/o3/o4) and 300 otherwise.
 - `--dry-run`: Print the compiled prompt without calling the API
 
 **Reasoning models** (`gpt-5.5`, `gpt-5.5-pro`, `o3`, `o4-mini`, etc.): Reviews may take 10-15
diff --git a/.claude/scripts/openai_review.py b/.claude/scripts/openai_review.py
index d01dcb74..91b7a999 100644
--- a/.claude/scripts/openai_review.py
+++ b/.claude/scripts/openai_review.py
@@ -1207,13 +1207,19 @@ def call_openai(
     prompt: str,
     model: str,
     api_key: str,
-    timeout: int = DEFAULT_TIMEOUT,
+    timeout: "int | None" = None,
 ) -> "tuple[str, dict]":
     """Call the OpenAI Responses API.
 
+    If ``timeout`` is None, resolves to REASONING_TIMEOUT (900s) for
+    reasoning models and DEFAULT_TIMEOUT (300s) otherwise — same logic
+    as the CLI ``--timeout`` flag. This guards future direct callers
+    against the old 300s-everywhere default.
+
     Returns (content, usage) where usage is the API response's usage dict
     containing input_tokens and output_tokens.
     """
+    timeout = _resolve_timeout(timeout, model)
     reasoning = _is_reasoning_model(model)
     max_tokens = REASONING_MAX_TOKENS if reasoning else DEFAULT_MAX_TOKENS
 
diff --git a/tests/test_openai_review.py b/tests/test_openai_review.py
index dd9e7740..005afbb6 100644
--- a/tests/test_openai_review.py
+++ b/tests/test_openai_review.py
@@ -1760,6 +1760,17 @@ def test_timeout_passed_through(self, review_mod, mock_urlopen):
         review_mod.call_openai("test", "gpt-5.4", "fake-key", timeout=900)
         assert mock_urlopen["timeout"] == 900
 
+    def test_omitted_timeout_resolves_for_reasoning_model(self, review_mod, mock_urlopen):
+        """Direct callers of call_openai() that omit timeout get the
+        model-aware default (900s for reasoning) — not the legacy 300s."""
+        review_mod.call_openai("test", "gpt-5.5", "fake-key")
+        assert mock_urlopen["timeout"] == 900
+
+    def test_omitted_timeout_resolves_for_non_reasoning_model(self, review_mod, mock_urlopen):
+        """Direct callers omitting timeout on non-reasoning models still get 300s."""
+        review_mod.call_openai("test", "gpt-4.1", "fake-key")
+        assert mock_urlopen["timeout"] == 300
+
     def test_missing_status_with_valid_output_succeeds(self, review_mod, mock_urlopen):
         """Valid content should be accepted even when status field is absent."""
         mock_urlopen["response_data"] = {

From 600a4acfc891e2b56a0d825b937e61f4106a321d Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 9 May 2026 12:12:50 -0400
Subject: [PATCH 4/5] Define effective_model in skill doc; assert exact gpt-5.5
 pricing

Two P2 polish items from CI review on 8fea196:

1. Skill doc Step 5 reasoning-model handling referred to "the model"
   without defining it, leaving the wrapper/script contract ambiguous
   for default local runs. Define effective_model = parsed --model OR
   default gpt-5.5, and note that --model/--timeout/--dry-run pass
   through. Also reflect that the script's _resolve_timeout() now
   handles the 900s timeout selection internally, making explicit
   --timeout 900 pass-through optional.

2. Pricing tests previously only asserted gpt-5.5 != gpt-5.4 and
   gpt-5.5-pro != gpt-5.5, which would silently accept incorrect
   distinct rates. Add exact-tuple assertions matching the OpenAI
   published rates ($5/$30 for gpt-5.5, $30/$180 for gpt-5.5-pro)
   and a snapshot-prefix test confirming gpt-5.5-pro-2026-04-23
   resolves to the same rate as gpt-5.5-pro.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .claude/commands/ai-review-local.md | 12 +++++++++---
 tests/test_openai_review.py         | 14 ++++++++++++++
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/.claude/commands/ai-review-local.md b/.claude/commands/ai-review-local.md
index 64c469bd..4cad7587 100644
--- a/.claude/commands/ai-review-local.md
+++ b/.claude/commands/ai-review-local.md
@@ -334,9 +334,15 @@ python3 .claude/scripts/openai_review.py \
 Note: `--force-fresh` is a skill-only flag — it controls whether delta diffs are
 generated in Step 4 and is NOT passed to the script.
 
-**Reasoning model handling:** If the model contains `-pro`, starts with `o1`/`o3`/`o4`, or starts with `gpt-5.4`/`gpt-5.5`
-(e.g., `gpt-5.5`, `gpt-5.5-pro`, `o3`, `o4-mini`):
-- Pass `--timeout 900` to the script (unless the user explicitly specified `--timeout`)
+**Reasoning model handling:** Resolve the effective model first — `effective_model` is
+the value of `--model` if the user provided one, otherwise the script default `gpt-5.5`.
+The `--model`, `--timeout`, and `--dry-run` flags pass through to the script when provided.
+
+If `effective_model` contains `-pro`, starts with `o1`/`o3`/`o4`, or starts with
+`gpt-5.4`/`gpt-5.5` (e.g., `gpt-5.5`, `gpt-5.5-pro`, `o3`, `o4-mini`):
+- The script's `_resolve_timeout()` already auto-selects 900s for these models when
+  `--timeout` is omitted, so no wrapper timeout pass-through is required. (Passing
+  `--timeout 900` explicitly remains harmless and is fine for backward compatibility.)
 - Run the Bash command with `run_in_background: true` (bypasses the 600s Bash tool timeout cap)
 - After the background command completes, continue to Step 6
 
diff --git a/tests/test_openai_review.py b/tests/test_openai_review.py
index 005afbb6..366ff023 100644
--- a/tests/test_openai_review.py
+++ b/tests/test_openai_review.py
@@ -1648,6 +1648,20 @@ def test_gpt55_pro_has_own_pricing(self, review_mod):
         assert base_cost is not None
         assert pro_cost != base_cost
 
+    def test_gpt55_exact_rates(self, review_mod):
+        """gpt-5.5 PRICING entry must match published OpenAI rates."""
+        assert review_mod.PRICING["gpt-5.5"] == (5.00, 30.00)
+
+    def test_gpt55_pro_exact_rates(self, review_mod):
+        """gpt-5.5-pro PRICING entry must match published OpenAI rates."""
+        assert review_mod.PRICING["gpt-5.5-pro"] == (30.00, 180.00)
+
+    def test_gpt55_pro_snapshot_matches_pro(self, review_mod):
+        """gpt-5.5-pro-2026-04-23 should match gpt-5.5-pro via prefix."""
+        snapshot = review_mod.estimate_cost(1_000_000, 1_000_000, "gpt-5.5-pro-2026-04-23")
+        base = review_mod.estimate_cost(1_000_000, 1_000_000, "gpt-5.5-pro")
+        assert snapshot == base
+
 
 class TestExtractResponseText:
     def test_prefers_output_text_field(self, review_mod):

From dacba16a923447cab5b509ecfc95bfb1663c3beb Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 9 May 2026 12:21:21 -0400
Subject: [PATCH 5/5] Fix stale Chat Completions API reference in skill doc

The data-transmission note in .claude/commands/ai-review-local.md said
the script POSTs to OpenAI's Chat Completions API. The script has used
the Responses API (ENDPOINT = .../v1/responses) for some time; this is
a leftover from before that migration.

Adds a TestSkillDocAPIConsistency regression test that grep-asserts the
skill doc never says "Chat Completions API" again.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .claude/commands/ai-review-local.md |  2 +-
 tests/test_openai_review.py         | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/.claude/commands/ai-review-local.md b/.claude/commands/ai-review-local.md
index 4cad7587..a3e965a2 100644
--- a/.claude/commands/ai-review-local.md
+++ b/.claude/commands/ai-review-local.md
@@ -507,7 +507,7 @@ runs `--force-fresh` or when a rebase invalidates the tracked commit.
 - **Data transmission**: In non-dry-run mode, this skill transmits the unified diff,
   changed-file metadata, full source file contents (in standard/deep mode),
   import-context files (in deep mode), selected methodology registry text, and
-  prior review context (if present) to OpenAI via the Chat Completions API.
+  prior review context (if present) to OpenAI via the Responses API.
   Use `--dry-run` to preview exactly what would be sent.
 - This skill pairs naturally with the iterative workflow:
   `/ai-review-local` -> address findings -> `/ai-review-local` -> `/submit-pr`
diff --git a/tests/test_openai_review.py b/tests/test_openai_review.py
index 366ff023..fa1f0996 100644
--- a/tests/test_openai_review.py
+++ b/tests/test_openai_review.py
@@ -1663,6 +1663,24 @@ def test_gpt55_pro_snapshot_matches_pro(self, review_mod):
         assert snapshot == base
 
 
+class TestSkillDocAPIConsistency:
+    """Catch doc drift between the script's API endpoint and the skill doc's
+    user-facing data-transmission note."""
+
+    def test_skill_doc_does_not_reference_chat_completions(self):
+        """Skill doc must not say "Chat Completions API" — script uses Responses API."""
+        assert _SCRIPT_PATH is not None
+        repo_root = _SCRIPT_PATH.parent.parent.parent
+        doc_path = repo_root / ".claude" / "commands" / "ai-review-local.md"
+        if not doc_path.exists():
+            pytest.skip("ai-review-local.md not found")
+        text = doc_path.read_text()
+        assert "Chat Completions API" not in text, (
+            "Skill doc references stale Chat Completions API; "
+            "script uses Responses API at openai_review.py:ENDPOINT"
+        )
+
+
 class TestExtractResponseText:
     def test_prefers_output_text_field(self, review_mod):
         result = {"output_text": "Direct text.", "output": []}