From ee75bd33634f468d4f69658e9e3d9f99b843d143 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 9 May 2026 11:28:10 -0400 Subject: [PATCH 1/5] Bump AI PR review to gpt-5.5 + add Single-Pass Completeness Mandate Updates the OpenAI Codex model from gpt-5.4 to gpt-5.5 across both the CI workflow and the local review script (PRICING entries and reasoning- model detection extended to cover the gpt-5.x family). Adds a Single-Pass Completeness Mandate to the CI prompt that instructs the reviewer to enumerate sibling-surface mirrors (BR/DR, schema/render, default/precomputed, summary/full), run pattern-wide greps, check reciprocal/symmetry directions, and sweep transitive workflow deps in the initial pass rather than across multiple /ai-review rounds. Re-review Scope amended to apply the same audits to newly-added code while preserving the existing [Newly identified] convention for unchanged code. Behavior change: _is_reasoning_model() now correctly classifies gpt-5.4 and gpt-5.5 as reasoning models (latent bug fix per OpenAI docs). For local-script users still passing --model gpt-5.4: max_output_tokens doubles to 32768, temperature=0 is omitted from the API payload, and the "consider --timeout 900" advisory newly fires. Co-Authored-By: Claude Opus 4.7 (1M context) --- .claude/commands/ai-review-local.md | 14 ++++----- .claude/scripts/openai_review.py | 10 ++++--- .github/codex/prompts/pr_review.md | 46 +++++++++++++++++++++++++++-- .github/workflows/ai_pr_review.yml | 2 +- tests/test_openai_review.py | 32 ++++++++++++++++---- 5 files changed, 85 insertions(+), 19 deletions(-) diff --git a/.claude/commands/ai-review-local.md b/.claude/commands/ai-review-local.md index 1cb2f22b..553097eb 100644 --- a/.claude/commands/ai-review-local.md +++ b/.claude/commands/ai-review-local.md @@ -22,14 +22,14 @@ pre-PR use. Designed for iterative review/revision cycles before submitting a PR files (default: 200000). Changed source files are always included regardless of budget. - `--force-fresh`: Skip delta-diff mode, run a full fresh review even if previous state exists - `--full-registry`: Include the entire REGISTRY.md instead of selective sections -- `--model `: Override the OpenAI model (default: `gpt-5.4`) +- `--model `: Override the OpenAI model (default: `gpt-5.5`) - `--timeout `: HTTP request timeout (default: 300). Use 900 for reasoning models. - `--dry-run`: Print the compiled prompt without calling the API -**Reasoning models** (`gpt-5.4-pro`, `o3`, `o4-mini`, etc.): Reviews may take 10-15 +**Reasoning models** (`gpt-5.5`, `gpt-5.5-pro`, `o3`, `o4-mini`, etc.): Reviews may take 10-15 minutes. For deep reviews with reasoning models, combine `--token-budget` with `--model`: ``` -/ai-review-local --model gpt-5.4-pro --token-budget 500000 --context deep +/ai-review-local --model gpt-5.5-pro --token-budget 500000 --context deep ``` ## Constraints @@ -47,7 +47,7 @@ before any data is sent externally. ### Step 1: Parse Arguments Parse `$ARGUMENTS` for the optional flags listed above. All flags are optional — -the default behavior (standard context, selective registry, gpt-5.4, live API call) +the default behavior (standard context, selective registry, gpt-5.5, live API call) requires no arguments. ### Step 2: Validate Prerequisites @@ -334,8 +334,8 @@ python3 .claude/scripts/openai_review.py \ Note: `--force-fresh` is a skill-only flag — it controls whether delta diffs are generated in Step 4 and is NOT passed to the script. -**Reasoning model handling:** If the model contains `-pro` or starts with `o1`/`o3`/`o4` -(e.g., `gpt-5.4-pro`, `o3`, `o4-mini`): +**Reasoning model handling:** If the model contains `-pro`, starts with `o1`/`o3`/`o4`, or starts with `gpt-5.4`/`gpt-5.5` +(e.g., `gpt-5.5`, `gpt-5.5-pro`, `o3`, `o4-mini`): - Pass `--timeout 900` to the script (unless the user explicitly specified `--timeout`) - Run the Bash command with `run_in_background: true` (bypasses the 600s Bash tool timeout cap) - After the background command completes, continue to Step 6 @@ -466,7 +466,7 @@ runs `--force-fresh` or when a rebase invalidates the tracked commit. /ai-review-local --model gpt-4.1 --full-registry # Deep review with reasoning model (may take 10-15 minutes) -/ai-review-local --model gpt-5.4-pro --token-budget 500000 --context deep +/ai-review-local --model gpt-5.5-pro --token-budget 500000 --context deep # Limit token budget for faster/cheaper reviews /ai-review-local --token-budget 100000 diff --git a/.claude/scripts/openai_review.py b/.claude/scripts/openai_review.py index ac8487f8..ccba6997 100644 --- a/.claude/scripts/openai_review.py +++ b/.claude/scripts/openai_review.py @@ -853,6 +853,8 @@ def apply_token_budget( # Source: https://platform.openai.com/docs/pricing # MAINTENANCE: Update when OpenAI changes pricing. PRICING = { + "gpt-5.5": (5.00, 30.00), + "gpt-5.5-pro": (30.00, 180.00), "gpt-5.4": (2.50, 15.00), "gpt-5.4-pro": (30.00, 180.00), "gpt-4.1": (2.00, 8.00), @@ -900,8 +902,8 @@ def estimate_cost( "code changes that have not yet been submitted as a pull request.", ), ( - "Review ONLY the changes introduced by this PR (diff)", - "Review ONLY the changes shown in the diff below", + "Review the changes introduced by this PR (diff)", + "Review the code changes shown in the diff below", ), ( "If the PR changes an estimator", @@ -1095,7 +1097,7 @@ def compile_prompt( # --------------------------------------------------------------------------- ENDPOINT = "https://api.openai.com/v1/responses" -DEFAULT_MODEL = "gpt-5.4" +DEFAULT_MODEL = "gpt-5.5" DEFAULT_TIMEOUT = 300 # seconds DEFAULT_MAX_TOKENS = 16384 REASONING_MAX_TOKENS = 32768 @@ -1103,7 +1105,7 @@ def compile_prompt( def _is_reasoning_model(model: str) -> bool: """Return True for models that use internal chain-of-thought reasoning.""" - return model.startswith(("o1", "o3", "o4")) or "-pro" in model + return model.startswith(("o1", "o3", "o4", "gpt-5.4", "gpt-5.5")) or "-pro" in model def estimate_tokens(text: str) -> int: diff --git a/.github/codex/prompts/pr_review.md b/.github/codex/prompts/pr_review.md index a64124de..783dd777 100644 --- a/.github/codex/prompts/pr_review.md +++ b/.github/codex/prompts/pr_review.md @@ -54,6 +54,42 @@ When reviewing new features or code paths, specifically check: - Command to check: `grep -n "pattern" diff_diff/*.py` - Flag as P1 if only partial fixes were made +## Single-Pass Completeness Mandate (Initial Review Only) + +This is an INITIAL review. Treat this as the only chance to enumerate findings. +Follow-up rounds are expensive — find ALL P0/P1/P2 issues in this pass. + +Before finalizing, confirm you have run each of these audits on the diff: + +1. **Sibling-surface mirror audit**: For every fix or change in a method, schema, + default-value path, or report block, identify the parallel surface in the same + codebase (BR ↔ DR, schema ↔ renderer, default ↔ precomputed, summary ↔ full) + and check whether the same change applies there. Flag the unmirrored side as P1. + +2. **Pattern-wide grep**: When you flag any anti-pattern or bug class, use `grep` + on `diff_diff/**.py` to identify sibling occurrences of the same pattern and + enumerate them in the SAME finding. Only LOAD a sibling file's full contents + if grep returns a hit and you need surrounding context to verify the issue. + Do not defer pattern-class findings to a follow-up round. + +3. **Reciprocal/symmetry check**: For dispatch code, validation, or guards in + one direction (A-on-B), explicitly enumerate the reciprocal direction (B-on-A) + and confirm coverage. + +4. **Transitive workflow deps**: For GH Actions workflow `paths:` or pytest + selection changes, sweep transitive auto-loaded files (conftest.py, + pyproject.toml, ancestor conftests) and confirm they are included. + +5. **Scope override (with carve-outs)**: The audits above explicitly authorize + loading files outside the diff to verify completeness. This overrides the + "minimum surrounding context" default in the Rules section below. + + **DO NOT load these paths** (the workflow's diff-build deliberately excludes + them; they are noise or out-of-scope): + - `docs/tutorials/*.ipynb` (notebook outputs are large JSON blobs) + - `benchmarks/data/real/*.json` + - `benchmarks/data/real/*.csv` + ## Deferred Work Acceptance This project tracks deferred technical debt in `TODO.md` under "Tech Debt from Code Reviews." @@ -68,7 +104,9 @@ This project tracks deferred technical debt in `TODO.md` under "Tech Debt from C and incorrect statistical output are not. Rules: -- Review ONLY the changes introduced by this PR (diff) and the minimum surrounding context needed. +- Review the changes introduced by this PR (diff). The Single-Pass Completeness + Mandate above authorizes broader audits (sibling surfaces, pattern-wide greps, + reciprocal checks, transitive deps) — do those upfront rather than deferring. - Provide a single Markdown report with: - Overall assessment (see Assessment Criteria below) - Executive summary (3–6 bullets) @@ -116,7 +154,11 @@ When this is a re-review (the PR has prior AI review comments): - New P1+ findings on unchanged code MAY be raised but must be marked "[Newly identified]" to distinguish from moving goalposts. Limit these to clear, concrete issues — not speculative concerns or stylistic preferences. -- New code added since the last review IS in scope for new findings. +- New code added since the last review IS in scope for new findings — apply the + Single-Pass Completeness Mandate's audits (sibling surfaces, pattern-wide greps, + reciprocal checks) to that new code in this re-review pass. For UNCHANGED code, + the existing [Newly identified] convention from the bullet above still applies: + new P1+ findings MAY be raised but must be marked "[Newly identified]". - If all previous P1+ findings are resolved, the assessment should be ✅ even if new P2/P3 items are noticed. diff --git a/.github/workflows/ai_pr_review.yml b/.github/workflows/ai_pr_review.yml index cb81f564..6307fb3e 100644 --- a/.github/workflows/ai_pr_review.yml +++ b/.github/workflows/ai_pr_review.yml @@ -165,7 +165,7 @@ jobs: sandbox: read-only safety-strategy: drop-sudo # Recommended by OpenAI for review quality/consistency: - model: gpt-5.4 + model: gpt-5.5 effort: xhigh - name: Post PR comment (new on /ai-review, update on opened) diff --git a/tests/test_openai_review.py b/tests/test_openai_review.py index 9e8e9d91..0d68d552 100644 --- a/tests/test_openai_review.py +++ b/tests/test_openai_review.py @@ -1546,8 +1546,14 @@ def test_pro_is_reasoning(self, review_mod): def test_pro_snapshot_is_reasoning(self, review_mod): assert review_mod._is_reasoning_model("gpt-5.4-pro-2026-03-05") is True - def test_gpt54_is_not_reasoning(self, review_mod): - assert review_mod._is_reasoning_model("gpt-5.4") is False + def test_gpt54_is_reasoning(self, review_mod): + assert review_mod._is_reasoning_model("gpt-5.4") is True + + def test_gpt55_is_reasoning(self, review_mod): + assert review_mod._is_reasoning_model("gpt-5.5") is True + + def test_gpt55_pro_is_reasoning(self, review_mod): + assert review_mod._is_reasoning_model("gpt-5.5-pro") is True def test_gpt41_is_not_reasoning(self, review_mod): assert review_mod._is_reasoning_model("gpt-4.1") is False @@ -1571,6 +1577,22 @@ def test_pro_snapshot_matches_pro(self, review_mod): base = review_mod.estimate_cost(1_000_000, 1_000_000, "gpt-5.4-pro") assert snapshot == base + def test_gpt55_has_own_pricing(self, review_mod): + """gpt-5.5 should not fall back to gpt-5.4 pricing via prefix.""" + gpt55_cost = review_mod.estimate_cost(1_000_000, 1_000_000, "gpt-5.5") + gpt54_cost = review_mod.estimate_cost(1_000_000, 1_000_000, "gpt-5.4") + assert gpt55_cost is not None + assert gpt54_cost is not None + assert gpt55_cost != gpt54_cost + + def test_gpt55_pro_has_own_pricing(self, review_mod): + """gpt-5.5-pro should not fall back to gpt-5.5 pricing via prefix.""" + pro_cost = review_mod.estimate_cost(1_000_000, 1_000_000, "gpt-5.5-pro") + base_cost = review_mod.estimate_cost(1_000_000, 1_000_000, "gpt-5.5") + assert pro_cost is not None + assert base_cost is not None + assert pro_cost != base_cost + class TestExtractResponseText: def test_prefers_output_text_field(self, review_mod): @@ -1656,8 +1678,8 @@ def fake_urlopen(req, timeout=None): return captured def test_standard_model_payload(self, review_mod, mock_urlopen): - """Standard model sends input, max_output_tokens, and temperature=0.""" - content, usage = review_mod.call_openai("test prompt", "gpt-5.4", "fake-key") + """Standard (non-reasoning) model sends input, max_output_tokens, and temperature=0.""" + content, usage = review_mod.call_openai("test prompt", "gpt-4.1", "fake-key") payload = mock_urlopen["payload"] assert payload["input"] == "test prompt" assert payload["max_output_tokens"] == review_mod.DEFAULT_MAX_TOKENS @@ -1669,7 +1691,7 @@ def test_standard_model_payload(self, review_mod, mock_urlopen): def test_reasoning_model_payload(self, review_mod, mock_urlopen): """Reasoning model omits temperature and uses REASONING_MAX_TOKENS.""" - content, _ = review_mod.call_openai("test prompt", "gpt-5.4-pro", "fake-key") + content, _ = review_mod.call_openai("test prompt", "gpt-5.5", "fake-key") payload = mock_urlopen["payload"] assert payload["max_output_tokens"] == review_mod.REASONING_MAX_TOKENS assert "temperature" not in payload From 9b76cd49015c2dc9d83879538acbbb7d1c2e29ad Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 9 May 2026 11:44:16 -0400 Subject: [PATCH 2/5] Address local-script execution-context gaps from CI review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related fixes recognizing that .claude/scripts/openai_review.py runs as a static-prompt API call without shell or file-loading access — unlike CI Codex which has sandbox tool access. 1. Reasoning-aware default --timeout. Previously the script defaulted to 300s regardless of model, so direct script invocations on the new gpt-5.5 default could time out on reasoning runs (which can take 10-15 min). New _resolve_timeout() helper picks 900s for reasoning models and 300s for standard models when --timeout is omitted; explicit --timeout values are preserved unchanged. 2. Soften the Single-Pass Completeness Mandate for local mode. The CI Mandate instructs the reviewer to run shell greps, load sibling files, and sweep transitive paths. The local raw-API path cannot do any of that, so leaving the CI wording in place caused the model to claim audits it could not perform. New _SUBSTITUTIONS entry replaces the CI Mandate block with a local-mode note that scopes audits to the loaded prompt content and explicitly forbids claiming shell/file-loading tool runs. Adds 8 regression tests: TestResolveTimeout (6 cases covering omitted + reasoning, omitted + non-reasoning, explicit-preserved both paths, zero-preserved, gpt-5.4 routing) and two additions to TestAdaptReviewCriteria asserting the local prompt strips the CI audit instructions and surfaces the local-mode note. Co-Authored-By: Claude Opus 4.7 (1M context) --- .claude/scripts/openai_review.py | 93 ++++++++++++++++++++++++++++---- tests/test_openai_review.py | 55 +++++++++++++++++++ 2 files changed, 138 insertions(+), 10 deletions(-) diff --git a/.claude/scripts/openai_review.py b/.claude/scripts/openai_review.py index ccba6997..d01dcb74 100644 --- a/.claude/scripts/openai_review.py +++ b/.claude/scripts/openai_review.py @@ -935,6 +935,62 @@ def estimate_cost( "Use the branch name only to understand which " "methods/papers are intended.", ), + # Replace the CI Single-Pass Completeness Mandate with a local-mode note. + # The CI Mandate instructs the reviewer to run shell greps, load sibling + # files, and sweep transitive paths — none of which the local raw API + # path can do. Leaving the CI wording in place would cause the model to + # claim audits it cannot perform. + ( + """## Single-Pass Completeness Mandate (Initial Review Only) + +This is an INITIAL review. Treat this as the only chance to enumerate findings. +Follow-up rounds are expensive — find ALL P0/P1/P2 issues in this pass. + +Before finalizing, confirm you have run each of these audits on the diff: + +1. **Sibling-surface mirror audit**: For every fix or change in a method, schema, + default-value path, or report block, identify the parallel surface in the same + codebase (BR ↔ DR, schema ↔ renderer, default ↔ precomputed, summary ↔ full) + and check whether the same change applies there. Flag the unmirrored side as P1. + +2. **Pattern-wide grep**: When you flag any anti-pattern or bug class, use `grep` + on `diff_diff/**.py` to identify sibling occurrences of the same pattern and + enumerate them in the SAME finding. Only LOAD a sibling file's full contents + if grep returns a hit and you need surrounding context to verify the issue. + Do not defer pattern-class findings to a follow-up round. + +3. **Reciprocal/symmetry check**: For dispatch code, validation, or guards in + one direction (A-on-B), explicitly enumerate the reciprocal direction (B-on-A) + and confirm coverage. + +4. **Transitive workflow deps**: For GH Actions workflow `paths:` or pytest + selection changes, sweep transitive auto-loaded files (conftest.py, + pyproject.toml, ancestor conftests) and confirm they are included. + +5. **Scope override (with carve-outs)**: The audits above explicitly authorize + loading files outside the diff to verify completeness. This overrides the + "minimum surrounding context" default in the Rules section below. + + **DO NOT load these paths** (the workflow's diff-build deliberately excludes + them; they are noise or out-of-scope): + - `docs/tutorials/*.ipynb` (notebook outputs are large JSON blobs) + - `benchmarks/data/real/*.json` + - `benchmarks/data/real/*.csv`""", + """## Single-Pass Completeness Audit (Local Review) + +This is a local review running as a static-prompt API call. You do NOT have +shell or file-loading access — only the prompt content below is available +(diff + changed source files + first-level imports). + +Find ALL P0/P1/P2 issues within the loaded context. Audit sibling surfaces, +parallel patterns, and reciprocal directions THAT ARE VISIBLE in the loaded +files. + +Do NOT claim to have run shell greps, loaded sibling files outside the +prompt, or audited paths not present here. If a relevant audit is impossible +because the necessary context is not in the prompt, say so explicitly rather +than asserting completeness.""", + ), ] @@ -1098,7 +1154,8 @@ def compile_prompt( ENDPOINT = "https://api.openai.com/v1/responses" DEFAULT_MODEL = "gpt-5.5" -DEFAULT_TIMEOUT = 300 # seconds +DEFAULT_TIMEOUT = 300 # seconds — non-reasoning models +REASONING_TIMEOUT = 900 # seconds — reasoning models can take 10-15 min DEFAULT_MAX_TOKENS = 16384 REASONING_MAX_TOKENS = 32768 @@ -1108,6 +1165,20 @@ def _is_reasoning_model(model: str) -> bool: return model.startswith(("o1", "o3", "o4", "gpt-5.4", "gpt-5.5")) or "-pro" in model +def _resolve_timeout(timeout: "int | None", model: str) -> int: + """Resolve the effective HTTP timeout for an API call. + + If --timeout was explicitly provided, use it. Otherwise pick + REASONING_TIMEOUT (900s) for reasoning models and DEFAULT_TIMEOUT + (300s) for standard models. This prevents reasoning-model reviews + from hitting a too-short default when the wrapper command does not + pass --timeout. + """ + if timeout is not None: + return timeout + return REASONING_TIMEOUT if _is_reasoning_model(model) else DEFAULT_TIMEOUT + + def estimate_tokens(text: str) -> int: """Rough token estimate (~4 chars per token). May vary +/- 50% for code.""" return len(text) // 4 @@ -1345,8 +1416,12 @@ def main() -> None: parser.add_argument( "--timeout", type=int, - default=DEFAULT_TIMEOUT, - help=f"HTTP request timeout in seconds (default: {DEFAULT_TIMEOUT})", + default=None, + help=( + f"HTTP request timeout in seconds. If omitted, defaults to " + f"{REASONING_TIMEOUT} for reasoning models and {DEFAULT_TIMEOUT} for " + f"standard models." + ), ) parser.add_argument( "--delta-diff", @@ -1376,6 +1451,10 @@ def main() -> None: args = parser.parse_args() + # Resolve --timeout: reasoning models default to REASONING_TIMEOUT (900s), + # standard models to DEFAULT_TIMEOUT (300s). Explicit --timeout overrides. + args.timeout = _resolve_timeout(args.timeout, args.model) + # Post-parse validation if args.context != "minimal" and not args.repo_root: parser.error( @@ -1616,13 +1695,7 @@ def main() -> None: sys.exit(0) # Call OpenAI API - if _is_reasoning_model(args.model) and args.timeout == DEFAULT_TIMEOUT: - print( - f"Note: {args.model} is a reasoning model. Consider --timeout 900 " - "for large reviews.", - file=sys.stderr, - ) - print(f"Sending review to {args.model}...", file=sys.stderr) + print(f"Sending review to {args.model} (timeout: {args.timeout}s)...", file=sys.stderr) print(f"Estimated input tokens: ~{est_tokens:,}", file=sys.stderr) if cost_str: print(f"Estimated cost: {cost_str}", file=sys.stderr) diff --git a/tests/test_openai_review.py b/tests/test_openai_review.py index 0d68d552..dd9e7740 100644 --- a/tests/test_openai_review.py +++ b/tests/test_openai_review.py @@ -221,6 +221,34 @@ def test_all_substitutions_apply_to_real_prompt(self, review_mod, capsys): captured = capsys.readouterr() assert "Warning: prompt substitution did not match" not in captured.err + def test_local_prompt_strips_ci_mandate_audit_instructions(self, review_mod): + """Local mode must not instruct the model to run shell greps or load + files outside the prompt — those are CI-Codex-only capabilities.""" + assert _SCRIPT_PATH is not None + repo_root = _SCRIPT_PATH.parent.parent.parent + prompt_path = repo_root / ".github" / "codex" / "prompts" / "pr_review.md" + if not prompt_path.exists(): + pytest.skip("pr_review.md not found") + source = prompt_path.read_text() + adapted = review_mod._adapt_review_criteria(source) + assert "use `grep`\n on `diff_diff/**.py`" not in adapted + assert "Transitive workflow deps" not in adapted + assert "Scope override (with carve-outs)" not in adapted + + def test_local_prompt_has_local_audit_note(self, review_mod): + """Local mode adds an explicit no-tool-access note in place of the + CI Mandate, so the model does not claim audits it cannot perform.""" + assert _SCRIPT_PATH is not None + repo_root = _SCRIPT_PATH.parent.parent.parent + prompt_path = repo_root / ".github" / "codex" / "prompts" / "pr_review.md" + if not prompt_path.exists(): + pytest.skip("pr_review.md not found") + source = prompt_path.read_text() + adapted = review_mod._adapt_review_criteria(source) + assert "Single-Pass Completeness Audit (Local Review)" in adapted + assert "static-prompt API call" in adapted + assert "Do NOT claim to have run shell greps" in adapted + # --------------------------------------------------------------------------- # compile_prompt @@ -1562,6 +1590,33 @@ def test_gpt41_mini_is_not_reasoning(self, review_mod): assert review_mod._is_reasoning_model("gpt-4.1-mini") is False +class TestResolveTimeout: + """The script must auto-select a 900s timeout for reasoning models when + --timeout is omitted; the old 300s default would time out reasoning runs.""" + + def test_omitted_reasoning_defaults_to_900(self, review_mod): + assert review_mod._resolve_timeout(None, "gpt-5.5") == review_mod.REASONING_TIMEOUT + assert review_mod._resolve_timeout(None, "gpt-5.5") == 900 + + def test_omitted_non_reasoning_defaults_to_300(self, review_mod): + assert review_mod._resolve_timeout(None, "gpt-4.1") == review_mod.DEFAULT_TIMEOUT + assert review_mod._resolve_timeout(None, "gpt-4.1") == 300 + + def test_explicit_timeout_preserved_for_reasoning(self, review_mod): + assert review_mod._resolve_timeout(1200, "gpt-5.5") == 1200 + + def test_explicit_timeout_preserved_for_non_reasoning(self, review_mod): + assert review_mod._resolve_timeout(60, "gpt-4.1") == 60 + + def test_explicit_zero_preserved(self, review_mod): + """Zero is a valid explicit value (not the same as None).""" + assert review_mod._resolve_timeout(0, "gpt-5.5") == 0 + + def test_gpt54_routes_to_reasoning_default(self, review_mod): + """gpt-5.4 is also reasoning-classified post-fix; should get 900s.""" + assert review_mod._resolve_timeout(None, "gpt-5.4") == 900 + + class TestProModelPricing: def test_pro_gets_own_pricing(self, review_mod): """gpt-5.4-pro should not fall back to gpt-5.4 pricing.""" From 8fea196403c6387e5e6e62456f95c5772ed48a13 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 9 May 2026 11:54:41 -0400 Subject: [PATCH 3/5] Resolve timeout in call_openai() and update skill-doc default text Two P2 polish items from CI review on 9b76cd4: 1. call_openai() signature still defaulted to timeout=DEFAULT_TIMEOUT, leaving any future direct caller exposed to the legacy 300s default even after the CLI gained model-aware resolution. Route the helper's default through _resolve_timeout() so the model-aware behavior applies uniformly regardless of caller. 2. .claude/commands/ai-review-local.md described --timeout as having a fixed 300s default, which is stale now that the script auto-selects 900s for reasoning models. Update the description to reflect the dynamic default and enumerate the reasoning-model prefixes. Adds 2 regression tests asserting omitted-timeout call_openai calls propagate 900s on gpt-5.5 and 300s on gpt-4.1 to urlopen. Co-Authored-By: Claude Opus 4.7 (1M context) --- .claude/commands/ai-review-local.md | 2 +- .claude/scripts/openai_review.py | 8 +++++++- tests/test_openai_review.py | 11 +++++++++++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/.claude/commands/ai-review-local.md b/.claude/commands/ai-review-local.md index 553097eb..64c469bd 100644 --- a/.claude/commands/ai-review-local.md +++ b/.claude/commands/ai-review-local.md @@ -23,7 +23,7 @@ pre-PR use. Designed for iterative review/revision cycles before submitting a PR - `--force-fresh`: Skip delta-diff mode, run a full fresh review even if previous state exists - `--full-registry`: Include the entire REGISTRY.md instead of selective sections - `--model `: Override the OpenAI model (default: `gpt-5.5`) -- `--timeout `: HTTP request timeout (default: 300). Use 900 for reasoning models. +- `--timeout `: HTTP request timeout. If omitted, defaults to 900 for reasoning models (gpt-5.4, gpt-5.5, *-pro, o1/o3/o4) and 300 otherwise. - `--dry-run`: Print the compiled prompt without calling the API **Reasoning models** (`gpt-5.5`, `gpt-5.5-pro`, `o3`, `o4-mini`, etc.): Reviews may take 10-15 diff --git a/.claude/scripts/openai_review.py b/.claude/scripts/openai_review.py index d01dcb74..91b7a999 100644 --- a/.claude/scripts/openai_review.py +++ b/.claude/scripts/openai_review.py @@ -1207,13 +1207,19 @@ def call_openai( prompt: str, model: str, api_key: str, - timeout: int = DEFAULT_TIMEOUT, + timeout: "int | None" = None, ) -> "tuple[str, dict]": """Call the OpenAI Responses API. + If ``timeout`` is None, resolves to REASONING_TIMEOUT (900s) for + reasoning models and DEFAULT_TIMEOUT (300s) otherwise — same logic + as the CLI ``--timeout`` flag. This guards future direct callers + against the old 300s-everywhere default. + Returns (content, usage) where usage is the API response's usage dict containing input_tokens and output_tokens. """ + timeout = _resolve_timeout(timeout, model) reasoning = _is_reasoning_model(model) max_tokens = REASONING_MAX_TOKENS if reasoning else DEFAULT_MAX_TOKENS diff --git a/tests/test_openai_review.py b/tests/test_openai_review.py index dd9e7740..005afbb6 100644 --- a/tests/test_openai_review.py +++ b/tests/test_openai_review.py @@ -1760,6 +1760,17 @@ def test_timeout_passed_through(self, review_mod, mock_urlopen): review_mod.call_openai("test", "gpt-5.4", "fake-key", timeout=900) assert mock_urlopen["timeout"] == 900 + def test_omitted_timeout_resolves_for_reasoning_model(self, review_mod, mock_urlopen): + """Direct callers of call_openai() that omit timeout get the + model-aware default (900s for reasoning) — not the legacy 300s.""" + review_mod.call_openai("test", "gpt-5.5", "fake-key") + assert mock_urlopen["timeout"] == 900 + + def test_omitted_timeout_resolves_for_non_reasoning_model(self, review_mod, mock_urlopen): + """Direct callers omitting timeout on non-reasoning models still get 300s.""" + review_mod.call_openai("test", "gpt-4.1", "fake-key") + assert mock_urlopen["timeout"] == 300 + def test_missing_status_with_valid_output_succeeds(self, review_mod, mock_urlopen): """Valid content should be accepted even when status field is absent.""" mock_urlopen["response_data"] = { From 600a4acfc891e2b56a0d825b937e61f4106a321d Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 9 May 2026 12:12:50 -0400 Subject: [PATCH 4/5] Define effective_model in skill doc; assert exact gpt-5.5 pricing Two P2 polish items from CI review on 8fea196: 1. Skill doc Step 5 reasoning-model handling referred to "the model" without defining it, leaving the wrapper/script contract ambiguous for default local runs. Define effective_model = parsed --model OR default gpt-5.5, and note that --model/--timeout/--dry-run pass through. Also reflect that the script's _resolve_timeout() now handles the 900s timeout selection internally, making explicit --timeout 900 pass-through optional. 2. Pricing tests previously only asserted gpt-5.5 != gpt-5.4 and gpt-5.5-pro != gpt-5.5, which would silently accept incorrect distinct rates. Add exact-tuple assertions matching the OpenAI published rates ($5/$30 for gpt-5.5, $30/$180 for gpt-5.5-pro) and a snapshot-prefix test confirming gpt-5.5-pro-2026-04-23 resolves to the same rate as gpt-5.5-pro. Co-Authored-By: Claude Opus 4.7 (1M context) --- .claude/commands/ai-review-local.md | 12 +++++++++--- tests/test_openai_review.py | 14 ++++++++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/.claude/commands/ai-review-local.md b/.claude/commands/ai-review-local.md index 64c469bd..4cad7587 100644 --- a/.claude/commands/ai-review-local.md +++ b/.claude/commands/ai-review-local.md @@ -334,9 +334,15 @@ python3 .claude/scripts/openai_review.py \ Note: `--force-fresh` is a skill-only flag — it controls whether delta diffs are generated in Step 4 and is NOT passed to the script. -**Reasoning model handling:** If the model contains `-pro`, starts with `o1`/`o3`/`o4`, or starts with `gpt-5.4`/`gpt-5.5` -(e.g., `gpt-5.5`, `gpt-5.5-pro`, `o3`, `o4-mini`): -- Pass `--timeout 900` to the script (unless the user explicitly specified `--timeout`) +**Reasoning model handling:** Resolve the effective model first — `effective_model` is +the value of `--model` if the user provided one, otherwise the script default `gpt-5.5`. +The `--model`, `--timeout`, and `--dry-run` flags pass through to the script when provided. + +If `effective_model` contains `-pro`, starts with `o1`/`o3`/`o4`, or starts with +`gpt-5.4`/`gpt-5.5` (e.g., `gpt-5.5`, `gpt-5.5-pro`, `o3`, `o4-mini`): +- The script's `_resolve_timeout()` already auto-selects 900s for these models when + `--timeout` is omitted, so no wrapper timeout pass-through is required. (Passing + `--timeout 900` explicitly remains harmless and is fine for backward compatibility.) - Run the Bash command with `run_in_background: true` (bypasses the 600s Bash tool timeout cap) - After the background command completes, continue to Step 6 diff --git a/tests/test_openai_review.py b/tests/test_openai_review.py index 005afbb6..366ff023 100644 --- a/tests/test_openai_review.py +++ b/tests/test_openai_review.py @@ -1648,6 +1648,20 @@ def test_gpt55_pro_has_own_pricing(self, review_mod): assert base_cost is not None assert pro_cost != base_cost + def test_gpt55_exact_rates(self, review_mod): + """gpt-5.5 PRICING entry must match published OpenAI rates.""" + assert review_mod.PRICING["gpt-5.5"] == (5.00, 30.00) + + def test_gpt55_pro_exact_rates(self, review_mod): + """gpt-5.5-pro PRICING entry must match published OpenAI rates.""" + assert review_mod.PRICING["gpt-5.5-pro"] == (30.00, 180.00) + + def test_gpt55_pro_snapshot_matches_pro(self, review_mod): + """gpt-5.5-pro-2026-04-23 should match gpt-5.5-pro via prefix.""" + snapshot = review_mod.estimate_cost(1_000_000, 1_000_000, "gpt-5.5-pro-2026-04-23") + base = review_mod.estimate_cost(1_000_000, 1_000_000, "gpt-5.5-pro") + assert snapshot == base + class TestExtractResponseText: def test_prefers_output_text_field(self, review_mod): From dacba16a923447cab5b509ecfc95bfb1663c3beb Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 9 May 2026 12:21:21 -0400 Subject: [PATCH 5/5] Fix stale Chat Completions API reference in skill doc The data-transmission note in .claude/commands/ai-review-local.md said the script POSTs to OpenAI's Chat Completions API. The script has used the Responses API (ENDPOINT = .../v1/responses) for some time; this is a leftover from before that migration. Adds a TestSkillDocAPIConsistency regression test that grep-asserts the skill doc never says "Chat Completions API" again. Co-Authored-By: Claude Opus 4.7 (1M context) --- .claude/commands/ai-review-local.md | 2 +- tests/test_openai_review.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/.claude/commands/ai-review-local.md b/.claude/commands/ai-review-local.md index 4cad7587..a3e965a2 100644 --- a/.claude/commands/ai-review-local.md +++ b/.claude/commands/ai-review-local.md @@ -507,7 +507,7 @@ runs `--force-fresh` or when a rebase invalidates the tracked commit. - **Data transmission**: In non-dry-run mode, this skill transmits the unified diff, changed-file metadata, full source file contents (in standard/deep mode), import-context files (in deep mode), selected methodology registry text, and - prior review context (if present) to OpenAI via the Chat Completions API. + prior review context (if present) to OpenAI via the Responses API. Use `--dry-run` to preview exactly what would be sent. - This skill pairs naturally with the iterative workflow: `/ai-review-local` -> address findings -> `/ai-review-local` -> `/submit-pr` diff --git a/tests/test_openai_review.py b/tests/test_openai_review.py index 366ff023..fa1f0996 100644 --- a/tests/test_openai_review.py +++ b/tests/test_openai_review.py @@ -1663,6 +1663,24 @@ def test_gpt55_pro_snapshot_matches_pro(self, review_mod): assert snapshot == base +class TestSkillDocAPIConsistency: + """Catch doc drift between the script's API endpoint and the skill doc's + user-facing data-transmission note.""" + + def test_skill_doc_does_not_reference_chat_completions(self): + """Skill doc must not say "Chat Completions API" — script uses Responses API.""" + assert _SCRIPT_PATH is not None + repo_root = _SCRIPT_PATH.parent.parent.parent + doc_path = repo_root / ".claude" / "commands" / "ai-review-local.md" + if not doc_path.exists(): + pytest.skip("ai-review-local.md not found") + text = doc_path.read_text() + assert "Chat Completions API" not in text, ( + "Skill doc references stale Chat Completions API; " + "script uses Responses API at openai_review.py:ENDPOINT" + ) + + class TestExtractResponseText: def test_prefers_output_text_field(self, review_mod): result = {"output_text": "Direct text.", "output": []}