From 873eddaaf67de360d4d9a2427236d9988d6ade8a Mon Sep 17 00:00:00 2001
From: mattgodbolt-molty <mattgodbolt-molty@users.noreply.github.com>
Date: Wed, 6 May 2026 13:34:26 -0500
Subject: [PATCH] Bump prompt-testing reviewer to Opus 4.7
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Update default reviewer model in `reviewer.py`, `cli.py run --review`,
  and the standalone `cli.py review` command from `claude-opus-4-6` to
  `claude-opus-4-7`. Same `$5/$25` price tier with stronger reasoning.
- Drop the hard-coded `temperature=0.0` from the reviewer's API call:
  Opus 4.7 rejects the parameter (`temperature is deprecated for this
  model`).
- Replace the hard-coded `$15/$75` cost calc in `_run_reviews` with a
  lookup via `app.model_costs.get_model_cost(model)`. The previous calc
  would have over-reported review cost by 3x against the new Opus pricing
  and would silently drift again on any future model bump.

Smoke-tested locally against three live cases (square_cpp_o1,
basic_inline_001, factorial_beginner_assembly) — Opus 4.7 ran cleanly
and flagged a real factual error in the Sonnet 4.6 explanation for
square_cpp_o1 (incorrect three-operand `imul` form).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 prompt_testing/cli.py      | 12 ++++++++----
 prompt_testing/reviewer.py |  4 ++--
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/prompt_testing/cli.py b/prompt_testing/cli.py
index 82a39ed..e5d3af0 100644
--- a/prompt_testing/cli.py
+++ b/prompt_testing/cli.py
@@ -20,6 +20,7 @@
 import click
 from dotenv import load_dotenv
 
+from app.model_costs import get_model_cost
 from prompt_testing.ce_api import CompilerExplorerClient
 from prompt_testing.enricher import TestCaseEnricher
 from prompt_testing.file_utils import load_all_test_cases
@@ -43,7 +44,7 @@ def cli(ctx, project_root):
 @click.option("--output", help="Output filename")
 @click.option("--max-concurrent", type=int, default=5)
 @click.option("--review", is_flag=True, help="Also run Opus correctness review on results")
-@click.option("--review-model", default="claude-opus-4-6", help="Model for correctness review")
+@click.option("--review-model", default="claude-opus-4-7", help="Model for correctness review")
 @click.pass_context
 def run(ctx, prompt, cases, categories, output, max_concurrent, review, review_model):
     """Run test cases and save results for review."""
@@ -208,6 +209,7 @@ async def _run_reviews(project_root: Path, results: dict, model: str) -> dict:
 
     review_cost = 0.0
     errors_found = 0
+    cost_per_input_token, cost_per_output_token = get_model_cost(model)
 
     for i, result in enumerate(successful, 1):
         case = cases_by_id.get(result["case_id"])
@@ -221,8 +223,10 @@ async def _run_reviews(project_root: Path, results: dict, model: str) -> dict:
         n_issues = len(review.get("issues", []))
         if not review.get("correct"):
             errors_found += 1
-        # Opus pricing: $15/M in, $75/M out
-        cost = review.get("reviewer_input_tokens", 0) * 15 / 1e6 + review.get("reviewer_output_tokens", 0) * 75 / 1e6
+        cost = (
+            review.get("reviewer_input_tokens", 0) * cost_per_input_token
+            + review.get("reviewer_output_tokens", 0) * cost_per_output_token
+        )
         review_cost += cost
         click.echo(f"  [{i}/{len(successful)}] {status} {result['case_id']} ({n_issues} issues, ${cost:.4f})")
 
@@ -256,7 +260,7 @@ def _print_review_summary(results: dict) -> None:
 
 @cli.command()
 @click.argument("results_file")
-@click.option("--model", default="claude-opus-4-6", help="Reviewer model")
+@click.option("--model", default="claude-opus-4-7", help="Reviewer model")
 @click.pass_context
 def review(ctx, results_file, model):
     """Run Opus correctness review on existing results."""
diff --git a/prompt_testing/reviewer.py b/prompt_testing/reviewer.py
index 4b90261..a329317 100644
--- a/prompt_testing/reviewer.py
+++ b/prompt_testing/reviewer.py
@@ -66,7 +66,7 @@
 class CorrectnessReviewer:
     """Reviews explanations for factual correctness using a powerful model."""
 
-    def __init__(self, model: str = "claude-opus-4-6"):
+    def __init__(self, model: str = "claude-opus-4-7"):
         self.model = model
         self.client = AsyncAnthropic()
 
@@ -95,10 +95,10 @@ async def review(
             explanation=explanation,
         )
 
+        # Opus 4.7+ rejects `temperature`; rely on the model's own default.
         msg = await self.client.messages.create(
             model=self.model,
             max_tokens=2048,
-            temperature=0.0,
             system=REVIEW_SYSTEM_PROMPT,
             messages=[{"role": "user", "content": user_prompt}],
         )