From 873eddaaf67de360d4d9a2427236d9988d6ade8a Mon Sep 17 00:00:00 2001 From: mattgodbolt-molty Date: Wed, 6 May 2026 13:34:26 -0500 Subject: [PATCH] Bump prompt-testing reviewer to Opus 4.7 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Update default reviewer model in `reviewer.py`, `cli.py run --review`, and the standalone `cli.py review` command from `claude-opus-4-6` to `claude-opus-4-7`. Same `$5/$25` price tier with stronger reasoning. - Drop the hard-coded `temperature=0.0` from the reviewer's API call: Opus 4.7 rejects the parameter (`temperature is deprecated for this model`). - Replace the hard-coded `$15/$75` cost calc in `_run_reviews` with a lookup via `app.model_costs.get_model_cost(model)`. The previous calc would have over-reported review cost by 3x against the new Opus pricing and would silently drift again on any future model bump. Smoke-tested locally against three live cases (square_cpp_o1, basic_inline_001, factorial_beginner_assembly) — Opus 4.7 ran cleanly and flagged a real factual error in the Sonnet 4.6 explanation for square_cpp_o1 (incorrect three-operand `imul` form). Co-Authored-By: Claude Opus 4.7 (1M context) --- prompt_testing/cli.py | 12 ++++++++---- prompt_testing/reviewer.py | 4 ++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/prompt_testing/cli.py b/prompt_testing/cli.py index 82a39ed..e5d3af0 100644 --- a/prompt_testing/cli.py +++ b/prompt_testing/cli.py @@ -20,6 +20,7 @@ import click from dotenv import load_dotenv +from app.model_costs import get_model_cost from prompt_testing.ce_api import CompilerExplorerClient from prompt_testing.enricher import TestCaseEnricher from prompt_testing.file_utils import load_all_test_cases @@ -43,7 +44,7 @@ def cli(ctx, project_root): @click.option("--output", help="Output filename") @click.option("--max-concurrent", type=int, default=5) @click.option("--review", is_flag=True, help="Also run Opus correctness review on results") -@click.option("--review-model", default="claude-opus-4-6", help="Model for correctness review") +@click.option("--review-model", default="claude-opus-4-7", help="Model for correctness review") @click.pass_context def run(ctx, prompt, cases, categories, output, max_concurrent, review, review_model): """Run test cases and save results for review.""" @@ -208,6 +209,7 @@ async def _run_reviews(project_root: Path, results: dict, model: str) -> dict: review_cost = 0.0 errors_found = 0 + cost_per_input_token, cost_per_output_token = get_model_cost(model) for i, result in enumerate(successful, 1): case = cases_by_id.get(result["case_id"]) @@ -221,8 +223,10 @@ async def _run_reviews(project_root: Path, results: dict, model: str) -> dict: n_issues = len(review.get("issues", [])) if not review.get("correct"): errors_found += 1 - # Opus pricing: $15/M in, $75/M out - cost = review.get("reviewer_input_tokens", 0) * 15 / 1e6 + review.get("reviewer_output_tokens", 0) * 75 / 1e6 + cost = ( + review.get("reviewer_input_tokens", 0) * cost_per_input_token + + review.get("reviewer_output_tokens", 0) * cost_per_output_token + ) review_cost += cost click.echo(f" [{i}/{len(successful)}] {status} {result['case_id']} ({n_issues} issues, ${cost:.4f})") @@ -256,7 +260,7 @@ def _print_review_summary(results: dict) -> None: @cli.command() @click.argument("results_file") -@click.option("--model", default="claude-opus-4-6", help="Reviewer model") +@click.option("--model", default="claude-opus-4-7", help="Reviewer model") @click.pass_context def review(ctx, results_file, model): """Run Opus correctness review on existing results.""" diff --git a/prompt_testing/reviewer.py b/prompt_testing/reviewer.py index 4b90261..a329317 100644 --- a/prompt_testing/reviewer.py +++ b/prompt_testing/reviewer.py @@ -66,7 +66,7 @@ class CorrectnessReviewer: """Reviews explanations for factual correctness using a powerful model.""" - def __init__(self, model: str = "claude-opus-4-6"): + def __init__(self, model: str = "claude-opus-4-7"): self.model = model self.client = AsyncAnthropic() @@ -95,10 +95,10 @@ async def review( explanation=explanation, ) + # Opus 4.7+ rejects `temperature`; rely on the model's own default. msg = await self.client.messages.create( model=self.model, max_tokens=2048, - temperature=0.0, system=REVIEW_SYSTEM_PROMPT, messages=[{"role": "user", "content": user_prompt}], )