From 248320adbed092065079cefdbe2914a78be2e797 Mon Sep 17 00:00:00 2001 From: Dang Nguyen Date: Tue, 12 May 2026 13:13:14 -0500 Subject: [PATCH] WIP: record actual OpenRouter cost in ReviewResult MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OpenRouter returns the real (post-discount, post-routing) cost in the usage block when asked. Plumb that through so reviews report ground-truth spend instead of a token-count × hardcoded-price estimate. - client.py: send extra_body.usage.include=true on OpenRouter calls; read resp.usage.cost (with model_extra fallback) and add it to the returned usage dict as cost_usd + cost_source. - models.py: ReviewResult gains total_cost_usd and cost_source fields, plus an add_usage() helper that consolidates the per-call token and cost accumulation that was previously duplicated at every call site. - method_local / method_progressive / method_zero_shot: 9 call sites switched from inline token-accumulation pairs to result.add_usage(). - evaluate.compute_cost: prefer the reported value when cost_source is set; estimator stays as the fallback for other providers. - cli.py: viz JSON now records "cost_method" so downstream tooling can tell real numbers apart from estimates. Non-OpenRouter providers (OpenAI, Anthropic, Gemini, Mistral) are unaffected — they keep using the token-price-table estimate. --- src/reviewer/cli.py | 2 ++ src/reviewer/client.py | 18 +++++++++++++++++- src/reviewer/evaluate.py | 6 +++++- src/reviewer/method_local.py | 6 ++---- src/reviewer/method_progressive.py | 15 +++++---------- src/reviewer/method_zero_shot.py | 6 ++---- src/reviewer/models.py | 13 +++++++++++++ 7 files changed, 46 insertions(+), 20 deletions(-) diff --git a/src/reviewer/cli.py b/src/reviewer/cli.py index c5828d7..d36b7d1 100644 --- a/src/reviewer/cli.py +++ b/src/reviewer/cli.py @@ -205,12 +205,14 @@ def _build_paper_json( from .evaluate import compute_cost cost_usd = compute_cost(result) + cost_method = "openrouter_actual" if result.cost_source == "openrouter" else "estimated" method_data = { "label": label, "model": result.model, "overall_feedback": result.overall_feedback, "comments": comments, "cost_usd": round(cost_usd, 4), + "cost_method": cost_method, "prompt_tokens": result.total_prompt_tokens, "completion_tokens": result.total_completion_tokens, } diff --git a/src/reviewer/client.py b/src/reviewer/client.py index fdf7a29..543b917 100644 --- a/src/reviewer/client.py +++ b/src/reviewer/client.py @@ -187,7 +187,8 @@ def chat( api_model = api_model[len(prefix_to_strip):] current_max_tokens = max_tokens - total_usage = {"prompt_tokens": 0, "completion_tokens": 0, "model": model} + total_usage = {"prompt_tokens": 0, "completion_tokens": 0, "model": model, + "cost_usd": 0.0, "cost_source": None} for empty_attempt in range(EMPTY_RESPONSE_MAX_RETRIES): for attempt in range(retries): @@ -210,8 +211,20 @@ def chat( kwargs["temperature"] = temperature if reasoning_effort is not None and reasoning_effort != "none": _apply_reasoning(kwargs, resolved_provider, reasoning_effort, current_max_tokens) + # Ask OpenRouter to include actual cost in the usage block. + if resolved_provider == "openrouter": + eb = kwargs.setdefault("extra_body", {}) + eb.setdefault("usage", {"include": True}) resp = client.chat.completions.create(**kwargs) print(f" [DEBUG] finish_reason={resp.choices[0].finish_reason}, completion_tokens={resp.usage.completion_tokens if resp.usage else 'N/A'}") + # Pull actual cost from OpenRouter's usage block when present. + resp_cost = None + if resp.usage is not None: + resp_cost = getattr(resp.usage, "cost", None) + if resp_cost is None: + # OpenAI SDK exposes unknown fields via model_extra + extra = getattr(resp.usage, "model_extra", None) or {} + resp_cost = extra.get("cost") usage = { "prompt_tokens": resp.usage.prompt_tokens if resp.usage else 0, "completion_tokens": resp.usage.completion_tokens if resp.usage else 0, @@ -222,6 +235,9 @@ def chat( # Accumulate tokens across retries total_usage["prompt_tokens"] += usage["prompt_tokens"] total_usage["completion_tokens"] += usage["completion_tokens"] + if resp_cost is not None: + total_usage["cost_usd"] += float(resp_cost) + total_usage["cost_source"] = "openrouter" if content.strip(): return content, total_usage diff --git a/src/reviewer/evaluate.py b/src/reviewer/evaluate.py index f4ecad1..8c1f1e0 100644 --- a/src/reviewer/evaluate.py +++ b/src/reviewer/evaluate.py @@ -47,7 +47,11 @@ def compute_cost(result: ReviewResult) -> float: - """Estimate USD cost of a review.""" + """USD cost of a review. Prefers the actual value reported by the + provider (currently OpenRouter) when available; otherwise falls back + to a token-count × hardcoded-price-table estimate.""" + if getattr(result, "cost_source", None) and getattr(result, "total_cost_usd", 0): + return float(result.total_cost_usd) pricing = None for key in COST_PER_1M: if key in result.model: diff --git a/src/reviewer/method_local.py b/src/reviewer/method_local.py index d5f7113..dcb6804 100644 --- a/src/reviewer/method_local.py +++ b/src/reviewer/method_local.py @@ -94,8 +94,7 @@ def review_local( reasoning_effort=reasoning_effort, ) result.raw_responses.append(response) - result.total_prompt_tokens += usage["prompt_tokens"] - result.total_completion_tokens += usage["completion_tokens"] + result.add_usage(usage) if not response.strip(): print(f" WARNING: Empty response for chunk {chunk_idx+1}/{len(chunks)} " @@ -126,7 +125,6 @@ def review_local( reasoning_effort=reasoning_effort, ) result.overall_feedback = feedback_response.strip() - result.total_prompt_tokens += usage["prompt_tokens"] - result.total_completion_tokens += usage["completion_tokens"] + result.add_usage(usage) return result diff --git a/src/reviewer/method_progressive.py b/src/reviewer/method_progressive.py index e0012b0..e6aac14 100644 --- a/src/reviewer/method_progressive.py +++ b/src/reviewer/method_progressive.py @@ -104,8 +104,7 @@ def update_running_summary( max_tokens=3000, reasoning_effort=reasoning_effort, ) - result.total_prompt_tokens += usage["prompt_tokens"] - result.total_completion_tokens += usage["completion_tokens"] + result.add_usage(usage) updated = response.strip() if count_tokens(updated) > max_summary_tokens: @@ -128,8 +127,7 @@ def is_substantial_passage( max_tokens=8, reasoning_effort=reasoning_effort, ) - result.total_prompt_tokens += usage["prompt_tokens"] - result.total_completion_tokens += usage["completion_tokens"] + result.add_usage(usage) return response.strip().lower().startswith("yes") @@ -154,8 +152,7 @@ def consolidate_comments( max_tokens=output_cap, reasoning_effort=reasoning_effort, ) - result.total_prompt_tokens += usage["prompt_tokens"] - result.total_completion_tokens += usage["completion_tokens"] + result.add_usage(usage) arr_match = re.search(r"\[.*\]", response, re.DOTALL) if arr_match: @@ -259,8 +256,7 @@ def review_progressive( reasoning_effort=reasoning_effort, ) result.raw_responses.append(response) - result.total_prompt_tokens += usage["prompt_tokens"] - result.total_completion_tokens += usage["completion_tokens"] + result.add_usage(usage) # Parse comments new_comments = [] @@ -308,8 +304,7 @@ def review_progressive( reasoning_effort=reasoning_effort, ) result.overall_feedback = feedback_response.strip() - result.total_prompt_tokens += usage["prompt_tokens"] - result.total_completion_tokens += usage["completion_tokens"] + result.add_usage(usage) # Step 3: Consolidation pass print(f" Consolidating {len(all_comments)} comments...") diff --git a/src/reviewer/method_zero_shot.py b/src/reviewer/method_zero_shot.py index 3c232bc..bfa749c 100644 --- a/src/reviewer/method_zero_shot.py +++ b/src/reviewer/method_zero_shot.py @@ -32,8 +32,7 @@ def review_zero_shot( reasoning_effort=reasoning_effort, ) result.raw_responses.append(response) - result.total_prompt_tokens += usage["prompt_tokens"] - result.total_completion_tokens += usage["completion_tokens"] + result.add_usage(usage) overall, comments = parse_review_response(response) result.overall_feedback = overall result.comments = comments @@ -58,8 +57,7 @@ def review_zero_shot( reasoning_effort=reasoning_effort, ) result.raw_responses.append(response) - result.total_prompt_tokens += usage["prompt_tokens"] - result.total_completion_tokens += usage["completion_tokens"] + result.add_usage(usage) overall, comments = parse_review_response(response) if overall: overall_parts.append(overall) diff --git a/src/reviewer/models.py b/src/reviewer/models.py index 671bbc0..7991344 100644 --- a/src/reviewer/models.py +++ b/src/reviewer/models.py @@ -33,6 +33,8 @@ class ReviewResult: overall_feedback: str = "" total_prompt_tokens: int = 0 total_completion_tokens: int = 0 + total_cost_usd: float = 0.0 # actual cost summed from API responses (OpenRouter only) + cost_source: str | None = None # "openrouter" if total_cost_usd is real; else None model: str = "" reasoning_effort: str | None = None raw_responses: list[str] = field(default_factory=list) @@ -41,6 +43,17 @@ class ReviewResult: def num_comments(self) -> int: return len(self.comments) + def add_usage(self, usage: dict) -> None: + """Accumulate one chat() usage dict into this result.""" + self.total_prompt_tokens += usage.get("prompt_tokens", 0) + self.total_completion_tokens += usage.get("completion_tokens", 0) + cost = usage.get("cost_usd") + if cost: + self.total_cost_usd += float(cost) + src = usage.get("cost_source") + if src and self.cost_source is None: + self.cost_source = src + def to_dict(self) -> dict: return { "method": self.method,