Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/reviewer/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,12 +205,14 @@ def _build_paper_json(
from .evaluate import compute_cost
cost_usd = compute_cost(result)

cost_method = "openrouter_actual" if result.cost_source == "openrouter" else "estimated"
method_data = {
"label": label,
"model": result.model,
"overall_feedback": result.overall_feedback,
"comments": comments,
"cost_usd": round(cost_usd, 4),
"cost_method": cost_method,
"prompt_tokens": result.total_prompt_tokens,
"completion_tokens": result.total_completion_tokens,
}
Expand Down
18 changes: 17 additions & 1 deletion src/reviewer/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,8 @@ def chat(
api_model = api_model[len(prefix_to_strip):]

current_max_tokens = max_tokens
total_usage = {"prompt_tokens": 0, "completion_tokens": 0, "model": model}
total_usage = {"prompt_tokens": 0, "completion_tokens": 0, "model": model,
"cost_usd": 0.0, "cost_source": None}

for empty_attempt in range(EMPTY_RESPONSE_MAX_RETRIES):
for attempt in range(retries):
Expand All @@ -210,8 +211,20 @@ def chat(
kwargs["temperature"] = temperature
if reasoning_effort is not None and reasoning_effort != "none":
_apply_reasoning(kwargs, resolved_provider, reasoning_effort, current_max_tokens)
# Ask OpenRouter to include actual cost in the usage block.
if resolved_provider == "openrouter":
eb = kwargs.setdefault("extra_body", {})
eb.setdefault("usage", {"include": True})
resp = client.chat.completions.create(**kwargs)
print(f" [DEBUG] finish_reason={resp.choices[0].finish_reason}, completion_tokens={resp.usage.completion_tokens if resp.usage else 'N/A'}")
# Pull actual cost from OpenRouter's usage block when present.
resp_cost = None
if resp.usage is not None:
resp_cost = getattr(resp.usage, "cost", None)
if resp_cost is None:
# OpenAI SDK exposes unknown fields via model_extra
extra = getattr(resp.usage, "model_extra", None) or {}
resp_cost = extra.get("cost")
usage = {
"prompt_tokens": resp.usage.prompt_tokens if resp.usage else 0,
"completion_tokens": resp.usage.completion_tokens if resp.usage else 0,
Expand All @@ -222,6 +235,9 @@ def chat(
# Accumulate tokens across retries
total_usage["prompt_tokens"] += usage["prompt_tokens"]
total_usage["completion_tokens"] += usage["completion_tokens"]
if resp_cost is not None:
total_usage["cost_usd"] += float(resp_cost)
total_usage["cost_source"] = "openrouter"

if content.strip():
return content, total_usage
Expand Down
6 changes: 5 additions & 1 deletion src/reviewer/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,11 @@


def compute_cost(result: ReviewResult) -> float:
"""Estimate USD cost of a review."""
"""USD cost of a review. Prefers the actual value reported by the
provider (currently OpenRouter) when available; otherwise falls back
to a token-count × hardcoded-price-table estimate."""
if getattr(result, "cost_source", None) and getattr(result, "total_cost_usd", 0):
return float(result.total_cost_usd)
pricing = None
for key in COST_PER_1M:
if key in result.model:
Expand Down
6 changes: 2 additions & 4 deletions src/reviewer/method_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,7 @@ def review_local(
reasoning_effort=reasoning_effort,
)
result.raw_responses.append(response)
result.total_prompt_tokens += usage["prompt_tokens"]
result.total_completion_tokens += usage["completion_tokens"]
result.add_usage(usage)

if not response.strip():
print(f" WARNING: Empty response for chunk {chunk_idx+1}/{len(chunks)} "
Expand Down Expand Up @@ -126,7 +125,6 @@ def review_local(
reasoning_effort=reasoning_effort,
)
result.overall_feedback = feedback_response.strip()
result.total_prompt_tokens += usage["prompt_tokens"]
result.total_completion_tokens += usage["completion_tokens"]
result.add_usage(usage)

return result
15 changes: 5 additions & 10 deletions src/reviewer/method_progressive.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,7 @@ def update_running_summary(
max_tokens=3000,
reasoning_effort=reasoning_effort,
)
result.total_prompt_tokens += usage["prompt_tokens"]
result.total_completion_tokens += usage["completion_tokens"]
result.add_usage(usage)

updated = response.strip()
if count_tokens(updated) > max_summary_tokens:
Expand All @@ -128,8 +127,7 @@ def is_substantial_passage(
max_tokens=8,
reasoning_effort=reasoning_effort,
)
result.total_prompt_tokens += usage["prompt_tokens"]
result.total_completion_tokens += usage["completion_tokens"]
result.add_usage(usage)
return response.strip().lower().startswith("yes")


Expand All @@ -154,8 +152,7 @@ def consolidate_comments(
max_tokens=output_cap,
reasoning_effort=reasoning_effort,
)
result.total_prompt_tokens += usage["prompt_tokens"]
result.total_completion_tokens += usage["completion_tokens"]
result.add_usage(usage)

arr_match = re.search(r"\[.*\]", response, re.DOTALL)
if arr_match:
Expand Down Expand Up @@ -259,8 +256,7 @@ def review_progressive(
reasoning_effort=reasoning_effort,
)
result.raw_responses.append(response)
result.total_prompt_tokens += usage["prompt_tokens"]
result.total_completion_tokens += usage["completion_tokens"]
result.add_usage(usage)

# Parse comments
new_comments = []
Expand Down Expand Up @@ -308,8 +304,7 @@ def review_progressive(
reasoning_effort=reasoning_effort,
)
result.overall_feedback = feedback_response.strip()
result.total_prompt_tokens += usage["prompt_tokens"]
result.total_completion_tokens += usage["completion_tokens"]
result.add_usage(usage)

# Step 3: Consolidation pass
print(f" Consolidating {len(all_comments)} comments...")
Expand Down
6 changes: 2 additions & 4 deletions src/reviewer/method_zero_shot.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@ def review_zero_shot(
reasoning_effort=reasoning_effort,
)
result.raw_responses.append(response)
result.total_prompt_tokens += usage["prompt_tokens"]
result.total_completion_tokens += usage["completion_tokens"]
result.add_usage(usage)
overall, comments = parse_review_response(response)
result.overall_feedback = overall
result.comments = comments
Expand All @@ -58,8 +57,7 @@ def review_zero_shot(
reasoning_effort=reasoning_effort,
)
result.raw_responses.append(response)
result.total_prompt_tokens += usage["prompt_tokens"]
result.total_completion_tokens += usage["completion_tokens"]
result.add_usage(usage)
overall, comments = parse_review_response(response)
if overall:
overall_parts.append(overall)
Expand Down
13 changes: 13 additions & 0 deletions src/reviewer/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ class ReviewResult:
overall_feedback: str = ""
total_prompt_tokens: int = 0
total_completion_tokens: int = 0
total_cost_usd: float = 0.0 # actual cost summed from API responses (OpenRouter only)
cost_source: str | None = None # "openrouter" if total_cost_usd is real; else None
model: str = ""
reasoning_effort: str | None = None
raw_responses: list[str] = field(default_factory=list)
Expand All @@ -41,6 +43,17 @@ class ReviewResult:
def num_comments(self) -> int:
return len(self.comments)

def add_usage(self, usage: dict) -> None:
"""Accumulate one chat() usage dict into this result."""
self.total_prompt_tokens += usage.get("prompt_tokens", 0)
self.total_completion_tokens += usage.get("completion_tokens", 0)
cost = usage.get("cost_usd")
if cost:
self.total_cost_usd += float(cost)
src = usage.get("cost_source")
if src and self.cost_source is None:
self.cost_source = src

def to_dict(self) -> dict:
return {
"method": self.method,
Expand Down