diff --git a/.env.example b/.env.example index 56aac36..4e36a22 100644 --- a/.env.example +++ b/.env.example @@ -15,3 +15,9 @@ OPENROUTER_API_KEY=your_openrouter_api_key_here # Optional: custom OpenAI base URL (e.g. EU endpoint, Azure) # OPENAI_BASE_URL=https://eu.api.openai.com/v1 + +# Reviewer 3 (closed-source HTTP API; benchmarks/perturbation only). +# Required when running the perturbation benchmark with system: reviewer3. +# REVIEWER3_API_KEY=sk_... +# REVIEWER3_USER_ID= +# REVIEWER3_BASE_URL=https://reviewer3.com # optional override diff --git a/.gitignore b/.gitignore index 034b5fd..4dd8729 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ __pycache__/ *.pyc *.pyo .venv/ +.venv venv/ # Jupyter @@ -22,12 +23,18 @@ venv/ # Run outputs review_results/ -benchmarks/conference_study/results/ -benchmarks/perturbation/perturbation_results/ +benchmarks/conference_study/results +benchmarks/perturbation/perturbation_results +benchmarks/perturbation/results +benchmarks/perturbation/data # conference_study study artifacts (not code) -benchmarks/conference_study/manifests/ -benchmarks/conference_study/reports/ +benchmarks/conference_study/manifests +benchmarks/conference_study/reports # Symlink targets — trailing-slash patterns wouldn't match the symlinks benchmarks/conference_study/analyses/manifests benchmarks/conference_study/analyses/results +benchmarks/conference_study/papers + +# Moved out of repo (see commit 6373fad); ignore the leftover local dir. +benchmarks/experimental_perturbations/ diff --git a/benchmarks/conference_study/analyses/compute_auc.py b/benchmarks/conference_study/analyses/compute_auc.py index d32f1a9..0d87e23 100644 --- a/benchmarks/conference_study/analyses/compute_auc.py +++ b/benchmarks/conference_study/analyses/compute_auc.py @@ -51,17 +51,14 @@ REPO_ROOT = HERE.parent # benchmarks/conference_study/ RESULTS_ROOT = REPO_ROOT / "results" -COARSE_SEVERITY_MAP = {"critical": "major", "major": "moderate", "minor": "minor"} -SEVERITY_TIERS = ("major", "moderate", "minor") - - -def normalize_severity(method: str, raw: str | None) -> str | None: - if not raw: - return None - raw = raw.lower() - if method == "coarse": - return COARSE_SEVERITY_MAP.get(raw) - return raw if raw in SEVERITY_TIERS else None +# Severity normalization lives in benchmarks/perturbation/_severity.py so the +# perturbation adapters and these analyses use one source of truth. +sys.path.insert(0, str(HERE.parents[1] / "perturbation")) +from _severity import ( # noqa: E402 + COARSE_SEVERITY_MAP, + TIERS as SEVERITY_TIERS, + normalize_severity, +) def load_manifest(path: Path) -> dict[str, list[dict]]: diff --git a/benchmarks/conference_study/analyses/report_scaleup.py b/benchmarks/conference_study/analyses/report_scaleup.py index 79dce76..8a43725 100644 --- a/benchmarks/conference_study/analyses/report_scaleup.py +++ b/benchmarks/conference_study/analyses/report_scaleup.py @@ -204,20 +204,14 @@ def comment_metrics_by_method( return out -# Coarse uses {minor, major, critical}; openaireview methods use -# {minor, moderate, major}. Normalize so a single set of tiers compares -# apples-to-apples (highest=major, mid=moderate, low=minor). -_COARSE_SEVERITY_MAP = {"critical": "major", "major": "moderate", "minor": "minor"} -SEVERITY_TIERS = ("major", "moderate", "minor") - - -def normalize_severity(method: str, raw: str | None) -> str | None: - if not raw: - return None - raw = raw.lower() - if method == "coarse": - return _COARSE_SEVERITY_MAP.get(raw) - return raw if raw in SEVERITY_TIERS else None +# Severity normalization lives in benchmarks/perturbation/_severity.py so the +# perturbation adapters and these analyses use one source of truth. +sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "perturbation")) +from _severity import ( # noqa: E402 + COARSE_SEVERITY_MAP as _COARSE_SEVERITY_MAP, + TIERS as SEVERITY_TIERS, + normalize_severity, +) def severity_counts_by_method( diff --git a/benchmarks/conference_study/competitors/registry.py b/benchmarks/conference_study/competitors/registry.py index be0a25a..7a64bd8 100644 --- a/benchmarks/conference_study/competitors/registry.py +++ b/benchmarks/conference_study/competitors/registry.py @@ -9,9 +9,11 @@ from .base import CompetitorAdapter from .coarse_adapter import CoarseAdapter +from .reviewer3_adapter import Reviewer3Adapter _REGISTRY: dict[str, type[CompetitorAdapter]] = { "coarse": CoarseAdapter, + "reviewer3": Reviewer3Adapter, } diff --git a/benchmarks/conference_study/competitors/reviewer3_adapter.py b/benchmarks/conference_study/competitors/reviewer3_adapter.py new file mode 100644 index 0000000..82052c9 --- /dev/null +++ b/benchmarks/conference_study/competitors/reviewer3_adapter.py @@ -0,0 +1,118 @@ +"""Adapter for Reviewer 3 (closed-source HTTP API). + +Submission flow is the same as the perturbation benchmark — POST a PDF to +`/api/internal/review`, poll the session until the `status` enum is terminal, +then map each comment via `_normalize_comment`. We reuse those helpers from +the perturbation adapter (`benchmarks/perturbation/systems/reviewer3_adapter.py`) +rather than duplicating the HTTP code; the only difference here is that +conference inputs arrive as PDFs already, so the LaTeX-as-md → PDF compile +step (`_ensure_pdf`) is unnecessary. + +Reviewer 3 has no model selector, so `method_key(...)` always returns +`"reviewer3__reviewer3"` regardless of the manifest `model` value. The +conference YAML should pin `models: [reviewer3]` to avoid duplicate +submissions across a phantom model loop. + +Required env: + REVIEWER3_API_KEY sk_... (sent as `x-api-key` header) + REVIEWER3_USER_ID UUID from the vendor's web UI session JSON (not an email) +""" +from __future__ import annotations + +import sys +from pathlib import Path + +from .base import CompetitorAdapter, NormalizedComment, NormalizedReview + +# Reuse the perturbation adapter's HTTP + normalization helpers. +_PERT = Path(__file__).resolve().parents[2] / "perturbation" / "systems" +sys.path.insert(0, str(_PERT)) +import reviewer3_adapter as _r3 # noqa: E402 + + +_METHOD_KEY = f"{_r3.REVIEWER3_SLUG}__{_r3.REVIEWER3_SLUG}" + + +class Reviewer3Adapter(CompetitorAdapter): + name = "reviewer3" + required_env = ("REVIEWER3_API_KEY", "REVIEWER3_USER_ID") + + def method_key(self, model: str) -> str: + # R3 has no model selector — fixed key regardless of `model`. + return _METHOD_KEY + + def review(self, pdf: Path, model: str, cfg: dict) -> NormalizedReview: + opts = cfg.get("reviewer3_options", {}) or {} + rcfg = _r3.config_from_env() + for k in ("review_mode", "poll_interval_s", "poll_timeout_s", + "request_timeout_s", "base_url"): + if k in opts and opts[k] is not None: + setattr(rcfg, k, opts[k]) + + # Cap PDF size sent to R3. `max_pages` lives at the top level of the + # config (run_competitors.py uses it for parse_document); we honor the + # same value here so the bytes shipped to R3 match the paragraph window + # we already cap on our side. Untrimmed full PDFs were tripping R3's + # HTTP 413 limit and inflating per-paper wall time. + max_pages = cfg.get("max_pages") or opts.get("max_pages") + + # sid_file is injected by run_competitors.py just before invocation: + # `cfg["_sid_file"] = out_file.with_suffix(".sid")`. If present and + # the file already exists, we resume that R3 session instead of + # submitting fresh — avoids duplicate-session credit waste when a + # prior run was killed mid-poll. (See PR notes; ~34% of credits + # observed wasted on duplicates before this fix.) + sid_file = cfg.get("_sid_file") + if isinstance(sid_file, str): + sid_file = Path(sid_file) + session_id = "" + body = None + if sid_file and sid_file.exists(): + session_id = sid_file.read_text().strip() + try: + body = _r3._poll_until_done(rcfg, session_id, + tag=f"reviewer3/{pdf.stem} (resumed)") + except RuntimeError as e: + m = str(e) + if "fetch failed" in m and ("403" in m or "404" in m): + sid_file.unlink(missing_ok=True) + session_id, body = "", None + else: + raise + + if body is None: + session_id = _r3._submit(rcfg, pdf, title=pdf.stem, max_pages=max_pages) + if sid_file: + sid_file.parent.mkdir(parents=True, exist_ok=True) + sid_file.write_text(session_id) + body = _r3._poll_until_done(rcfg, session_id, tag=f"reviewer3/{pdf.stem}") + + comments: list[NormalizedComment] = [] + for i, raw in enumerate(body.get("comments") or []): + if not isinstance(raw, dict): + raw = {"comment": str(raw)} + norm = _r3._normalize_comment(raw, i) + comments.append(NormalizedComment( + title=norm.get("title", ""), + quote=norm.get("quote", ""), + explanation=norm.get("explanation", ""), + comment_type=norm.get("comment_type", "technical"), + extra={ + "severity": norm.get("severity"), + "reviewerId": raw.get("reviewerId"), + "rank": raw.get("rank"), + "session_id": session_id, + }, + )) + + # R3 doesn't publish pricing and doesn't return overall_feedback or + # token counts in its response, so we leave those empty/None. + return NormalizedReview( + comments=comments, + overall_feedback="", + cost_usd=None, + cost_method="estimated", + prompt_tokens=None, + completion_tokens=None, + model=_r3.REVIEWER3_SLUG, + ) diff --git a/benchmarks/conference_study/configs/reviewer3.yaml b/benchmarks/conference_study/configs/reviewer3.yaml new file mode 100644 index 0000000..1f6d19f --- /dev/null +++ b/benchmarks/conference_study/configs/reviewer3.yaml @@ -0,0 +1,32 @@ +# Reviewer 3 (closed-source HTTP API) run on the v2 conference cohort. +# Results -> benchmarks/conference_study/results/reviewer3_v2/ +# Log -> benchmarks/conference_study/results/reviewer3_v2/run_log.jsonl +# +# Prerequisites: +# - REVIEWER3_API_KEY and REVIEWER3_USER_ID set in .env +# - Manifest, papers, and results dirs reachable. In this worktree they are +# symlinks into the sibling OpenAIReview worktree (gitignored data lives +# only there). Set them up with: +# ln -s ../../../OpenAIReview/benchmarks/conference_study/manifests manifests +# ln -s ../../../OpenAIReview/benchmarks/conference_study/papers papers +# ln -s ../../../OpenAIReview/benchmarks/conference_study/results results + +name: reviewer3_v2 +competitor: reviewer3 + +manifest: manifests/v2/combined.json + +# R3 has no model selector. Pin to a single dummy entry so run_competitors.py +# loops once per paper rather than once per (paper × manifest model). +models: + - reviewer3 + +timeout_sec: 3600 # outer per-(paper, model) wall cap (R3 is 10-30 min typical) +max_per_model: 5 +max_pages: 20 # parse_document cap; matches coarse.yaml convention + +# Adapter-specific options forwarded to Reviewer3Adapter.review(cfg=...). +reviewer3_options: + review_mode: author # author | journal (R3 reviewMode enum) + poll_interval_s: 30 + poll_timeout_s: 3600 # 60 min/paper cap diff --git a/benchmarks/conference_study/run_competitors.py b/benchmarks/conference_study/run_competitors.py index 68214c6..3c5eb34 100644 --- a/benchmarks/conference_study/run_competitors.py +++ b/benchmarks/conference_study/run_competitors.py @@ -125,6 +125,14 @@ def run_one(paper: dict, model: str, adapter, cfg: dict, dry_run: bool = False) title, content, _was_ocr = parse_document(pdf, max_pages=MAX_PAGES) paragraphs = split_into_paragraphs(content) + # Inject sid_file location so the adapter can persist/resume the + # competitor-side session id (e.g. reviewer3). Adapters that don't + # care simply ignore the underscore-prefixed key. The file lives + # next to the merged paper JSON so it survives across runs. + out_file = RESULTS_DIR / f"{paper['slug']}.json" + sid_dir = RESULTS_DIR / ".sids" + cfg = {**cfg, "_sid_file": sid_dir / f"{paper['slug']}.{method_key}.sid"} + review = adapter.review(pdf, model, cfg) method_data = build_method_data( @@ -134,7 +142,6 @@ def run_one(paper: dict, model: str, adapter, cfg: dict, dry_run: bool = False) paragraphs=paragraphs, ) - out_file = RESULTS_DIR / f"{paper['slug']}.json" merge_into_paper_json( out_file=out_file, slug=paper["slug"], diff --git a/benchmarks/perturbation/.gitignore b/benchmarks/perturbation/.gitignore index bddf55d..b435026 100644 --- a/benchmarks/perturbation/.gitignore +++ b/benchmarks/perturbation/.gitignore @@ -8,8 +8,19 @@ papers/ # Run logs reports/*.log -# Temporary / ephemeral configs +# Temporary / ephemeral configs. +# Underscore-prefix are by-convention scratch. The other rules below cover the +# bulk per-domain configs that we generate locally (one per system × domain) +# but don't check in. The `!*_reviewer3.yaml` exception preserves the canonical +# reviewer3 configs that are tracked. configs/_* +configs/cs_*scaleup*.yaml +configs/full_*.yaml +configs/grok_*.yaml +configs/longtail_*.yaml +configs/subset_*.yaml +configs/r3_smoke*.yaml +!configs/full_*_reviewer3.yaml # Python __pycache__/ diff --git a/benchmarks/perturbation/_severity.py b/benchmarks/perturbation/_severity.py new file mode 100644 index 0000000..0e2db3c --- /dev/null +++ b/benchmarks/perturbation/_severity.py @@ -0,0 +1,85 @@ +"""Canonical severity tiers and per-system normalization. + +The perturbation benchmark, the conference study analyses, and the viz layer +all want to compare comment severities across review systems. Each system uses +its own native vocabulary, so before any cross-system comparison the raw value +must be mapped to the canonical 3-tier scale used by openaireview itself: + + major - Undermines a key claim/methodology; affects conclusions. + moderate - Real error or gap that is localized and fixable. + minor - Framing concern, mild overclaim, or resolvable ambiguity. + +Per-system maps: + + * openaireview: identity. Output is already in {major, moderate, minor}. + * coarse: {critical, major, minor} -> {major, moderate, minor} + (shift down one tier; same mapping that the conference-study + scripts in benchmarks/conference_study/analyses/ use). + * reviewer3: integer 1..4 per their OpenAPI spec, where + 1=Critical, 2=Major, 3=Minor, 4=Editorial. + Compressed to the 3-tier scale by collapsing R3 Minor and + Editorial into `minor`, since in practice R3 tags substantive + -but-lower-importance findings as Editorial rather than style + notes. Confirm with the vendor if the label is later clarified. + +The conference_study analyses currently inline `COARSE_SEVERITY_MAP` (see +`benchmarks/conference_study/analyses/compute_auc.py` and `report_scaleup.py`). +Once those analyses are co-resident with this module they should import +`COARSE_SEVERITY_MAP` and `normalize_severity` from here instead. +""" + +from __future__ import annotations + + +TIERS: tuple[str, ...] = ("major", "moderate", "minor") + + +# openaireview methods emit canonical tier strings directly. +OPENAIREVIEW_SEVERITY_MAP: dict[str, str] = {t: t for t in TIERS} + +# coarse uses {minor, major, critical}. Shift down one level. +COARSE_SEVERITY_MAP: dict[str, str] = { + "critical": "major", + "major": "moderate", + "minor": "minor", +} + +# Reviewer 3 spec: 1=Critical, 2=Major, 3=Minor, 4=Editorial. +# Compress to 3 tiers; Editorial collapses with Minor (see module docstring). +REVIEWER3_SEVERITY_MAP: dict[int, str] = { + 1: "major", + 2: "moderate", + 3: "minor", + 4: "minor", +} + + +def normalize_severity(system: str, raw: object) -> str | None: + """Map a system-native severity value to the canonical 3-tier scale. + + Returns None for unrecognized values so callers can decide whether to drop + the comment, default it, or warn. + + `system` is the registry key matching `benchmarks/perturbation/systems/`: + 'openaireview', 'coarse', or 'reviewer3'. + """ + if raw is None: + return None + sysn = system.lower() + if sysn == "reviewer3": + if isinstance(raw, int): + return REVIEWER3_SEVERITY_MAP.get(raw) + # tolerate the str-form for hand-written test fixtures + try: + return REVIEWER3_SEVERITY_MAP.get(int(raw)) + except (TypeError, ValueError): + return None + if not isinstance(raw, str): + return None + s = raw.lower() + if sysn == "coarse": + return COARSE_SEVERITY_MAP.get(s) + if sysn == "openaireview": + return OPENAIREVIEW_SEVERITY_MAP.get(s) + # unknown system -> pass through if already canonical + return s if s in TIERS else None diff --git a/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml b/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml new file mode 100644 index 0000000..f60a078 --- /dev/null +++ b/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml @@ -0,0 +1,14 @@ +system: reviewer3 +input_dir: benchmarks/perturbation/data/perturbations_filtered/cs_CC/all +max_tokens: 13000 # match full_cs_CC_coarse.yaml +min_perturbations: 5 # match full_cs_CC_coarse.yaml + +score_method: llm +score_model: google/gemini-3-flash-preview + +review_mode: author # author | journal (Reviewer 3 reviewMode enum) +poll_interval_s: 30 +poll_timeout_s: 3600 # 60 min/paper cap (R3 is slow under 10-concurrent load) +max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) + +results_dir: benchmarks/perturbation/results/full_cs_CC_reviewer3 diff --git a/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml b/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml new file mode 100644 index 0000000..3193fbd --- /dev/null +++ b/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml @@ -0,0 +1,14 @@ +system: reviewer3 +input_dir: benchmarks/perturbation/data/perturbations_filtered/cs_LG/all +max_tokens: 13000 # match full_cs_LG_coarse.yaml +min_perturbations: 5 # match full_cs_LG_coarse.yaml + +score_method: llm +score_model: google/gemini-3-flash-preview + +review_mode: author # author | journal (Reviewer 3 reviewMode enum) +poll_interval_s: 30 +poll_timeout_s: 3600 # 60 min/paper cap (R3 is slow under 10-concurrent load) +max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) + +results_dir: benchmarks/perturbation/results/full_cs_LG_reviewer3 diff --git a/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml b/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml new file mode 100644 index 0000000..501ec02 --- /dev/null +++ b/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml @@ -0,0 +1,14 @@ +system: reviewer3 +input_dir: benchmarks/perturbation/data/perturbations_filtered/econ_EM/all +max_tokens: 13000 # match full_econ_EM_coarse.yaml +min_perturbations: 5 # match full_econ_EM_coarse.yaml + +score_method: llm +score_model: google/gemini-3-flash-preview + +review_mode: author # author | journal (Reviewer 3 reviewMode enum) +poll_interval_s: 30 +poll_timeout_s: 3600 # 60 min/paper cap (R3 is slow under 10-concurrent load) +max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) + +results_dir: benchmarks/perturbation/results/full_econ_EM_reviewer3 diff --git a/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml b/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml new file mode 100644 index 0000000..8e6cd7a --- /dev/null +++ b/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml @@ -0,0 +1,14 @@ +system: reviewer3 +input_dir: benchmarks/perturbation/data/perturbations_filtered/hep-ex/all +max_tokens: 13000 # match full_hep_ex_coarse.yaml +min_perturbations: 5 # match full_hep_ex_coarse.yaml + +score_method: llm +score_model: google/gemini-3-flash-preview + +review_mode: author # author | journal (Reviewer 3 reviewMode enum) +poll_interval_s: 30 +poll_timeout_s: 3600 # 60 min/paper cap (R3 is slow under 10-concurrent load) +max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) + +results_dir: benchmarks/perturbation/results/full_hep_ex_reviewer3 diff --git a/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml b/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml new file mode 100644 index 0000000..9b8e311 --- /dev/null +++ b/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml @@ -0,0 +1,14 @@ +system: reviewer3 +input_dir: benchmarks/perturbation/data/perturbations_filtered/math_all/all +max_tokens: 13000 # match full_math_all_coarse.yaml +min_perturbations: 5 # match full_math_all_coarse.yaml + +score_method: llm +score_model: google/gemini-3-flash-preview + +review_mode: author # author | journal (Reviewer 3 reviewMode enum) +poll_interval_s: 30 +poll_timeout_s: 3600 # 60 min/paper cap (R3 is slow under 10-concurrent load) +max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) + +results_dir: benchmarks/perturbation/results/full_math_all_reviewer3 diff --git a/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml b/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml new file mode 100644 index 0000000..4800288 --- /dev/null +++ b/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml @@ -0,0 +1,14 @@ +system: reviewer3 +input_dir: benchmarks/perturbation/data/perturbations_filtered/physics_atm-clus/all +max_tokens: 13000 # match full_physics_atm_clus_coarse.yaml +min_perturbations: 5 # match full_physics_atm_clus_coarse.yaml + +score_method: llm +score_model: google/gemini-3-flash-preview + +review_mode: author # author | journal (Reviewer 3 reviewMode enum) +poll_interval_s: 30 +poll_timeout_s: 3600 # 60 min/paper cap (R3 is slow under 10-concurrent load) +max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) + +results_dir: benchmarks/perturbation/results/full_physics_atm_clus_reviewer3 diff --git a/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml b/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml new file mode 100644 index 0000000..ca8669d --- /dev/null +++ b/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml @@ -0,0 +1,14 @@ +system: reviewer3 +input_dir: benchmarks/perturbation/data/perturbations_filtered/q-bio_GN/all +max_tokens: 13000 # match full_q_bio_GN_coarse.yaml +min_perturbations: 5 # match full_q_bio_GN_coarse.yaml + +score_method: llm +score_model: google/gemini-3-flash-preview + +review_mode: author # author | journal (Reviewer 3 reviewMode enum) +poll_interval_s: 30 +poll_timeout_s: 3600 # 60 min/paper cap (R3 is slow under 10-concurrent load) +max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) + +results_dir: benchmarks/perturbation/results/full_q_bio_GN_reviewer3 diff --git a/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml b/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml new file mode 100644 index 0000000..a06206e --- /dev/null +++ b/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml @@ -0,0 +1,14 @@ +system: reviewer3 +input_dir: benchmarks/perturbation/data/perturbations_filtered/stat_AP/all +max_tokens: 13000 # match full_stat_AP_coarse.yaml +min_perturbations: 5 # match full_stat_AP_coarse.yaml + +score_method: llm +score_model: google/gemini-3-flash-preview + +review_mode: author # author | journal (Reviewer 3 reviewMode enum) +poll_interval_s: 30 +poll_timeout_s: 3600 # 60 min/paper cap (R3 is slow under 10-concurrent load) +max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) + +results_dir: benchmarks/perturbation/results/full_stat_AP_reviewer3 diff --git a/benchmarks/perturbation/rescue_sessions.py b/benchmarks/perturbation/rescue_sessions.py new file mode 100644 index 0000000..1070b29 --- /dev/null +++ b/benchmarks/perturbation/rescue_sessions.py @@ -0,0 +1,285 @@ +#!/usr/bin/env python3 +"""Pull down completed Reviewer 3 sessions whose local poll loop gave up. + +Reviewer 3 is async: a session keeps processing on their server even after +our adapter has timed out / been killed. The adapter persists the sessionId +to a `.sid` file next to each cell's output JSON; this script walks those +`.sid` files, fetches each session, and writes results to disk for the ones +that completed in the meantime. + +Walks both result trees: + * Perturbation: /full__reviewer3/.../review/*.sid + * Conference: /results/reviewer3_v2/.sids/*.sid + +Skips cells whose `out_json` already has comments. Safe to re-run. + +Usage: + python rescue_sessions.py # rescue both, write results + python rescue_sessions.py --dry-run # just report what's recoverable + python rescue_sessions.py --kind perturbation + python rescue_sessions.py --kind conference + +Required env: REVIEWER3_API_KEY (used for the `review:read` GET). + +Exit code: 0 always (no errors are fatal — script is meant to be safe to +re-run; failures are just logged). +""" +from __future__ import annotations + +import argparse +import json +import os +import sys +from concurrent.futures import ThreadPoolExecutor +from dataclasses import dataclass +from pathlib import Path + +try: + from dotenv import load_dotenv + load_dotenv() +except ImportError: + pass + +import requests # noqa: E402 + +HERE = Path(__file__).resolve().parent +sys.path.insert(0, str(HERE)) +from systems.reviewer3_adapter import build_pipeline_json # noqa: E402 + +# Repo root + conference_study root +REPO = HERE.parent.parent +CONFERENCE = REPO / "benchmarks" / "conference_study" +sys.path.insert(0, str(REPO / "src")) +sys.path.insert(0, str(CONFERENCE)) + + +REVIEWER3_BASE_URL = os.environ.get("REVIEWER3_BASE_URL", "https://reviewer3.com").rstrip("/") + + +@dataclass +class Orphan: + """A `.sid` file pointing at a session we never wrote out_json for.""" + kind: str # "pert" or "conf" + sid_file: Path + out_json: Path # where we'd write the result if status == completed + extra: dict # kind-specific (e.g. conference slug) + + +def _has_content(out_json: Path, method_key: str | None = None) -> bool: + """`out_json` exists AND already has a comment-bearing method entry.""" + if not out_json.exists(): + return False + try: + d = json.loads(out_json.read_text()) + except Exception: + return False + methods = d.get("methods") or {} + if method_key is not None: + m = methods.get(method_key) + return bool(m and m.get("comments")) + return bool(methods) and any(m.get("comments") for m in methods.values()) + + +def find_orphans(kind: str = "both") -> list[Orphan]: + """Walk .sid files; return ones whose result JSON is missing or empty.""" + out: list[Orphan] = [] + + if kind in ("both", "perturbation"): + # Result paths are resolved via Config.results_dir, which the runner + # resolves to absolute via REPO/. The most robust thing + # is to walk every `.sid` under any `full_*_reviewer3` results dir + # that the runner might write to. We check both the local results + # tree (under this worktree) and any sibling worktrees the user may + # have symlinked in. + pert_roots = [REPO / "benchmarks" / "perturbation" / "results"] + # Follow the symlink target too in case results/ is a symlink (we use + # this pattern when running from a different worktree than the data). + resolved = (REPO / "benchmarks" / "perturbation" / "results").resolve() + if resolved not in pert_roots: + pert_roots.append(resolved) + seen = set() + for root in pert_roots: + if not root.exists(): + continue + for sid_file in root.rglob("*.sid"): + if sid_file in seen: + continue + seen.add(sid_file) + out_json = sid_file.with_suffix(".json") + if _has_content(out_json): + continue + out.append(Orphan(kind="pert", sid_file=sid_file, + out_json=out_json, extra={})) + + if kind in ("both", "conference"): + conf_root = CONFERENCE / "results" / "reviewer3_v2" + sids_dir = conf_root / ".sids" + if sids_dir.exists(): + for sid_file in sids_dir.glob("*.sid"): + # filename: ..sid + stem_parts = sid_file.stem.split(".") + slug = stem_parts[0] + method_key = ".".join(stem_parts[1:]) or "reviewer3__reviewer3" + out_json = conf_root / f"{slug}.json" + if _has_content(out_json, method_key=method_key): + continue + out.append(Orphan(kind="conf", sid_file=sid_file, + out_json=out_json, + extra={"slug": slug, "method_key": method_key})) + + return out + + +def fetch_session(sid: str, *, headers: dict, timeout: float = 30.0) -> dict | None: + """Return the full session body if status==completed, else None.""" + url = f"{REVIEWER3_BASE_URL}/api/internal/review/{sid}" + try: + r = requests.get(url, headers=headers, timeout=timeout) + except Exception as e: + print(f" fetch error for {sid}: {type(e).__name__}: {e}", + file=sys.stderr, flush=True) + return None + if r.status_code != 200: + print(f" {sid}: HTTP {r.status_code}", file=sys.stderr, flush=True) + return None + body = r.json() + if body.get("status") != "completed": + return None + return body + + +def write_perturbation(orphan: Orphan, body: dict) -> None: + """Write pipeline JSON + raw.json next to the .sid.""" + raw_path = orphan.sid_file.with_suffix(".raw.json") + raw_path.write_text(json.dumps(body, indent=2, ensure_ascii=False)) + # Synthesize a paper path from the sid_file stem (e.g. paper_001_corrupted.md) + pj = build_pipeline_json(Path(orphan.sid_file.stem + ".md"), + body, elapsed_s=0.0) + orphan.out_json.write_text(json.dumps(pj, indent=2, ensure_ascii=False)) + + +def write_conference(orphan: Orphan, body: dict, *, _cache: dict = {}) -> None: + """Build the merged paper JSON for the conference cohort.""" + # Lazy imports to keep --kind perturbation fast (avoids parse_document deps). + if "loaded" not in _cache: + from competitors import get_adapter # noqa: F401 + from competitors.helpers import build_method_data, merge_into_paper_json + from competitors.base import NormalizedComment, NormalizedReview + from reviewer.parsers import parse_document + from reviewer.utils import split_into_paragraphs + import systems.reviewer3_adapter as r3a + manifest = json.loads((CONFERENCE / "manifests/v2/combined.json").read_text()) + _cache.update( + build_method_data=build_method_data, + merge_into_paper_json=merge_into_paper_json, + NormalizedComment=NormalizedComment, + NormalizedReview=NormalizedReview, + parse_document=parse_document, + split_into_paragraphs=split_into_paragraphs, + r3a=r3a, + slug2paper={p["slug"]: p for p in manifest["papers"]}, + loaded=True, + ) + slug = orphan.extra["slug"] + method_key = orphan.extra["method_key"] + paper = _cache["slug2paper"].get(slug) + if not paper: + print(f" {slug}: no manifest entry, skipping", file=sys.stderr) + return + pdf = CONFERENCE / paper["pdf_path"] + title, content, _ = _cache["parse_document"](pdf, max_pages=20) + paragraphs = _cache["split_into_paragraphs"](content) + comments = [] + for i, c in enumerate(body.get("comments") or []): + if not isinstance(c, dict): + c = {"comment": str(c)} + n = _cache["r3a"]._normalize_comment(c, i) + comments.append(_cache["NormalizedComment"]( + title=n.get("title", ""), quote=n.get("quote", ""), + explanation=n.get("explanation", ""), + comment_type=n.get("comment_type", "technical"), + extra={"severity": n.get("severity"), + "reviewerId": c.get("reviewerId"), + "rank": c.get("rank"), + "session_id": (body.get("session") or {}).get("id")}, + )) + review = _cache["NormalizedReview"]( + comments=comments, overall_feedback="", cost_usd=None, + cost_method="estimated", model="reviewer3", + ) + md = _cache["build_method_data"]( + review=review, method_key=method_key, + method_label="Reviewer3 (reviewer3)", paragraphs=paragraphs, + ) + _cache["merge_into_paper_json"]( + out_file=orphan.out_json, slug=slug, + title=paper.get("title") or title, paragraphs=paragraphs, + method_key=method_key, method_data=md, + ) + + +def main() -> int: + ap = argparse.ArgumentParser(description=__doc__.splitlines()[0]) + ap.add_argument("--kind", choices=["both", "perturbation", "conference"], + default="both") + ap.add_argument("--dry-run", action="store_true", + help="Just report what's recoverable; don't write anything.") + ap.add_argument("--parallel", type=int, default=10, + help="Concurrent API fetches (default 10).") + args = ap.parse_args() + + api_key = os.environ.get("REVIEWER3_API_KEY") + if not api_key: + print("REVIEWER3_API_KEY not set (check .env)", file=sys.stderr) + return 1 + headers = {"x-api-key": api_key} + + orphans = find_orphans(args.kind) + print(f"found {len(orphans)} orphan .sid file(s) " + f"(no result on disk for these cells yet)") + if not orphans: + return 0 + + # Fetch every sid in parallel; bucket by status. + def _fetch(orphan: Orphan): + sid = orphan.sid_file.read_text().strip() + if not sid: + return orphan, sid, None + return orphan, sid, fetch_session(sid, headers=headers) + + completed: list[tuple[Orphan, str, dict]] = [] + incomplete = 0 + with ThreadPoolExecutor(max_workers=args.parallel) as pool: + for orphan, sid, body in pool.map(_fetch, orphans): + if body is None: + incomplete += 1 + continue + completed.append((orphan, sid, body)) + + print(f" {len(completed)} completed on R3 (recoverable)") + print(f" {incomplete} still in-progress / failed / unreachable") + + if args.dry_run or not completed: + for orphan, sid, _ in completed: + print(f" would write: {orphan.out_json} (sid={sid})") + return 0 + + n_pert = n_conf = 0 + for orphan, sid, body in completed: + try: + if orphan.kind == "pert": + write_perturbation(orphan, body) + n_pert += 1 + else: + write_conference(orphan, body) + n_conf += 1 + except Exception as e: + print(f" write failed for {sid} ({orphan.out_json}): " + f"{type(e).__name__}: {e}", file=sys.stderr) + + print(f"\nrescued: perturbation={n_pert} conference={n_conf}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/benchmarks/perturbation/run_benchmark.py b/benchmarks/perturbation/run_benchmark.py index 9117911..b44112e 100644 --- a/benchmarks/perturbation/run_benchmark.py +++ b/benchmarks/perturbation/run_benchmark.py @@ -98,6 +98,11 @@ class Config: review_mode: str = field(default="author", metadata={"choices": ["author", "journal"]}) poll_interval_s: float = 5.0 poll_timeout_s: float = 1200.0 + # Cap pages of the rendered PDF sent to R3. None = no cap. Used because + # the token-truncated staged file often isn't valid LaTeX; we compile the + # FULL pre-truncation source and trim by pages instead (matches the + # max_pages: 20 convention in conference_study/configs/coarse.yaml). + max_pages: int | None = None # Legacy aliases (read on load, normalized into models/methods). review_models: list[str] = field(default_factory=list) review_methods: list[str] = field(default_factory=list) diff --git a/benchmarks/perturbation/systems/reviewer3.py b/benchmarks/perturbation/systems/reviewer3.py index 66ab23e..503720f 100644 --- a/benchmarks/perturbation/systems/reviewer3.py +++ b/benchmarks/perturbation/systems/reviewer3.py @@ -30,6 +30,15 @@ class Reviewer3System(System): def build_jobs(self, units, cfg, results_dir): domain = results_dir.name + overrides = { + k: cfg[k] for k in ("review_mode", "poll_interval_s", "poll_timeout_s") + if k in cfg and cfg[k] is not None + } + # We compile the FULL (pre-truncation) source for R3 — the token-cut + # staged file frequently isn't valid LaTeX (chops mid-environment). + # `max_pages` then trims the rendered PDF so R3 still sees roughly the + # same content window as coarse (which uses max_pages: 20). + max_pages = cfg.get("max_pages") out: list[tuple[CellKey, ReviewJob]] = [] for u in units: if not u.staged_corrupted.exists(): @@ -42,10 +51,21 @@ def build_jobs(self, units, cfg, results_dir): continue out_json.unlink() tag = f"{domain}/{u.paper_label}/{u.error_type}/{REVIEWER3_SLUG}" + # Persist the R3 sessionId next to the review JSON. If a prior + # run was killed mid-poll, the next run resumes that session + # instead of creating a duplicate (~34% credit waste observed + # without this). + sid_file = review_dir / f"{u.staged_corrupted.stem}.sid" job = ReviewJob( tag=tag, out_json=out_json, review_dir=review_dir, paper_label=f"{u.error_type}/{u.paper_label}", - payload={"paper": u.staged_corrupted}, + payload={ + "paper": u.staged_corrupted, + "source": u.src_corrupted, + "max_pages": max_pages, + "sid_file": sid_file, + "overrides": overrides, + }, ) out.append(((REVIEWER3_SLUG,), job)) return out @@ -54,12 +74,16 @@ def run_jobs(self, cell_key, jobs, parallel): if not jobs: return [] cfg = reviewer3_adapter.config_from_env() - # cfg overrides come from the caller via run_jobs_with_cfg; default cfg is fine here. + for k, v in jobs[0].payload.get("overrides", {}).items(): + setattr(cfg, k, v) adapter_jobs = [ reviewer3_adapter.Reviewer3Job( paper=j.payload["paper"], out_json=j.out_json, paper_label=j.tag, + source=j.payload.get("source"), + max_pages=j.payload.get("max_pages"), + sid_file=j.payload.get("sid_file"), ) for j in jobs ] diff --git a/benchmarks/perturbation/systems/reviewer3_adapter.py b/benchmarks/perturbation/systems/reviewer3_adapter.py index c2f7408..552d911 100644 --- a/benchmarks/perturbation/systems/reviewer3_adapter.py +++ b/benchmarks/perturbation/systems/reviewer3_adapter.py @@ -90,10 +90,23 @@ def config_from_env() -> Reviewer3Config: @dataclass class Reviewer3Job: - paper: Path # path to *_corrupted.md - out_json: Path # where to write the pipeline-shaped JSON - paper_label: str # e.g. "/" + paper: Path # path to *_corrupted.md (staged, possibly token-truncated) + out_json: Path # where to write the pipeline-shaped JSON + paper_label: str # e.g. "/" title: str | None = None + # Optional: full pre-truncation source. When provided, _ensure_pdf + # compiles THIS instead of `paper` — the staged file is often invalid + # LaTeX because token truncation can chop mid-environment. + source: Path | None = None + # Optional: trim the rendered PDF to its first N pages so R3 still sees + # roughly the same window other systems do (coarse uses max_pages: 20). + max_pages: int | None = None + # Optional: where to persist the R3 sessionId. When set, _run_one writes + # the sid here right after submit succeeds and reads it back on subsequent + # runs to resume the same R3 session instead of creating a duplicate. + # This prevents the duplicate-session credit waste we observed when we + # killed runs mid-poll (R3 keeps the session live and we'd submit fresh). + sid_file: Path | None = None @dataclass @@ -114,8 +127,280 @@ def _headers(cfg: Reviewer3Config) -> dict[str, str]: return {"x-api-key": cfg.api_key} -def _submit(cfg: Reviewer3Config, paper: Path, *, title: str | None) -> str: - """POST /api/internal/review (multipart). Returns sessionId.""" +def _ensure_pdf(paper: Path, *, source: Path | None = None, + max_pages: int | None = None) -> Path: + """Reviewer 3 only accepts PDF. Return a compiled+possibly-trimmed PDF. + + Resolution order for the source bytes: + 1. If `paper` is already a `.pdf`, return it as-is (page trim still applies). + 2. If `source` was provided, compile that — preferred for LaTeX-as-md + since the staged `paper` is often invalid LaTeX (token truncation + chops mid-environment). + 3. Else compile `paper` directly. If it lacks `\\end{document}`, append + one as a best-effort close. + + When `max_pages` is set, the resulting PDF is trimmed to its first N + pages via pymupdf. This matches what coarse does (max_pages: 20) so R3 + sees roughly the same window other systems see. + """ + if paper.suffix.lower() == ".pdf": + return _maybe_trim_pages(paper, max_pages) + + src_for_compile = source if (source is not None and source.exists()) else paper + cached_suffix = ".trim.pdf" if max_pages else ".pdf" + cached = src_for_compile.with_suffix(cached_suffix) + src_mtime = src_for_compile.stat().st_mtime + if cached.exists() and cached.stat().st_mtime > src_mtime: + return cached + + head = src_for_compile.read_text(errors="replace")[:2000] + if "\\documentclass" not in head: + raise RuntimeError( + f"don't know how to convert {src_for_compile.name} to PDF " + "(no \\documentclass found; expected LaTeX-as-md or PDF)" + ) + text = src_for_compile.read_text(errors="replace") + if "\\end{document}" not in text: + # `source` should always close cleanly; this only fires if we fell back + # to compiling the token-truncated `paper`. + text = text.rstrip() + "\n\n\\end{document}\n" + # Rewrite \documentclass{} -> \documentclass{amsart} when the + # custom class isn't installed on the local TeX Live. Without this many + # journal-class papers (aq-amsart, sn-jnl, atlasdoc, aastex*, cas-sc, ...) + # bail at line 1 with "File `X.cls' not found". + text = _force_known_documentclass(text) + # Same idea for missing \usepackage{X.sty} (e.g. jinstpub on some hep-ex + # papers). Comment them out — pdflatex aborts hard on missing packages. + text = _strip_missing_packages(text) + # Strip orphan \input / \include — the perturbation corpus dumps each paper + # into a single .md but a few preserve `\input{mypreamble.tex}`-style + # directives that pdflatex can't resolve (fatal error, no PDF produced). + text = _strip_orphan_includes(text, src_for_compile.parent) + # Stripped-include papers (and some others) rely on author-defined shortcuts + # like \bbC, \calA, \vvirg, \ootimes from the missing preamble. Inject + # \providecommand fallbacks so the body compiles. + text = _inject_rescue_preamble(text) + + import shutil, subprocess, tempfile + with tempfile.TemporaryDirectory() as td: + tex = Path(td) / "source.tex" + tex.write_text(text) + # Run twice to resolve cross-refs; ignore exit code, accept partial PDF. + # Capture output as bytes — pdflatex's own log can contain non-UTF-8 + # accent bytes and we don't read this output anyway (the .log file is + # the source of truth on failure). + for _ in range(2): + subprocess.run( + ["pdflatex", "-interaction=nonstopmode", "source.tex"], + cwd=td, capture_output=True, timeout=300, + ) + out_pdf = Path(td) / "source.pdf" + if not out_pdf.exists() or out_pdf.stat().st_size < 1000: + log_path = Path(td) / "source.log" + log = log_path.read_text(errors="replace") if log_path.exists() else "" + raise RuntimeError( + f"pdflatex produced no usable PDF for {src_for_compile}: {log[-1500:]}" + ) + if max_pages: + _trim_pages_to(out_pdf, max_pages) + shutil.copy(out_pdf, cached) + return cached + + +_INPUT_RE = __import__("re").compile( + r"\\(?:input|include)\s*\{([^}]+)\}", flags=__import__("re").IGNORECASE, +) + + +def _strip_orphan_includes(text: str, base_dir: Path) -> str: + """Comment out `\\input{path}` / `\\include{path}` whose target file isn't + next to the source. pdflatex aborts hard on a missing \\input, which kills + the compile even when the rest of the document is fine.""" + def _replace(m): + target = m.group(1).strip() + # Common LaTeX convention: optional .tex extension. + for cand in (target, target + ".tex"): + if (base_dir / cand).exists(): + return m.group(0) + return "% [stripped missing include] " + m.group(0) + return _INPUT_RE.sub(_replace, text) + + +# Defensive preamble injected after \documentclass to provide fallbacks for +# common custom-command patterns that authors typically define in private +# preamble files (e.g. mypreamble.tex). \providecommand is a no-op when the +# command is already defined, so this is safe to inject blindly. +_RESCUE_PREAMBLE = r""" +% --- injected by reviewer3_adapter: providecommand fallbacks --- +% Run at \begin{document} so we don't fight with packages that define the +% same shortcuts (e.g. amsfonts -> \Bbbk). \providecommand is itself a no-op +% when the command already exists, but only at the moment it runs; AtBeginDoc +% defers the check past all \usepackage{...} loads. +\AtBeginDocument{% + \providecommand{\bb}[1]{\mathbb{#1}}% + \providecommand{\cal}[1]{\mathcal{#1}}% + \providecommand{\bff}[1]{\mathbf{#1}}% + \makeatletter + \@for\letter:={A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z}\do{% + \expandafter\providecommand\csname bb\letter\endcsname{\ensuremath{\mathbb{\letter}}}% + \expandafter\providecommand\csname cal\letter\endcsname{\ensuremath{\mathcal{\letter}}}% + }% + \@for\letter:={a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z}\do{% + \expandafter\providecommand\csname bf\letter\endcsname{\ensuremath{\mathbf{\letter}}}% + }% + \makeatother + \providecommand{\eps}{\epsilon}% + \providecommand{\veps}{\varepsilon}% + \providecommand{\vvirg}{,\,}% + \providecommand{\ootimes}{\otimes}% +} +% --- end injected preamble --- +""" + + +def _inject_rescue_preamble(text: str) -> str: + """Insert _RESCUE_PREAMBLE right after the first \\documentclass{...} line. + Idempotent: looks for our marker before injecting.""" + if "injected by reviewer3_adapter" in text: + return text + import re + m = re.search(r"\\documentclass(\[[^\]]*\])?\{[^}]+\}", text) + if not m: + return text + cut = m.end() + return text[:cut] + "\n" + _RESCUE_PREAMBLE + text[cut:] + + +# Bundled-with-TeX-Live classes are kept as-is. Anything else gets rewritten +# to `\documentclass{amsart}` so pdflatex doesn't bail at line 1 with +# "File `.cls' not found". Class-specific commands in the body then +# emit undefined-control-sequence warnings, but pdflatex in nonstopmode still +# produces a usable PDF. +_KNOWN_CLASSES_CACHE: dict[str, bool] = {} + + +def _class_is_installed(cls: str) -> bool: + if cls in _KNOWN_CLASSES_CACHE: + return _KNOWN_CLASSES_CACHE[cls] + import subprocess + try: + r = subprocess.run(["kpsewhich", f"{cls}.cls"], + capture_output=True, timeout=5) + ok = (r.returncode == 0 and r.stdout.strip() != b"") + except Exception: + ok = False + _KNOWN_CLASSES_CACHE[cls] = ok + return ok + + +def _force_known_documentclass(text: str) -> str: + """Rewrite `\\documentclass[opts]{}` to `\\documentclass{amsart}` + when isn't bundled with the local TeX Live. Drops options too — + they're class-specific and frequently invalid against amsart. amsart is + the math-friendly fallback (preserves theorem/lemma environments). + + Matches only the first **uncommented** \\documentclass — many papers carry + example/commented-out variants before the active one. + """ + import re + pat = re.compile(r"^(?P[^%\n]*?)\\documentclass(\[[^\]]*\])?\{([^}]+)\}", + re.MULTILINE) + for m in pat.finditer(text): + # If there's a `%` before the `\documentclass` on this line, it's commented. + if "%" in m.group("lead"): + continue + cls = m.group(3).strip() + if _class_is_installed(cls): + return text + # Replace just the `\documentclass...{...}` span, keeping any leading content + # (it's empty in practice but be safe). + start = m.start() + len(m.group("lead")) + return text[:start] + r"\documentclass{amsart}" + text[m.end():] + return text + + +# Same pattern for missing packages — pdflatex aborts hard on +# "File `X.sty' not found", so comment out \usepackage{X} (and the bracketed +# options line) when X isn't installed. Bundled-with-TeX packages stay. +_KNOWN_PACKAGES_CACHE: dict[str, bool] = {} + + +def _package_is_installed(pkg: str) -> bool: + if pkg in _KNOWN_PACKAGES_CACHE: + return _KNOWN_PACKAGES_CACHE[pkg] + import subprocess + try: + r = subprocess.run(["kpsewhich", f"{pkg}.sty"], + capture_output=True, timeout=5) + ok = (r.returncode == 0 and r.stdout.strip() != b"") + except Exception: + ok = False + _KNOWN_PACKAGES_CACHE[pkg] = ok + return ok + + +def _strip_missing_packages(text: str) -> str: + """Comment out `\\usepackage[opts]{X}` lines whose .sty isn't installed. + Multi-package forms like `\\usepackage{a,b,c}` are split: any missing + member causes the whole line to be commented out (rare in practice). + """ + import re + def _replace(m): + pkgs = [p.strip() for p in m.group(2).split(",")] + if all(_package_is_installed(p) for p in pkgs if p): + return m.group(0) + return "% [stripped missing package] " + m.group(0) + return re.sub(r"\\usepackage(\[[^\]]*\])?\{([^}]+)\}", _replace, text) + + +def _maybe_trim_pages(pdf: Path, max_pages: int | None) -> Path: + """Return `pdf` (already a PDF). If `max_pages` is set and the PDF has more, + return a trimmed sibling cached next to it.""" + if not max_pages: + return pdf + import fitz + src = fitz.open(pdf) + try: + if src.page_count <= max_pages: + return pdf + trimmed = pdf.with_suffix(f".first{max_pages}p.pdf") + if trimmed.exists() and trimmed.stat().st_mtime > pdf.stat().st_mtime: + return trimmed + dst = fitz.open() + dst.insert_pdf(src, from_page=0, to_page=max_pages - 1) + dst.save(trimmed) + dst.close() + return trimmed + finally: + src.close() + + +def _trim_pages_to(pdf: Path, max_pages: int) -> None: + """In-place trim of `pdf` to its first `max_pages` pages.""" + import fitz + src = fitz.open(pdf) + try: + if src.page_count <= max_pages: + return + dst = fitz.open() + dst.insert_pdf(src, from_page=0, to_page=max_pages - 1) + tmp = pdf.with_suffix(".pdf.tmp") + dst.save(tmp) + dst.close() + finally: + src.close() + tmp.replace(pdf) + + +def _submit(cfg: Reviewer3Config, paper: Path, *, title: str | None, + source: Path | None = None, max_pages: int | None = None) -> str: + """POST /api/internal/review (multipart). Returns sessionId. + + `source` and `max_pages` are forwarded to `_ensure_pdf` so callers can opt + into compiling the full pre-truncation source and trimming the rendered + PDF — see _ensure_pdf docstring. + """ + paper = _ensure_pdf(paper, source=source, max_pages=max_pages) url = f"{cfg.base_url}/api/internal/review" data: dict[str, str] = { "userId": cfg.user_id, @@ -125,7 +410,7 @@ def _submit(cfg: Reviewer3Config, paper: Path, *, title: str | None) -> str: if title: data["title"] = title with paper.open("rb") as fh: - files = {"file": (paper.name, fh, "text/markdown")} + files = {"file": (paper.name, fh, "application/pdf")} resp = requests.post(url, headers=_headers(cfg), data=data, files=files, timeout=cfg.request_timeout_s) if resp.status_code >= 400: @@ -180,29 +465,36 @@ def _pick(d: dict, *keys: str, default: str = "") -> str: return default +# Canonical severity mapping lives in benchmarks/perturbation/_severity.py +# so every system (coarse, reviewer3, openaireview) and the downstream +# conference-study analyses share one source of truth. +import sys as _sys +from pathlib import Path as _Path +_sys.path.insert(0, str(_Path(__file__).resolve().parent.parent)) +from _severity import normalize_severity as _normalize_r3_severity # noqa: E402 + + def _normalize_comment(raw: dict, idx: int) -> dict: - """Best-effort mapping from Reviewer 3 comment shape to pipeline schema. + """Map a Reviewer 3 comment to the pipeline schema. - The OpenAPI spec doesn't pin field names. We try common synonyms; anything - we don't recognize is preserved on the side as `_raw` for later inspection. + Reviewer 3 comments carry: reviewerId, comment, title, citedText, severity, rank. + `citedText` is the verbatim excerpt from the paper — what our scorer uses as + `quote` for fuzzy/semantic matching. """ cid = _pick(raw, "id", "commentId", "uuid") or f"reviewer3_{idx}" - title = _pick(raw, "title", "subject", "heading", "summary") - quote = _pick(raw, "quote", "snippet", "excerpt", "passage", "text", "highlight") - explanation = _pick(raw, "explanation", "comment", "feedback", "body", "rationale", "message") + title = _pick(raw, "title") + quote = _pick(raw, "citedText", "quote", "snippet", "excerpt", "passage", "highlight") + explanation = _pick(raw, "comment", "explanation", "feedback", "body", "rationale", "message") + severity = _normalize_r3_severity("reviewer3", raw.get("severity")) if not explanation: - # last resort: serialize whatever we have so the comment isn't empty - explanation = json.dumps({k: v for k, v in raw.items() - if k not in ("id", "commentId", "uuid", "title", "subject", - "heading", "summary", "quote", "snippet", - "excerpt", "passage", "text", "highlight")}, - ensure_ascii=False) + explanation = json.dumps(raw, ensure_ascii=False) return { "id": cid, "title": title, "quote": quote, "explanation": explanation, "comment_type": "technical", + "severity": severity, "paragraph_index": None, "_raw": raw, } @@ -243,25 +535,67 @@ def _run_one(job: Reviewer3Job, cfg: Reviewer3Config) -> Reviewer3Result: job.out_json.parent.mkdir(parents=True, exist_ok=True) raw_path = job.out_json.with_suffix(".raw.json") tag = f"reviewer3/{job.paper_label}" - print(f"[{tag}] starting: {job.paper.name}", file=sys.stderr, flush=True) start = time.time() sid = "" + sid_file = job.sid_file + + def _write_outputs(body: dict, elapsed: float) -> int: + raw_path.write_text(json.dumps(body, indent=2, ensure_ascii=False)) + pipeline = build_pipeline_json(job.paper, body, elapsed_s=elapsed) + job.out_json.write_text(json.dumps(pipeline, indent=2, ensure_ascii=False)) + # Successful → sid_file no longer needed for resume, but leave it as + # an audit trail (cheap, sometimes useful for tracing back to R3 UI). + return len(pipeline["methods"][next(iter(pipeline["methods"]))]["comments"]) + try: - sid = _submit(cfg, job.paper, title=job.title) + # Resume path: if a sid_file exists, try to recover the prior session + # instead of submitting fresh. This is the dedup-credit-waste fix. + if sid_file and sid_file.exists(): + sid = sid_file.read_text().strip() + print(f"[{tag}] resuming sessionId={sid} (from {sid_file.name})", + file=sys.stderr, flush=True) + try: + body = _poll_until_done(cfg, sid, tag=tag) + elapsed = time.time() - start + n = _write_outputs(body, elapsed) + print(f"[{tag}] done in {elapsed:.0f}s ({n} comments, resumed)", + file=sys.stderr, flush=True) + return Reviewer3Result(job=job, ok=True, elapsed_s=elapsed, + session_id=sid, raw_response=body) + except RuntimeError as e: + # Session-fetch hard fail (e.g., 403/404 — sid stale/invalid). + # Drop the sid_file and fall through to fresh submit. + msg = str(e) + if "fetch failed" in msg and ("403" in msg or "404" in msg): + print(f"[{tag}] sid {sid} unrecoverable ({msg[:80]}); " + "submitting fresh", file=sys.stderr, flush=True) + sid_file.unlink(missing_ok=True) + sid = "" + else: + raise # other RuntimeErrors (e.g., status=failed) propagate + + # Submit path: no usable sid_file, send a new submission. + print(f"[{tag}] starting: {job.paper.name}", file=sys.stderr, flush=True) + sid = _submit(cfg, job.paper, title=job.title, + source=job.source, max_pages=job.max_pages) print(f"[{tag}] submitted, sessionId={sid}", file=sys.stderr, flush=True) + if sid_file: + sid_file.parent.mkdir(parents=True, exist_ok=True) + sid_file.write_text(sid) body = _poll_until_done(cfg, sid, tag=tag) elapsed = time.time() - start - raw_path.write_text(json.dumps(body, indent=2, ensure_ascii=False)) - pipeline = build_pipeline_json(job.paper, body, elapsed_s=elapsed) - job.out_json.write_text(json.dumps(pipeline, indent=2, ensure_ascii=False)) - n = len(pipeline["methods"][next(iter(pipeline["methods"]))]["comments"]) - print(f"[{tag}] done in {elapsed:.0f}s ({n} comments)", file=sys.stderr, flush=True) + n = _write_outputs(body, elapsed) + print(f"[{tag}] done in {elapsed:.0f}s ({n} comments)", + file=sys.stderr, flush=True) return Reviewer3Result(job=job, ok=True, elapsed_s=elapsed, session_id=sid, raw_response=body) except Exception as e: elapsed = time.time() - start msg = f"{type(e).__name__}: {e}" - print(f"[{tag}] FAILED in {elapsed:.0f}s: {msg}", file=sys.stderr, flush=True) + print(f"[{tag}] FAILED in {elapsed:.0f}s: {msg}", + file=sys.stderr, flush=True) + # Keep sid_file on failure — next run will retry the same session + # rather than incur a duplicate submission cost. return Reviewer3Result(job=job, ok=False, elapsed_s=elapsed, session_id=sid, error=msg) diff --git a/pyproject.toml b/pyproject.toml index 396295e..2500007 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,4 +34,4 @@ reviewer = ["viz/*.html", "skill/SKILL.md", "skill/scripts/*.py", "skill/referen mistral = ["mistral-ocr-cli>=1.2.0"] deepseek = ["deepseek-ocr-cli>=0.4.2"] dev = ["pytest>=8.0"] -benchmarks = ["pyyaml>=6.0", "datasets>=2.0", "rapidfuzz>=3.0", "sentence-transformers>=3.0"] +benchmarks = ["pyyaml>=6.0", "datasets>=2.0", "rapidfuzz>=3.0", "sentence-transformers>=3.0", "requests>=2.31"]