From bf6b3b6fa992e72e46190a47614218e14345f5f8 Mon Sep 17 00:00:00 2001 From: Dang Nguyen Date: Fri, 15 May 2026 15:30:49 -0500 Subject: [PATCH 1/7] Add Reviewer 3 (closed-source HTTP API) as a benchmark system MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires reviewer3 into both the perturbation benchmark (run_benchmark.py) and the conference outcomes study (run_competitors.py), so it can be benchmarked on the exact same paper sets as openaireview and coarse. Highlights - Perturbation: LaTeX-as-md sources are compiled to PDF with pdflatex before submission (cs_CC corpus is full LaTeX); other domains pass through. PDF MIME is set; cfg knobs (review_mode, poll_interval_s, poll_timeout_s) are threaded from YAML through Reviewer3System into the adapter via the job payload. - Comment normalization picks up the recently-added Reviewer 3 fields: citedText (-> quote), title, severity (1-4 -> canonical 3-tier scale). - Severity mapping is consolidated into benchmarks/perturbation/_severity.py so the perturbation adapter, compute_auc.py, and report_scaleup.py share one source of truth (collapses three inlined copies of COARSE_SEVERITY_MAP). - Conference adapter reuses the perturbation HTTP submit/poll/normalize helpers via sys.path import; no duplication. R3 has no model selector, so method_key is fixed at reviewer3__reviewer3. - 8 perturbation configs (full__reviewer3.yaml) mirror the full__coarse.yaml knobs for max_tokens / min_perturbations. - conference_study/configs/reviewer3.yaml pins models: [reviewer3] so the per-(paper, model) loop fires once per paper rather than once per manifest model. - requests>=2.31 added to [project.optional-dependencies] benchmarks. - .env.example documents REVIEWER3_API_KEY / REVIEWER3_USER_ID (UUID, not email — see neurips_2026 setup notes). Operational note (not in this diff) - conference_study/{manifests,papers,results} are gitignored data dirs that live only in the sibling worktree. To run, symlink them in: cd benchmarks/conference_study ln -s ../../../OpenAIReview/benchmarks/conference_study/manifests manifests ln -s ../../../OpenAIReview/benchmarks/conference_study/papers papers ln -s ../../../OpenAIReview/benchmarks/conference_study/results results Known gap - run_competitors.py does not load .env (only run_benchmark.py does); export REVIEWER3_* before launching, or add a dotenv.load_dotenv() at the top of run_competitors.py in a follow-up. Smoke-validated end-to-end on 1 cs_CC paper through the perturbation runner; recall 10/17 on the staged perturbations (LLM judge). Co-Authored-By: Claude Opus 4.7 (1M context) --- .env.example | 6 ++ .../conference_study/analyses/compute_auc.py | 19 ++--- .../analyses/report_scaleup.py | 22 ++--- .../conference_study/competitors/registry.py | 2 + .../competitors/reviewer3_adapter.py | 83 ++++++++++++++++++ .../conference_study/configs/reviewer3.yaml | 32 +++++++ benchmarks/perturbation/_severity.py | 85 +++++++++++++++++++ .../configs/full_cs_CC_reviewer3.yaml | 13 +++ .../configs/full_cs_LG_reviewer3.yaml | 13 +++ .../configs/full_econ_EM_reviewer3.yaml | 13 +++ .../configs/full_hep_ex_reviewer3.yaml | 13 +++ .../configs/full_math_all_reviewer3.yaml | 13 +++ .../full_physics_atm_clus_reviewer3.yaml | 13 +++ .../configs/full_q_bio_GN_reviewer3.yaml | 13 +++ .../configs/full_stat_AP_reviewer3.yaml | 13 +++ benchmarks/perturbation/systems/reviewer3.py | 9 +- .../perturbation/systems/reviewer3_adapter.py | 73 +++++++++++++--- pyproject.toml | 2 +- 18 files changed, 396 insertions(+), 41 deletions(-) create mode 100644 benchmarks/conference_study/competitors/reviewer3_adapter.py create mode 100644 benchmarks/conference_study/configs/reviewer3.yaml create mode 100644 benchmarks/perturbation/_severity.py create mode 100644 benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml create mode 100644 benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml create mode 100644 benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml create mode 100644 benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml create mode 100644 benchmarks/perturbation/configs/full_math_all_reviewer3.yaml create mode 100644 benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml create mode 100644 benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml create mode 100644 benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml diff --git a/.env.example b/.env.example index 56aac36..4e36a22 100644 --- a/.env.example +++ b/.env.example @@ -15,3 +15,9 @@ OPENROUTER_API_KEY=your_openrouter_api_key_here # Optional: custom OpenAI base URL (e.g. EU endpoint, Azure) # OPENAI_BASE_URL=https://eu.api.openai.com/v1 + +# Reviewer 3 (closed-source HTTP API; benchmarks/perturbation only). +# Required when running the perturbation benchmark with system: reviewer3. +# REVIEWER3_API_KEY=sk_... +# REVIEWER3_USER_ID= +# REVIEWER3_BASE_URL=https://reviewer3.com # optional override diff --git a/benchmarks/conference_study/analyses/compute_auc.py b/benchmarks/conference_study/analyses/compute_auc.py index d32f1a9..0d87e23 100644 --- a/benchmarks/conference_study/analyses/compute_auc.py +++ b/benchmarks/conference_study/analyses/compute_auc.py @@ -51,17 +51,14 @@ REPO_ROOT = HERE.parent # benchmarks/conference_study/ RESULTS_ROOT = REPO_ROOT / "results" -COARSE_SEVERITY_MAP = {"critical": "major", "major": "moderate", "minor": "minor"} -SEVERITY_TIERS = ("major", "moderate", "minor") - - -def normalize_severity(method: str, raw: str | None) -> str | None: - if not raw: - return None - raw = raw.lower() - if method == "coarse": - return COARSE_SEVERITY_MAP.get(raw) - return raw if raw in SEVERITY_TIERS else None +# Severity normalization lives in benchmarks/perturbation/_severity.py so the +# perturbation adapters and these analyses use one source of truth. +sys.path.insert(0, str(HERE.parents[1] / "perturbation")) +from _severity import ( # noqa: E402 + COARSE_SEVERITY_MAP, + TIERS as SEVERITY_TIERS, + normalize_severity, +) def load_manifest(path: Path) -> dict[str, list[dict]]: diff --git a/benchmarks/conference_study/analyses/report_scaleup.py b/benchmarks/conference_study/analyses/report_scaleup.py index 79dce76..8a43725 100644 --- a/benchmarks/conference_study/analyses/report_scaleup.py +++ b/benchmarks/conference_study/analyses/report_scaleup.py @@ -204,20 +204,14 @@ def comment_metrics_by_method( return out -# Coarse uses {minor, major, critical}; openaireview methods use -# {minor, moderate, major}. Normalize so a single set of tiers compares -# apples-to-apples (highest=major, mid=moderate, low=minor). -_COARSE_SEVERITY_MAP = {"critical": "major", "major": "moderate", "minor": "minor"} -SEVERITY_TIERS = ("major", "moderate", "minor") - - -def normalize_severity(method: str, raw: str | None) -> str | None: - if not raw: - return None - raw = raw.lower() - if method == "coarse": - return _COARSE_SEVERITY_MAP.get(raw) - return raw if raw in SEVERITY_TIERS else None +# Severity normalization lives in benchmarks/perturbation/_severity.py so the +# perturbation adapters and these analyses use one source of truth. +sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "perturbation")) +from _severity import ( # noqa: E402 + COARSE_SEVERITY_MAP as _COARSE_SEVERITY_MAP, + TIERS as SEVERITY_TIERS, + normalize_severity, +) def severity_counts_by_method( diff --git a/benchmarks/conference_study/competitors/registry.py b/benchmarks/conference_study/competitors/registry.py index be0a25a..7a64bd8 100644 --- a/benchmarks/conference_study/competitors/registry.py +++ b/benchmarks/conference_study/competitors/registry.py @@ -9,9 +9,11 @@ from .base import CompetitorAdapter from .coarse_adapter import CoarseAdapter +from .reviewer3_adapter import Reviewer3Adapter _REGISTRY: dict[str, type[CompetitorAdapter]] = { "coarse": CoarseAdapter, + "reviewer3": Reviewer3Adapter, } diff --git a/benchmarks/conference_study/competitors/reviewer3_adapter.py b/benchmarks/conference_study/competitors/reviewer3_adapter.py new file mode 100644 index 0000000..be4dbd2 --- /dev/null +++ b/benchmarks/conference_study/competitors/reviewer3_adapter.py @@ -0,0 +1,83 @@ +"""Adapter for Reviewer 3 (closed-source HTTP API). + +Submission flow is the same as the perturbation benchmark — POST a PDF to +`/api/internal/review`, poll the session until the `status` enum is terminal, +then map each comment via `_normalize_comment`. We reuse those helpers from +the perturbation adapter (`benchmarks/perturbation/systems/reviewer3_adapter.py`) +rather than duplicating the HTTP code; the only difference here is that +conference inputs arrive as PDFs already, so the LaTeX-as-md → PDF compile +step (`_ensure_pdf`) is unnecessary. + +Reviewer 3 has no model selector, so `method_key(...)` always returns +`"reviewer3__reviewer3"` regardless of the manifest `model` value. The +conference YAML should pin `models: [reviewer3]` to avoid duplicate +submissions across a phantom model loop. + +Required env: + REVIEWER3_API_KEY sk_... (sent as `x-api-key` header) + REVIEWER3_USER_ID UUID from the vendor's web UI session JSON (not an email) +""" +from __future__ import annotations + +import sys +from pathlib import Path + +from .base import CompetitorAdapter, NormalizedComment, NormalizedReview + +# Reuse the perturbation adapter's HTTP + normalization helpers. +_PERT = Path(__file__).resolve().parents[2] / "perturbation" / "systems" +sys.path.insert(0, str(_PERT)) +import reviewer3_adapter as _r3 # noqa: E402 + + +_METHOD_KEY = f"{_r3.REVIEWER3_SLUG}__{_r3.REVIEWER3_SLUG}" + + +class Reviewer3Adapter(CompetitorAdapter): + name = "reviewer3" + required_env = ("REVIEWER3_API_KEY", "REVIEWER3_USER_ID") + + def method_key(self, model: str) -> str: + # R3 has no model selector — fixed key regardless of `model`. + return _METHOD_KEY + + def review(self, pdf: Path, model: str, cfg: dict) -> NormalizedReview: + opts = cfg.get("reviewer3_options", {}) or {} + rcfg = _r3.config_from_env() + for k in ("review_mode", "poll_interval_s", "poll_timeout_s", + "request_timeout_s", "base_url"): + if k in opts and opts[k] is not None: + setattr(rcfg, k, opts[k]) + + session_id = _r3._submit(rcfg, pdf, title=pdf.stem) + body = _r3._poll_until_done(rcfg, session_id, tag=f"reviewer3/{pdf.stem}") + + comments: list[NormalizedComment] = [] + for i, raw in enumerate(body.get("comments") or []): + if not isinstance(raw, dict): + raw = {"comment": str(raw)} + norm = _r3._normalize_comment(raw, i) + comments.append(NormalizedComment( + title=norm.get("title", ""), + quote=norm.get("quote", ""), + explanation=norm.get("explanation", ""), + comment_type=norm.get("comment_type", "technical"), + extra={ + "severity": norm.get("severity"), + "reviewerId": raw.get("reviewerId"), + "rank": raw.get("rank"), + "session_id": session_id, + }, + )) + + # R3 doesn't publish pricing and doesn't return overall_feedback or + # token counts in its response, so we leave those empty/None. + return NormalizedReview( + comments=comments, + overall_feedback="", + cost_usd=None, + cost_method="estimated", + prompt_tokens=None, + completion_tokens=None, + model=_r3.REVIEWER3_SLUG, + ) diff --git a/benchmarks/conference_study/configs/reviewer3.yaml b/benchmarks/conference_study/configs/reviewer3.yaml new file mode 100644 index 0000000..f090d41 --- /dev/null +++ b/benchmarks/conference_study/configs/reviewer3.yaml @@ -0,0 +1,32 @@ +# Reviewer 3 (closed-source HTTP API) run on the v2 conference cohort. +# Results -> benchmarks/conference_study/results/reviewer3_v2/ +# Log -> benchmarks/conference_study/results/reviewer3_v2/run_log.jsonl +# +# Prerequisites: +# - REVIEWER3_API_KEY and REVIEWER3_USER_ID set in .env +# - Manifest, papers, and results dirs reachable. In this worktree they are +# symlinks into the sibling OpenAIReview worktree (gitignored data lives +# only there). Set them up with: +# ln -s ../../../OpenAIReview/benchmarks/conference_study/manifests manifests +# ln -s ../../../OpenAIReview/benchmarks/conference_study/papers papers +# ln -s ../../../OpenAIReview/benchmarks/conference_study/results results + +name: reviewer3_v2 +competitor: reviewer3 + +manifest: manifests/v2/combined.json + +# R3 has no model selector. Pin to a single dummy entry so run_competitors.py +# loops once per paper rather than once per (paper × manifest model). +models: + - reviewer3 + +timeout_sec: 3600 # outer per-(paper, model) wall cap (R3 is 10-30 min typical) +max_per_model: 5 # 5 concurrent submits/polls against R3 (matches plan) +max_pages: 20 # parse_document cap; matches coarse.yaml convention + +# Adapter-specific options forwarded to Reviewer3Adapter.review(cfg=...). +reviewer3_options: + review_mode: author # author | journal (R3 reviewMode enum) + poll_interval_s: 30 + poll_timeout_s: 1800 # 30 min/paper cap inside the poll loop diff --git a/benchmarks/perturbation/_severity.py b/benchmarks/perturbation/_severity.py new file mode 100644 index 0000000..0e2db3c --- /dev/null +++ b/benchmarks/perturbation/_severity.py @@ -0,0 +1,85 @@ +"""Canonical severity tiers and per-system normalization. + +The perturbation benchmark, the conference study analyses, and the viz layer +all want to compare comment severities across review systems. Each system uses +its own native vocabulary, so before any cross-system comparison the raw value +must be mapped to the canonical 3-tier scale used by openaireview itself: + + major - Undermines a key claim/methodology; affects conclusions. + moderate - Real error or gap that is localized and fixable. + minor - Framing concern, mild overclaim, or resolvable ambiguity. + +Per-system maps: + + * openaireview: identity. Output is already in {major, moderate, minor}. + * coarse: {critical, major, minor} -> {major, moderate, minor} + (shift down one tier; same mapping that the conference-study + scripts in benchmarks/conference_study/analyses/ use). + * reviewer3: integer 1..4 per their OpenAPI spec, where + 1=Critical, 2=Major, 3=Minor, 4=Editorial. + Compressed to the 3-tier scale by collapsing R3 Minor and + Editorial into `minor`, since in practice R3 tags substantive + -but-lower-importance findings as Editorial rather than style + notes. Confirm with the vendor if the label is later clarified. + +The conference_study analyses currently inline `COARSE_SEVERITY_MAP` (see +`benchmarks/conference_study/analyses/compute_auc.py` and `report_scaleup.py`). +Once those analyses are co-resident with this module they should import +`COARSE_SEVERITY_MAP` and `normalize_severity` from here instead. +""" + +from __future__ import annotations + + +TIERS: tuple[str, ...] = ("major", "moderate", "minor") + + +# openaireview methods emit canonical tier strings directly. +OPENAIREVIEW_SEVERITY_MAP: dict[str, str] = {t: t for t in TIERS} + +# coarse uses {minor, major, critical}. Shift down one level. +COARSE_SEVERITY_MAP: dict[str, str] = { + "critical": "major", + "major": "moderate", + "minor": "minor", +} + +# Reviewer 3 spec: 1=Critical, 2=Major, 3=Minor, 4=Editorial. +# Compress to 3 tiers; Editorial collapses with Minor (see module docstring). +REVIEWER3_SEVERITY_MAP: dict[int, str] = { + 1: "major", + 2: "moderate", + 3: "minor", + 4: "minor", +} + + +def normalize_severity(system: str, raw: object) -> str | None: + """Map a system-native severity value to the canonical 3-tier scale. + + Returns None for unrecognized values so callers can decide whether to drop + the comment, default it, or warn. + + `system` is the registry key matching `benchmarks/perturbation/systems/`: + 'openaireview', 'coarse', or 'reviewer3'. + """ + if raw is None: + return None + sysn = system.lower() + if sysn == "reviewer3": + if isinstance(raw, int): + return REVIEWER3_SEVERITY_MAP.get(raw) + # tolerate the str-form for hand-written test fixtures + try: + return REVIEWER3_SEVERITY_MAP.get(int(raw)) + except (TypeError, ValueError): + return None + if not isinstance(raw, str): + return None + s = raw.lower() + if sysn == "coarse": + return COARSE_SEVERITY_MAP.get(s) + if sysn == "openaireview": + return OPENAIREVIEW_SEVERITY_MAP.get(s) + # unknown system -> pass through if already canonical + return s if s in TIERS else None diff --git a/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml b/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml new file mode 100644 index 0000000..3488e54 --- /dev/null +++ b/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml @@ -0,0 +1,13 @@ +system: reviewer3 +input_dir: benchmarks/perturbation/data/perturbations_filtered/cs_CC/all +max_tokens: 13000 # match full_cs_CC_coarse.yaml +min_perturbations: 5 # match full_cs_CC_coarse.yaml + +score_method: llm +score_model: google/gemini-3-flash-preview + +review_mode: author # author | journal (Reviewer 3 reviewMode enum) +poll_interval_s: 30 +poll_timeout_s: 1800 # 30 min/paper cap + +results_dir: benchmarks/perturbation/results/full_cs_CC_reviewer3 diff --git a/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml b/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml new file mode 100644 index 0000000..bc3cbb3 --- /dev/null +++ b/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml @@ -0,0 +1,13 @@ +system: reviewer3 +input_dir: benchmarks/perturbation/data/perturbations_filtered/cs_LG/all +max_tokens: 13000 # match full_cs_LG_coarse.yaml +min_perturbations: 5 # match full_cs_LG_coarse.yaml + +score_method: llm +score_model: google/gemini-3-flash-preview + +review_mode: author # author | journal (Reviewer 3 reviewMode enum) +poll_interval_s: 30 +poll_timeout_s: 1800 # 30 min/paper cap + +results_dir: benchmarks/perturbation/results/full_cs_LG_reviewer3 diff --git a/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml b/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml new file mode 100644 index 0000000..f871179 --- /dev/null +++ b/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml @@ -0,0 +1,13 @@ +system: reviewer3 +input_dir: benchmarks/perturbation/data/perturbations_filtered/econ_EM/all +max_tokens: 13000 # match full_econ_EM_coarse.yaml +min_perturbations: 5 # match full_econ_EM_coarse.yaml + +score_method: llm +score_model: google/gemini-3-flash-preview + +review_mode: author # author | journal (Reviewer 3 reviewMode enum) +poll_interval_s: 30 +poll_timeout_s: 1800 # 30 min/paper cap + +results_dir: benchmarks/perturbation/results/full_econ_EM_reviewer3 diff --git a/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml b/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml new file mode 100644 index 0000000..901fbdb --- /dev/null +++ b/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml @@ -0,0 +1,13 @@ +system: reviewer3 +input_dir: benchmarks/perturbation/data/perturbations_filtered/hep-ex/all +max_tokens: 13000 # match full_hep_ex_coarse.yaml +min_perturbations: 5 # match full_hep_ex_coarse.yaml + +score_method: llm +score_model: google/gemini-3-flash-preview + +review_mode: author # author | journal (Reviewer 3 reviewMode enum) +poll_interval_s: 30 +poll_timeout_s: 1800 # 30 min/paper cap + +results_dir: benchmarks/perturbation/results/full_hep_ex_reviewer3 diff --git a/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml b/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml new file mode 100644 index 0000000..ef94db9 --- /dev/null +++ b/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml @@ -0,0 +1,13 @@ +system: reviewer3 +input_dir: benchmarks/perturbation/data/perturbations_filtered/math_all/all +max_tokens: 13000 # match full_math_all_coarse.yaml +min_perturbations: 5 # match full_math_all_coarse.yaml + +score_method: llm +score_model: google/gemini-3-flash-preview + +review_mode: author # author | journal (Reviewer 3 reviewMode enum) +poll_interval_s: 30 +poll_timeout_s: 1800 # 30 min/paper cap + +results_dir: benchmarks/perturbation/results/full_math_all_reviewer3 diff --git a/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml b/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml new file mode 100644 index 0000000..79d5851 --- /dev/null +++ b/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml @@ -0,0 +1,13 @@ +system: reviewer3 +input_dir: benchmarks/perturbation/data/perturbations_filtered/physics_atm-clus/all +max_tokens: 13000 # match full_physics_atm_clus_coarse.yaml +min_perturbations: 5 # match full_physics_atm_clus_coarse.yaml + +score_method: llm +score_model: google/gemini-3-flash-preview + +review_mode: author # author | journal (Reviewer 3 reviewMode enum) +poll_interval_s: 30 +poll_timeout_s: 1800 # 30 min/paper cap + +results_dir: benchmarks/perturbation/results/full_physics_atm_clus_reviewer3 diff --git a/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml b/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml new file mode 100644 index 0000000..00d3bf4 --- /dev/null +++ b/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml @@ -0,0 +1,13 @@ +system: reviewer3 +input_dir: benchmarks/perturbation/data/perturbations_filtered/q-bio_GN/all +max_tokens: 13000 # match full_q_bio_GN_coarse.yaml +min_perturbations: 5 # match full_q_bio_GN_coarse.yaml + +score_method: llm +score_model: google/gemini-3-flash-preview + +review_mode: author # author | journal (Reviewer 3 reviewMode enum) +poll_interval_s: 30 +poll_timeout_s: 1800 # 30 min/paper cap + +results_dir: benchmarks/perturbation/results/full_q_bio_GN_reviewer3 diff --git a/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml b/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml new file mode 100644 index 0000000..a375500 --- /dev/null +++ b/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml @@ -0,0 +1,13 @@ +system: reviewer3 +input_dir: benchmarks/perturbation/data/perturbations_filtered/stat_AP/all +max_tokens: 13000 # match full_stat_AP_coarse.yaml +min_perturbations: 5 # match full_stat_AP_coarse.yaml + +score_method: llm +score_model: google/gemini-3-flash-preview + +review_mode: author # author | journal (Reviewer 3 reviewMode enum) +poll_interval_s: 30 +poll_timeout_s: 1800 # 30 min/paper cap + +results_dir: benchmarks/perturbation/results/full_stat_AP_reviewer3 diff --git a/benchmarks/perturbation/systems/reviewer3.py b/benchmarks/perturbation/systems/reviewer3.py index 66ab23e..1f4b1b9 100644 --- a/benchmarks/perturbation/systems/reviewer3.py +++ b/benchmarks/perturbation/systems/reviewer3.py @@ -30,6 +30,10 @@ class Reviewer3System(System): def build_jobs(self, units, cfg, results_dir): domain = results_dir.name + overrides = { + k: cfg[k] for k in ("review_mode", "poll_interval_s", "poll_timeout_s") + if k in cfg and cfg[k] is not None + } out: list[tuple[CellKey, ReviewJob]] = [] for u in units: if not u.staged_corrupted.exists(): @@ -45,7 +49,7 @@ def build_jobs(self, units, cfg, results_dir): job = ReviewJob( tag=tag, out_json=out_json, review_dir=review_dir, paper_label=f"{u.error_type}/{u.paper_label}", - payload={"paper": u.staged_corrupted}, + payload={"paper": u.staged_corrupted, "overrides": overrides}, ) out.append(((REVIEWER3_SLUG,), job)) return out @@ -54,7 +58,8 @@ def run_jobs(self, cell_key, jobs, parallel): if not jobs: return [] cfg = reviewer3_adapter.config_from_env() - # cfg overrides come from the caller via run_jobs_with_cfg; default cfg is fine here. + for k, v in jobs[0].payload.get("overrides", {}).items(): + setattr(cfg, k, v) adapter_jobs = [ reviewer3_adapter.Reviewer3Job( paper=j.payload["paper"], diff --git a/benchmarks/perturbation/systems/reviewer3_adapter.py b/benchmarks/perturbation/systems/reviewer3_adapter.py index c2f7408..71c1196 100644 --- a/benchmarks/perturbation/systems/reviewer3_adapter.py +++ b/benchmarks/perturbation/systems/reviewer3_adapter.py @@ -114,8 +114,48 @@ def _headers(cfg: Reviewer3Config) -> dict[str, str]: return {"x-api-key": cfg.api_key} +def _ensure_pdf(paper: Path) -> Path: + """Reviewer 3 only accepts PDF. If `paper` is already PDF, return it. + If it's a LaTeX source (starts with `\\documentclass`) — true for the + `cs_CC` corpus where `.md` files are really `.tex` — compile via pdflatex + and cache the result next to the source. Otherwise raise.""" + if paper.suffix.lower() == ".pdf": + return paper + head = paper.read_text(errors="replace")[:2000] + if "\\documentclass" not in head: + raise RuntimeError( + f"don't know how to convert {paper.name} to PDF " + "(no \\documentclass found; expected LaTeX-as-md or PDF)" + ) + cached = paper.with_suffix(".pdf") + if cached.exists() and cached.stat().st_mtime > paper.stat().st_mtime: + return cached + import shutil, subprocess, tempfile + text = paper.read_text(errors="replace") + # Staging truncates by tokens, which can leave the LaTeX source missing + # \end{document} (or mid-environment). Best-effort: ensure document closes. + if "\\end{document}" not in text: + text = text.rstrip() + "\n\n\\end{document}\n" + with tempfile.TemporaryDirectory() as td: + tex = Path(td) / "source.tex" + tex.write_text(text) + # Run twice to resolve cross-refs; ignore exit code, accept partial PDF. + for _ in range(2): + subprocess.run( + ["pdflatex", "-interaction=nonstopmode", "source.tex"], + cwd=td, capture_output=True, text=True, timeout=180, + ) + out_pdf = Path(td) / "source.pdf" + if not out_pdf.exists() or out_pdf.stat().st_size < 1000: + log = (Path(td) / "source.log").read_text(errors="replace") if (Path(td) / "source.log").exists() else "" + raise RuntimeError(f"pdflatex produced no usable PDF for {paper}: {log[-1500:]}") + shutil.copy(out_pdf, cached) + return cached + + def _submit(cfg: Reviewer3Config, paper: Path, *, title: str | None) -> str: """POST /api/internal/review (multipart). Returns sessionId.""" + paper = _ensure_pdf(paper) url = f"{cfg.base_url}/api/internal/review" data: dict[str, str] = { "userId": cfg.user_id, @@ -125,7 +165,7 @@ def _submit(cfg: Reviewer3Config, paper: Path, *, title: str | None) -> str: if title: data["title"] = title with paper.open("rb") as fh: - files = {"file": (paper.name, fh, "text/markdown")} + files = {"file": (paper.name, fh, "application/pdf")} resp = requests.post(url, headers=_headers(cfg), data=data, files=files, timeout=cfg.request_timeout_s) if resp.status_code >= 400: @@ -180,29 +220,36 @@ def _pick(d: dict, *keys: str, default: str = "") -> str: return default +# Canonical severity mapping lives in benchmarks/perturbation/_severity.py +# so every system (coarse, reviewer3, openaireview) and the downstream +# conference-study analyses share one source of truth. +import sys as _sys +from pathlib import Path as _Path +_sys.path.insert(0, str(_Path(__file__).resolve().parent.parent)) +from _severity import normalize_severity as _normalize_r3_severity # noqa: E402 + + def _normalize_comment(raw: dict, idx: int) -> dict: - """Best-effort mapping from Reviewer 3 comment shape to pipeline schema. + """Map a Reviewer 3 comment to the pipeline schema. - The OpenAPI spec doesn't pin field names. We try common synonyms; anything - we don't recognize is preserved on the side as `_raw` for later inspection. + Reviewer 3 comments carry: reviewerId, comment, title, citedText, severity, rank. + `citedText` is the verbatim excerpt from the paper — what our scorer uses as + `quote` for fuzzy/semantic matching. """ cid = _pick(raw, "id", "commentId", "uuid") or f"reviewer3_{idx}" - title = _pick(raw, "title", "subject", "heading", "summary") - quote = _pick(raw, "quote", "snippet", "excerpt", "passage", "text", "highlight") - explanation = _pick(raw, "explanation", "comment", "feedback", "body", "rationale", "message") + title = _pick(raw, "title") + quote = _pick(raw, "citedText", "quote", "snippet", "excerpt", "passage", "highlight") + explanation = _pick(raw, "comment", "explanation", "feedback", "body", "rationale", "message") + severity = _normalize_r3_severity("reviewer3", raw.get("severity")) if not explanation: - # last resort: serialize whatever we have so the comment isn't empty - explanation = json.dumps({k: v for k, v in raw.items() - if k not in ("id", "commentId", "uuid", "title", "subject", - "heading", "summary", "quote", "snippet", - "excerpt", "passage", "text", "highlight")}, - ensure_ascii=False) + explanation = json.dumps(raw, ensure_ascii=False) return { "id": cid, "title": title, "quote": quote, "explanation": explanation, "comment_type": "technical", + "severity": severity, "paragraph_index": None, "_raw": raw, } diff --git a/pyproject.toml b/pyproject.toml index 396295e..2500007 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,4 +34,4 @@ reviewer = ["viz/*.html", "skill/SKILL.md", "skill/scripts/*.py", "skill/referen mistral = ["mistral-ocr-cli>=1.2.0"] deepseek = ["deepseek-ocr-cli>=0.4.2"] dev = ["pytest>=8.0"] -benchmarks = ["pyyaml>=6.0", "datasets>=2.0", "rapidfuzz>=3.0", "sentence-transformers>=3.0"] +benchmarks = ["pyyaml>=6.0", "datasets>=2.0", "rapidfuzz>=3.0", "sentence-transformers>=3.0", "requests>=2.31"] From 3453bf9cd3836597366c2f6127a8effec7075e79 Mon Sep 17 00:00:00 2001 From: Dang Nguyen Date: Fri, 15 May 2026 15:43:02 -0500 Subject: [PATCH 2/7] gitignore: cover conference_study symlink, ephemeral perturbation configs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two cleanups so `git status` reflects only real work: Root .gitignore - Add slash-less `benchmarks/conference_study/papers` (the existing `papers/` rule in conference_study/.gitignore uses a trailing slash and wouldn't match a symlink — same situation the file already handles for manifests/ and results/). - Ignore `benchmarks/experimental_perturbations/` (removed from the repo in 6373fad but the local dir lingers). benchmarks/perturbation/.gitignore - Extend the "ephemeral configs" rule beyond `configs/_*` to cover the per-domain configs we generate locally but don't check in: configs/cs_*scaleup*.yaml configs/full_*.yaml configs/grok_*.yaml configs/longtail_*.yaml configs/subset_*.yaml configs/r3_smoke*.yaml Add `!configs/full_*_reviewer3.yaml` exception so the canonical reviewer3 configs that ARE tracked don't get hidden by the bulk rule. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 12 ++++++++---- benchmarks/perturbation/.gitignore | 13 ++++++++++++- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 034b5fd..fdb5bed 100644 --- a/.gitignore +++ b/.gitignore @@ -22,12 +22,16 @@ venv/ # Run outputs review_results/ -benchmarks/conference_study/results/ -benchmarks/perturbation/perturbation_results/ +benchmarks/conference_study/results +benchmarks/perturbation/perturbation_results # conference_study study artifacts (not code) -benchmarks/conference_study/manifests/ -benchmarks/conference_study/reports/ +benchmarks/conference_study/manifests +benchmarks/conference_study/reports # Symlink targets — trailing-slash patterns wouldn't match the symlinks benchmarks/conference_study/analyses/manifests benchmarks/conference_study/analyses/results +benchmarks/conference_study/papers + +# Moved out of repo (see commit 6373fad); ignore the leftover local dir. +benchmarks/experimental_perturbations/ diff --git a/benchmarks/perturbation/.gitignore b/benchmarks/perturbation/.gitignore index bddf55d..b435026 100644 --- a/benchmarks/perturbation/.gitignore +++ b/benchmarks/perturbation/.gitignore @@ -8,8 +8,19 @@ papers/ # Run logs reports/*.log -# Temporary / ephemeral configs +# Temporary / ephemeral configs. +# Underscore-prefix are by-convention scratch. The other rules below cover the +# bulk per-domain configs that we generate locally (one per system × domain) +# but don't check in. The `!*_reviewer3.yaml` exception preserves the canonical +# reviewer3 configs that are tracked. configs/_* +configs/cs_*scaleup*.yaml +configs/full_*.yaml +configs/grok_*.yaml +configs/longtail_*.yaml +configs/subset_*.yaml +configs/r3_smoke*.yaml +!configs/full_*_reviewer3.yaml # Python __pycache__/ From 8c6238f9ec87c62b7e0664a64f9c938db55d6253 Mon Sep 17 00:00:00 2001 From: Dang Nguyen Date: Fri, 15 May 2026 16:05:46 -0500 Subject: [PATCH 3/7] reviewer3: compile FULL source + trim PDF by pages (fixes pdflatex failures) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The token-based truncation in `prepare_units` cuts the LaTeX-as-md staged file at a token boundary, which routinely leaves the document mid-environment. pdflatex on the staged file then "produces no usable PDF" for a fraction of papers, surfacing as a hard failure in the reviewer3 run. Switches the reviewer3 system to compile the FULL pre-truncation source (`u.src_corrupted`) and then trim the rendered PDF to its first N pages. This matches the `max_pages: 20` convention conference_study/configs/coarse.yaml already uses for coarse, so reviewer3 sees roughly the same content window the other systems see. Three pdflatex robustness fixes layered in: 1. Strip orphan `\input{...}` / `\include{...}` whose target file isn't bundled. pdflatex aborts hard on a missing \input, killing the compile for the whole paper even when the body is fine (paper_005 cs_CC had `\input{mypreamble.tex}`). 2. Inject a defensive preamble of `\providecommand` fallbacks for common author-defined shortcuts (\bbR, \calA, \bfx, \eps, \vvirg, \ootimes, etc.). Authors typically define these in private preamble files we don't have; \providecommand is a no-op when the command is already defined, so the injection is safe blanket coverage. 3. subprocess.run uses bytes (text=False) instead of text=True so the pdflatex log's non-UTF-8 accent bytes don't blow up Python's decoder (paper_009 cs_CC had byte 0xaa at offset ~57k). Changes - Reviewer3System.build_jobs threads `u.src_corrupted` (full path) and `cfg["max_pages"]` into the job payload. - Reviewer3Job adopts `source` + `max_pages` fields; `_submit` / `_ensure_pdf` forward them. - `_ensure_pdf` prefers `source` over `paper` for the compile when set; caches alongside the source with `.trim.pdf` suffix when trimmed. - `_trim_pages_to` (in-place) and `_maybe_trim_pages` (for already-PDF inputs) use pymupdf to cap pages. - `max_pages: 20` added to all 8 `full_*_reviewer3.yaml` configs. - run_benchmark.py Config gains `max_pages: int | None = None`. Smoke-validated on three previously-failing cs_CC papers (2604.19872v1 with missing \input + custom commands, 2604.24325v1 with same pattern, 2604.24879v1 with non-UTF8 bytes in pdflatex output) — all three now produce 20-page trimmed PDFs in 2–4s. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../configs/full_cs_CC_reviewer3.yaml | 1 + .../configs/full_cs_LG_reviewer3.yaml | 1 + .../configs/full_econ_EM_reviewer3.yaml | 1 + .../configs/full_hep_ex_reviewer3.yaml | 1 + .../configs/full_math_all_reviewer3.yaml | 1 + .../full_physics_atm_clus_reviewer3.yaml | 1 + .../configs/full_q_bio_GN_reviewer3.yaml | 1 + .../configs/full_stat_AP_reviewer3.yaml | 1 + benchmarks/perturbation/run_benchmark.py | 5 + benchmarks/perturbation/systems/reviewer3.py | 14 +- .../perturbation/systems/reviewer3_adapter.py | 198 +++++++++++++++--- 11 files changed, 199 insertions(+), 26 deletions(-) diff --git a/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml b/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml index 3488e54..61e3c36 100644 --- a/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml +++ b/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml @@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview review_mode: author # author | journal (Reviewer 3 reviewMode enum) poll_interval_s: 30 poll_timeout_s: 1800 # 30 min/paper cap +max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) results_dir: benchmarks/perturbation/results/full_cs_CC_reviewer3 diff --git a/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml b/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml index bc3cbb3..d9aa287 100644 --- a/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml +++ b/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml @@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview review_mode: author # author | journal (Reviewer 3 reviewMode enum) poll_interval_s: 30 poll_timeout_s: 1800 # 30 min/paper cap +max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) results_dir: benchmarks/perturbation/results/full_cs_LG_reviewer3 diff --git a/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml b/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml index f871179..efebb8b 100644 --- a/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml +++ b/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml @@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview review_mode: author # author | journal (Reviewer 3 reviewMode enum) poll_interval_s: 30 poll_timeout_s: 1800 # 30 min/paper cap +max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) results_dir: benchmarks/perturbation/results/full_econ_EM_reviewer3 diff --git a/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml b/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml index 901fbdb..711eae7 100644 --- a/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml +++ b/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml @@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview review_mode: author # author | journal (Reviewer 3 reviewMode enum) poll_interval_s: 30 poll_timeout_s: 1800 # 30 min/paper cap +max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) results_dir: benchmarks/perturbation/results/full_hep_ex_reviewer3 diff --git a/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml b/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml index ef94db9..69e061c 100644 --- a/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml +++ b/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml @@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview review_mode: author # author | journal (Reviewer 3 reviewMode enum) poll_interval_s: 30 poll_timeout_s: 1800 # 30 min/paper cap +max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) results_dir: benchmarks/perturbation/results/full_math_all_reviewer3 diff --git a/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml b/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml index 79d5851..48a09f6 100644 --- a/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml +++ b/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml @@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview review_mode: author # author | journal (Reviewer 3 reviewMode enum) poll_interval_s: 30 poll_timeout_s: 1800 # 30 min/paper cap +max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) results_dir: benchmarks/perturbation/results/full_physics_atm_clus_reviewer3 diff --git a/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml b/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml index 00d3bf4..756cfc9 100644 --- a/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml +++ b/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml @@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview review_mode: author # author | journal (Reviewer 3 reviewMode enum) poll_interval_s: 30 poll_timeout_s: 1800 # 30 min/paper cap +max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) results_dir: benchmarks/perturbation/results/full_q_bio_GN_reviewer3 diff --git a/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml b/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml index a375500..fa4cb19 100644 --- a/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml +++ b/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml @@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview review_mode: author # author | journal (Reviewer 3 reviewMode enum) poll_interval_s: 30 poll_timeout_s: 1800 # 30 min/paper cap +max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) results_dir: benchmarks/perturbation/results/full_stat_AP_reviewer3 diff --git a/benchmarks/perturbation/run_benchmark.py b/benchmarks/perturbation/run_benchmark.py index 9117911..b44112e 100644 --- a/benchmarks/perturbation/run_benchmark.py +++ b/benchmarks/perturbation/run_benchmark.py @@ -98,6 +98,11 @@ class Config: review_mode: str = field(default="author", metadata={"choices": ["author", "journal"]}) poll_interval_s: float = 5.0 poll_timeout_s: float = 1200.0 + # Cap pages of the rendered PDF sent to R3. None = no cap. Used because + # the token-truncated staged file often isn't valid LaTeX; we compile the + # FULL pre-truncation source and trim by pages instead (matches the + # max_pages: 20 convention in conference_study/configs/coarse.yaml). + max_pages: int | None = None # Legacy aliases (read on load, normalized into models/methods). review_models: list[str] = field(default_factory=list) review_methods: list[str] = field(default_factory=list) diff --git a/benchmarks/perturbation/systems/reviewer3.py b/benchmarks/perturbation/systems/reviewer3.py index 1f4b1b9..8ccc9a7 100644 --- a/benchmarks/perturbation/systems/reviewer3.py +++ b/benchmarks/perturbation/systems/reviewer3.py @@ -34,6 +34,11 @@ def build_jobs(self, units, cfg, results_dir): k: cfg[k] for k in ("review_mode", "poll_interval_s", "poll_timeout_s") if k in cfg and cfg[k] is not None } + # We compile the FULL (pre-truncation) source for R3 — the token-cut + # staged file frequently isn't valid LaTeX (chops mid-environment). + # `max_pages` then trims the rendered PDF so R3 still sees roughly the + # same content window as coarse (which uses max_pages: 20). + max_pages = cfg.get("max_pages") out: list[tuple[CellKey, ReviewJob]] = [] for u in units: if not u.staged_corrupted.exists(): @@ -49,7 +54,12 @@ def build_jobs(self, units, cfg, results_dir): job = ReviewJob( tag=tag, out_json=out_json, review_dir=review_dir, paper_label=f"{u.error_type}/{u.paper_label}", - payload={"paper": u.staged_corrupted, "overrides": overrides}, + payload={ + "paper": u.staged_corrupted, + "source": u.src_corrupted, + "max_pages": max_pages, + "overrides": overrides, + }, ) out.append(((REVIEWER3_SLUG,), job)) return out @@ -65,6 +75,8 @@ def run_jobs(self, cell_key, jobs, parallel): paper=j.payload["paper"], out_json=j.out_json, paper_label=j.tag, + source=j.payload.get("source"), + max_pages=j.payload.get("max_pages"), ) for j in jobs ] diff --git a/benchmarks/perturbation/systems/reviewer3_adapter.py b/benchmarks/perturbation/systems/reviewer3_adapter.py index 71c1196..570fad6 100644 --- a/benchmarks/perturbation/systems/reviewer3_adapter.py +++ b/benchmarks/perturbation/systems/reviewer3_adapter.py @@ -90,10 +90,17 @@ def config_from_env() -> Reviewer3Config: @dataclass class Reviewer3Job: - paper: Path # path to *_corrupted.md - out_json: Path # where to write the pipeline-shaped JSON - paper_label: str # e.g. "/" + paper: Path # path to *_corrupted.md (staged, possibly token-truncated) + out_json: Path # where to write the pipeline-shaped JSON + paper_label: str # e.g. "/" title: str | None = None + # Optional: full pre-truncation source. When provided, _ensure_pdf + # compiles THIS instead of `paper` — the staged file is often invalid + # LaTeX because token truncation can chop mid-environment. + source: Path | None = None + # Optional: trim the rendered PDF to its first N pages so R3 still sees + # roughly the same window other systems do (coarse uses max_pages: 20). + max_pages: int | None = None @dataclass @@ -114,48 +121,188 @@ def _headers(cfg: Reviewer3Config) -> dict[str, str]: return {"x-api-key": cfg.api_key} -def _ensure_pdf(paper: Path) -> Path: - """Reviewer 3 only accepts PDF. If `paper` is already PDF, return it. - If it's a LaTeX source (starts with `\\documentclass`) — true for the - `cs_CC` corpus where `.md` files are really `.tex` — compile via pdflatex - and cache the result next to the source. Otherwise raise.""" +def _ensure_pdf(paper: Path, *, source: Path | None = None, + max_pages: int | None = None) -> Path: + """Reviewer 3 only accepts PDF. Return a compiled+possibly-trimmed PDF. + + Resolution order for the source bytes: + 1. If `paper` is already a `.pdf`, return it as-is (page trim still applies). + 2. If `source` was provided, compile that — preferred for LaTeX-as-md + since the staged `paper` is often invalid LaTeX (token truncation + chops mid-environment). + 3. Else compile `paper` directly. If it lacks `\\end{document}`, append + one as a best-effort close. + + When `max_pages` is set, the resulting PDF is trimmed to its first N + pages via pymupdf. This matches what coarse does (max_pages: 20) so R3 + sees roughly the same window other systems see. + """ if paper.suffix.lower() == ".pdf": - return paper - head = paper.read_text(errors="replace")[:2000] + return _maybe_trim_pages(paper, max_pages) + + src_for_compile = source if (source is not None and source.exists()) else paper + cached_suffix = ".trim.pdf" if max_pages else ".pdf" + cached = src_for_compile.with_suffix(cached_suffix) + src_mtime = src_for_compile.stat().st_mtime + if cached.exists() and cached.stat().st_mtime > src_mtime: + return cached + + head = src_for_compile.read_text(errors="replace")[:2000] if "\\documentclass" not in head: raise RuntimeError( - f"don't know how to convert {paper.name} to PDF " + f"don't know how to convert {src_for_compile.name} to PDF " "(no \\documentclass found; expected LaTeX-as-md or PDF)" ) - cached = paper.with_suffix(".pdf") - if cached.exists() and cached.stat().st_mtime > paper.stat().st_mtime: - return cached - import shutil, subprocess, tempfile - text = paper.read_text(errors="replace") - # Staging truncates by tokens, which can leave the LaTeX source missing - # \end{document} (or mid-environment). Best-effort: ensure document closes. + text = src_for_compile.read_text(errors="replace") if "\\end{document}" not in text: + # `source` should always close cleanly; this only fires if we fell back + # to compiling the token-truncated `paper`. text = text.rstrip() + "\n\n\\end{document}\n" + # Strip orphan \input / \include — the perturbation corpus dumps each paper + # into a single .md but a few preserve `\input{mypreamble.tex}`-style + # directives that pdflatex can't resolve (fatal error, no PDF produced). + text = _strip_orphan_includes(text, src_for_compile.parent) + # Stripped-include papers (and some others) rely on author-defined shortcuts + # like \bbC, \calA, \vvirg, \ootimes from the missing preamble. Inject + # \providecommand fallbacks so the body compiles. + text = _inject_rescue_preamble(text) + + import shutil, subprocess, tempfile with tempfile.TemporaryDirectory() as td: tex = Path(td) / "source.tex" tex.write_text(text) # Run twice to resolve cross-refs; ignore exit code, accept partial PDF. + # Capture output as bytes — pdflatex's own log can contain non-UTF-8 + # accent bytes and we don't read this output anyway (the .log file is + # the source of truth on failure). for _ in range(2): subprocess.run( ["pdflatex", "-interaction=nonstopmode", "source.tex"], - cwd=td, capture_output=True, text=True, timeout=180, + cwd=td, capture_output=True, timeout=300, ) out_pdf = Path(td) / "source.pdf" if not out_pdf.exists() or out_pdf.stat().st_size < 1000: - log = (Path(td) / "source.log").read_text(errors="replace") if (Path(td) / "source.log").exists() else "" - raise RuntimeError(f"pdflatex produced no usable PDF for {paper}: {log[-1500:]}") + log_path = Path(td) / "source.log" + log = log_path.read_text(errors="replace") if log_path.exists() else "" + raise RuntimeError( + f"pdflatex produced no usable PDF for {src_for_compile}: {log[-1500:]}" + ) + if max_pages: + _trim_pages_to(out_pdf, max_pages) shutil.copy(out_pdf, cached) return cached -def _submit(cfg: Reviewer3Config, paper: Path, *, title: str | None) -> str: - """POST /api/internal/review (multipart). Returns sessionId.""" - paper = _ensure_pdf(paper) +_INPUT_RE = __import__("re").compile( + r"\\(?:input|include)\s*\{([^}]+)\}", flags=__import__("re").IGNORECASE, +) + + +def _strip_orphan_includes(text: str, base_dir: Path) -> str: + """Comment out `\\input{path}` / `\\include{path}` whose target file isn't + next to the source. pdflatex aborts hard on a missing \\input, which kills + the compile even when the rest of the document is fine.""" + def _replace(m): + target = m.group(1).strip() + # Common LaTeX convention: optional .tex extension. + for cand in (target, target + ".tex"): + if (base_dir / cand).exists(): + return m.group(0) + return "% [stripped missing include] " + m.group(0) + return _INPUT_RE.sub(_replace, text) + + +# Defensive preamble injected after \documentclass to provide fallbacks for +# common custom-command patterns that authors typically define in private +# preamble files (e.g. mypreamble.tex). \providecommand is a no-op when the +# command is already defined, so this is safe to inject blindly. +_RESCUE_PREAMBLE = r""" +% --- injected by reviewer3_adapter: providecommand fallbacks --- +% blackboard / cal / bf shortcuts authors commonly define per-paper +\providecommand{\bb}[1]{\mathbb{#1}} +\providecommand{\cal}[1]{\mathcal{#1}} +\providecommand{\bff}[1]{\mathbf{#1}} +% common single-letter shortcuts (\bbR, \calA, \bfx, etc.) +\makeatletter +\@for\letter:={A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z}\do{% + \expandafter\providecommand\csname bb\letter\endcsname{\ensuremath{\mathbb{\letter}}}% + \expandafter\providecommand\csname cal\letter\endcsname{\ensuremath{\mathcal{\letter}}}% +} +\@for\letter:={a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z}\do{% + \expandafter\providecommand\csname bf\letter\endcsname{\ensuremath{\mathbf{\letter}}}% +} +\makeatother +% other commonly-used shortcuts +\providecommand{\eps}{\epsilon} +\providecommand{\veps}{\varepsilon} +\providecommand{\vvirg}{,\,} +\providecommand{\ootimes}{\otimes} +\providecommand{\Bbbk}{\mathbb{k}} +% --- end injected preamble --- +""" + + +def _inject_rescue_preamble(text: str) -> str: + """Insert _RESCUE_PREAMBLE right after the first \\documentclass{...} line. + Idempotent: looks for our marker before injecting.""" + if "injected by reviewer3_adapter" in text: + return text + import re + m = re.search(r"\\documentclass(\[[^\]]*\])?\{[^}]+\}", text) + if not m: + return text + cut = m.end() + return text[:cut] + "\n" + _RESCUE_PREAMBLE + text[cut:] + + +def _maybe_trim_pages(pdf: Path, max_pages: int | None) -> Path: + """Return `pdf` (already a PDF). If `max_pages` is set and the PDF has more, + return a trimmed sibling cached next to it.""" + if not max_pages: + return pdf + import fitz + src = fitz.open(pdf) + try: + if src.page_count <= max_pages: + return pdf + trimmed = pdf.with_suffix(f".first{max_pages}p.pdf") + if trimmed.exists() and trimmed.stat().st_mtime > pdf.stat().st_mtime: + return trimmed + dst = fitz.open() + dst.insert_pdf(src, from_page=0, to_page=max_pages - 1) + dst.save(trimmed) + dst.close() + return trimmed + finally: + src.close() + + +def _trim_pages_to(pdf: Path, max_pages: int) -> None: + """In-place trim of `pdf` to its first `max_pages` pages.""" + import fitz + src = fitz.open(pdf) + try: + if src.page_count <= max_pages: + return + dst = fitz.open() + dst.insert_pdf(src, from_page=0, to_page=max_pages - 1) + tmp = pdf.with_suffix(".pdf.tmp") + dst.save(tmp) + dst.close() + finally: + src.close() + tmp.replace(pdf) + + +def _submit(cfg: Reviewer3Config, paper: Path, *, title: str | None, + source: Path | None = None, max_pages: int | None = None) -> str: + """POST /api/internal/review (multipart). Returns sessionId. + + `source` and `max_pages` are forwarded to `_ensure_pdf` so callers can opt + into compiling the full pre-truncation source and trimming the rendered + PDF — see _ensure_pdf docstring. + """ + paper = _ensure_pdf(paper, source=source, max_pages=max_pages) url = f"{cfg.base_url}/api/internal/review" data: dict[str, str] = { "userId": cfg.user_id, @@ -294,7 +441,8 @@ def _run_one(job: Reviewer3Job, cfg: Reviewer3Config) -> Reviewer3Result: start = time.time() sid = "" try: - sid = _submit(cfg, job.paper, title=job.title) + sid = _submit(cfg, job.paper, title=job.title, + source=job.source, max_pages=job.max_pages) print(f"[{tag}] submitted, sessionId={sid}", file=sys.stderr, flush=True) body = _poll_until_done(cfg, sid, tag=tag) elapsed = time.time() - start From 40da53cb39833dd056dd646e98e364ee4288ab02 Mon Sep 17 00:00:00 2001 From: Dang Nguyen Date: Fri, 15 May 2026 20:58:10 -0500 Subject: [PATCH 4/7] reviewer3: trim PDF page count for conference too; bump poll timeout to 1h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two operational fixes after observing the live runs: 1. Conference Reviewer3Adapter wasn't honoring max_pages — it called _submit(pdf) directly without forwarding max_pages, so R3 received the full PDF (sometimes 50+ pages, 5+ MB) and a chunk of those tripped R3's HTTP 413 limit. Adapter now reads `max_pages` from the top-level config (falling back to reviewer3_options.max_pages) and threads it through to _submit -> _ensure_pdf -> _maybe_trim_pages. 2. poll_timeout_s bumped from 1800 (30 min) to 3600 (60 min) in all 8 perturbation configs and conference reviewer3.yaml. Observed wall time per paper under 10-concurrent load was routinely 25-40 min, with a long tail past 30 — causing dozens of false-timeout failures even though R3 was still processing. The session remains live on R3's side regardless, but the adapter abandoned them. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../conference_study/competitors/reviewer3_adapter.py | 8 +++++++- benchmarks/conference_study/configs/reviewer3.yaml | 2 +- benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml | 2 +- benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml | 2 +- .../perturbation/configs/full_econ_EM_reviewer3.yaml | 2 +- .../perturbation/configs/full_hep_ex_reviewer3.yaml | 2 +- .../perturbation/configs/full_math_all_reviewer3.yaml | 2 +- .../configs/full_physics_atm_clus_reviewer3.yaml | 2 +- .../perturbation/configs/full_q_bio_GN_reviewer3.yaml | 2 +- .../perturbation/configs/full_stat_AP_reviewer3.yaml | 2 +- 10 files changed, 16 insertions(+), 10 deletions(-) diff --git a/benchmarks/conference_study/competitors/reviewer3_adapter.py b/benchmarks/conference_study/competitors/reviewer3_adapter.py index be4dbd2..02caeed 100644 --- a/benchmarks/conference_study/competitors/reviewer3_adapter.py +++ b/benchmarks/conference_study/competitors/reviewer3_adapter.py @@ -49,7 +49,13 @@ def review(self, pdf: Path, model: str, cfg: dict) -> NormalizedReview: if k in opts and opts[k] is not None: setattr(rcfg, k, opts[k]) - session_id = _r3._submit(rcfg, pdf, title=pdf.stem) + # Cap PDF size sent to R3. `max_pages` lives at the top level of the + # config (run_competitors.py uses it for parse_document); we honor the + # same value here so the bytes shipped to R3 match the paragraph window + # we already cap on our side. Untrimmed full PDFs were tripping R3's + # HTTP 413 limit and inflating per-paper wall time. + max_pages = cfg.get("max_pages") or opts.get("max_pages") + session_id = _r3._submit(rcfg, pdf, title=pdf.stem, max_pages=max_pages) body = _r3._poll_until_done(rcfg, session_id, tag=f"reviewer3/{pdf.stem}") comments: list[NormalizedComment] = [] diff --git a/benchmarks/conference_study/configs/reviewer3.yaml b/benchmarks/conference_study/configs/reviewer3.yaml index f090d41..fccf923 100644 --- a/benchmarks/conference_study/configs/reviewer3.yaml +++ b/benchmarks/conference_study/configs/reviewer3.yaml @@ -29,4 +29,4 @@ max_pages: 20 # parse_document cap; matches coarse.yaml convention reviewer3_options: review_mode: author # author | journal (R3 reviewMode enum) poll_interval_s: 30 - poll_timeout_s: 1800 # 30 min/paper cap inside the poll loop + poll_timeout_s: 3600 # 60 min/paper cap diff --git a/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml b/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml index 61e3c36..f60a078 100644 --- a/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml +++ b/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml @@ -8,7 +8,7 @@ score_model: google/gemini-3-flash-preview review_mode: author # author | journal (Reviewer 3 reviewMode enum) poll_interval_s: 30 -poll_timeout_s: 1800 # 30 min/paper cap +poll_timeout_s: 3600 # 60 min/paper cap (R3 is slow under 10-concurrent load) max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) results_dir: benchmarks/perturbation/results/full_cs_CC_reviewer3 diff --git a/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml b/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml index d9aa287..3193fbd 100644 --- a/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml +++ b/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml @@ -8,7 +8,7 @@ score_model: google/gemini-3-flash-preview review_mode: author # author | journal (Reviewer 3 reviewMode enum) poll_interval_s: 30 -poll_timeout_s: 1800 # 30 min/paper cap +poll_timeout_s: 3600 # 60 min/paper cap (R3 is slow under 10-concurrent load) max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) results_dir: benchmarks/perturbation/results/full_cs_LG_reviewer3 diff --git a/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml b/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml index efebb8b..501ec02 100644 --- a/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml +++ b/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml @@ -8,7 +8,7 @@ score_model: google/gemini-3-flash-preview review_mode: author # author | journal (Reviewer 3 reviewMode enum) poll_interval_s: 30 -poll_timeout_s: 1800 # 30 min/paper cap +poll_timeout_s: 3600 # 60 min/paper cap (R3 is slow under 10-concurrent load) max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) results_dir: benchmarks/perturbation/results/full_econ_EM_reviewer3 diff --git a/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml b/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml index 711eae7..8e6cd7a 100644 --- a/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml +++ b/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml @@ -8,7 +8,7 @@ score_model: google/gemini-3-flash-preview review_mode: author # author | journal (Reviewer 3 reviewMode enum) poll_interval_s: 30 -poll_timeout_s: 1800 # 30 min/paper cap +poll_timeout_s: 3600 # 60 min/paper cap (R3 is slow under 10-concurrent load) max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) results_dir: benchmarks/perturbation/results/full_hep_ex_reviewer3 diff --git a/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml b/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml index 69e061c..9b8e311 100644 --- a/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml +++ b/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml @@ -8,7 +8,7 @@ score_model: google/gemini-3-flash-preview review_mode: author # author | journal (Reviewer 3 reviewMode enum) poll_interval_s: 30 -poll_timeout_s: 1800 # 30 min/paper cap +poll_timeout_s: 3600 # 60 min/paper cap (R3 is slow under 10-concurrent load) max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) results_dir: benchmarks/perturbation/results/full_math_all_reviewer3 diff --git a/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml b/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml index 48a09f6..4800288 100644 --- a/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml +++ b/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml @@ -8,7 +8,7 @@ score_model: google/gemini-3-flash-preview review_mode: author # author | journal (Reviewer 3 reviewMode enum) poll_interval_s: 30 -poll_timeout_s: 1800 # 30 min/paper cap +poll_timeout_s: 3600 # 60 min/paper cap (R3 is slow under 10-concurrent load) max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) results_dir: benchmarks/perturbation/results/full_physics_atm_clus_reviewer3 diff --git a/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml b/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml index 756cfc9..ca8669d 100644 --- a/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml +++ b/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml @@ -8,7 +8,7 @@ score_model: google/gemini-3-flash-preview review_mode: author # author | journal (Reviewer 3 reviewMode enum) poll_interval_s: 30 -poll_timeout_s: 1800 # 30 min/paper cap +poll_timeout_s: 3600 # 60 min/paper cap (R3 is slow under 10-concurrent load) max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) results_dir: benchmarks/perturbation/results/full_q_bio_GN_reviewer3 diff --git a/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml b/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml index fa4cb19..a06206e 100644 --- a/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml +++ b/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml @@ -8,7 +8,7 @@ score_model: google/gemini-3-flash-preview review_mode: author # author | journal (Reviewer 3 reviewMode enum) poll_interval_s: 30 -poll_timeout_s: 1800 # 30 min/paper cap +poll_timeout_s: 3600 # 60 min/paper cap (R3 is slow under 10-concurrent load) max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) results_dir: benchmarks/perturbation/results/full_stat_AP_reviewer3 From 5cf52adcfb0ca83f23ad8e5714dbb344970b221c Mon Sep 17 00:00:00 2001 From: Dang Nguyen Date: Fri, 15 May 2026 21:09:01 -0500 Subject: [PATCH 5/7] reviewer3 pdflatex: rewrite missing doc classes + strip missing packages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three layered fixes for the remaining pdflatex-aborts: 1. `_force_known_documentclass` — rewrite \documentclass[opts]{X} to \documentclass{amsart} when X.cls isn't installed (kpsewhich miss). Many papers use journal classes not in TeX Live (aq-amsart, sn-jnl, atlasdoc, aastex631/701, cas-sc, iopjournal, svjour3, ieeeconf, informs3, revtex4 without -1/-2). amsart is the math-friendly fallback, preserving theorem/lemma envs. Regex now matches only the **first uncommented** \documentclass so commented-out example lines (e.g. q-bio.GN papers carry `%%\documentclass[...]{sn-jnl}` followed by the active variant) don't short-circuit the lookup. 2. `_strip_missing_packages` — same idea for \usepackage{X} when X.sty isn't installed. pdflatex aborts hard on a missing package too; the common offender on hep-ex papers is `\usepackage{jinstpub}`. Comment them out; the body's references to missing-package commands degrade to undefined-control-sequence warnings (pdflatex in nonstopmode still produces a usable PDF). 3. Rescue preamble now runs inside \AtBeginDocument{...} so its \providecommand falls AFTER all \usepackage{...} loads. Previously, the rescue defined \Bbbk before amsfonts loaded, then amsfonts `\DeclareSymbolFont` errored with "Command \Bbbk already defined". Spot-test across 8 domains: 7 of 8 sample papers compile cleanly. The holdout is q-bio.GN paper_001 (uses sn-jnl class with extensive class- specific commands that don't degrade gracefully); 9 of 10 q-bio.GN papers use bundled classes and work. Expected pdflatex-failure cells dropped from 39 to ~3 (one paper × three error types). kpsewhich results are cached per process for the common-class / common-package set so repeated rewrites across the 222-cell run pay the subprocess cost once per (class|package). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../perturbation/systems/reviewer3_adapter.py | 130 +++++++++++++++--- 1 file changed, 111 insertions(+), 19 deletions(-) diff --git a/benchmarks/perturbation/systems/reviewer3_adapter.py b/benchmarks/perturbation/systems/reviewer3_adapter.py index 570fad6..3bbd96a 100644 --- a/benchmarks/perturbation/systems/reviewer3_adapter.py +++ b/benchmarks/perturbation/systems/reviewer3_adapter.py @@ -158,6 +158,14 @@ def _ensure_pdf(paper: Path, *, source: Path | None = None, # `source` should always close cleanly; this only fires if we fell back # to compiling the token-truncated `paper`. text = text.rstrip() + "\n\n\\end{document}\n" + # Rewrite \documentclass{} -> \documentclass{amsart} when the + # custom class isn't installed on the local TeX Live. Without this many + # journal-class papers (aq-amsart, sn-jnl, atlasdoc, aastex*, cas-sc, ...) + # bail at line 1 with "File `X.cls' not found". + text = _force_known_documentclass(text) + # Same idea for missing \usepackage{X.sty} (e.g. jinstpub on some hep-ex + # papers). Comment them out — pdflatex aborts hard on missing packages. + text = _strip_missing_packages(text) # Strip orphan \input / \include — the perturbation corpus dumps each paper # into a single .md but a few preserve `\input{mypreamble.tex}`-style # directives that pdflatex can't resolve (fatal error, no PDF produced). @@ -218,26 +226,28 @@ def _replace(m): # command is already defined, so this is safe to inject blindly. _RESCUE_PREAMBLE = r""" % --- injected by reviewer3_adapter: providecommand fallbacks --- -% blackboard / cal / bf shortcuts authors commonly define per-paper -\providecommand{\bb}[1]{\mathbb{#1}} -\providecommand{\cal}[1]{\mathcal{#1}} -\providecommand{\bff}[1]{\mathbf{#1}} -% common single-letter shortcuts (\bbR, \calA, \bfx, etc.) -\makeatletter -\@for\letter:={A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z}\do{% - \expandafter\providecommand\csname bb\letter\endcsname{\ensuremath{\mathbb{\letter}}}% - \expandafter\providecommand\csname cal\letter\endcsname{\ensuremath{\mathcal{\letter}}}% +% Run at \begin{document} so we don't fight with packages that define the +% same shortcuts (e.g. amsfonts -> \Bbbk). \providecommand is itself a no-op +% when the command already exists, but only at the moment it runs; AtBeginDoc +% defers the check past all \usepackage{...} loads. +\AtBeginDocument{% + \providecommand{\bb}[1]{\mathbb{#1}}% + \providecommand{\cal}[1]{\mathcal{#1}}% + \providecommand{\bff}[1]{\mathbf{#1}}% + \makeatletter + \@for\letter:={A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z}\do{% + \expandafter\providecommand\csname bb\letter\endcsname{\ensuremath{\mathbb{\letter}}}% + \expandafter\providecommand\csname cal\letter\endcsname{\ensuremath{\mathcal{\letter}}}% + }% + \@for\letter:={a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z}\do{% + \expandafter\providecommand\csname bf\letter\endcsname{\ensuremath{\mathbf{\letter}}}% + }% + \makeatother + \providecommand{\eps}{\epsilon}% + \providecommand{\veps}{\varepsilon}% + \providecommand{\vvirg}{,\,}% + \providecommand{\ootimes}{\otimes}% } -\@for\letter:={a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z}\do{% - \expandafter\providecommand\csname bf\letter\endcsname{\ensuremath{\mathbf{\letter}}}% -} -\makeatother -% other commonly-used shortcuts -\providecommand{\eps}{\epsilon} -\providecommand{\veps}{\varepsilon} -\providecommand{\vvirg}{,\,} -\providecommand{\ootimes}{\otimes} -\providecommand{\Bbbk}{\mathbb{k}} % --- end injected preamble --- """ @@ -255,6 +265,88 @@ def _inject_rescue_preamble(text: str) -> str: return text[:cut] + "\n" + _RESCUE_PREAMBLE + text[cut:] +# Bundled-with-TeX-Live classes are kept as-is. Anything else gets rewritten +# to `\documentclass{amsart}` so pdflatex doesn't bail at line 1 with +# "File `.cls' not found". Class-specific commands in the body then +# emit undefined-control-sequence warnings, but pdflatex in nonstopmode still +# produces a usable PDF. +_KNOWN_CLASSES_CACHE: dict[str, bool] = {} + + +def _class_is_installed(cls: str) -> bool: + if cls in _KNOWN_CLASSES_CACHE: + return _KNOWN_CLASSES_CACHE[cls] + import subprocess + try: + r = subprocess.run(["kpsewhich", f"{cls}.cls"], + capture_output=True, timeout=5) + ok = (r.returncode == 0 and r.stdout.strip() != b"") + except Exception: + ok = False + _KNOWN_CLASSES_CACHE[cls] = ok + return ok + + +def _force_known_documentclass(text: str) -> str: + """Rewrite `\\documentclass[opts]{}` to `\\documentclass{amsart}` + when isn't bundled with the local TeX Live. Drops options too — + they're class-specific and frequently invalid against amsart. amsart is + the math-friendly fallback (preserves theorem/lemma environments). + + Matches only the first **uncommented** \\documentclass — many papers carry + example/commented-out variants before the active one. + """ + import re + pat = re.compile(r"^(?P[^%\n]*?)\\documentclass(\[[^\]]*\])?\{([^}]+)\}", + re.MULTILINE) + for m in pat.finditer(text): + # If there's a `%` before the `\documentclass` on this line, it's commented. + if "%" in m.group("lead"): + continue + cls = m.group(3).strip() + if _class_is_installed(cls): + return text + # Replace just the `\documentclass...{...}` span, keeping any leading content + # (it's empty in practice but be safe). + start = m.start() + len(m.group("lead")) + return text[:start] + r"\documentclass{amsart}" + text[m.end():] + return text + + +# Same pattern for missing packages — pdflatex aborts hard on +# "File `X.sty' not found", so comment out \usepackage{X} (and the bracketed +# options line) when X isn't installed. Bundled-with-TeX packages stay. +_KNOWN_PACKAGES_CACHE: dict[str, bool] = {} + + +def _package_is_installed(pkg: str) -> bool: + if pkg in _KNOWN_PACKAGES_CACHE: + return _KNOWN_PACKAGES_CACHE[pkg] + import subprocess + try: + r = subprocess.run(["kpsewhich", f"{pkg}.sty"], + capture_output=True, timeout=5) + ok = (r.returncode == 0 and r.stdout.strip() != b"") + except Exception: + ok = False + _KNOWN_PACKAGES_CACHE[pkg] = ok + return ok + + +def _strip_missing_packages(text: str) -> str: + """Comment out `\\usepackage[opts]{X}` lines whose .sty isn't installed. + Multi-package forms like `\\usepackage{a,b,c}` are split: any missing + member causes the whole line to be commented out (rare in practice). + """ + import re + def _replace(m): + pkgs = [p.strip() for p in m.group(2).split(",")] + if all(_package_is_installed(p) for p in pkgs if p): + return m.group(0) + return "% [stripped missing package] " + m.group(0) + return re.sub(r"\\usepackage(\[[^\]]*\])?\{([^}]+)\}", _replace, text) + + def _maybe_trim_pages(pdf: Path, max_pages: int | None) -> Path: """Return `pdf` (already a PDF). If `max_pages` is set and the PDF has more, return a trimmed sibling cached next to it.""" From fc84a6eb4d51f8f379f7d9d30e153b81a7d5c1f7 Mon Sep 17 00:00:00 2001 From: Dang Nguyen Date: Fri, 15 May 2026 23:50:37 -0500 Subject: [PATCH 6/7] reviewer3: persist sessionId for resume; eliminates duplicate-submit waste MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `.sid` file alongside each review output. On run: - If `.sid` exists and points to a still-valid R3 session, we POLL that session instead of submitting a new one. Avoids the duplicate-session credit waste observed when runs were killed mid-poll (~34% of credits spent on duplicates across yesterday's runs, per the rescue audit). - If the sid fetch hard-fails (403 "not found" / 404), drop the stale file and submit fresh. - On submit success: write the sid IMMEDIATELY so a SIGKILL between submit and first poll-tick still leaves a recovery path. - On any failure: keep the sid file — next run resumes the same session. - On success: leave the sid file in place as an audit trail (cheap; the out_json's presence is the real "done" marker for skip-completed). Perturbation: - `Reviewer3Job` gains `sid_file: Path | None`. `Reviewer3System.build_jobs` computes `/.sid` per cell and threads it through. - `_run_one` handles the resume vs submit branch. Conference: - `run_competitors.py` injects `cfg["_sid_file"]` next to the merged paper JSON (under `/.sids/..sid`). - Conference `Reviewer3Adapter.review()` honors the underscore-prefixed key and persists/resumes the same way. Also drops the conference `max_per_model: 1` back to 5 after confirming R3's throttle from yesterday has lifted (single-paper probe transitioned waiting -> processing). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../competitors/reviewer3_adapter.py | 33 +++++++++- .../conference_study/configs/reviewer3.yaml | 2 +- .../conference_study/run_competitors.py | 9 ++- benchmarks/perturbation/systems/reviewer3.py | 7 +++ .../perturbation/systems/reviewer3_adapter.py | 61 ++++++++++++++++--- 5 files changed, 101 insertions(+), 11 deletions(-) diff --git a/benchmarks/conference_study/competitors/reviewer3_adapter.py b/benchmarks/conference_study/competitors/reviewer3_adapter.py index 02caeed..82052c9 100644 --- a/benchmarks/conference_study/competitors/reviewer3_adapter.py +++ b/benchmarks/conference_study/competitors/reviewer3_adapter.py @@ -55,8 +55,37 @@ def review(self, pdf: Path, model: str, cfg: dict) -> NormalizedReview: # we already cap on our side. Untrimmed full PDFs were tripping R3's # HTTP 413 limit and inflating per-paper wall time. max_pages = cfg.get("max_pages") or opts.get("max_pages") - session_id = _r3._submit(rcfg, pdf, title=pdf.stem, max_pages=max_pages) - body = _r3._poll_until_done(rcfg, session_id, tag=f"reviewer3/{pdf.stem}") + + # sid_file is injected by run_competitors.py just before invocation: + # `cfg["_sid_file"] = out_file.with_suffix(".sid")`. If present and + # the file already exists, we resume that R3 session instead of + # submitting fresh — avoids duplicate-session credit waste when a + # prior run was killed mid-poll. (See PR notes; ~34% of credits + # observed wasted on duplicates before this fix.) + sid_file = cfg.get("_sid_file") + if isinstance(sid_file, str): + sid_file = Path(sid_file) + session_id = "" + body = None + if sid_file and sid_file.exists(): + session_id = sid_file.read_text().strip() + try: + body = _r3._poll_until_done(rcfg, session_id, + tag=f"reviewer3/{pdf.stem} (resumed)") + except RuntimeError as e: + m = str(e) + if "fetch failed" in m and ("403" in m or "404" in m): + sid_file.unlink(missing_ok=True) + session_id, body = "", None + else: + raise + + if body is None: + session_id = _r3._submit(rcfg, pdf, title=pdf.stem, max_pages=max_pages) + if sid_file: + sid_file.parent.mkdir(parents=True, exist_ok=True) + sid_file.write_text(session_id) + body = _r3._poll_until_done(rcfg, session_id, tag=f"reviewer3/{pdf.stem}") comments: list[NormalizedComment] = [] for i, raw in enumerate(body.get("comments") or []): diff --git a/benchmarks/conference_study/configs/reviewer3.yaml b/benchmarks/conference_study/configs/reviewer3.yaml index fccf923..1f6d19f 100644 --- a/benchmarks/conference_study/configs/reviewer3.yaml +++ b/benchmarks/conference_study/configs/reviewer3.yaml @@ -22,7 +22,7 @@ models: - reviewer3 timeout_sec: 3600 # outer per-(paper, model) wall cap (R3 is 10-30 min typical) -max_per_model: 5 # 5 concurrent submits/polls against R3 (matches plan) +max_per_model: 5 max_pages: 20 # parse_document cap; matches coarse.yaml convention # Adapter-specific options forwarded to Reviewer3Adapter.review(cfg=...). diff --git a/benchmarks/conference_study/run_competitors.py b/benchmarks/conference_study/run_competitors.py index 68214c6..3c5eb34 100644 --- a/benchmarks/conference_study/run_competitors.py +++ b/benchmarks/conference_study/run_competitors.py @@ -125,6 +125,14 @@ def run_one(paper: dict, model: str, adapter, cfg: dict, dry_run: bool = False) title, content, _was_ocr = parse_document(pdf, max_pages=MAX_PAGES) paragraphs = split_into_paragraphs(content) + # Inject sid_file location so the adapter can persist/resume the + # competitor-side session id (e.g. reviewer3). Adapters that don't + # care simply ignore the underscore-prefixed key. The file lives + # next to the merged paper JSON so it survives across runs. + out_file = RESULTS_DIR / f"{paper['slug']}.json" + sid_dir = RESULTS_DIR / ".sids" + cfg = {**cfg, "_sid_file": sid_dir / f"{paper['slug']}.{method_key}.sid"} + review = adapter.review(pdf, model, cfg) method_data = build_method_data( @@ -134,7 +142,6 @@ def run_one(paper: dict, model: str, adapter, cfg: dict, dry_run: bool = False) paragraphs=paragraphs, ) - out_file = RESULTS_DIR / f"{paper['slug']}.json" merge_into_paper_json( out_file=out_file, slug=paper["slug"], diff --git a/benchmarks/perturbation/systems/reviewer3.py b/benchmarks/perturbation/systems/reviewer3.py index 8ccc9a7..503720f 100644 --- a/benchmarks/perturbation/systems/reviewer3.py +++ b/benchmarks/perturbation/systems/reviewer3.py @@ -51,6 +51,11 @@ def build_jobs(self, units, cfg, results_dir): continue out_json.unlink() tag = f"{domain}/{u.paper_label}/{u.error_type}/{REVIEWER3_SLUG}" + # Persist the R3 sessionId next to the review JSON. If a prior + # run was killed mid-poll, the next run resumes that session + # instead of creating a duplicate (~34% credit waste observed + # without this). + sid_file = review_dir / f"{u.staged_corrupted.stem}.sid" job = ReviewJob( tag=tag, out_json=out_json, review_dir=review_dir, paper_label=f"{u.error_type}/{u.paper_label}", @@ -58,6 +63,7 @@ def build_jobs(self, units, cfg, results_dir): "paper": u.staged_corrupted, "source": u.src_corrupted, "max_pages": max_pages, + "sid_file": sid_file, "overrides": overrides, }, ) @@ -77,6 +83,7 @@ def run_jobs(self, cell_key, jobs, parallel): paper_label=j.tag, source=j.payload.get("source"), max_pages=j.payload.get("max_pages"), + sid_file=j.payload.get("sid_file"), ) for j in jobs ] diff --git a/benchmarks/perturbation/systems/reviewer3_adapter.py b/benchmarks/perturbation/systems/reviewer3_adapter.py index 3bbd96a..552d911 100644 --- a/benchmarks/perturbation/systems/reviewer3_adapter.py +++ b/benchmarks/perturbation/systems/reviewer3_adapter.py @@ -101,6 +101,12 @@ class Reviewer3Job: # Optional: trim the rendered PDF to its first N pages so R3 still sees # roughly the same window other systems do (coarse uses max_pages: 20). max_pages: int | None = None + # Optional: where to persist the R3 sessionId. When set, _run_one writes + # the sid here right after submit succeeds and reads it back on subsequent + # runs to resume the same R3 session instead of creating a duplicate. + # This prevents the duplicate-session credit waste we observed when we + # killed runs mid-poll (R3 keeps the session live and we'd submit fresh). + sid_file: Path | None = None @dataclass @@ -529,26 +535,67 @@ def _run_one(job: Reviewer3Job, cfg: Reviewer3Config) -> Reviewer3Result: job.out_json.parent.mkdir(parents=True, exist_ok=True) raw_path = job.out_json.with_suffix(".raw.json") tag = f"reviewer3/{job.paper_label}" - print(f"[{tag}] starting: {job.paper.name}", file=sys.stderr, flush=True) start = time.time() sid = "" + sid_file = job.sid_file + + def _write_outputs(body: dict, elapsed: float) -> int: + raw_path.write_text(json.dumps(body, indent=2, ensure_ascii=False)) + pipeline = build_pipeline_json(job.paper, body, elapsed_s=elapsed) + job.out_json.write_text(json.dumps(pipeline, indent=2, ensure_ascii=False)) + # Successful → sid_file no longer needed for resume, but leave it as + # an audit trail (cheap, sometimes useful for tracing back to R3 UI). + return len(pipeline["methods"][next(iter(pipeline["methods"]))]["comments"]) + try: + # Resume path: if a sid_file exists, try to recover the prior session + # instead of submitting fresh. This is the dedup-credit-waste fix. + if sid_file and sid_file.exists(): + sid = sid_file.read_text().strip() + print(f"[{tag}] resuming sessionId={sid} (from {sid_file.name})", + file=sys.stderr, flush=True) + try: + body = _poll_until_done(cfg, sid, tag=tag) + elapsed = time.time() - start + n = _write_outputs(body, elapsed) + print(f"[{tag}] done in {elapsed:.0f}s ({n} comments, resumed)", + file=sys.stderr, flush=True) + return Reviewer3Result(job=job, ok=True, elapsed_s=elapsed, + session_id=sid, raw_response=body) + except RuntimeError as e: + # Session-fetch hard fail (e.g., 403/404 — sid stale/invalid). + # Drop the sid_file and fall through to fresh submit. + msg = str(e) + if "fetch failed" in msg and ("403" in msg or "404" in msg): + print(f"[{tag}] sid {sid} unrecoverable ({msg[:80]}); " + "submitting fresh", file=sys.stderr, flush=True) + sid_file.unlink(missing_ok=True) + sid = "" + else: + raise # other RuntimeErrors (e.g., status=failed) propagate + + # Submit path: no usable sid_file, send a new submission. + print(f"[{tag}] starting: {job.paper.name}", file=sys.stderr, flush=True) sid = _submit(cfg, job.paper, title=job.title, source=job.source, max_pages=job.max_pages) print(f"[{tag}] submitted, sessionId={sid}", file=sys.stderr, flush=True) + if sid_file: + sid_file.parent.mkdir(parents=True, exist_ok=True) + sid_file.write_text(sid) body = _poll_until_done(cfg, sid, tag=tag) elapsed = time.time() - start - raw_path.write_text(json.dumps(body, indent=2, ensure_ascii=False)) - pipeline = build_pipeline_json(job.paper, body, elapsed_s=elapsed) - job.out_json.write_text(json.dumps(pipeline, indent=2, ensure_ascii=False)) - n = len(pipeline["methods"][next(iter(pipeline["methods"]))]["comments"]) - print(f"[{tag}] done in {elapsed:.0f}s ({n} comments)", file=sys.stderr, flush=True) + n = _write_outputs(body, elapsed) + print(f"[{tag}] done in {elapsed:.0f}s ({n} comments)", + file=sys.stderr, flush=True) return Reviewer3Result(job=job, ok=True, elapsed_s=elapsed, session_id=sid, raw_response=body) except Exception as e: elapsed = time.time() - start msg = f"{type(e).__name__}: {e}" - print(f"[{tag}] FAILED in {elapsed:.0f}s: {msg}", file=sys.stderr, flush=True) + print(f"[{tag}] FAILED in {elapsed:.0f}s: {msg}", + file=sys.stderr, flush=True) + # Keep sid_file on failure — next run will retry the same session + # rather than incur a duplicate submission cost. return Reviewer3Result(job=job, ok=False, elapsed_s=elapsed, session_id=sid, error=msg) From b3a167585b4286920ca5ffacf0cd2239142f3cf6 Mon Sep 17 00:00:00 2001 From: Dang Nguyen Date: Sun, 17 May 2026 21:50:40 -0500 Subject: [PATCH 7/7] reviewer3: add rescue_sessions.py + tighten gitignore for symlinks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two small additions: 1. `benchmarks/perturbation/rescue_sessions.py` — CLI that walks every `.sid` file under both result trees, fetches its R3 session via the API, and writes results to disk for any session that completed in the meantime (status=completed). Idempotent — `out_json` with content is skipped — so safe to re-run. The pattern emerged when ~30+ R3 sessions completed on the server after our local poll loop had timed out and abandoned them. Rescue recovered them without re-submitting (no duplicate-credit cost). Reviewer3 is the only system in the benchmark with async server-side state, so this stays reviewer3-specific by design. 2. `.gitignore` — add slash-less entries for the three symlinks we create when running R3 from a sibling worktree: - `.venv` (the existing `.venv/` rule misses symlinks) - `benchmarks/perturbation/data` - `benchmarks/perturbation/results` No files were tracked at these paths; this just cleans up `git status` noise when the symlinks are present. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 3 + benchmarks/perturbation/rescue_sessions.py | 285 +++++++++++++++++++++ 2 files changed, 288 insertions(+) create mode 100644 benchmarks/perturbation/rescue_sessions.py diff --git a/.gitignore b/.gitignore index fdb5bed..4dd8729 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ __pycache__/ *.pyc *.pyo .venv/ +.venv venv/ # Jupyter @@ -24,6 +25,8 @@ venv/ review_results/ benchmarks/conference_study/results benchmarks/perturbation/perturbation_results +benchmarks/perturbation/results +benchmarks/perturbation/data # conference_study study artifacts (not code) benchmarks/conference_study/manifests diff --git a/benchmarks/perturbation/rescue_sessions.py b/benchmarks/perturbation/rescue_sessions.py new file mode 100644 index 0000000..1070b29 --- /dev/null +++ b/benchmarks/perturbation/rescue_sessions.py @@ -0,0 +1,285 @@ +#!/usr/bin/env python3 +"""Pull down completed Reviewer 3 sessions whose local poll loop gave up. + +Reviewer 3 is async: a session keeps processing on their server even after +our adapter has timed out / been killed. The adapter persists the sessionId +to a `.sid` file next to each cell's output JSON; this script walks those +`.sid` files, fetches each session, and writes results to disk for the ones +that completed in the meantime. + +Walks both result trees: + * Perturbation: /full__reviewer3/.../review/*.sid + * Conference: /results/reviewer3_v2/.sids/*.sid + +Skips cells whose `out_json` already has comments. Safe to re-run. + +Usage: + python rescue_sessions.py # rescue both, write results + python rescue_sessions.py --dry-run # just report what's recoverable + python rescue_sessions.py --kind perturbation + python rescue_sessions.py --kind conference + +Required env: REVIEWER3_API_KEY (used for the `review:read` GET). + +Exit code: 0 always (no errors are fatal — script is meant to be safe to +re-run; failures are just logged). +""" +from __future__ import annotations + +import argparse +import json +import os +import sys +from concurrent.futures import ThreadPoolExecutor +from dataclasses import dataclass +from pathlib import Path + +try: + from dotenv import load_dotenv + load_dotenv() +except ImportError: + pass + +import requests # noqa: E402 + +HERE = Path(__file__).resolve().parent +sys.path.insert(0, str(HERE)) +from systems.reviewer3_adapter import build_pipeline_json # noqa: E402 + +# Repo root + conference_study root +REPO = HERE.parent.parent +CONFERENCE = REPO / "benchmarks" / "conference_study" +sys.path.insert(0, str(REPO / "src")) +sys.path.insert(0, str(CONFERENCE)) + + +REVIEWER3_BASE_URL = os.environ.get("REVIEWER3_BASE_URL", "https://reviewer3.com").rstrip("/") + + +@dataclass +class Orphan: + """A `.sid` file pointing at a session we never wrote out_json for.""" + kind: str # "pert" or "conf" + sid_file: Path + out_json: Path # where we'd write the result if status == completed + extra: dict # kind-specific (e.g. conference slug) + + +def _has_content(out_json: Path, method_key: str | None = None) -> bool: + """`out_json` exists AND already has a comment-bearing method entry.""" + if not out_json.exists(): + return False + try: + d = json.loads(out_json.read_text()) + except Exception: + return False + methods = d.get("methods") or {} + if method_key is not None: + m = methods.get(method_key) + return bool(m and m.get("comments")) + return bool(methods) and any(m.get("comments") for m in methods.values()) + + +def find_orphans(kind: str = "both") -> list[Orphan]: + """Walk .sid files; return ones whose result JSON is missing or empty.""" + out: list[Orphan] = [] + + if kind in ("both", "perturbation"): + # Result paths are resolved via Config.results_dir, which the runner + # resolves to absolute via REPO/. The most robust thing + # is to walk every `.sid` under any `full_*_reviewer3` results dir + # that the runner might write to. We check both the local results + # tree (under this worktree) and any sibling worktrees the user may + # have symlinked in. + pert_roots = [REPO / "benchmarks" / "perturbation" / "results"] + # Follow the symlink target too in case results/ is a symlink (we use + # this pattern when running from a different worktree than the data). + resolved = (REPO / "benchmarks" / "perturbation" / "results").resolve() + if resolved not in pert_roots: + pert_roots.append(resolved) + seen = set() + for root in pert_roots: + if not root.exists(): + continue + for sid_file in root.rglob("*.sid"): + if sid_file in seen: + continue + seen.add(sid_file) + out_json = sid_file.with_suffix(".json") + if _has_content(out_json): + continue + out.append(Orphan(kind="pert", sid_file=sid_file, + out_json=out_json, extra={})) + + if kind in ("both", "conference"): + conf_root = CONFERENCE / "results" / "reviewer3_v2" + sids_dir = conf_root / ".sids" + if sids_dir.exists(): + for sid_file in sids_dir.glob("*.sid"): + # filename: ..sid + stem_parts = sid_file.stem.split(".") + slug = stem_parts[0] + method_key = ".".join(stem_parts[1:]) or "reviewer3__reviewer3" + out_json = conf_root / f"{slug}.json" + if _has_content(out_json, method_key=method_key): + continue + out.append(Orphan(kind="conf", sid_file=sid_file, + out_json=out_json, + extra={"slug": slug, "method_key": method_key})) + + return out + + +def fetch_session(sid: str, *, headers: dict, timeout: float = 30.0) -> dict | None: + """Return the full session body if status==completed, else None.""" + url = f"{REVIEWER3_BASE_URL}/api/internal/review/{sid}" + try: + r = requests.get(url, headers=headers, timeout=timeout) + except Exception as e: + print(f" fetch error for {sid}: {type(e).__name__}: {e}", + file=sys.stderr, flush=True) + return None + if r.status_code != 200: + print(f" {sid}: HTTP {r.status_code}", file=sys.stderr, flush=True) + return None + body = r.json() + if body.get("status") != "completed": + return None + return body + + +def write_perturbation(orphan: Orphan, body: dict) -> None: + """Write pipeline JSON + raw.json next to the .sid.""" + raw_path = orphan.sid_file.with_suffix(".raw.json") + raw_path.write_text(json.dumps(body, indent=2, ensure_ascii=False)) + # Synthesize a paper path from the sid_file stem (e.g. paper_001_corrupted.md) + pj = build_pipeline_json(Path(orphan.sid_file.stem + ".md"), + body, elapsed_s=0.0) + orphan.out_json.write_text(json.dumps(pj, indent=2, ensure_ascii=False)) + + +def write_conference(orphan: Orphan, body: dict, *, _cache: dict = {}) -> None: + """Build the merged paper JSON for the conference cohort.""" + # Lazy imports to keep --kind perturbation fast (avoids parse_document deps). + if "loaded" not in _cache: + from competitors import get_adapter # noqa: F401 + from competitors.helpers import build_method_data, merge_into_paper_json + from competitors.base import NormalizedComment, NormalizedReview + from reviewer.parsers import parse_document + from reviewer.utils import split_into_paragraphs + import systems.reviewer3_adapter as r3a + manifest = json.loads((CONFERENCE / "manifests/v2/combined.json").read_text()) + _cache.update( + build_method_data=build_method_data, + merge_into_paper_json=merge_into_paper_json, + NormalizedComment=NormalizedComment, + NormalizedReview=NormalizedReview, + parse_document=parse_document, + split_into_paragraphs=split_into_paragraphs, + r3a=r3a, + slug2paper={p["slug"]: p for p in manifest["papers"]}, + loaded=True, + ) + slug = orphan.extra["slug"] + method_key = orphan.extra["method_key"] + paper = _cache["slug2paper"].get(slug) + if not paper: + print(f" {slug}: no manifest entry, skipping", file=sys.stderr) + return + pdf = CONFERENCE / paper["pdf_path"] + title, content, _ = _cache["parse_document"](pdf, max_pages=20) + paragraphs = _cache["split_into_paragraphs"](content) + comments = [] + for i, c in enumerate(body.get("comments") or []): + if not isinstance(c, dict): + c = {"comment": str(c)} + n = _cache["r3a"]._normalize_comment(c, i) + comments.append(_cache["NormalizedComment"]( + title=n.get("title", ""), quote=n.get("quote", ""), + explanation=n.get("explanation", ""), + comment_type=n.get("comment_type", "technical"), + extra={"severity": n.get("severity"), + "reviewerId": c.get("reviewerId"), + "rank": c.get("rank"), + "session_id": (body.get("session") or {}).get("id")}, + )) + review = _cache["NormalizedReview"]( + comments=comments, overall_feedback="", cost_usd=None, + cost_method="estimated", model="reviewer3", + ) + md = _cache["build_method_data"]( + review=review, method_key=method_key, + method_label="Reviewer3 (reviewer3)", paragraphs=paragraphs, + ) + _cache["merge_into_paper_json"]( + out_file=orphan.out_json, slug=slug, + title=paper.get("title") or title, paragraphs=paragraphs, + method_key=method_key, method_data=md, + ) + + +def main() -> int: + ap = argparse.ArgumentParser(description=__doc__.splitlines()[0]) + ap.add_argument("--kind", choices=["both", "perturbation", "conference"], + default="both") + ap.add_argument("--dry-run", action="store_true", + help="Just report what's recoverable; don't write anything.") + ap.add_argument("--parallel", type=int, default=10, + help="Concurrent API fetches (default 10).") + args = ap.parse_args() + + api_key = os.environ.get("REVIEWER3_API_KEY") + if not api_key: + print("REVIEWER3_API_KEY not set (check .env)", file=sys.stderr) + return 1 + headers = {"x-api-key": api_key} + + orphans = find_orphans(args.kind) + print(f"found {len(orphans)} orphan .sid file(s) " + f"(no result on disk for these cells yet)") + if not orphans: + return 0 + + # Fetch every sid in parallel; bucket by status. + def _fetch(orphan: Orphan): + sid = orphan.sid_file.read_text().strip() + if not sid: + return orphan, sid, None + return orphan, sid, fetch_session(sid, headers=headers) + + completed: list[tuple[Orphan, str, dict]] = [] + incomplete = 0 + with ThreadPoolExecutor(max_workers=args.parallel) as pool: + for orphan, sid, body in pool.map(_fetch, orphans): + if body is None: + incomplete += 1 + continue + completed.append((orphan, sid, body)) + + print(f" {len(completed)} completed on R3 (recoverable)") + print(f" {incomplete} still in-progress / failed / unreachable") + + if args.dry_run or not completed: + for orphan, sid, _ in completed: + print(f" would write: {orphan.out_json} (sid={sid})") + return 0 + + n_pert = n_conf = 0 + for orphan, sid, body in completed: + try: + if orphan.kind == "pert": + write_perturbation(orphan, body) + n_pert += 1 + else: + write_conference(orphan, body) + n_conf += 1 + except Exception as e: + print(f" write failed for {sid} ({orphan.out_json}): " + f"{type(e).__name__}: {e}", file=sys.stderr) + + print(f"\nrescued: perturbation={n_pert} conference={n_conf}") + return 0 + + +if __name__ == "__main__": + sys.exit(main())