From bf6b3b6fa992e72e46190a47614218e14345f5f8 Mon Sep 17 00:00:00 2001
From: Dang Nguyen <nmd.ptnk@gmail.com>
Date: Fri, 15 May 2026 15:30:49 -0500
Subject: [PATCH 1/7] Add Reviewer 3 (closed-source HTTP API) as a benchmark
 system
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wires reviewer3 into both the perturbation benchmark (run_benchmark.py) and
the conference outcomes study (run_competitors.py), so it can be benchmarked
on the exact same paper sets as openaireview and coarse.

Highlights
- Perturbation: LaTeX-as-md sources are compiled to PDF with pdflatex
  before submission (cs_CC corpus is full LaTeX); other domains pass
  through. PDF MIME is set; cfg knobs (review_mode, poll_interval_s,
  poll_timeout_s) are threaded from YAML through Reviewer3System into
  the adapter via the job payload.
- Comment normalization picks up the recently-added Reviewer 3 fields:
  citedText (-> quote), title, severity (1-4 -> canonical 3-tier scale).
- Severity mapping is consolidated into benchmarks/perturbation/_severity.py
  so the perturbation adapter, compute_auc.py, and report_scaleup.py share
  one source of truth (collapses three inlined copies of COARSE_SEVERITY_MAP).
- Conference adapter reuses the perturbation HTTP submit/poll/normalize
  helpers via sys.path import; no duplication. R3 has no model selector,
  so method_key is fixed at reviewer3__reviewer3.
- 8 perturbation configs (full_<domain>_reviewer3.yaml) mirror the
  full_<domain>_coarse.yaml knobs for max_tokens / min_perturbations.
- conference_study/configs/reviewer3.yaml pins models: [reviewer3] so
  the per-(paper, model) loop fires once per paper rather than once per
  manifest model.
- requests>=2.31 added to [project.optional-dependencies] benchmarks.
- .env.example documents REVIEWER3_API_KEY / REVIEWER3_USER_ID
  (UUID, not email — see neurips_2026 setup notes).

Operational note (not in this diff)
- conference_study/{manifests,papers,results} are gitignored data dirs
  that live only in the sibling worktree. To run, symlink them in:
    cd benchmarks/conference_study
    ln -s ../../../OpenAIReview/benchmarks/conference_study/manifests manifests
    ln -s ../../../OpenAIReview/benchmarks/conference_study/papers    papers
    ln -s ../../../OpenAIReview/benchmarks/conference_study/results   results

Known gap
- run_competitors.py does not load .env (only run_benchmark.py does);
  export REVIEWER3_* before launching, or add a dotenv.load_dotenv() at
  the top of run_competitors.py in a follow-up.

Smoke-validated end-to-end on 1 cs_CC paper through the perturbation
runner; recall 10/17 on the staged perturbations (LLM judge).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .env.example                                  |  6 ++
 .../conference_study/analyses/compute_auc.py  | 19 ++---
 .../analyses/report_scaleup.py                | 22 ++---
 .../conference_study/competitors/registry.py  |  2 +
 .../competitors/reviewer3_adapter.py          | 83 ++++++++++++++++++
 .../conference_study/configs/reviewer3.yaml   | 32 +++++++
 benchmarks/perturbation/_severity.py          | 85 +++++++++++++++++++
 .../configs/full_cs_CC_reviewer3.yaml         | 13 +++
 .../configs/full_cs_LG_reviewer3.yaml         | 13 +++
 .../configs/full_econ_EM_reviewer3.yaml       | 13 +++
 .../configs/full_hep_ex_reviewer3.yaml        | 13 +++
 .../configs/full_math_all_reviewer3.yaml      | 13 +++
 .../full_physics_atm_clus_reviewer3.yaml      | 13 +++
 .../configs/full_q_bio_GN_reviewer3.yaml      | 13 +++
 .../configs/full_stat_AP_reviewer3.yaml       | 13 +++
 benchmarks/perturbation/systems/reviewer3.py  |  9 +-
 .../perturbation/systems/reviewer3_adapter.py | 73 +++++++++++++---
 pyproject.toml                                |  2 +-
 18 files changed, 396 insertions(+), 41 deletions(-)
 create mode 100644 benchmarks/conference_study/competitors/reviewer3_adapter.py
 create mode 100644 benchmarks/conference_study/configs/reviewer3.yaml
 create mode 100644 benchmarks/perturbation/_severity.py
 create mode 100644 benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml
 create mode 100644 benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml
 create mode 100644 benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml
 create mode 100644 benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml
 create mode 100644 benchmarks/perturbation/configs/full_math_all_reviewer3.yaml
 create mode 100644 benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml
 create mode 100644 benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml
 create mode 100644 benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml
diff --git a/.env.example b/.env.example
index 56aac36..4e36a22 100644
--- a/.env.example
+++ b/.env.example
@@ -15,3 +15,9 @@ OPENROUTER_API_KEY=your_openrouter_api_key_here
 
 # Optional: custom OpenAI base URL (e.g. EU endpoint, Azure)
 # OPENAI_BASE_URL=https://eu.api.openai.com/v1
+
+# Reviewer 3 (closed-source HTTP API; benchmarks/perturbation only).
+# Required when running the perturbation benchmark with system: reviewer3.
+# REVIEWER3_API_KEY=sk_...
+# REVIEWER3_USER_ID=<uuid from web UI session JSON; not an email>
+# REVIEWER3_BASE_URL=https://reviewer3.com  # optional override
diff --git a/benchmarks/conference_study/analyses/compute_auc.py b/benchmarks/conference_study/analyses/compute_auc.py
index d32f1a9..0d87e23 100644
--- a/benchmarks/conference_study/analyses/compute_auc.py
+++ b/benchmarks/conference_study/analyses/compute_auc.py
@@ -51,17 +51,14 @@
 REPO_ROOT = HERE.parent  # benchmarks/conference_study/
 RESULTS_ROOT = REPO_ROOT / "results"
 
-COARSE_SEVERITY_MAP = {"critical": "major", "major": "moderate", "minor": "minor"}
-SEVERITY_TIERS = ("major", "moderate", "minor")
-
-
-def normalize_severity(method: str, raw: str | None) -> str | None:
-    if not raw:
-        return None
-    raw = raw.lower()
-    if method == "coarse":
-        return COARSE_SEVERITY_MAP.get(raw)
-    return raw if raw in SEVERITY_TIERS else None
+# Severity normalization lives in benchmarks/perturbation/_severity.py so the
+# perturbation adapters and these analyses use one source of truth.
+sys.path.insert(0, str(HERE.parents[1] / "perturbation"))
+from _severity import (  # noqa: E402
+    COARSE_SEVERITY_MAP,
+    TIERS as SEVERITY_TIERS,
+    normalize_severity,
+)
 
 
 def load_manifest(path: Path) -> dict[str, list[dict]]:
diff --git a/benchmarks/conference_study/analyses/report_scaleup.py b/benchmarks/conference_study/analyses/report_scaleup.py
index 79dce76..8a43725 100644
--- a/benchmarks/conference_study/analyses/report_scaleup.py
+++ b/benchmarks/conference_study/analyses/report_scaleup.py
@@ -204,20 +204,14 @@ def comment_metrics_by_method(
     return out
 
 
-# Coarse uses {minor, major, critical}; openaireview methods use
-# {minor, moderate, major}. Normalize so a single set of tiers compares
-# apples-to-apples (highest=major, mid=moderate, low=minor).
-_COARSE_SEVERITY_MAP = {"critical": "major", "major": "moderate", "minor": "minor"}
-SEVERITY_TIERS = ("major", "moderate", "minor")
-
-
-def normalize_severity(method: str, raw: str | None) -> str | None:
-    if not raw:
-        return None
-    raw = raw.lower()
-    if method == "coarse":
-        return _COARSE_SEVERITY_MAP.get(raw)
-    return raw if raw in SEVERITY_TIERS else None
+# Severity normalization lives in benchmarks/perturbation/_severity.py so the
+# perturbation adapters and these analyses use one source of truth.
+sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "perturbation"))
+from _severity import (  # noqa: E402
+    COARSE_SEVERITY_MAP as _COARSE_SEVERITY_MAP,
+    TIERS as SEVERITY_TIERS,
+    normalize_severity,
+)
 
 
 def severity_counts_by_method(
diff --git a/benchmarks/conference_study/competitors/registry.py b/benchmarks/conference_study/competitors/registry.py
index be0a25a..7a64bd8 100644
--- a/benchmarks/conference_study/competitors/registry.py
+++ b/benchmarks/conference_study/competitors/registry.py
@@ -9,9 +9,11 @@
 
 from .base import CompetitorAdapter
 from .coarse_adapter import CoarseAdapter
+from .reviewer3_adapter import Reviewer3Adapter
 
 _REGISTRY: dict[str, type[CompetitorAdapter]] = {
     "coarse": CoarseAdapter,
+    "reviewer3": Reviewer3Adapter,
 }
 
 
diff --git a/benchmarks/conference_study/competitors/reviewer3_adapter.py b/benchmarks/conference_study/competitors/reviewer3_adapter.py
new file mode 100644
index 0000000..be4dbd2
--- /dev/null
+++ b/benchmarks/conference_study/competitors/reviewer3_adapter.py
@@ -0,0 +1,83 @@
+"""Adapter for Reviewer 3 (closed-source HTTP API).
+
+Submission flow is the same as the perturbation benchmark — POST a PDF to
+`/api/internal/review`, poll the session until the `status` enum is terminal,
+then map each comment via `_normalize_comment`. We reuse those helpers from
+the perturbation adapter (`benchmarks/perturbation/systems/reviewer3_adapter.py`)
+rather than duplicating the HTTP code; the only difference here is that
+conference inputs arrive as PDFs already, so the LaTeX-as-md → PDF compile
+step (`_ensure_pdf`) is unnecessary.
+
+Reviewer 3 has no model selector, so `method_key(...)` always returns
+`"reviewer3__reviewer3"` regardless of the manifest `model` value. The
+conference YAML should pin `models: [reviewer3]` to avoid duplicate
+submissions across a phantom model loop.
+
+Required env:
+    REVIEWER3_API_KEY    sk_... (sent as `x-api-key` header)
+    REVIEWER3_USER_ID    UUID from the vendor's web UI session JSON (not an email)
+"""
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+from .base import CompetitorAdapter, NormalizedComment, NormalizedReview
+
+# Reuse the perturbation adapter's HTTP + normalization helpers.
+_PERT = Path(__file__).resolve().parents[2] / "perturbation" / "systems"
+sys.path.insert(0, str(_PERT))
+import reviewer3_adapter as _r3  # noqa: E402
+
+
+_METHOD_KEY = f"{_r3.REVIEWER3_SLUG}__{_r3.REVIEWER3_SLUG}"
+
+
+class Reviewer3Adapter(CompetitorAdapter):
+    name = "reviewer3"
+    required_env = ("REVIEWER3_API_KEY", "REVIEWER3_USER_ID")
+
+    def method_key(self, model: str) -> str:
+        # R3 has no model selector — fixed key regardless of `model`.
+        return _METHOD_KEY
+
+    def review(self, pdf: Path, model: str, cfg: dict) -> NormalizedReview:
+        opts = cfg.get("reviewer3_options", {}) or {}
+        rcfg = _r3.config_from_env()
+        for k in ("review_mode", "poll_interval_s", "poll_timeout_s",
+                  "request_timeout_s", "base_url"):
+            if k in opts and opts[k] is not None:
+                setattr(rcfg, k, opts[k])
+
+        session_id = _r3._submit(rcfg, pdf, title=pdf.stem)
+        body = _r3._poll_until_done(rcfg, session_id, tag=f"reviewer3/{pdf.stem}")
+
+        comments: list[NormalizedComment] = []
+        for i, raw in enumerate(body.get("comments") or []):
+            if not isinstance(raw, dict):
+                raw = {"comment": str(raw)}
+            norm = _r3._normalize_comment(raw, i)
+            comments.append(NormalizedComment(
+                title=norm.get("title", ""),
+                quote=norm.get("quote", ""),
+                explanation=norm.get("explanation", ""),
+                comment_type=norm.get("comment_type", "technical"),
+                extra={
+                    "severity": norm.get("severity"),
+                    "reviewerId": raw.get("reviewerId"),
+                    "rank": raw.get("rank"),
+                    "session_id": session_id,
+                },
+            ))
+
+        # R3 doesn't publish pricing and doesn't return overall_feedback or
+        # token counts in its response, so we leave those empty/None.
+        return NormalizedReview(
+            comments=comments,
+            overall_feedback="",
+            cost_usd=None,
+            cost_method="estimated",
+            prompt_tokens=None,
+            completion_tokens=None,
+            model=_r3.REVIEWER3_SLUG,
+        )
diff --git a/benchmarks/conference_study/configs/reviewer3.yaml b/benchmarks/conference_study/configs/reviewer3.yaml
new file mode 100644
index 0000000..f090d41
--- /dev/null
+++ b/benchmarks/conference_study/configs/reviewer3.yaml
@@ -0,0 +1,32 @@
+# Reviewer 3 (closed-source HTTP API) run on the v2 conference cohort.
+# Results -> benchmarks/conference_study/results/reviewer3_v2/
+# Log    -> benchmarks/conference_study/results/reviewer3_v2/run_log.jsonl
+#
+# Prerequisites:
+#   - REVIEWER3_API_KEY  and  REVIEWER3_USER_ID  set in .env
+#   - Manifest, papers, and results dirs reachable. In this worktree they are
+#     symlinks into the sibling OpenAIReview worktree (gitignored data lives
+#     only there). Set them up with:
+#       ln -s ../../../OpenAIReview/benchmarks/conference_study/manifests manifests
+#       ln -s ../../../OpenAIReview/benchmarks/conference_study/papers    papers
+#       ln -s ../../../OpenAIReview/benchmarks/conference_study/results   results
+
+name: reviewer3_v2
+competitor: reviewer3
+
+manifest: manifests/v2/combined.json
+
+# R3 has no model selector. Pin to a single dummy entry so run_competitors.py
+# loops once per paper rather than once per (paper × manifest model).
+models:
+  - reviewer3
+
+timeout_sec: 3600       # outer per-(paper, model) wall cap (R3 is 10-30 min typical)
+max_per_model: 5        # 5 concurrent submits/polls against R3 (matches plan)
+max_pages: 20           # parse_document cap; matches coarse.yaml convention
+
+# Adapter-specific options forwarded to Reviewer3Adapter.review(cfg=...).
+reviewer3_options:
+  review_mode: author      # author | journal (R3 reviewMode enum)
+  poll_interval_s: 30
+  poll_timeout_s: 1800     # 30 min/paper cap inside the poll loop
diff --git a/benchmarks/perturbation/_severity.py b/benchmarks/perturbation/_severity.py
new file mode 100644
index 0000000..0e2db3c
--- /dev/null
+++ b/benchmarks/perturbation/_severity.py
@@ -0,0 +1,85 @@
+"""Canonical severity tiers and per-system normalization.
+
+The perturbation benchmark, the conference study analyses, and the viz layer
+all want to compare comment severities across review systems. Each system uses
+its own native vocabulary, so before any cross-system comparison the raw value
+must be mapped to the canonical 3-tier scale used by openaireview itself:
+
+    major     - Undermines a key claim/methodology; affects conclusions.
+    moderate  - Real error or gap that is localized and fixable.
+    minor     - Framing concern, mild overclaim, or resolvable ambiguity.
+
+Per-system maps:
+
+  * openaireview: identity. Output is already in {major, moderate, minor}.
+  * coarse:       {critical, major, minor} -> {major, moderate, minor}
+                  (shift down one tier; same mapping that the conference-study
+                  scripts in benchmarks/conference_study/analyses/ use).
+  * reviewer3:    integer 1..4 per their OpenAPI spec, where
+                  1=Critical, 2=Major, 3=Minor, 4=Editorial.
+                  Compressed to the 3-tier scale by collapsing R3 Minor and
+                  Editorial into `minor`, since in practice R3 tags substantive
+                  -but-lower-importance findings as Editorial rather than style
+                  notes. Confirm with the vendor if the label is later clarified.
+
+The conference_study analyses currently inline `COARSE_SEVERITY_MAP` (see
+`benchmarks/conference_study/analyses/compute_auc.py` and `report_scaleup.py`).
+Once those analyses are co-resident with this module they should import
+`COARSE_SEVERITY_MAP` and `normalize_severity` from here instead.
+"""
+
+from __future__ import annotations
+
+
+TIERS: tuple[str, ...] = ("major", "moderate", "minor")
+
+
+# openaireview methods emit canonical tier strings directly.
+OPENAIREVIEW_SEVERITY_MAP: dict[str, str] = {t: t for t in TIERS}
+
+# coarse uses {minor, major, critical}. Shift down one level.
+COARSE_SEVERITY_MAP: dict[str, str] = {
+    "critical": "major",
+    "major": "moderate",
+    "minor": "minor",
+}
+
+# Reviewer 3 spec: 1=Critical, 2=Major, 3=Minor, 4=Editorial.
+# Compress to 3 tiers; Editorial collapses with Minor (see module docstring).
+REVIEWER3_SEVERITY_MAP: dict[int, str] = {
+    1: "major",
+    2: "moderate",
+    3: "minor",
+    4: "minor",
+}
+
+
+def normalize_severity(system: str, raw: object) -> str | None:
+    """Map a system-native severity value to the canonical 3-tier scale.
+
+    Returns None for unrecognized values so callers can decide whether to drop
+    the comment, default it, or warn.
+
+    `system` is the registry key matching `benchmarks/perturbation/systems/`:
+    'openaireview', 'coarse', or 'reviewer3'.
+    """
+    if raw is None:
+        return None
+    sysn = system.lower()
+    if sysn == "reviewer3":
+        if isinstance(raw, int):
+            return REVIEWER3_SEVERITY_MAP.get(raw)
+        # tolerate the str-form for hand-written test fixtures
+        try:
+            return REVIEWER3_SEVERITY_MAP.get(int(raw))
+        except (TypeError, ValueError):
+            return None
+    if not isinstance(raw, str):
+        return None
+    s = raw.lower()
+    if sysn == "coarse":
+        return COARSE_SEVERITY_MAP.get(s)
+    if sysn == "openaireview":
+        return OPENAIREVIEW_SEVERITY_MAP.get(s)
+    # unknown system -> pass through if already canonical
+    return s if s in TIERS else None
diff --git a/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml b/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml
new file mode 100644
index 0000000..3488e54
--- /dev/null
+++ b/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml
@@ -0,0 +1,13 @@
+system: reviewer3
+input_dir: benchmarks/perturbation/data/perturbations_filtered/cs_CC/all
+max_tokens: 13000          # match full_cs_CC_coarse.yaml
+min_perturbations: 5       # match full_cs_CC_coarse.yaml
+
+score_method: llm
+score_model: google/gemini-3-flash-preview
+
+review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
+poll_interval_s: 30
+poll_timeout_s: 1800       # 30 min/paper cap
+
+results_dir: benchmarks/perturbation/results/full_cs_CC_reviewer3
diff --git a/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml b/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml
new file mode 100644
index 0000000..bc3cbb3
--- /dev/null
+++ b/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml
@@ -0,0 +1,13 @@
+system: reviewer3
+input_dir: benchmarks/perturbation/data/perturbations_filtered/cs_LG/all
+max_tokens: 13000          # match full_cs_LG_coarse.yaml
+min_perturbations: 5       # match full_cs_LG_coarse.yaml
+
+score_method: llm
+score_model: google/gemini-3-flash-preview
+
+review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
+poll_interval_s: 30
+poll_timeout_s: 1800       # 30 min/paper cap
+
+results_dir: benchmarks/perturbation/results/full_cs_LG_reviewer3
diff --git a/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml b/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml
new file mode 100644
index 0000000..f871179
--- /dev/null
+++ b/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml
@@ -0,0 +1,13 @@
+system: reviewer3
+input_dir: benchmarks/perturbation/data/perturbations_filtered/econ_EM/all
+max_tokens: 13000          # match full_econ_EM_coarse.yaml
+min_perturbations: 5       # match full_econ_EM_coarse.yaml
+
+score_method: llm
+score_model: google/gemini-3-flash-preview
+
+review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
+poll_interval_s: 30
+poll_timeout_s: 1800       # 30 min/paper cap
+
+results_dir: benchmarks/perturbation/results/full_econ_EM_reviewer3
diff --git a/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml b/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml
new file mode 100644
index 0000000..901fbdb
--- /dev/null
+++ b/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml
@@ -0,0 +1,13 @@
+system: reviewer3
+input_dir: benchmarks/perturbation/data/perturbations_filtered/hep-ex/all
+max_tokens: 13000          # match full_hep_ex_coarse.yaml
+min_perturbations: 5       # match full_hep_ex_coarse.yaml
+
+score_method: llm
+score_model: google/gemini-3-flash-preview
+
+review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
+poll_interval_s: 30
+poll_timeout_s: 1800       # 30 min/paper cap
+
+results_dir: benchmarks/perturbation/results/full_hep_ex_reviewer3
diff --git a/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml b/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml
new file mode 100644
index 0000000..ef94db9
--- /dev/null
+++ b/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml
@@ -0,0 +1,13 @@
+system: reviewer3
+input_dir: benchmarks/perturbation/data/perturbations_filtered/math_all/all
+max_tokens: 13000          # match full_math_all_coarse.yaml
+min_perturbations: 5       # match full_math_all_coarse.yaml
+
+score_method: llm
+score_model: google/gemini-3-flash-preview
+
+review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
+poll_interval_s: 30
+poll_timeout_s: 1800       # 30 min/paper cap
+
+results_dir: benchmarks/perturbation/results/full_math_all_reviewer3
diff --git a/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml b/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml
new file mode 100644
index 0000000..79d5851
--- /dev/null
+++ b/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml
@@ -0,0 +1,13 @@
+system: reviewer3
+input_dir: benchmarks/perturbation/data/perturbations_filtered/physics_atm-clus/all
+max_tokens: 13000          # match full_physics_atm_clus_coarse.yaml
+min_perturbations: 5       # match full_physics_atm_clus_coarse.yaml
+
+score_method: llm
+score_model: google/gemini-3-flash-preview
+
+review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
+poll_interval_s: 30
+poll_timeout_s: 1800       # 30 min/paper cap
+
+results_dir: benchmarks/perturbation/results/full_physics_atm_clus_reviewer3
diff --git a/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml b/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml
new file mode 100644
index 0000000..00d3bf4
--- /dev/null
+++ b/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml
@@ -0,0 +1,13 @@
+system: reviewer3
+input_dir: benchmarks/perturbation/data/perturbations_filtered/q-bio_GN/all
+max_tokens: 13000          # match full_q_bio_GN_coarse.yaml
+min_perturbations: 5       # match full_q_bio_GN_coarse.yaml
+
+score_method: llm
+score_model: google/gemini-3-flash-preview
+
+review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
+poll_interval_s: 30
+poll_timeout_s: 1800       # 30 min/paper cap
+
+results_dir: benchmarks/perturbation/results/full_q_bio_GN_reviewer3
diff --git a/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml b/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml
new file mode 100644
index 0000000..a375500
--- /dev/null
+++ b/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml
@@ -0,0 +1,13 @@
+system: reviewer3
+input_dir: benchmarks/perturbation/data/perturbations_filtered/stat_AP/all
+max_tokens: 13000          # match full_stat_AP_coarse.yaml
+min_perturbations: 5       # match full_stat_AP_coarse.yaml
+
+score_method: llm
+score_model: google/gemini-3-flash-preview
+
+review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
+poll_interval_s: 30
+poll_timeout_s: 1800       # 30 min/paper cap
+
+results_dir: benchmarks/perturbation/results/full_stat_AP_reviewer3
diff --git a/benchmarks/perturbation/systems/reviewer3.py b/benchmarks/perturbation/systems/reviewer3.py
index 66ab23e..1f4b1b9 100644
--- a/benchmarks/perturbation/systems/reviewer3.py
+++ b/benchmarks/perturbation/systems/reviewer3.py
@@ -30,6 +30,10 @@ class Reviewer3System(System):
 
     def build_jobs(self, units, cfg, results_dir):
         domain = results_dir.name
+        overrides = {
+            k: cfg[k] for k in ("review_mode", "poll_interval_s", "poll_timeout_s")
+            if k in cfg and cfg[k] is not None
+        }
         out: list[tuple[CellKey, ReviewJob]] = []
         for u in units:
             if not u.staged_corrupted.exists():
@@ -45,7 +49,7 @@ def build_jobs(self, units, cfg, results_dir):
             job = ReviewJob(
                 tag=tag, out_json=out_json, review_dir=review_dir,
                 paper_label=f"{u.error_type}/{u.paper_label}",
-                payload={"paper": u.staged_corrupted},
+                payload={"paper": u.staged_corrupted, "overrides": overrides},
             )
             out.append(((REVIEWER3_SLUG,), job))
         return out
@@ -54,7 +58,8 @@ def run_jobs(self, cell_key, jobs, parallel):
         if not jobs:
             return []
         cfg = reviewer3_adapter.config_from_env()
-        # cfg overrides come from the caller via run_jobs_with_cfg; default cfg is fine here.
+        for k, v in jobs[0].payload.get("overrides", {}).items():
+            setattr(cfg, k, v)
         adapter_jobs = [
             reviewer3_adapter.Reviewer3Job(
                 paper=j.payload["paper"],
diff --git a/benchmarks/perturbation/systems/reviewer3_adapter.py b/benchmarks/perturbation/systems/reviewer3_adapter.py
index c2f7408..71c1196 100644
--- a/benchmarks/perturbation/systems/reviewer3_adapter.py
+++ b/benchmarks/perturbation/systems/reviewer3_adapter.py
@@ -114,8 +114,48 @@ def _headers(cfg: Reviewer3Config) -> dict[str, str]:
     return {"x-api-key": cfg.api_key}
 
 
+def _ensure_pdf(paper: Path) -> Path:
+    """Reviewer 3 only accepts PDF. If `paper` is already PDF, return it.
+    If it's a LaTeX source (starts with `\\documentclass`) — true for the
+    `cs_CC` corpus where `.md` files are really `.tex` — compile via pdflatex
+    and cache the result next to the source. Otherwise raise."""
+    if paper.suffix.lower() == ".pdf":
+        return paper
+    head = paper.read_text(errors="replace")[:2000]
+    if "\\documentclass" not in head:
+        raise RuntimeError(
+            f"don't know how to convert {paper.name} to PDF "
+            "(no \\documentclass found; expected LaTeX-as-md or PDF)"
+        )
+    cached = paper.with_suffix(".pdf")
+    if cached.exists() and cached.stat().st_mtime > paper.stat().st_mtime:
+        return cached
+    import shutil, subprocess, tempfile
+    text = paper.read_text(errors="replace")
+    # Staging truncates by tokens, which can leave the LaTeX source missing
+    # \end{document} (or mid-environment). Best-effort: ensure document closes.
+    if "\\end{document}" not in text:
+        text = text.rstrip() + "\n\n\\end{document}\n"
+    with tempfile.TemporaryDirectory() as td:
+        tex = Path(td) / "source.tex"
+        tex.write_text(text)
+        # Run twice to resolve cross-refs; ignore exit code, accept partial PDF.
+        for _ in range(2):
+            subprocess.run(
+                ["pdflatex", "-interaction=nonstopmode", "source.tex"],
+                cwd=td, capture_output=True, text=True, timeout=180,
+            )
+        out_pdf = Path(td) / "source.pdf"
+        if not out_pdf.exists() or out_pdf.stat().st_size < 1000:
+            log = (Path(td) / "source.log").read_text(errors="replace") if (Path(td) / "source.log").exists() else ""
+            raise RuntimeError(f"pdflatex produced no usable PDF for {paper}: {log[-1500:]}")
+        shutil.copy(out_pdf, cached)
+    return cached
+
+
 def _submit(cfg: Reviewer3Config, paper: Path, *, title: str | None) -> str:
     """POST /api/internal/review (multipart). Returns sessionId."""
+    paper = _ensure_pdf(paper)
     url = f"{cfg.base_url}/api/internal/review"
     data: dict[str, str] = {
         "userId": cfg.user_id,
@@ -125,7 +165,7 @@ def _submit(cfg: Reviewer3Config, paper: Path, *, title: str | None) -> str:
     if title:
         data["title"] = title
     with paper.open("rb") as fh:
-        files = {"file": (paper.name, fh, "text/markdown")}
+        files = {"file": (paper.name, fh, "application/pdf")}
         resp = requests.post(url, headers=_headers(cfg), data=data, files=files,
                              timeout=cfg.request_timeout_s)
     if resp.status_code >= 400:
@@ -180,29 +220,36 @@ def _pick(d: dict, *keys: str, default: str = "") -> str:
     return default
 
 
+# Canonical severity mapping lives in benchmarks/perturbation/_severity.py
+# so every system (coarse, reviewer3, openaireview) and the downstream
+# conference-study analyses share one source of truth.
+import sys as _sys
+from pathlib import Path as _Path
+_sys.path.insert(0, str(_Path(__file__).resolve().parent.parent))
+from _severity import normalize_severity as _normalize_r3_severity  # noqa: E402
+
+
 def _normalize_comment(raw: dict, idx: int) -> dict:
-    """Best-effort mapping from Reviewer 3 comment shape to pipeline schema.
+    """Map a Reviewer 3 comment to the pipeline schema.
 
-    The OpenAPI spec doesn't pin field names. We try common synonyms; anything
-    we don't recognize is preserved on the side as `_raw` for later inspection.
+    Reviewer 3 comments carry: reviewerId, comment, title, citedText, severity, rank.
+    `citedText` is the verbatim excerpt from the paper — what our scorer uses as
+    `quote` for fuzzy/semantic matching.
     """
     cid = _pick(raw, "id", "commentId", "uuid") or f"reviewer3_{idx}"
-    title = _pick(raw, "title", "subject", "heading", "summary")
-    quote = _pick(raw, "quote", "snippet", "excerpt", "passage", "text", "highlight")
-    explanation = _pick(raw, "explanation", "comment", "feedback", "body", "rationale", "message")
+    title = _pick(raw, "title")
+    quote = _pick(raw, "citedText", "quote", "snippet", "excerpt", "passage", "highlight")
+    explanation = _pick(raw, "comment", "explanation", "feedback", "body", "rationale", "message")
+    severity = _normalize_r3_severity("reviewer3", raw.get("severity"))
     if not explanation:
-        # last resort: serialize whatever we have so the comment isn't empty
-        explanation = json.dumps({k: v for k, v in raw.items()
-                                  if k not in ("id", "commentId", "uuid", "title", "subject",
-                                               "heading", "summary", "quote", "snippet",
-                                               "excerpt", "passage", "text", "highlight")},
-                                 ensure_ascii=False)
+        explanation = json.dumps(raw, ensure_ascii=False)
     return {
         "id": cid,
         "title": title,
         "quote": quote,
         "explanation": explanation,
         "comment_type": "technical",
+        "severity": severity,
         "paragraph_index": None,
         "_raw": raw,
     }
diff --git a/pyproject.toml b/pyproject.toml
index 396295e..2500007 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,4 +34,4 @@ reviewer = ["viz/*.html", "skill/SKILL.md", "skill/scripts/*.py", "skill/referen
 mistral = ["mistral-ocr-cli>=1.2.0"]
 deepseek = ["deepseek-ocr-cli>=0.4.2"]
 dev = ["pytest>=8.0"]
-benchmarks = ["pyyaml>=6.0", "datasets>=2.0", "rapidfuzz>=3.0", "sentence-transformers>=3.0"]
+benchmarks = ["pyyaml>=6.0", "datasets>=2.0", "rapidfuzz>=3.0", "sentence-transformers>=3.0", "requests>=2.31"]

From 3453bf9cd3836597366c2f6127a8effec7075e79 Mon Sep 17 00:00:00 2001
From: Dang Nguyen <nmd.ptnk@gmail.com>
Date: Fri, 15 May 2026 15:43:02 -0500
Subject: [PATCH 2/7] gitignore: cover conference_study symlink, ephemeral
 perturbation configs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two cleanups so `git status` reflects only real work:

Root .gitignore
- Add slash-less `benchmarks/conference_study/papers` (the existing
  `papers/` rule in conference_study/.gitignore uses a trailing slash and
  wouldn't match a symlink — same situation the file already handles for
  manifests/ and results/).
- Ignore `benchmarks/experimental_perturbations/` (removed from the repo
  in 6373fad but the local dir lingers).

benchmarks/perturbation/.gitignore
- Extend the "ephemeral configs" rule beyond `configs/_*` to cover the
  per-domain configs we generate locally but don't check in:
    configs/cs_*scaleup*.yaml
    configs/full_*.yaml
    configs/grok_*.yaml
    configs/longtail_*.yaml
    configs/subset_*.yaml
    configs/r3_smoke*.yaml
  Add `!configs/full_*_reviewer3.yaml` exception so the canonical
  reviewer3 configs that ARE tracked don't get hidden by the bulk rule.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .gitignore                         | 12 ++++++++----
 benchmarks/perturbation/.gitignore | 13 ++++++++++++-
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index 034b5fd..fdb5bed 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,12 +22,16 @@ venv/
 
 # Run outputs
 review_results/
-benchmarks/conference_study/results/
-benchmarks/perturbation/perturbation_results/
+benchmarks/conference_study/results
+benchmarks/perturbation/perturbation_results
 
 # conference_study study artifacts (not code)
-benchmarks/conference_study/manifests/
-benchmarks/conference_study/reports/
+benchmarks/conference_study/manifests
+benchmarks/conference_study/reports
 # Symlink targets — trailing-slash patterns wouldn't match the symlinks
 benchmarks/conference_study/analyses/manifests
 benchmarks/conference_study/analyses/results
+benchmarks/conference_study/papers
+
+# Moved out of repo (see commit 6373fad); ignore the leftover local dir.
+benchmarks/experimental_perturbations/
diff --git a/benchmarks/perturbation/.gitignore b/benchmarks/perturbation/.gitignore
index bddf55d..b435026 100644
--- a/benchmarks/perturbation/.gitignore
+++ b/benchmarks/perturbation/.gitignore
@@ -8,8 +8,19 @@ papers/
 # Run logs
 reports/*.log
 
-# Temporary / ephemeral configs
+# Temporary / ephemeral configs.
+# Underscore-prefix are by-convention scratch. The other rules below cover the
+# bulk per-domain configs that we generate locally (one per system × domain)
+# but don't check in. The `!*_reviewer3.yaml` exception preserves the canonical
+# reviewer3 configs that are tracked.
 configs/_*
+configs/cs_*scaleup*.yaml
+configs/full_*.yaml
+configs/grok_*.yaml
+configs/longtail_*.yaml
+configs/subset_*.yaml
+configs/r3_smoke*.yaml
+!configs/full_*_reviewer3.yaml
 
 # Python
 __pycache__/

From 8c6238f9ec87c62b7e0664a64f9c938db55d6253 Mon Sep 17 00:00:00 2001
From: Dang Nguyen <nmd.ptnk@gmail.com>
Date: Fri, 15 May 2026 16:05:46 -0500
Subject: [PATCH 3/7] reviewer3: compile FULL source + trim PDF by pages (fixes
 pdflatex failures)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The token-based truncation in `prepare_units` cuts the LaTeX-as-md staged file
at a token boundary, which routinely leaves the document mid-environment.
pdflatex on the staged file then "produces no usable PDF" for a fraction of
papers, surfacing as a hard failure in the reviewer3 run.

Switches the reviewer3 system to compile the FULL pre-truncation source
(`u.src_corrupted`) and then trim the rendered PDF to its first N pages.
This matches the `max_pages: 20` convention conference_study/configs/coarse.yaml
already uses for coarse, so reviewer3 sees roughly the same content window
the other systems see.

Three pdflatex robustness fixes layered in:

1. Strip orphan `\input{...}` / `\include{...}` whose target file isn't
   bundled. pdflatex aborts hard on a missing \input, killing the compile
   for the whole paper even when the body is fine (paper_005 cs_CC had
   `\input{mypreamble.tex}`).

2. Inject a defensive preamble of `\providecommand` fallbacks for common
   author-defined shortcuts (\bbR, \calA, \bfx, \eps, \vvirg, \ootimes,
   etc.). Authors typically define these in private preamble files we
   don't have; \providecommand is a no-op when the command is already
   defined, so the injection is safe blanket coverage.

3. subprocess.run uses bytes (text=False) instead of text=True so the
   pdflatex log's non-UTF-8 accent bytes don't blow up Python's decoder
   (paper_009 cs_CC had byte 0xaa at offset ~57k).

Changes
- Reviewer3System.build_jobs threads `u.src_corrupted` (full path) and
  `cfg["max_pages"]` into the job payload.
- Reviewer3Job adopts `source` + `max_pages` fields; `_submit` / `_ensure_pdf`
  forward them.
- `_ensure_pdf` prefers `source` over `paper` for the compile when set;
  caches alongside the source with `.trim.pdf` suffix when trimmed.
- `_trim_pages_to` (in-place) and `_maybe_trim_pages` (for already-PDF inputs)
  use pymupdf to cap pages.
- `max_pages: 20` added to all 8 `full_*_reviewer3.yaml` configs.
- run_benchmark.py Config gains `max_pages: int | None = None`.

Smoke-validated on three previously-failing cs_CC papers (2604.19872v1 with
missing \input + custom commands, 2604.24325v1 with same pattern, 2604.24879v1
with non-UTF8 bytes in pdflatex output) — all three now produce 20-page
trimmed PDFs in 2–4s.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../configs/full_cs_CC_reviewer3.yaml         |   1 +
 .../configs/full_cs_LG_reviewer3.yaml         |   1 +
 .../configs/full_econ_EM_reviewer3.yaml       |   1 +
 .../configs/full_hep_ex_reviewer3.yaml        |   1 +
 .../configs/full_math_all_reviewer3.yaml      |   1 +
 .../full_physics_atm_clus_reviewer3.yaml      |   1 +
 .../configs/full_q_bio_GN_reviewer3.yaml      |   1 +
 .../configs/full_stat_AP_reviewer3.yaml       |   1 +
 benchmarks/perturbation/run_benchmark.py      |   5 +
 benchmarks/perturbation/systems/reviewer3.py  |  14 +-
 .../perturbation/systems/reviewer3_adapter.py | 198 +++++++++++++++---
 11 files changed, 199 insertions(+), 26 deletions(-)

diff --git a/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml b/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml
index 3488e54..61e3c36 100644
--- a/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml
+++ b/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml
@@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview
 review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
 poll_interval_s: 30
 poll_timeout_s: 1800       # 30 min/paper cap
+max_pages: 20            # trim rendered PDF to first N pages (matches coarse.yaml convention)
 
 results_dir: benchmarks/perturbation/results/full_cs_CC_reviewer3
diff --git a/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml b/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml
index bc3cbb3..d9aa287 100644
--- a/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml
+++ b/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml
@@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview
 review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
 poll_interval_s: 30
 poll_timeout_s: 1800       # 30 min/paper cap
+max_pages: 20            # trim rendered PDF to first N pages (matches coarse.yaml convention)
 
 results_dir: benchmarks/perturbation/results/full_cs_LG_reviewer3
diff --git a/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml b/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml
index f871179..efebb8b 100644
--- a/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml
+++ b/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml
@@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview
 review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
 poll_interval_s: 30
 poll_timeout_s: 1800       # 30 min/paper cap
+max_pages: 20            # trim rendered PDF to first N pages (matches coarse.yaml convention)
 
 results_dir: benchmarks/perturbation/results/full_econ_EM_reviewer3
diff --git a/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml b/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml
index 901fbdb..711eae7 100644
--- a/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml
+++ b/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml
@@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview
 review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
 poll_interval_s: 30
 poll_timeout_s: 1800       # 30 min/paper cap
+max_pages: 20            # trim rendered PDF to first N pages (matches coarse.yaml convention)
 
 results_dir: benchmarks/perturbation/results/full_hep_ex_reviewer3
diff --git a/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml b/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml
index ef94db9..69e061c 100644
--- a/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml
+++ b/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml
@@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview
 review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
 poll_interval_s: 30
 poll_timeout_s: 1800       # 30 min/paper cap
+max_pages: 20            # trim rendered PDF to first N pages (matches coarse.yaml convention)
 
 results_dir: benchmarks/perturbation/results/full_math_all_reviewer3
diff --git a/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml b/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml
index 79d5851..48a09f6 100644
--- a/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml
+++ b/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml
@@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview
 review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
 poll_interval_s: 30
 poll_timeout_s: 1800       # 30 min/paper cap
+max_pages: 20            # trim rendered PDF to first N pages (matches coarse.yaml convention)
 
 results_dir: benchmarks/perturbation/results/full_physics_atm_clus_reviewer3
diff --git a/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml b/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml
index 00d3bf4..756cfc9 100644
--- a/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml
+++ b/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml
@@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview
 review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
 poll_interval_s: 30
 poll_timeout_s: 1800       # 30 min/paper cap
+max_pages: 20            # trim rendered PDF to first N pages (matches coarse.yaml convention)
 
 results_dir: benchmarks/perturbation/results/full_q_bio_GN_reviewer3
diff --git a/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml b/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml
index a375500..fa4cb19 100644
--- a/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml
+++ b/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml
@@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview
 review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
 poll_interval_s: 30
 poll_timeout_s: 1800       # 30 min/paper cap
+max_pages: 20            # trim rendered PDF to first N pages (matches coarse.yaml convention)
 
 results_dir: benchmarks/perturbation/results/full_stat_AP_reviewer3
diff --git a/benchmarks/perturbation/run_benchmark.py b/benchmarks/perturbation/run_benchmark.py
index 9117911..b44112e 100644
--- a/benchmarks/perturbation/run_benchmark.py
+++ b/benchmarks/perturbation/run_benchmark.py
@@ -98,6 +98,11 @@ class Config:
     review_mode: str = field(default="author", metadata={"choices": ["author", "journal"]})
     poll_interval_s: float = 5.0
     poll_timeout_s: float = 1200.0
+    # Cap pages of the rendered PDF sent to R3. None = no cap. Used because
+    # the token-truncated staged file often isn't valid LaTeX; we compile the
+    # FULL pre-truncation source and trim by pages instead (matches the
+    # max_pages: 20 convention in conference_study/configs/coarse.yaml).
+    max_pages: int | None = None
     # Legacy aliases (read on load, normalized into models/methods).
     review_models: list[str] = field(default_factory=list)
     review_methods: list[str] = field(default_factory=list)
diff --git a/benchmarks/perturbation/systems/reviewer3.py b/benchmarks/perturbation/systems/reviewer3.py
index 1f4b1b9..8ccc9a7 100644
--- a/benchmarks/perturbation/systems/reviewer3.py
+++ b/benchmarks/perturbation/systems/reviewer3.py
@@ -34,6 +34,11 @@ def build_jobs(self, units, cfg, results_dir):
             k: cfg[k] for k in ("review_mode", "poll_interval_s", "poll_timeout_s")
             if k in cfg and cfg[k] is not None
         }
+        # We compile the FULL (pre-truncation) source for R3 — the token-cut
+        # staged file frequently isn't valid LaTeX (chops mid-environment).
+        # `max_pages` then trims the rendered PDF so R3 still sees roughly the
+        # same content window as coarse (which uses max_pages: 20).
+        max_pages = cfg.get("max_pages")
         out: list[tuple[CellKey, ReviewJob]] = []
         for u in units:
             if not u.staged_corrupted.exists():
@@ -49,7 +54,12 @@ def build_jobs(self, units, cfg, results_dir):
             job = ReviewJob(
                 tag=tag, out_json=out_json, review_dir=review_dir,
                 paper_label=f"{u.error_type}/{u.paper_label}",
-                payload={"paper": u.staged_corrupted, "overrides": overrides},
+                payload={
+                    "paper": u.staged_corrupted,
+                    "source": u.src_corrupted,
+                    "max_pages": max_pages,
+                    "overrides": overrides,
+                },
             )
             out.append(((REVIEWER3_SLUG,), job))
         return out
@@ -65,6 +75,8 @@ def run_jobs(self, cell_key, jobs, parallel):
                 paper=j.payload["paper"],
                 out_json=j.out_json,
                 paper_label=j.tag,
+                source=j.payload.get("source"),
+                max_pages=j.payload.get("max_pages"),
             )
             for j in jobs
         ]
diff --git a/benchmarks/perturbation/systems/reviewer3_adapter.py b/benchmarks/perturbation/systems/reviewer3_adapter.py
index 71c1196..570fad6 100644
--- a/benchmarks/perturbation/systems/reviewer3_adapter.py
+++ b/benchmarks/perturbation/systems/reviewer3_adapter.py
@@ -90,10 +90,17 @@ def config_from_env() -> Reviewer3Config:
 
 @dataclass
 class Reviewer3Job:
-    paper: Path           # path to *_corrupted.md
-    out_json: Path        # where to write the pipeline-shaped JSON
-    paper_label: str      # e.g. "<error_type>/<paper_label>"
+    paper: Path                 # path to *_corrupted.md (staged, possibly token-truncated)
+    out_json: Path              # where to write the pipeline-shaped JSON
+    paper_label: str            # e.g. "<error_type>/<paper_label>"
     title: str | None = None
+    # Optional: full pre-truncation source. When provided, _ensure_pdf
+    # compiles THIS instead of `paper` — the staged file is often invalid
+    # LaTeX because token truncation can chop mid-environment.
+    source: Path | None = None
+    # Optional: trim the rendered PDF to its first N pages so R3 still sees
+    # roughly the same window other systems do (coarse uses max_pages: 20).
+    max_pages: int | None = None
 
 
 @dataclass
@@ -114,48 +121,188 @@ def _headers(cfg: Reviewer3Config) -> dict[str, str]:
     return {"x-api-key": cfg.api_key}
 
 
-def _ensure_pdf(paper: Path) -> Path:
-    """Reviewer 3 only accepts PDF. If `paper` is already PDF, return it.
-    If it's a LaTeX source (starts with `\\documentclass`) — true for the
-    `cs_CC` corpus where `.md` files are really `.tex` — compile via pdflatex
-    and cache the result next to the source. Otherwise raise."""
+def _ensure_pdf(paper: Path, *, source: Path | None = None,
+                max_pages: int | None = None) -> Path:
+    """Reviewer 3 only accepts PDF. Return a compiled+possibly-trimmed PDF.
+
+    Resolution order for the source bytes:
+      1. If `paper` is already a `.pdf`, return it as-is (page trim still applies).
+      2. If `source` was provided, compile that — preferred for LaTeX-as-md
+         since the staged `paper` is often invalid LaTeX (token truncation
+         chops mid-environment).
+      3. Else compile `paper` directly. If it lacks `\\end{document}`, append
+         one as a best-effort close.
+
+    When `max_pages` is set, the resulting PDF is trimmed to its first N
+    pages via pymupdf. This matches what coarse does (max_pages: 20) so R3
+    sees roughly the same window other systems see.
+    """
     if paper.suffix.lower() == ".pdf":
-        return paper
-    head = paper.read_text(errors="replace")[:2000]
+        return _maybe_trim_pages(paper, max_pages)
+
+    src_for_compile = source if (source is not None and source.exists()) else paper
+    cached_suffix = ".trim.pdf" if max_pages else ".pdf"
+    cached = src_for_compile.with_suffix(cached_suffix)
+    src_mtime = src_for_compile.stat().st_mtime
+    if cached.exists() and cached.stat().st_mtime > src_mtime:
+        return cached
+
+    head = src_for_compile.read_text(errors="replace")[:2000]
     if "\\documentclass" not in head:
         raise RuntimeError(
-            f"don't know how to convert {paper.name} to PDF "
+            f"don't know how to convert {src_for_compile.name} to PDF "
             "(no \\documentclass found; expected LaTeX-as-md or PDF)"
         )
-    cached = paper.with_suffix(".pdf")
-    if cached.exists() and cached.stat().st_mtime > paper.stat().st_mtime:
-        return cached
-    import shutil, subprocess, tempfile
-    text = paper.read_text(errors="replace")
-    # Staging truncates by tokens, which can leave the LaTeX source missing
-    # \end{document} (or mid-environment). Best-effort: ensure document closes.
+    text = src_for_compile.read_text(errors="replace")
     if "\\end{document}" not in text:
+        # `source` should always close cleanly; this only fires if we fell back
+        # to compiling the token-truncated `paper`.
         text = text.rstrip() + "\n\n\\end{document}\n"
+    # Strip orphan \input / \include — the perturbation corpus dumps each paper
+    # into a single .md but a few preserve `\input{mypreamble.tex}`-style
+    # directives that pdflatex can't resolve (fatal error, no PDF produced).
+    text = _strip_orphan_includes(text, src_for_compile.parent)
+    # Stripped-include papers (and some others) rely on author-defined shortcuts
+    # like \bbC, \calA, \vvirg, \ootimes from the missing preamble. Inject
+    # \providecommand fallbacks so the body compiles.
+    text = _inject_rescue_preamble(text)
+
+    import shutil, subprocess, tempfile
     with tempfile.TemporaryDirectory() as td:
         tex = Path(td) / "source.tex"
         tex.write_text(text)
         # Run twice to resolve cross-refs; ignore exit code, accept partial PDF.
+        # Capture output as bytes — pdflatex's own log can contain non-UTF-8
+        # accent bytes and we don't read this output anyway (the .log file is
+        # the source of truth on failure).
         for _ in range(2):
             subprocess.run(
                 ["pdflatex", "-interaction=nonstopmode", "source.tex"],
-                cwd=td, capture_output=True, text=True, timeout=180,
+                cwd=td, capture_output=True, timeout=300,
             )
         out_pdf = Path(td) / "source.pdf"
         if not out_pdf.exists() or out_pdf.stat().st_size < 1000:
-            log = (Path(td) / "source.log").read_text(errors="replace") if (Path(td) / "source.log").exists() else ""
-            raise RuntimeError(f"pdflatex produced no usable PDF for {paper}: {log[-1500:]}")
+            log_path = Path(td) / "source.log"
+            log = log_path.read_text(errors="replace") if log_path.exists() else ""
+            raise RuntimeError(
+                f"pdflatex produced no usable PDF for {src_for_compile}: {log[-1500:]}"
+            )
+        if max_pages:
+            _trim_pages_to(out_pdf, max_pages)
         shutil.copy(out_pdf, cached)
     return cached
 
 
-def _submit(cfg: Reviewer3Config, paper: Path, *, title: str | None) -> str:
-    """POST /api/internal/review (multipart). Returns sessionId."""
-    paper = _ensure_pdf(paper)
+_INPUT_RE = __import__("re").compile(
+    r"\\(?:input|include)\s*\{([^}]+)\}", flags=__import__("re").IGNORECASE,
+)
+
+
+def _strip_orphan_includes(text: str, base_dir: Path) -> str:
+    """Comment out `\\input{path}` / `\\include{path}` whose target file isn't
+    next to the source. pdflatex aborts hard on a missing \\input, which kills
+    the compile even when the rest of the document is fine."""
+    def _replace(m):
+        target = m.group(1).strip()
+        # Common LaTeX convention: optional .tex extension.
+        for cand in (target, target + ".tex"):
+            if (base_dir / cand).exists():
+                return m.group(0)
+        return "% [stripped missing include] " + m.group(0)
+    return _INPUT_RE.sub(_replace, text)
+
+
+# Defensive preamble injected after \documentclass to provide fallbacks for
+# common custom-command patterns that authors typically define in private
+# preamble files (e.g. mypreamble.tex). \providecommand is a no-op when the
+# command is already defined, so this is safe to inject blindly.
+_RESCUE_PREAMBLE = r"""
+% --- injected by reviewer3_adapter: providecommand fallbacks ---
+% blackboard / cal / bf shortcuts authors commonly define per-paper
+\providecommand{\bb}[1]{\mathbb{#1}}
+\providecommand{\cal}[1]{\mathcal{#1}}
+\providecommand{\bff}[1]{\mathbf{#1}}
+% common single-letter shortcuts (\bbR, \calA, \bfx, etc.)
+\makeatletter
+\@for\letter:={A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z}\do{%
+  \expandafter\providecommand\csname bb\letter\endcsname{\ensuremath{\mathbb{\letter}}}%
+  \expandafter\providecommand\csname cal\letter\endcsname{\ensuremath{\mathcal{\letter}}}%
+}
+\@for\letter:={a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z}\do{%
+  \expandafter\providecommand\csname bf\letter\endcsname{\ensuremath{\mathbf{\letter}}}%
+}
+\makeatother
+% other commonly-used shortcuts
+\providecommand{\eps}{\epsilon}
+\providecommand{\veps}{\varepsilon}
+\providecommand{\vvirg}{,\,}
+\providecommand{\ootimes}{\otimes}
+\providecommand{\Bbbk}{\mathbb{k}}
+% --- end injected preamble ---
+"""
+
+
+def _inject_rescue_preamble(text: str) -> str:
+    """Insert _RESCUE_PREAMBLE right after the first \\documentclass{...} line.
+    Idempotent: looks for our marker before injecting."""
+    if "injected by reviewer3_adapter" in text:
+        return text
+    import re
+    m = re.search(r"\\documentclass(\[[^\]]*\])?\{[^}]+\}", text)
+    if not m:
+        return text
+    cut = m.end()
+    return text[:cut] + "\n" + _RESCUE_PREAMBLE + text[cut:]
+
+
+def _maybe_trim_pages(pdf: Path, max_pages: int | None) -> Path:
+    """Return `pdf` (already a PDF). If `max_pages` is set and the PDF has more,
+    return a trimmed sibling cached next to it."""
+    if not max_pages:
+        return pdf
+    import fitz
+    src = fitz.open(pdf)
+    try:
+        if src.page_count <= max_pages:
+            return pdf
+        trimmed = pdf.with_suffix(f".first{max_pages}p.pdf")
+        if trimmed.exists() and trimmed.stat().st_mtime > pdf.stat().st_mtime:
+            return trimmed
+        dst = fitz.open()
+        dst.insert_pdf(src, from_page=0, to_page=max_pages - 1)
+        dst.save(trimmed)
+        dst.close()
+        return trimmed
+    finally:
+        src.close()
+
+
+def _trim_pages_to(pdf: Path, max_pages: int) -> None:
+    """In-place trim of `pdf` to its first `max_pages` pages."""
+    import fitz
+    src = fitz.open(pdf)
+    try:
+        if src.page_count <= max_pages:
+            return
+        dst = fitz.open()
+        dst.insert_pdf(src, from_page=0, to_page=max_pages - 1)
+        tmp = pdf.with_suffix(".pdf.tmp")
+        dst.save(tmp)
+        dst.close()
+    finally:
+        src.close()
+    tmp.replace(pdf)
+
+
+def _submit(cfg: Reviewer3Config, paper: Path, *, title: str | None,
+            source: Path | None = None, max_pages: int | None = None) -> str:
+    """POST /api/internal/review (multipart). Returns sessionId.
+
+    `source` and `max_pages` are forwarded to `_ensure_pdf` so callers can opt
+    into compiling the full pre-truncation source and trimming the rendered
+    PDF — see _ensure_pdf docstring.
+    """
+    paper = _ensure_pdf(paper, source=source, max_pages=max_pages)
     url = f"{cfg.base_url}/api/internal/review"
     data: dict[str, str] = {
         "userId": cfg.user_id,
@@ -294,7 +441,8 @@ def _run_one(job: Reviewer3Job, cfg: Reviewer3Config) -> Reviewer3Result:
     start = time.time()
     sid = ""
     try:
-        sid = _submit(cfg, job.paper, title=job.title)
+        sid = _submit(cfg, job.paper, title=job.title,
+                      source=job.source, max_pages=job.max_pages)
         print(f"[{tag}] submitted, sessionId={sid}", file=sys.stderr, flush=True)
         body = _poll_until_done(cfg, sid, tag=tag)
         elapsed = time.time() - start

From 40da53cb39833dd056dd646e98e364ee4288ab02 Mon Sep 17 00:00:00 2001
From: Dang Nguyen <nmd.ptnk@gmail.com>
Date: Fri, 15 May 2026 20:58:10 -0500
Subject: [PATCH 4/7] reviewer3: trim PDF page count for conference too; bump
 poll timeout to 1h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two operational fixes after observing the live runs:

1. Conference Reviewer3Adapter wasn't honoring max_pages — it called
   _submit(pdf) directly without forwarding max_pages, so R3 received the
   full PDF (sometimes 50+ pages, 5+ MB) and a chunk of those tripped R3's
   HTTP 413 limit. Adapter now reads `max_pages` from the top-level config
   (falling back to reviewer3_options.max_pages) and threads it through to
   _submit -> _ensure_pdf -> _maybe_trim_pages.

2. poll_timeout_s bumped from 1800 (30 min) to 3600 (60 min) in all 8
   perturbation configs and conference reviewer3.yaml. Observed wall time
   per paper under 10-concurrent load was routinely 25-40 min, with a
   long tail past 30 — causing dozens of false-timeout failures even
   though R3 was still processing. The session remains live on R3's side
   regardless, but the adapter abandoned them.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../conference_study/competitors/reviewer3_adapter.py     | 8 +++++++-
 benchmarks/conference_study/configs/reviewer3.yaml        | 2 +-
 benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml | 2 +-
 benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml | 2 +-
 .../perturbation/configs/full_econ_EM_reviewer3.yaml      | 2 +-
 .../perturbation/configs/full_hep_ex_reviewer3.yaml       | 2 +-
 .../perturbation/configs/full_math_all_reviewer3.yaml     | 2 +-
 .../configs/full_physics_atm_clus_reviewer3.yaml          | 2 +-
 .../perturbation/configs/full_q_bio_GN_reviewer3.yaml     | 2 +-
 .../perturbation/configs/full_stat_AP_reviewer3.yaml      | 2 +-
 10 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/benchmarks/conference_study/competitors/reviewer3_adapter.py b/benchmarks/conference_study/competitors/reviewer3_adapter.py
index be4dbd2..02caeed 100644
--- a/benchmarks/conference_study/competitors/reviewer3_adapter.py
+++ b/benchmarks/conference_study/competitors/reviewer3_adapter.py
@@ -49,7 +49,13 @@ def review(self, pdf: Path, model: str, cfg: dict) -> NormalizedReview:
             if k in opts and opts[k] is not None:
                 setattr(rcfg, k, opts[k])
 
-        session_id = _r3._submit(rcfg, pdf, title=pdf.stem)
+        # Cap PDF size sent to R3. `max_pages` lives at the top level of the
+        # config (run_competitors.py uses it for parse_document); we honor the
+        # same value here so the bytes shipped to R3 match the paragraph window
+        # we already cap on our side. Untrimmed full PDFs were tripping R3's
+        # HTTP 413 limit and inflating per-paper wall time.
+        max_pages = cfg.get("max_pages") or opts.get("max_pages")
+        session_id = _r3._submit(rcfg, pdf, title=pdf.stem, max_pages=max_pages)
         body = _r3._poll_until_done(rcfg, session_id, tag=f"reviewer3/{pdf.stem}")
 
         comments: list[NormalizedComment] = []
diff --git a/benchmarks/conference_study/configs/reviewer3.yaml b/benchmarks/conference_study/configs/reviewer3.yaml
index f090d41..fccf923 100644
--- a/benchmarks/conference_study/configs/reviewer3.yaml
+++ b/benchmarks/conference_study/configs/reviewer3.yaml
@@ -29,4 +29,4 @@ max_pages: 20           # parse_document cap; matches coarse.yaml convention
 reviewer3_options:
   review_mode: author      # author | journal (R3 reviewMode enum)
   poll_interval_s: 30
-  poll_timeout_s: 1800     # 30 min/paper cap inside the poll loop
+  poll_timeout_s: 3600     # 60 min/paper cap
diff --git a/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml b/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml
index 61e3c36..f60a078 100644
--- a/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml
+++ b/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml
@@ -8,7 +8,7 @@ score_model: google/gemini-3-flash-preview
 
 review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
 poll_interval_s: 30
-poll_timeout_s: 1800       # 30 min/paper cap
+poll_timeout_s: 3600       # 60 min/paper cap (R3 is slow under 10-concurrent load)
 max_pages: 20            # trim rendered PDF to first N pages (matches coarse.yaml convention)
 
 results_dir: benchmarks/perturbation/results/full_cs_CC_reviewer3
diff --git a/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml b/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml
index d9aa287..3193fbd 100644
--- a/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml
+++ b/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml
@@ -8,7 +8,7 @@ score_model: google/gemini-3-flash-preview
 
 review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
 poll_interval_s: 30
-poll_timeout_s: 1800       # 30 min/paper cap
+poll_timeout_s: 3600       # 60 min/paper cap (R3 is slow under 10-concurrent load)
 max_pages: 20            # trim rendered PDF to first N pages (matches coarse.yaml convention)
 
 results_dir: benchmarks/perturbation/results/full_cs_LG_reviewer3
diff --git a/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml b/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml
index efebb8b..501ec02 100644
--- a/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml
+++ b/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml
@@ -8,7 +8,7 @@ score_model: google/gemini-3-flash-preview
 
 review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
 poll_interval_s: 30
-poll_timeout_s: 1800       # 30 min/paper cap
+poll_timeout_s: 3600       # 60 min/paper cap (R3 is slow under 10-concurrent load)
 max_pages: 20            # trim rendered PDF to first N pages (matches coarse.yaml convention)
 
 results_dir: benchmarks/perturbation/results/full_econ_EM_reviewer3
diff --git a/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml b/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml
index 711eae7..8e6cd7a 100644
--- a/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml
+++ b/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml
@@ -8,7 +8,7 @@ score_model: google/gemini-3-flash-preview
 
 review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
 poll_interval_s: 30
-poll_timeout_s: 1800       # 30 min/paper cap
+poll_timeout_s: 3600       # 60 min/paper cap (R3 is slow under 10-concurrent load)
 max_pages: 20            # trim rendered PDF to first N pages (matches coarse.yaml convention)
 
 results_dir: benchmarks/perturbation/results/full_hep_ex_reviewer3
diff --git a/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml b/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml
index 69e061c..9b8e311 100644
--- a/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml
+++ b/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml
@@ -8,7 +8,7 @@ score_model: google/gemini-3-flash-preview
 
 review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
 poll_interval_s: 30
-poll_timeout_s: 1800       # 30 min/paper cap
+poll_timeout_s: 3600       # 60 min/paper cap (R3 is slow under 10-concurrent load)
 max_pages: 20            # trim rendered PDF to first N pages (matches coarse.yaml convention)
 
 results_dir: benchmarks/perturbation/results/full_math_all_reviewer3
diff --git a/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml b/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml
index 48a09f6..4800288 100644
--- a/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml
+++ b/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml
@@ -8,7 +8,7 @@ score_model: google/gemini-3-flash-preview
 
 review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
 poll_interval_s: 30
-poll_timeout_s: 1800       # 30 min/paper cap
+poll_timeout_s: 3600       # 60 min/paper cap (R3 is slow under 10-concurrent load)
 max_pages: 20            # trim rendered PDF to first N pages (matches coarse.yaml convention)
 
 results_dir: benchmarks/perturbation/results/full_physics_atm_clus_reviewer3
diff --git a/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml b/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml
index 756cfc9..ca8669d 100644
--- a/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml
+++ b/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml
@@ -8,7 +8,7 @@ score_model: google/gemini-3-flash-preview
 
 review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
 poll_interval_s: 30
-poll_timeout_s: 1800       # 30 min/paper cap
+poll_timeout_s: 3600       # 60 min/paper cap (R3 is slow under 10-concurrent load)
 max_pages: 20            # trim rendered PDF to first N pages (matches coarse.yaml convention)
 
 results_dir: benchmarks/perturbation/results/full_q_bio_GN_reviewer3
diff --git a/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml b/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml
index fa4cb19..a06206e 100644
--- a/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml
+++ b/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml
@@ -8,7 +8,7 @@ score_model: google/gemini-3-flash-preview
 
 review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
 poll_interval_s: 30
-poll_timeout_s: 1800       # 30 min/paper cap
+poll_timeout_s: 3600       # 60 min/paper cap (R3 is slow under 10-concurrent load)
 max_pages: 20            # trim rendered PDF to first N pages (matches coarse.yaml convention)
 
 results_dir: benchmarks/perturbation/results/full_stat_AP_reviewer3

From 5cf52adcfb0ca83f23ad8e5714dbb344970b221c Mon Sep 17 00:00:00 2001
From: Dang Nguyen <nmd.ptnk@gmail.com>
Date: Fri, 15 May 2026 21:09:01 -0500
Subject: [PATCH 5/7] reviewer3 pdflatex: rewrite missing doc classes + strip
 missing packages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three layered fixes for the remaining pdflatex-aborts:

1. `_force_known_documentclass` — rewrite \documentclass[opts]{X} to
   \documentclass{amsart} when X.cls isn't installed (kpsewhich miss).
   Many papers use journal classes not in TeX Live (aq-amsart, sn-jnl,
   atlasdoc, aastex631/701, cas-sc, iopjournal, svjour3, ieeeconf,
   informs3, revtex4 without -1/-2). amsart is the math-friendly fallback,
   preserving theorem/lemma envs.

   Regex now matches only the **first uncommented** \documentclass so
   commented-out example lines (e.g. q-bio.GN papers carry
   `%%\documentclass[...]{sn-jnl}` followed by the active variant) don't
   short-circuit the lookup.

2. `_strip_missing_packages` — same idea for \usepackage{X} when X.sty
   isn't installed. pdflatex aborts hard on a missing package too; the
   common offender on hep-ex papers is `\usepackage{jinstpub}`. Comment
   them out; the body's references to missing-package commands degrade
   to undefined-control-sequence warnings (pdflatex in nonstopmode still
   produces a usable PDF).

3. Rescue preamble now runs inside \AtBeginDocument{...} so its
   \providecommand falls AFTER all \usepackage{...} loads. Previously,
   the rescue defined \Bbbk before amsfonts loaded, then amsfonts
   `\DeclareSymbolFont` errored with "Command \Bbbk already defined".

Spot-test across 8 domains: 7 of 8 sample papers compile cleanly. The
holdout is q-bio.GN paper_001 (uses sn-jnl class with extensive class-
specific commands that don't degrade gracefully); 9 of 10 q-bio.GN papers
use bundled classes and work. Expected pdflatex-failure cells dropped
from 39 to ~3 (one paper × three error types).

kpsewhich results are cached per process for the common-class /
common-package set so repeated rewrites across the 222-cell run pay the
subprocess cost once per (class|package).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../perturbation/systems/reviewer3_adapter.py | 130 +++++++++++++++---
 1 file changed, 111 insertions(+), 19 deletions(-)

diff --git a/benchmarks/perturbation/systems/reviewer3_adapter.py b/benchmarks/perturbation/systems/reviewer3_adapter.py
index 570fad6..3bbd96a 100644
--- a/benchmarks/perturbation/systems/reviewer3_adapter.py
+++ b/benchmarks/perturbation/systems/reviewer3_adapter.py
@@ -158,6 +158,14 @@ def _ensure_pdf(paper: Path, *, source: Path | None = None,
         # `source` should always close cleanly; this only fires if we fell back
         # to compiling the token-truncated `paper`.
         text = text.rstrip() + "\n\n\\end{document}\n"
+    # Rewrite \documentclass{<custom>} -> \documentclass{amsart} when the
+    # custom class isn't installed on the local TeX Live. Without this many
+    # journal-class papers (aq-amsart, sn-jnl, atlasdoc, aastex*, cas-sc, ...)
+    # bail at line 1 with "File `X.cls' not found".
+    text = _force_known_documentclass(text)
+    # Same idea for missing \usepackage{X.sty} (e.g. jinstpub on some hep-ex
+    # papers). Comment them out — pdflatex aborts hard on missing packages.
+    text = _strip_missing_packages(text)
     # Strip orphan \input / \include — the perturbation corpus dumps each paper
     # into a single .md but a few preserve `\input{mypreamble.tex}`-style
     # directives that pdflatex can't resolve (fatal error, no PDF produced).
@@ -218,26 +226,28 @@ def _replace(m):
 # command is already defined, so this is safe to inject blindly.
 _RESCUE_PREAMBLE = r"""
 % --- injected by reviewer3_adapter: providecommand fallbacks ---
-% blackboard / cal / bf shortcuts authors commonly define per-paper
-\providecommand{\bb}[1]{\mathbb{#1}}
-\providecommand{\cal}[1]{\mathcal{#1}}
-\providecommand{\bff}[1]{\mathbf{#1}}
-% common single-letter shortcuts (\bbR, \calA, \bfx, etc.)
-\makeatletter
-\@for\letter:={A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z}\do{%
-  \expandafter\providecommand\csname bb\letter\endcsname{\ensuremath{\mathbb{\letter}}}%
-  \expandafter\providecommand\csname cal\letter\endcsname{\ensuremath{\mathcal{\letter}}}%
+% Run at \begin{document} so we don't fight with packages that define the
+% same shortcuts (e.g. amsfonts -> \Bbbk). \providecommand is itself a no-op
+% when the command already exists, but only at the moment it runs; AtBeginDoc
+% defers the check past all \usepackage{...} loads.
+\AtBeginDocument{%
+  \providecommand{\bb}[1]{\mathbb{#1}}%
+  \providecommand{\cal}[1]{\mathcal{#1}}%
+  \providecommand{\bff}[1]{\mathbf{#1}}%
+  \makeatletter
+  \@for\letter:={A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z}\do{%
+    \expandafter\providecommand\csname bb\letter\endcsname{\ensuremath{\mathbb{\letter}}}%
+    \expandafter\providecommand\csname cal\letter\endcsname{\ensuremath{\mathcal{\letter}}}%
+  }%
+  \@for\letter:={a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z}\do{%
+    \expandafter\providecommand\csname bf\letter\endcsname{\ensuremath{\mathbf{\letter}}}%
+  }%
+  \makeatother
+  \providecommand{\eps}{\epsilon}%
+  \providecommand{\veps}{\varepsilon}%
+  \providecommand{\vvirg}{,\,}%
+  \providecommand{\ootimes}{\otimes}%
 }
-\@for\letter:={a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z}\do{%
-  \expandafter\providecommand\csname bf\letter\endcsname{\ensuremath{\mathbf{\letter}}}%
-}
-\makeatother
-% other commonly-used shortcuts
-\providecommand{\eps}{\epsilon}
-\providecommand{\veps}{\varepsilon}
-\providecommand{\vvirg}{,\,}
-\providecommand{\ootimes}{\otimes}
-\providecommand{\Bbbk}{\mathbb{k}}
 % --- end injected preamble ---
 """
 
@@ -255,6 +265,88 @@ def _inject_rescue_preamble(text: str) -> str:
     return text[:cut] + "\n" + _RESCUE_PREAMBLE + text[cut:]
 
 
+# Bundled-with-TeX-Live classes are kept as-is. Anything else gets rewritten
+# to `\documentclass{amsart}` so pdflatex doesn't bail at line 1 with
+# "File `<custom>.cls' not found". Class-specific commands in the body then
+# emit undefined-control-sequence warnings, but pdflatex in nonstopmode still
+# produces a usable PDF.
+_KNOWN_CLASSES_CACHE: dict[str, bool] = {}
+
+
+def _class_is_installed(cls: str) -> bool:
+    if cls in _KNOWN_CLASSES_CACHE:
+        return _KNOWN_CLASSES_CACHE[cls]
+    import subprocess
+    try:
+        r = subprocess.run(["kpsewhich", f"{cls}.cls"],
+                           capture_output=True, timeout=5)
+        ok = (r.returncode == 0 and r.stdout.strip() != b"")
+    except Exception:
+        ok = False
+    _KNOWN_CLASSES_CACHE[cls] = ok
+    return ok
+
+
+def _force_known_documentclass(text: str) -> str:
+    """Rewrite `\\documentclass[opts]{<custom>}` to `\\documentclass{amsart}`
+    when <custom> isn't bundled with the local TeX Live. Drops options too —
+    they're class-specific and frequently invalid against amsart. amsart is
+    the math-friendly fallback (preserves theorem/lemma environments).
+
+    Matches only the first **uncommented** \\documentclass — many papers carry
+    example/commented-out variants before the active one.
+    """
+    import re
+    pat = re.compile(r"^(?P<lead>[^%\n]*?)\\documentclass(\[[^\]]*\])?\{([^}]+)\}",
+                     re.MULTILINE)
+    for m in pat.finditer(text):
+        # If there's a `%` before the `\documentclass` on this line, it's commented.
+        if "%" in m.group("lead"):
+            continue
+        cls = m.group(3).strip()
+        if _class_is_installed(cls):
+            return text
+        # Replace just the `\documentclass...{...}` span, keeping any leading content
+        # (it's empty in practice but be safe).
+        start = m.start() + len(m.group("lead"))
+        return text[:start] + r"\documentclass{amsart}" + text[m.end():]
+    return text
+
+
+# Same pattern for missing packages — pdflatex aborts hard on
+# "File `X.sty' not found", so comment out \usepackage{X} (and the bracketed
+# options line) when X isn't installed. Bundled-with-TeX packages stay.
+_KNOWN_PACKAGES_CACHE: dict[str, bool] = {}
+
+
+def _package_is_installed(pkg: str) -> bool:
+    if pkg in _KNOWN_PACKAGES_CACHE:
+        return _KNOWN_PACKAGES_CACHE[pkg]
+    import subprocess
+    try:
+        r = subprocess.run(["kpsewhich", f"{pkg}.sty"],
+                           capture_output=True, timeout=5)
+        ok = (r.returncode == 0 and r.stdout.strip() != b"")
+    except Exception:
+        ok = False
+    _KNOWN_PACKAGES_CACHE[pkg] = ok
+    return ok
+
+
+def _strip_missing_packages(text: str) -> str:
+    """Comment out `\\usepackage[opts]{X}` lines whose .sty isn't installed.
+    Multi-package forms like `\\usepackage{a,b,c}` are split: any missing
+    member causes the whole line to be commented out (rare in practice).
+    """
+    import re
+    def _replace(m):
+        pkgs = [p.strip() for p in m.group(2).split(",")]
+        if all(_package_is_installed(p) for p in pkgs if p):
+            return m.group(0)
+        return "% [stripped missing package] " + m.group(0)
+    return re.sub(r"\\usepackage(\[[^\]]*\])?\{([^}]+)\}", _replace, text)
+
+
 def _maybe_trim_pages(pdf: Path, max_pages: int | None) -> Path:
     """Return `pdf` (already a PDF). If `max_pages` is set and the PDF has more,
     return a trimmed sibling cached next to it."""

From fc84a6eb4d51f8f379f7d9d30e153b81a7d5c1f7 Mon Sep 17 00:00:00 2001
From: Dang Nguyen <nmd.ptnk@gmail.com>
Date: Fri, 15 May 2026 23:50:37 -0500
Subject: [PATCH 6/7] reviewer3: persist sessionId for resume; eliminates
 duplicate-submit waste
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds `.sid` file alongside each review output. On run:
- If `.sid` exists and points to a still-valid R3 session, we POLL that
  session instead of submitting a new one. Avoids the duplicate-session
  credit waste observed when runs were killed mid-poll (~34% of credits
  spent on duplicates across yesterday's runs, per the rescue audit).
- If the sid fetch hard-fails (403 "not found" / 404), drop the stale
  file and submit fresh.
- On submit success: write the sid IMMEDIATELY so a SIGKILL between
  submit and first poll-tick still leaves a recovery path.
- On any failure: keep the sid file — next run resumes the same session.
- On success: leave the sid file in place as an audit trail (cheap; the
  out_json's presence is the real "done" marker for skip-completed).

Perturbation:
- `Reviewer3Job` gains `sid_file: Path | None`. `Reviewer3System.build_jobs`
  computes `<review_dir>/<stem>.sid` per cell and threads it through.
- `_run_one` handles the resume vs submit branch.

Conference:
- `run_competitors.py` injects `cfg["_sid_file"]` next to the merged
  paper JSON (under `<results>/.sids/<slug>.<method_key>.sid`).
- Conference `Reviewer3Adapter.review()` honors the underscore-prefixed
  key and persists/resumes the same way.

Also drops the conference `max_per_model: 1` back to 5 after confirming
R3's throttle from yesterday has lifted (single-paper probe transitioned
waiting -> processing).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../competitors/reviewer3_adapter.py          | 33 +++++++++-
 .../conference_study/configs/reviewer3.yaml   |  2 +-
 .../conference_study/run_competitors.py       |  9 ++-
 benchmarks/perturbation/systems/reviewer3.py  |  7 +++
 .../perturbation/systems/reviewer3_adapter.py | 61 ++++++++++++++++---
 5 files changed, 101 insertions(+), 11 deletions(-)

diff --git a/benchmarks/conference_study/competitors/reviewer3_adapter.py b/benchmarks/conference_study/competitors/reviewer3_adapter.py
index 02caeed..82052c9 100644
--- a/benchmarks/conference_study/competitors/reviewer3_adapter.py
+++ b/benchmarks/conference_study/competitors/reviewer3_adapter.py
@@ -55,8 +55,37 @@ def review(self, pdf: Path, model: str, cfg: dict) -> NormalizedReview:
         # we already cap on our side. Untrimmed full PDFs were tripping R3's
         # HTTP 413 limit and inflating per-paper wall time.
         max_pages = cfg.get("max_pages") or opts.get("max_pages")
-        session_id = _r3._submit(rcfg, pdf, title=pdf.stem, max_pages=max_pages)
-        body = _r3._poll_until_done(rcfg, session_id, tag=f"reviewer3/{pdf.stem}")
+
+        # sid_file is injected by run_competitors.py just before invocation:
+        # `cfg["_sid_file"] = out_file.with_suffix(".sid")`. If present and
+        # the file already exists, we resume that R3 session instead of
+        # submitting fresh — avoids duplicate-session credit waste when a
+        # prior run was killed mid-poll. (See PR notes; ~34% of credits
+        # observed wasted on duplicates before this fix.)
+        sid_file = cfg.get("_sid_file")
+        if isinstance(sid_file, str):
+            sid_file = Path(sid_file)
+        session_id = ""
+        body = None
+        if sid_file and sid_file.exists():
+            session_id = sid_file.read_text().strip()
+            try:
+                body = _r3._poll_until_done(rcfg, session_id,
+                                            tag=f"reviewer3/{pdf.stem} (resumed)")
+            except RuntimeError as e:
+                m = str(e)
+                if "fetch failed" in m and ("403" in m or "404" in m):
+                    sid_file.unlink(missing_ok=True)
+                    session_id, body = "", None
+                else:
+                    raise
+
+        if body is None:
+            session_id = _r3._submit(rcfg, pdf, title=pdf.stem, max_pages=max_pages)
+            if sid_file:
+                sid_file.parent.mkdir(parents=True, exist_ok=True)
+                sid_file.write_text(session_id)
+            body = _r3._poll_until_done(rcfg, session_id, tag=f"reviewer3/{pdf.stem}")
 
         comments: list[NormalizedComment] = []
         for i, raw in enumerate(body.get("comments") or []):
diff --git a/benchmarks/conference_study/configs/reviewer3.yaml b/benchmarks/conference_study/configs/reviewer3.yaml
index fccf923..1f6d19f 100644
--- a/benchmarks/conference_study/configs/reviewer3.yaml
+++ b/benchmarks/conference_study/configs/reviewer3.yaml
@@ -22,7 +22,7 @@ models:
   - reviewer3
 
 timeout_sec: 3600       # outer per-(paper, model) wall cap (R3 is 10-30 min typical)
-max_per_model: 5        # 5 concurrent submits/polls against R3 (matches plan)
+max_per_model: 5        
 max_pages: 20           # parse_document cap; matches coarse.yaml convention
 
 # Adapter-specific options forwarded to Reviewer3Adapter.review(cfg=...).
diff --git a/benchmarks/conference_study/run_competitors.py b/benchmarks/conference_study/run_competitors.py
index 68214c6..3c5eb34 100644
--- a/benchmarks/conference_study/run_competitors.py
+++ b/benchmarks/conference_study/run_competitors.py
@@ -125,6 +125,14 @@ def run_one(paper: dict, model: str, adapter, cfg: dict, dry_run: bool = False)
         title, content, _was_ocr = parse_document(pdf, max_pages=MAX_PAGES)
         paragraphs = split_into_paragraphs(content)
 
+        # Inject sid_file location so the adapter can persist/resume the
+        # competitor-side session id (e.g. reviewer3). Adapters that don't
+        # care simply ignore the underscore-prefixed key. The file lives
+        # next to the merged paper JSON so it survives across runs.
+        out_file = RESULTS_DIR / f"{paper['slug']}.json"
+        sid_dir = RESULTS_DIR / ".sids"
+        cfg = {**cfg, "_sid_file": sid_dir / f"{paper['slug']}.{method_key}.sid"}
+
         review = adapter.review(pdf, model, cfg)
 
         method_data = build_method_data(
@@ -134,7 +142,6 @@ def run_one(paper: dict, model: str, adapter, cfg: dict, dry_run: bool = False)
             paragraphs=paragraphs,
         )
 
-        out_file = RESULTS_DIR / f"{paper['slug']}.json"
         merge_into_paper_json(
             out_file=out_file,
             slug=paper["slug"],
diff --git a/benchmarks/perturbation/systems/reviewer3.py b/benchmarks/perturbation/systems/reviewer3.py
index 8ccc9a7..503720f 100644
--- a/benchmarks/perturbation/systems/reviewer3.py
+++ b/benchmarks/perturbation/systems/reviewer3.py
@@ -51,6 +51,11 @@ def build_jobs(self, units, cfg, results_dir):
                     continue
                 out_json.unlink()
             tag = f"{domain}/{u.paper_label}/{u.error_type}/{REVIEWER3_SLUG}"
+            # Persist the R3 sessionId next to the review JSON. If a prior
+            # run was killed mid-poll, the next run resumes that session
+            # instead of creating a duplicate (~34% credit waste observed
+            # without this).
+            sid_file = review_dir / f"{u.staged_corrupted.stem}.sid"
             job = ReviewJob(
                 tag=tag, out_json=out_json, review_dir=review_dir,
                 paper_label=f"{u.error_type}/{u.paper_label}",
@@ -58,6 +63,7 @@ def build_jobs(self, units, cfg, results_dir):
                     "paper": u.staged_corrupted,
                     "source": u.src_corrupted,
                     "max_pages": max_pages,
+                    "sid_file": sid_file,
                     "overrides": overrides,
                 },
             )
@@ -77,6 +83,7 @@ def run_jobs(self, cell_key, jobs, parallel):
                 paper_label=j.tag,
                 source=j.payload.get("source"),
                 max_pages=j.payload.get("max_pages"),
+                sid_file=j.payload.get("sid_file"),
             )
             for j in jobs
         ]
diff --git a/benchmarks/perturbation/systems/reviewer3_adapter.py b/benchmarks/perturbation/systems/reviewer3_adapter.py
index 3bbd96a..552d911 100644
--- a/benchmarks/perturbation/systems/reviewer3_adapter.py
+++ b/benchmarks/perturbation/systems/reviewer3_adapter.py
@@ -101,6 +101,12 @@ class Reviewer3Job:
     # Optional: trim the rendered PDF to its first N pages so R3 still sees
     # roughly the same window other systems do (coarse uses max_pages: 20).
     max_pages: int | None = None
+    # Optional: where to persist the R3 sessionId. When set, _run_one writes
+    # the sid here right after submit succeeds and reads it back on subsequent
+    # runs to resume the same R3 session instead of creating a duplicate.
+    # This prevents the duplicate-session credit waste we observed when we
+    # killed runs mid-poll (R3 keeps the session live and we'd submit fresh).
+    sid_file: Path | None = None
 
 
 @dataclass
@@ -529,26 +535,67 @@ def _run_one(job: Reviewer3Job, cfg: Reviewer3Config) -> Reviewer3Result:
     job.out_json.parent.mkdir(parents=True, exist_ok=True)
     raw_path = job.out_json.with_suffix(".raw.json")
     tag = f"reviewer3/{job.paper_label}"
-    print(f"[{tag}] starting: {job.paper.name}", file=sys.stderr, flush=True)
     start = time.time()
     sid = ""
+    sid_file = job.sid_file
+
+    def _write_outputs(body: dict, elapsed: float) -> int:
+        raw_path.write_text(json.dumps(body, indent=2, ensure_ascii=False))
+        pipeline = build_pipeline_json(job.paper, body, elapsed_s=elapsed)
+        job.out_json.write_text(json.dumps(pipeline, indent=2, ensure_ascii=False))
+        # Successful → sid_file no longer needed for resume, but leave it as
+        # an audit trail (cheap, sometimes useful for tracing back to R3 UI).
+        return len(pipeline["methods"][next(iter(pipeline["methods"]))]["comments"])
+
     try:
+        # Resume path: if a sid_file exists, try to recover the prior session
+        # instead of submitting fresh. This is the dedup-credit-waste fix.
+        if sid_file and sid_file.exists():
+            sid = sid_file.read_text().strip()
+            print(f"[{tag}] resuming sessionId={sid} (from {sid_file.name})",
+                  file=sys.stderr, flush=True)
+            try:
+                body = _poll_until_done(cfg, sid, tag=tag)
+                elapsed = time.time() - start
+                n = _write_outputs(body, elapsed)
+                print(f"[{tag}] done in {elapsed:.0f}s ({n} comments, resumed)",
+                      file=sys.stderr, flush=True)
+                return Reviewer3Result(job=job, ok=True, elapsed_s=elapsed,
+                                       session_id=sid, raw_response=body)
+            except RuntimeError as e:
+                # Session-fetch hard fail (e.g., 403/404 — sid stale/invalid).
+                # Drop the sid_file and fall through to fresh submit.
+                msg = str(e)
+                if "fetch failed" in msg and ("403" in msg or "404" in msg):
+                    print(f"[{tag}] sid {sid} unrecoverable ({msg[:80]}); "
+                          "submitting fresh", file=sys.stderr, flush=True)
+                    sid_file.unlink(missing_ok=True)
+                    sid = ""
+                else:
+                    raise  # other RuntimeErrors (e.g., status=failed) propagate
+
+        # Submit path: no usable sid_file, send a new submission.
+        print(f"[{tag}] starting: {job.paper.name}", file=sys.stderr, flush=True)
         sid = _submit(cfg, job.paper, title=job.title,
                       source=job.source, max_pages=job.max_pages)
         print(f"[{tag}] submitted, sessionId={sid}", file=sys.stderr, flush=True)
+        if sid_file:
+            sid_file.parent.mkdir(parents=True, exist_ok=True)
+            sid_file.write_text(sid)
         body = _poll_until_done(cfg, sid, tag=tag)
         elapsed = time.time() - start
-        raw_path.write_text(json.dumps(body, indent=2, ensure_ascii=False))
-        pipeline = build_pipeline_json(job.paper, body, elapsed_s=elapsed)
-        job.out_json.write_text(json.dumps(pipeline, indent=2, ensure_ascii=False))
-        n = len(pipeline["methods"][next(iter(pipeline["methods"]))]["comments"])
-        print(f"[{tag}] done in {elapsed:.0f}s ({n} comments)", file=sys.stderr, flush=True)
+        n = _write_outputs(body, elapsed)
+        print(f"[{tag}] done in {elapsed:.0f}s ({n} comments)",
+              file=sys.stderr, flush=True)
         return Reviewer3Result(job=job, ok=True, elapsed_s=elapsed,
                                session_id=sid, raw_response=body)
     except Exception as e:
         elapsed = time.time() - start
         msg = f"{type(e).__name__}: {e}"
-        print(f"[{tag}] FAILED in {elapsed:.0f}s: {msg}", file=sys.stderr, flush=True)
+        print(f"[{tag}] FAILED in {elapsed:.0f}s: {msg}",
+              file=sys.stderr, flush=True)
+        # Keep sid_file on failure — next run will retry the same session
+        # rather than incur a duplicate submission cost.
         return Reviewer3Result(job=job, ok=False, elapsed_s=elapsed,
                                session_id=sid, error=msg)
 

From b3a167585b4286920ca5ffacf0cd2239142f3cf6 Mon Sep 17 00:00:00 2001
From: Dang Nguyen <nmd.ptnk@gmail.com>
Date: Sun, 17 May 2026 21:50:40 -0500
Subject: [PATCH 7/7] reviewer3: add rescue_sessions.py + tighten gitignore for
 symlinks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two small additions:

1. `benchmarks/perturbation/rescue_sessions.py` — CLI that walks every
   `.sid` file under both result trees, fetches its R3 session via the
   API, and writes results to disk for any session that completed in
   the meantime (status=completed). Idempotent — `out_json` with content
   is skipped — so safe to re-run.

   The pattern emerged when ~30+ R3 sessions completed on the server
   after our local poll loop had timed out and abandoned them.
   Rescue recovered them without re-submitting (no duplicate-credit
   cost). Reviewer3 is the only system in the benchmark with async
   server-side state, so this stays reviewer3-specific by design.

2. `.gitignore` — add slash-less entries for the three symlinks we
   create when running R3 from a sibling worktree:
     - `.venv`           (the existing `.venv/` rule misses symlinks)
     - `benchmarks/perturbation/data`
     - `benchmarks/perturbation/results`
   No files were tracked at these paths; this just cleans up
   `git status` noise when the symlinks are present.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .gitignore                                 |   3 +
 benchmarks/perturbation/rescue_sessions.py | 285 +++++++++++++++++++++
 2 files changed, 288 insertions(+)
 create mode 100644 benchmarks/perturbation/rescue_sessions.py

diff --git a/.gitignore b/.gitignore
index fdb5bed..4dd8729 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@ __pycache__/
 *.pyc
 *.pyo
 .venv/
+.venv
 venv/
 
 # Jupyter
@@ -24,6 +25,8 @@ venv/
 review_results/
 benchmarks/conference_study/results
 benchmarks/perturbation/perturbation_results
+benchmarks/perturbation/results
+benchmarks/perturbation/data
 
 # conference_study study artifacts (not code)
 benchmarks/conference_study/manifests
diff --git a/benchmarks/perturbation/rescue_sessions.py b/benchmarks/perturbation/rescue_sessions.py
new file mode 100644
index 0000000..1070b29
--- /dev/null
+++ b/benchmarks/perturbation/rescue_sessions.py
@@ -0,0 +1,285 @@
+#!/usr/bin/env python3
+"""Pull down completed Reviewer 3 sessions whose local poll loop gave up.
+
+Reviewer 3 is async: a session keeps processing on their server even after
+our adapter has timed out / been killed. The adapter persists the sessionId
+to a `.sid` file next to each cell's output JSON; this script walks those
+`.sid` files, fetches each session, and writes results to disk for the ones
+that completed in the meantime.
+
+Walks both result trees:
+  * Perturbation:  <results>/full_<domain>_reviewer3/.../review/*.sid
+  * Conference:    <conference_study>/results/reviewer3_v2/.sids/*.sid
+
+Skips cells whose `out_json` already has comments. Safe to re-run.
+
+Usage:
+  python rescue_sessions.py                  # rescue both, write results
+  python rescue_sessions.py --dry-run        # just report what's recoverable
+  python rescue_sessions.py --kind perturbation
+  python rescue_sessions.py --kind conference
+
+Required env: REVIEWER3_API_KEY (used for the `review:read` GET).
+
+Exit code: 0 always (no errors are fatal — script is meant to be safe to
+re-run; failures are just logged).
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from pathlib import Path
+
+try:
+    from dotenv import load_dotenv
+    load_dotenv()
+except ImportError:
+    pass
+
+import requests  # noqa: E402
+
+HERE = Path(__file__).resolve().parent
+sys.path.insert(0, str(HERE))
+from systems.reviewer3_adapter import build_pipeline_json  # noqa: E402
+
+# Repo root + conference_study root
+REPO = HERE.parent.parent
+CONFERENCE = REPO / "benchmarks" / "conference_study"
+sys.path.insert(0, str(REPO / "src"))
+sys.path.insert(0, str(CONFERENCE))
+
+
+REVIEWER3_BASE_URL = os.environ.get("REVIEWER3_BASE_URL", "https://reviewer3.com").rstrip("/")
+
+
+@dataclass
+class Orphan:
+    """A `.sid` file pointing at a session we never wrote out_json for."""
+    kind: str               # "pert" or "conf"
+    sid_file: Path
+    out_json: Path          # where we'd write the result if status == completed
+    extra: dict             # kind-specific (e.g. conference slug)
+
+
+def _has_content(out_json: Path, method_key: str | None = None) -> bool:
+    """`out_json` exists AND already has a comment-bearing method entry."""
+    if not out_json.exists():
+        return False
+    try:
+        d = json.loads(out_json.read_text())
+    except Exception:
+        return False
+    methods = d.get("methods") or {}
+    if method_key is not None:
+        m = methods.get(method_key)
+        return bool(m and m.get("comments"))
+    return bool(methods) and any(m.get("comments") for m in methods.values())
+
+
+def find_orphans(kind: str = "both") -> list[Orphan]:
+    """Walk .sid files; return ones whose result JSON is missing or empty."""
+    out: list[Orphan] = []
+
+    if kind in ("both", "perturbation"):
+        # Result paths are resolved via Config.results_dir, which the runner
+        # resolves to absolute via REPO/<results_dir>. The most robust thing
+        # is to walk every `.sid` under any `full_*_reviewer3` results dir
+        # that the runner might write to. We check both the local results
+        # tree (under this worktree) and any sibling worktrees the user may
+        # have symlinked in.
+        pert_roots = [REPO / "benchmarks" / "perturbation" / "results"]
+        # Follow the symlink target too in case results/ is a symlink (we use
+        # this pattern when running from a different worktree than the data).
+        resolved = (REPO / "benchmarks" / "perturbation" / "results").resolve()
+        if resolved not in pert_roots:
+            pert_roots.append(resolved)
+        seen = set()
+        for root in pert_roots:
+            if not root.exists():
+                continue
+            for sid_file in root.rglob("*.sid"):
+                if sid_file in seen:
+                    continue
+                seen.add(sid_file)
+                out_json = sid_file.with_suffix(".json")
+                if _has_content(out_json):
+                    continue
+                out.append(Orphan(kind="pert", sid_file=sid_file,
+                                  out_json=out_json, extra={}))
+
+    if kind in ("both", "conference"):
+        conf_root = CONFERENCE / "results" / "reviewer3_v2"
+        sids_dir = conf_root / ".sids"
+        if sids_dir.exists():
+            for sid_file in sids_dir.glob("*.sid"):
+                # filename: <slug>.<method_key>.sid
+                stem_parts = sid_file.stem.split(".")
+                slug = stem_parts[0]
+                method_key = ".".join(stem_parts[1:]) or "reviewer3__reviewer3"
+                out_json = conf_root / f"{slug}.json"
+                if _has_content(out_json, method_key=method_key):
+                    continue
+                out.append(Orphan(kind="conf", sid_file=sid_file,
+                                  out_json=out_json,
+                                  extra={"slug": slug, "method_key": method_key}))
+
+    return out
+
+
+def fetch_session(sid: str, *, headers: dict, timeout: float = 30.0) -> dict | None:
+    """Return the full session body if status==completed, else None."""
+    url = f"{REVIEWER3_BASE_URL}/api/internal/review/{sid}"
+    try:
+        r = requests.get(url, headers=headers, timeout=timeout)
+    except Exception as e:
+        print(f"  fetch error for {sid}: {type(e).__name__}: {e}",
+              file=sys.stderr, flush=True)
+        return None
+    if r.status_code != 200:
+        print(f"  {sid}: HTTP {r.status_code}", file=sys.stderr, flush=True)
+        return None
+    body = r.json()
+    if body.get("status") != "completed":
+        return None
+    return body
+
+
+def write_perturbation(orphan: Orphan, body: dict) -> None:
+    """Write pipeline JSON + raw.json next to the .sid."""
+    raw_path = orphan.sid_file.with_suffix(".raw.json")
+    raw_path.write_text(json.dumps(body, indent=2, ensure_ascii=False))
+    # Synthesize a paper path from the sid_file stem (e.g. paper_001_corrupted.md)
+    pj = build_pipeline_json(Path(orphan.sid_file.stem + ".md"),
+                             body, elapsed_s=0.0)
+    orphan.out_json.write_text(json.dumps(pj, indent=2, ensure_ascii=False))
+
+
+def write_conference(orphan: Orphan, body: dict, *, _cache: dict = {}) -> None:
+    """Build the merged paper JSON for the conference cohort."""
+    # Lazy imports to keep --kind perturbation fast (avoids parse_document deps).
+    if "loaded" not in _cache:
+        from competitors import get_adapter  # noqa: F401
+        from competitors.helpers import build_method_data, merge_into_paper_json
+        from competitors.base import NormalizedComment, NormalizedReview
+        from reviewer.parsers import parse_document
+        from reviewer.utils import split_into_paragraphs
+        import systems.reviewer3_adapter as r3a
+        manifest = json.loads((CONFERENCE / "manifests/v2/combined.json").read_text())
+        _cache.update(
+            build_method_data=build_method_data,
+            merge_into_paper_json=merge_into_paper_json,
+            NormalizedComment=NormalizedComment,
+            NormalizedReview=NormalizedReview,
+            parse_document=parse_document,
+            split_into_paragraphs=split_into_paragraphs,
+            r3a=r3a,
+            slug2paper={p["slug"]: p for p in manifest["papers"]},
+            loaded=True,
+        )
+    slug = orphan.extra["slug"]
+    method_key = orphan.extra["method_key"]
+    paper = _cache["slug2paper"].get(slug)
+    if not paper:
+        print(f"  {slug}: no manifest entry, skipping", file=sys.stderr)
+        return
+    pdf = CONFERENCE / paper["pdf_path"]
+    title, content, _ = _cache["parse_document"](pdf, max_pages=20)
+    paragraphs = _cache["split_into_paragraphs"](content)
+    comments = []
+    for i, c in enumerate(body.get("comments") or []):
+        if not isinstance(c, dict):
+            c = {"comment": str(c)}
+        n = _cache["r3a"]._normalize_comment(c, i)
+        comments.append(_cache["NormalizedComment"](
+            title=n.get("title", ""), quote=n.get("quote", ""),
+            explanation=n.get("explanation", ""),
+            comment_type=n.get("comment_type", "technical"),
+            extra={"severity": n.get("severity"),
+                   "reviewerId": c.get("reviewerId"),
+                   "rank": c.get("rank"),
+                   "session_id": (body.get("session") or {}).get("id")},
+        ))
+    review = _cache["NormalizedReview"](
+        comments=comments, overall_feedback="", cost_usd=None,
+        cost_method="estimated", model="reviewer3",
+    )
+    md = _cache["build_method_data"](
+        review=review, method_key=method_key,
+        method_label="Reviewer3 (reviewer3)", paragraphs=paragraphs,
+    )
+    _cache["merge_into_paper_json"](
+        out_file=orphan.out_json, slug=slug,
+        title=paper.get("title") or title, paragraphs=paragraphs,
+        method_key=method_key, method_data=md,
+    )
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description=__doc__.splitlines()[0])
+    ap.add_argument("--kind", choices=["both", "perturbation", "conference"],
+                    default="both")
+    ap.add_argument("--dry-run", action="store_true",
+                    help="Just report what's recoverable; don't write anything.")
+    ap.add_argument("--parallel", type=int, default=10,
+                    help="Concurrent API fetches (default 10).")
+    args = ap.parse_args()
+
+    api_key = os.environ.get("REVIEWER3_API_KEY")
+    if not api_key:
+        print("REVIEWER3_API_KEY not set (check .env)", file=sys.stderr)
+        return 1
+    headers = {"x-api-key": api_key}
+
+    orphans = find_orphans(args.kind)
+    print(f"found {len(orphans)} orphan .sid file(s) "
+          f"(no result on disk for these cells yet)")
+    if not orphans:
+        return 0
+
+    # Fetch every sid in parallel; bucket by status.
+    def _fetch(orphan: Orphan):
+        sid = orphan.sid_file.read_text().strip()
+        if not sid:
+            return orphan, sid, None
+        return orphan, sid, fetch_session(sid, headers=headers)
+
+    completed: list[tuple[Orphan, str, dict]] = []
+    incomplete = 0
+    with ThreadPoolExecutor(max_workers=args.parallel) as pool:
+        for orphan, sid, body in pool.map(_fetch, orphans):
+            if body is None:
+                incomplete += 1
+                continue
+            completed.append((orphan, sid, body))
+
+    print(f"  {len(completed)} completed on R3 (recoverable)")
+    print(f"  {incomplete} still in-progress / failed / unreachable")
+
+    if args.dry_run or not completed:
+        for orphan, sid, _ in completed:
+            print(f"  would write: {orphan.out_json}  (sid={sid})")
+        return 0
+
+    n_pert = n_conf = 0
+    for orphan, sid, body in completed:
+        try:
+            if orphan.kind == "pert":
+                write_perturbation(orphan, body)
+                n_pert += 1
+            else:
+                write_conference(orphan, body)
+                n_conf += 1
+        except Exception as e:
+            print(f"  write failed for {sid} ({orphan.out_json}): "
+                  f"{type(e).__name__}: {e}", file=sys.stderr)
+
+    print(f"\nrescued: perturbation={n_pert}  conference={n_conf}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())