diff --git a/.env.example b/.env.example
index 56aac36..4e36a22 100644
--- a/.env.example
+++ b/.env.example
@@ -15,3 +15,9 @@ OPENROUTER_API_KEY=your_openrouter_api_key_here
 
 # Optional: custom OpenAI base URL (e.g. EU endpoint, Azure)
 # OPENAI_BASE_URL=https://eu.api.openai.com/v1
+
+# Reviewer 3 (closed-source HTTP API; benchmarks/perturbation only).
+# Required when running the perturbation benchmark with system: reviewer3.
+# REVIEWER3_API_KEY=sk_...
+# REVIEWER3_USER_ID=<uuid from web UI session JSON; not an email>
+# REVIEWER3_BASE_URL=https://reviewer3.com  # optional override
diff --git a/.gitignore b/.gitignore
index 034b5fd..4dd8729 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@ __pycache__/
 *.pyc
 *.pyo
 .venv/
+.venv
 venv/
 
 # Jupyter
@@ -22,12 +23,18 @@ venv/
 
 # Run outputs
 review_results/
-benchmarks/conference_study/results/
-benchmarks/perturbation/perturbation_results/
+benchmarks/conference_study/results
+benchmarks/perturbation/perturbation_results
+benchmarks/perturbation/results
+benchmarks/perturbation/data
 
 # conference_study study artifacts (not code)
-benchmarks/conference_study/manifests/
-benchmarks/conference_study/reports/
+benchmarks/conference_study/manifests
+benchmarks/conference_study/reports
 # Symlink targets — trailing-slash patterns wouldn't match the symlinks
 benchmarks/conference_study/analyses/manifests
 benchmarks/conference_study/analyses/results
+benchmarks/conference_study/papers
+
+# Moved out of repo (see commit 6373fad); ignore the leftover local dir.
+benchmarks/experimental_perturbations/
diff --git a/benchmarks/conference_study/analyses/compute_auc.py b/benchmarks/conference_study/analyses/compute_auc.py
index d32f1a9..0d87e23 100644
--- a/benchmarks/conference_study/analyses/compute_auc.py
+++ b/benchmarks/conference_study/analyses/compute_auc.py
@@ -51,17 +51,14 @@
 REPO_ROOT = HERE.parent  # benchmarks/conference_study/
 RESULTS_ROOT = REPO_ROOT / "results"
 
-COARSE_SEVERITY_MAP = {"critical": "major", "major": "moderate", "minor": "minor"}
-SEVERITY_TIERS = ("major", "moderate", "minor")
-
-
-def normalize_severity(method: str, raw: str | None) -> str | None:
-    if not raw:
-        return None
-    raw = raw.lower()
-    if method == "coarse":
-        return COARSE_SEVERITY_MAP.get(raw)
-    return raw if raw in SEVERITY_TIERS else None
+# Severity normalization lives in benchmarks/perturbation/_severity.py so the
+# perturbation adapters and these analyses use one source of truth.
+sys.path.insert(0, str(HERE.parents[1] / "perturbation"))
+from _severity import (  # noqa: E402
+    COARSE_SEVERITY_MAP,
+    TIERS as SEVERITY_TIERS,
+    normalize_severity,
+)
 
 
 def load_manifest(path: Path) -> dict[str, list[dict]]:
diff --git a/benchmarks/conference_study/analyses/report_scaleup.py b/benchmarks/conference_study/analyses/report_scaleup.py
index 79dce76..8a43725 100644
--- a/benchmarks/conference_study/analyses/report_scaleup.py
+++ b/benchmarks/conference_study/analyses/report_scaleup.py
@@ -204,20 +204,14 @@ def comment_metrics_by_method(
     return out
 
 
-# Coarse uses {minor, major, critical}; openaireview methods use
-# {minor, moderate, major}. Normalize so a single set of tiers compares
-# apples-to-apples (highest=major, mid=moderate, low=minor).
-_COARSE_SEVERITY_MAP = {"critical": "major", "major": "moderate", "minor": "minor"}
-SEVERITY_TIERS = ("major", "moderate", "minor")
-
-
-def normalize_severity(method: str, raw: str | None) -> str | None:
-    if not raw:
-        return None
-    raw = raw.lower()
-    if method == "coarse":
-        return _COARSE_SEVERITY_MAP.get(raw)
-    return raw if raw in SEVERITY_TIERS else None
+# Severity normalization lives in benchmarks/perturbation/_severity.py so the
+# perturbation adapters and these analyses use one source of truth.
+sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "perturbation"))
+from _severity import (  # noqa: E402
+    COARSE_SEVERITY_MAP as _COARSE_SEVERITY_MAP,
+    TIERS as SEVERITY_TIERS,
+    normalize_severity,
+)
 
 
 def severity_counts_by_method(
diff --git a/benchmarks/conference_study/competitors/registry.py b/benchmarks/conference_study/competitors/registry.py
index be0a25a..7a64bd8 100644
--- a/benchmarks/conference_study/competitors/registry.py
+++ b/benchmarks/conference_study/competitors/registry.py
@@ -9,9 +9,11 @@
 
 from .base import CompetitorAdapter
 from .coarse_adapter import CoarseAdapter
+from .reviewer3_adapter import Reviewer3Adapter
 
 _REGISTRY: dict[str, type[CompetitorAdapter]] = {
     "coarse": CoarseAdapter,
+    "reviewer3": Reviewer3Adapter,
 }
 
 
diff --git a/benchmarks/conference_study/competitors/reviewer3_adapter.py b/benchmarks/conference_study/competitors/reviewer3_adapter.py
new file mode 100644
index 0000000..82052c9
--- /dev/null
+++ b/benchmarks/conference_study/competitors/reviewer3_adapter.py
@@ -0,0 +1,118 @@
+"""Adapter for Reviewer 3 (closed-source HTTP API).
+
+Submission flow is the same as the perturbation benchmark — POST a PDF to
+`/api/internal/review`, poll the session until the `status` enum is terminal,
+then map each comment via `_normalize_comment`. We reuse those helpers from
+the perturbation adapter (`benchmarks/perturbation/systems/reviewer3_adapter.py`)
+rather than duplicating the HTTP code; the only difference here is that
+conference inputs arrive as PDFs already, so the LaTeX-as-md → PDF compile
+step (`_ensure_pdf`) is unnecessary.
+
+Reviewer 3 has no model selector, so `method_key(...)` always returns
+`"reviewer3__reviewer3"` regardless of the manifest `model` value. The
+conference YAML should pin `models: [reviewer3]` to avoid duplicate
+submissions across a phantom model loop.
+
+Required env:
+    REVIEWER3_API_KEY    sk_... (sent as `x-api-key` header)
+    REVIEWER3_USER_ID    UUID from the vendor's web UI session JSON (not an email)
+"""
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+from .base import CompetitorAdapter, NormalizedComment, NormalizedReview
+
+# Reuse the perturbation adapter's HTTP + normalization helpers.
+_PERT = Path(__file__).resolve().parents[2] / "perturbation" / "systems"
+sys.path.insert(0, str(_PERT))
+import reviewer3_adapter as _r3  # noqa: E402
+
+
+_METHOD_KEY = f"{_r3.REVIEWER3_SLUG}__{_r3.REVIEWER3_SLUG}"
+
+
+class Reviewer3Adapter(CompetitorAdapter):
+    name = "reviewer3"
+    required_env = ("REVIEWER3_API_KEY", "REVIEWER3_USER_ID")
+
+    def method_key(self, model: str) -> str:
+        # R3 has no model selector — fixed key regardless of `model`.
+        return _METHOD_KEY
+
+    def review(self, pdf: Path, model: str, cfg: dict) -> NormalizedReview:
+        opts = cfg.get("reviewer3_options", {}) or {}
+        rcfg = _r3.config_from_env()
+        for k in ("review_mode", "poll_interval_s", "poll_timeout_s",
+                  "request_timeout_s", "base_url"):
+            if k in opts and opts[k] is not None:
+                setattr(rcfg, k, opts[k])
+
+        # Cap PDF size sent to R3. `max_pages` lives at the top level of the
+        # config (run_competitors.py uses it for parse_document); we honor the
+        # same value here so the bytes shipped to R3 match the paragraph window
+        # we already cap on our side. Untrimmed full PDFs were tripping R3's
+        # HTTP 413 limit and inflating per-paper wall time.
+        max_pages = cfg.get("max_pages") or opts.get("max_pages")
+
+        # sid_file is injected by run_competitors.py just before invocation:
+        # `cfg["_sid_file"] = out_file.with_suffix(".sid")`. If present and
+        # the file already exists, we resume that R3 session instead of
+        # submitting fresh — avoids duplicate-session credit waste when a
+        # prior run was killed mid-poll. (See PR notes; ~34% of credits
+        # observed wasted on duplicates before this fix.)
+        sid_file = cfg.get("_sid_file")
+        if isinstance(sid_file, str):
+            sid_file = Path(sid_file)
+        session_id = ""
+        body = None
+        if sid_file and sid_file.exists():
+            session_id = sid_file.read_text().strip()
+            try:
+                body = _r3._poll_until_done(rcfg, session_id,
+                                            tag=f"reviewer3/{pdf.stem} (resumed)")
+            except RuntimeError as e:
+                m = str(e)
+                if "fetch failed" in m and ("403" in m or "404" in m):
+                    sid_file.unlink(missing_ok=True)
+                    session_id, body = "", None
+                else:
+                    raise
+
+        if body is None:
+            session_id = _r3._submit(rcfg, pdf, title=pdf.stem, max_pages=max_pages)
+            if sid_file:
+                sid_file.parent.mkdir(parents=True, exist_ok=True)
+                sid_file.write_text(session_id)
+            body = _r3._poll_until_done(rcfg, session_id, tag=f"reviewer3/{pdf.stem}")
+
+        comments: list[NormalizedComment] = []
+        for i, raw in enumerate(body.get("comments") or []):
+            if not isinstance(raw, dict):
+                raw = {"comment": str(raw)}
+            norm = _r3._normalize_comment(raw, i)
+            comments.append(NormalizedComment(
+                title=norm.get("title", ""),
+                quote=norm.get("quote", ""),
+                explanation=norm.get("explanation", ""),
+                comment_type=norm.get("comment_type", "technical"),
+                extra={
+                    "severity": norm.get("severity"),
+                    "reviewerId": raw.get("reviewerId"),
+                    "rank": raw.get("rank"),
+                    "session_id": session_id,
+                },
+            ))
+
+        # R3 doesn't publish pricing and doesn't return overall_feedback or
+        # token counts in its response, so we leave those empty/None.
+        return NormalizedReview(
+            comments=comments,
+            overall_feedback="",
+            cost_usd=None,
+            cost_method="estimated",
+            prompt_tokens=None,
+            completion_tokens=None,
+            model=_r3.REVIEWER3_SLUG,
+        )
diff --git a/benchmarks/conference_study/configs/reviewer3.yaml b/benchmarks/conference_study/configs/reviewer3.yaml
new file mode 100644
index 0000000..1f6d19f
--- /dev/null
+++ b/benchmarks/conference_study/configs/reviewer3.yaml
@@ -0,0 +1,32 @@
+# Reviewer 3 (closed-source HTTP API) run on the v2 conference cohort.
+# Results -> benchmarks/conference_study/results/reviewer3_v2/
+# Log    -> benchmarks/conference_study/results/reviewer3_v2/run_log.jsonl
+#
+# Prerequisites:
+#   - REVIEWER3_API_KEY  and  REVIEWER3_USER_ID  set in .env
+#   - Manifest, papers, and results dirs reachable. In this worktree they are
+#     symlinks into the sibling OpenAIReview worktree (gitignored data lives
+#     only there). Set them up with:
+#       ln -s ../../../OpenAIReview/benchmarks/conference_study/manifests manifests
+#       ln -s ../../../OpenAIReview/benchmarks/conference_study/papers    papers
+#       ln -s ../../../OpenAIReview/benchmarks/conference_study/results   results
+
+name: reviewer3_v2
+competitor: reviewer3
+
+manifest: manifests/v2/combined.json
+
+# R3 has no model selector. Pin to a single dummy entry so run_competitors.py
+# loops once per paper rather than once per (paper × manifest model).
+models:
+  - reviewer3
+
+timeout_sec: 3600       # outer per-(paper, model) wall cap (R3 is 10-30 min typical)
+max_per_model: 5        
+max_pages: 20           # parse_document cap; matches coarse.yaml convention
+
+# Adapter-specific options forwarded to Reviewer3Adapter.review(cfg=...).
+reviewer3_options:
+  review_mode: author      # author | journal (R3 reviewMode enum)
+  poll_interval_s: 30
+  poll_timeout_s: 3600     # 60 min/paper cap
diff --git a/benchmarks/conference_study/run_competitors.py b/benchmarks/conference_study/run_competitors.py
index 68214c6..3c5eb34 100644
--- a/benchmarks/conference_study/run_competitors.py
+++ b/benchmarks/conference_study/run_competitors.py
@@ -125,6 +125,14 @@ def run_one(paper: dict, model: str, adapter, cfg: dict, dry_run: bool = False)
         title, content, _was_ocr = parse_document(pdf, max_pages=MAX_PAGES)
         paragraphs = split_into_paragraphs(content)
 
+        # Inject sid_file location so the adapter can persist/resume the
+        # competitor-side session id (e.g. reviewer3). Adapters that don't
+        # care simply ignore the underscore-prefixed key. The file lives
+        # next to the merged paper JSON so it survives across runs.
+        out_file = RESULTS_DIR / f"{paper['slug']}.json"
+        sid_dir = RESULTS_DIR / ".sids"
+        cfg = {**cfg, "_sid_file": sid_dir / f"{paper['slug']}.{method_key}.sid"}
+
         review = adapter.review(pdf, model, cfg)
 
         method_data = build_method_data(
@@ -134,7 +142,6 @@ def run_one(paper: dict, model: str, adapter, cfg: dict, dry_run: bool = False)
             paragraphs=paragraphs,
         )
 
-        out_file = RESULTS_DIR / f"{paper['slug']}.json"
         merge_into_paper_json(
             out_file=out_file,
             slug=paper["slug"],
diff --git a/benchmarks/perturbation/.gitignore b/benchmarks/perturbation/.gitignore
index bddf55d..b435026 100644
--- a/benchmarks/perturbation/.gitignore
+++ b/benchmarks/perturbation/.gitignore
@@ -8,8 +8,19 @@ papers/
 # Run logs
 reports/*.log
 
-# Temporary / ephemeral configs
+# Temporary / ephemeral configs.
+# Underscore-prefix are by-convention scratch. The other rules below cover the
+# bulk per-domain configs that we generate locally (one per system × domain)
+# but don't check in. The `!*_reviewer3.yaml` exception preserves the canonical
+# reviewer3 configs that are tracked.
 configs/_*
+configs/cs_*scaleup*.yaml
+configs/full_*.yaml
+configs/grok_*.yaml
+configs/longtail_*.yaml
+configs/subset_*.yaml
+configs/r3_smoke*.yaml
+!configs/full_*_reviewer3.yaml
 
 # Python
 __pycache__/
diff --git a/benchmarks/perturbation/_severity.py b/benchmarks/perturbation/_severity.py
new file mode 100644
index 0000000..0e2db3c
--- /dev/null
+++ b/benchmarks/perturbation/_severity.py
@@ -0,0 +1,85 @@
+"""Canonical severity tiers and per-system normalization.
+
+The perturbation benchmark, the conference study analyses, and the viz layer
+all want to compare comment severities across review systems. Each system uses
+its own native vocabulary, so before any cross-system comparison the raw value
+must be mapped to the canonical 3-tier scale used by openaireview itself:
+
+    major     - Undermines a key claim/methodology; affects conclusions.
+    moderate  - Real error or gap that is localized and fixable.
+    minor     - Framing concern, mild overclaim, or resolvable ambiguity.
+
+Per-system maps:
+
+  * openaireview: identity. Output is already in {major, moderate, minor}.
+  * coarse:       {critical, major, minor} -> {major, moderate, minor}
+                  (shift down one tier; same mapping that the conference-study
+                  scripts in benchmarks/conference_study/analyses/ use).
+  * reviewer3:    integer 1..4 per their OpenAPI spec, where
+                  1=Critical, 2=Major, 3=Minor, 4=Editorial.
+                  Compressed to the 3-tier scale by collapsing R3 Minor and
+                  Editorial into `minor`, since in practice R3 tags substantive
+                  -but-lower-importance findings as Editorial rather than style
+                  notes. Confirm with the vendor if the label is later clarified.
+
+The conference_study analyses currently inline `COARSE_SEVERITY_MAP` (see
+`benchmarks/conference_study/analyses/compute_auc.py` and `report_scaleup.py`).
+Once those analyses are co-resident with this module they should import
+`COARSE_SEVERITY_MAP` and `normalize_severity` from here instead.
+"""
+
+from __future__ import annotations
+
+
+TIERS: tuple[str, ...] = ("major", "moderate", "minor")
+
+
+# openaireview methods emit canonical tier strings directly.
+OPENAIREVIEW_SEVERITY_MAP: dict[str, str] = {t: t for t in TIERS}
+
+# coarse uses {minor, major, critical}. Shift down one level.
+COARSE_SEVERITY_MAP: dict[str, str] = {
+    "critical": "major",
+    "major": "moderate",
+    "minor": "minor",
+}
+
+# Reviewer 3 spec: 1=Critical, 2=Major, 3=Minor, 4=Editorial.
+# Compress to 3 tiers; Editorial collapses with Minor (see module docstring).
+REVIEWER3_SEVERITY_MAP: dict[int, str] = {
+    1: "major",
+    2: "moderate",
+    3: "minor",
+    4: "minor",
+}
+
+
+def normalize_severity(system: str, raw: object) -> str | None:
+    """Map a system-native severity value to the canonical 3-tier scale.
+
+    Returns None for unrecognized values so callers can decide whether to drop
+    the comment, default it, or warn.
+
+    `system` is the registry key matching `benchmarks/perturbation/systems/`:
+    'openaireview', 'coarse', or 'reviewer3'.
+    """
+    if raw is None:
+        return None
+    sysn = system.lower()
+    if sysn == "reviewer3":
+        if isinstance(raw, int):
+            return REVIEWER3_SEVERITY_MAP.get(raw)
+        # tolerate the str-form for hand-written test fixtures
+        try:
+            return REVIEWER3_SEVERITY_MAP.get(int(raw))
+        except (TypeError, ValueError):
+            return None
+    if not isinstance(raw, str):
+        return None
+    s = raw.lower()
+    if sysn == "coarse":
+        return COARSE_SEVERITY_MAP.get(s)
+    if sysn == "openaireview":
+        return OPENAIREVIEW_SEVERITY_MAP.get(s)
+    # unknown system -> pass through if already canonical
+    return s if s in TIERS else None
diff --git a/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml b/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml
new file mode 100644
index 0000000..f60a078
--- /dev/null
+++ b/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml
@@ -0,0 +1,14 @@
+system: reviewer3
+input_dir: benchmarks/perturbation/data/perturbations_filtered/cs_CC/all
+max_tokens: 13000          # match full_cs_CC_coarse.yaml
+min_perturbations: 5       # match full_cs_CC_coarse.yaml
+
+score_method: llm
+score_model: google/gemini-3-flash-preview
+
+review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
+poll_interval_s: 30
+poll_timeout_s: 3600       # 60 min/paper cap (R3 is slow under 10-concurrent load)
+max_pages: 20            # trim rendered PDF to first N pages (matches coarse.yaml convention)
+
+results_dir: benchmarks/perturbation/results/full_cs_CC_reviewer3
diff --git a/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml b/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml
new file mode 100644
index 0000000..3193fbd
--- /dev/null
+++ b/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml
@@ -0,0 +1,14 @@
+system: reviewer3
+input_dir: benchmarks/perturbation/data/perturbations_filtered/cs_LG/all
+max_tokens: 13000          # match full_cs_LG_coarse.yaml
+min_perturbations: 5       # match full_cs_LG_coarse.yaml
+
+score_method: llm
+score_model: google/gemini-3-flash-preview
+
+review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
+poll_interval_s: 30
+poll_timeout_s: 3600       # 60 min/paper cap (R3 is slow under 10-concurrent load)
+max_pages: 20            # trim rendered PDF to first N pages (matches coarse.yaml convention)
+
+results_dir: benchmarks/perturbation/results/full_cs_LG_reviewer3
diff --git a/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml b/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml
new file mode 100644
index 0000000..501ec02
--- /dev/null
+++ b/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml
@@ -0,0 +1,14 @@
+system: reviewer3
+input_dir: benchmarks/perturbation/data/perturbations_filtered/econ_EM/all
+max_tokens: 13000          # match full_econ_EM_coarse.yaml
+min_perturbations: 5       # match full_econ_EM_coarse.yaml
+
+score_method: llm
+score_model: google/gemini-3-flash-preview
+
+review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
+poll_interval_s: 30
+poll_timeout_s: 3600       # 60 min/paper cap (R3 is slow under 10-concurrent load)
+max_pages: 20            # trim rendered PDF to first N pages (matches coarse.yaml convention)
+
+results_dir: benchmarks/perturbation/results/full_econ_EM_reviewer3
diff --git a/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml b/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml
new file mode 100644
index 0000000..8e6cd7a
--- /dev/null
+++ b/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml
@@ -0,0 +1,14 @@
+system: reviewer3
+input_dir: benchmarks/perturbation/data/perturbations_filtered/hep-ex/all
+max_tokens: 13000          # match full_hep_ex_coarse.yaml
+min_perturbations: 5       # match full_hep_ex_coarse.yaml
+
+score_method: llm
+score_model: google/gemini-3-flash-preview
+
+review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
+poll_interval_s: 30
+poll_timeout_s: 3600       # 60 min/paper cap (R3 is slow under 10-concurrent load)
+max_pages: 20            # trim rendered PDF to first N pages (matches coarse.yaml convention)
+
+results_dir: benchmarks/perturbation/results/full_hep_ex_reviewer3
diff --git a/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml b/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml
new file mode 100644
index 0000000..9b8e311
--- /dev/null
+++ b/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml
@@ -0,0 +1,14 @@
+system: reviewer3
+input_dir: benchmarks/perturbation/data/perturbations_filtered/math_all/all
+max_tokens: 13000          # match full_math_all_coarse.yaml
+min_perturbations: 5       # match full_math_all_coarse.yaml
+
+score_method: llm
+score_model: google/gemini-3-flash-preview
+
+review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
+poll_interval_s: 30
+poll_timeout_s: 3600       # 60 min/paper cap (R3 is slow under 10-concurrent load)
+max_pages: 20            # trim rendered PDF to first N pages (matches coarse.yaml convention)
+
+results_dir: benchmarks/perturbation/results/full_math_all_reviewer3
diff --git a/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml b/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml
new file mode 100644
index 0000000..4800288
--- /dev/null
+++ b/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml
@@ -0,0 +1,14 @@
+system: reviewer3
+input_dir: benchmarks/perturbation/data/perturbations_filtered/physics_atm-clus/all
+max_tokens: 13000          # match full_physics_atm_clus_coarse.yaml
+min_perturbations: 5       # match full_physics_atm_clus_coarse.yaml
+
+score_method: llm
+score_model: google/gemini-3-flash-preview
+
+review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
+poll_interval_s: 30
+poll_timeout_s: 3600       # 60 min/paper cap (R3 is slow under 10-concurrent load)
+max_pages: 20            # trim rendered PDF to first N pages (matches coarse.yaml convention)
+
+results_dir: benchmarks/perturbation/results/full_physics_atm_clus_reviewer3
diff --git a/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml b/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml
new file mode 100644
index 0000000..ca8669d
--- /dev/null
+++ b/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml
@@ -0,0 +1,14 @@
+system: reviewer3
+input_dir: benchmarks/perturbation/data/perturbations_filtered/q-bio_GN/all
+max_tokens: 13000          # match full_q_bio_GN_coarse.yaml
+min_perturbations: 5       # match full_q_bio_GN_coarse.yaml
+
+score_method: llm
+score_model: google/gemini-3-flash-preview
+
+review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
+poll_interval_s: 30
+poll_timeout_s: 3600       # 60 min/paper cap (R3 is slow under 10-concurrent load)
+max_pages: 20            # trim rendered PDF to first N pages (matches coarse.yaml convention)
+
+results_dir: benchmarks/perturbation/results/full_q_bio_GN_reviewer3
diff --git a/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml b/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml
new file mode 100644
index 0000000..a06206e
--- /dev/null
+++ b/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml
@@ -0,0 +1,14 @@
+system: reviewer3
+input_dir: benchmarks/perturbation/data/perturbations_filtered/stat_AP/all
+max_tokens: 13000          # match full_stat_AP_coarse.yaml
+min_perturbations: 5       # match full_stat_AP_coarse.yaml
+
+score_method: llm
+score_model: google/gemini-3-flash-preview
+
+review_mode: author        # author | journal (Reviewer 3 reviewMode enum)
+poll_interval_s: 30
+poll_timeout_s: 3600       # 60 min/paper cap (R3 is slow under 10-concurrent load)
+max_pages: 20            # trim rendered PDF to first N pages (matches coarse.yaml convention)
+
+results_dir: benchmarks/perturbation/results/full_stat_AP_reviewer3
diff --git a/benchmarks/perturbation/rescue_sessions.py b/benchmarks/perturbation/rescue_sessions.py
new file mode 100644
index 0000000..1070b29
--- /dev/null
+++ b/benchmarks/perturbation/rescue_sessions.py
@@ -0,0 +1,285 @@
+#!/usr/bin/env python3
+"""Pull down completed Reviewer 3 sessions whose local poll loop gave up.
+
+Reviewer 3 is async: a session keeps processing on their server even after
+our adapter has timed out / been killed. The adapter persists the sessionId
+to a `.sid` file next to each cell's output JSON; this script walks those
+`.sid` files, fetches each session, and writes results to disk for the ones
+that completed in the meantime.
+
+Walks both result trees:
+  * Perturbation:  <results>/full_<domain>_reviewer3/.../review/*.sid
+  * Conference:    <conference_study>/results/reviewer3_v2/.sids/*.sid
+
+Skips cells whose `out_json` already has comments. Safe to re-run.
+
+Usage:
+  python rescue_sessions.py                  # rescue both, write results
+  python rescue_sessions.py --dry-run        # just report what's recoverable
+  python rescue_sessions.py --kind perturbation
+  python rescue_sessions.py --kind conference
+
+Required env: REVIEWER3_API_KEY (used for the `review:read` GET).
+
+Exit code: 0 always (no errors are fatal — script is meant to be safe to
+re-run; failures are just logged).
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from pathlib import Path
+
+try:
+    from dotenv import load_dotenv
+    load_dotenv()
+except ImportError:
+    pass
+
+import requests  # noqa: E402
+
+HERE = Path(__file__).resolve().parent
+sys.path.insert(0, str(HERE))
+from systems.reviewer3_adapter import build_pipeline_json  # noqa: E402
+
+# Repo root + conference_study root
+REPO = HERE.parent.parent
+CONFERENCE = REPO / "benchmarks" / "conference_study"
+sys.path.insert(0, str(REPO / "src"))
+sys.path.insert(0, str(CONFERENCE))
+
+
+REVIEWER3_BASE_URL = os.environ.get("REVIEWER3_BASE_URL", "https://reviewer3.com").rstrip("/")
+
+
+@dataclass
+class Orphan:
+    """A `.sid` file pointing at a session we never wrote out_json for."""
+    kind: str               # "pert" or "conf"
+    sid_file: Path
+    out_json: Path          # where we'd write the result if status == completed
+    extra: dict             # kind-specific (e.g. conference slug)
+
+
+def _has_content(out_json: Path, method_key: str | None = None) -> bool:
+    """`out_json` exists AND already has a comment-bearing method entry."""
+    if not out_json.exists():
+        return False
+    try:
+        d = json.loads(out_json.read_text())
+    except Exception:
+        return False
+    methods = d.get("methods") or {}
+    if method_key is not None:
+        m = methods.get(method_key)
+        return bool(m and m.get("comments"))
+    return bool(methods) and any(m.get("comments") for m in methods.values())
+
+
+def find_orphans(kind: str = "both") -> list[Orphan]:
+    """Walk .sid files; return ones whose result JSON is missing or empty."""
+    out: list[Orphan] = []
+
+    if kind in ("both", "perturbation"):
+        # Result paths are resolved via Config.results_dir, which the runner
+        # resolves to absolute via REPO/<results_dir>. The most robust thing
+        # is to walk every `.sid` under any `full_*_reviewer3` results dir
+        # that the runner might write to. We check both the local results
+        # tree (under this worktree) and any sibling worktrees the user may
+        # have symlinked in.
+        pert_roots = [REPO / "benchmarks" / "perturbation" / "results"]
+        # Follow the symlink target too in case results/ is a symlink (we use
+        # this pattern when running from a different worktree than the data).
+        resolved = (REPO / "benchmarks" / "perturbation" / "results").resolve()
+        if resolved not in pert_roots:
+            pert_roots.append(resolved)
+        seen = set()
+        for root in pert_roots:
+            if not root.exists():
+                continue
+            for sid_file in root.rglob("*.sid"):
+                if sid_file in seen:
+                    continue
+                seen.add(sid_file)
+                out_json = sid_file.with_suffix(".json")
+                if _has_content(out_json):
+                    continue
+                out.append(Orphan(kind="pert", sid_file=sid_file,
+                                  out_json=out_json, extra={}))
+
+    if kind in ("both", "conference"):
+        conf_root = CONFERENCE / "results" / "reviewer3_v2"
+        sids_dir = conf_root / ".sids"
+        if sids_dir.exists():
+            for sid_file in sids_dir.glob("*.sid"):
+                # filename: <slug>.<method_key>.sid
+                stem_parts = sid_file.stem.split(".")
+                slug = stem_parts[0]
+                method_key = ".".join(stem_parts[1:]) or "reviewer3__reviewer3"
+                out_json = conf_root / f"{slug}.json"
+                if _has_content(out_json, method_key=method_key):
+                    continue
+                out.append(Orphan(kind="conf", sid_file=sid_file,
+                                  out_json=out_json,
+                                  extra={"slug": slug, "method_key": method_key}))
+
+    return out
+
+
+def fetch_session(sid: str, *, headers: dict, timeout: float = 30.0) -> dict | None:
+    """Return the full session body if status==completed, else None."""
+    url = f"{REVIEWER3_BASE_URL}/api/internal/review/{sid}"
+    try:
+        r = requests.get(url, headers=headers, timeout=timeout)
+    except Exception as e:
+        print(f"  fetch error for {sid}: {type(e).__name__}: {e}",
+              file=sys.stderr, flush=True)
+        return None
+    if r.status_code != 200:
+        print(f"  {sid}: HTTP {r.status_code}", file=sys.stderr, flush=True)
+        return None
+    body = r.json()
+    if body.get("status") != "completed":
+        return None
+    return body
+
+
+def write_perturbation(orphan: Orphan, body: dict) -> None:
+    """Write pipeline JSON + raw.json next to the .sid."""
+    raw_path = orphan.sid_file.with_suffix(".raw.json")
+    raw_path.write_text(json.dumps(body, indent=2, ensure_ascii=False))
+    # Synthesize a paper path from the sid_file stem (e.g. paper_001_corrupted.md)
+    pj = build_pipeline_json(Path(orphan.sid_file.stem + ".md"),
+                             body, elapsed_s=0.0)
+    orphan.out_json.write_text(json.dumps(pj, indent=2, ensure_ascii=False))
+
+
+def write_conference(orphan: Orphan, body: dict, *, _cache: dict = {}) -> None:
+    """Build the merged paper JSON for the conference cohort."""
+    # Lazy imports to keep --kind perturbation fast (avoids parse_document deps).
+    if "loaded" not in _cache:
+        from competitors import get_adapter  # noqa: F401
+        from competitors.helpers import build_method_data, merge_into_paper_json
+        from competitors.base import NormalizedComment, NormalizedReview
+        from reviewer.parsers import parse_document
+        from reviewer.utils import split_into_paragraphs
+        import systems.reviewer3_adapter as r3a
+        manifest = json.loads((CONFERENCE / "manifests/v2/combined.json").read_text())
+        _cache.update(
+            build_method_data=build_method_data,
+            merge_into_paper_json=merge_into_paper_json,
+            NormalizedComment=NormalizedComment,
+            NormalizedReview=NormalizedReview,
+            parse_document=parse_document,
+            split_into_paragraphs=split_into_paragraphs,
+            r3a=r3a,
+            slug2paper={p["slug"]: p for p in manifest["papers"]},
+            loaded=True,
+        )
+    slug = orphan.extra["slug"]
+    method_key = orphan.extra["method_key"]
+    paper = _cache["slug2paper"].get(slug)
+    if not paper:
+        print(f"  {slug}: no manifest entry, skipping", file=sys.stderr)
+        return
+    pdf = CONFERENCE / paper["pdf_path"]
+    title, content, _ = _cache["parse_document"](pdf, max_pages=20)
+    paragraphs = _cache["split_into_paragraphs"](content)
+    comments = []
+    for i, c in enumerate(body.get("comments") or []):
+        if not isinstance(c, dict):
+            c = {"comment": str(c)}
+        n = _cache["r3a"]._normalize_comment(c, i)
+        comments.append(_cache["NormalizedComment"](
+            title=n.get("title", ""), quote=n.get("quote", ""),
+            explanation=n.get("explanation", ""),
+            comment_type=n.get("comment_type", "technical"),
+            extra={"severity": n.get("severity"),
+                   "reviewerId": c.get("reviewerId"),
+                   "rank": c.get("rank"),
+                   "session_id": (body.get("session") or {}).get("id")},
+        ))
+    review = _cache["NormalizedReview"](
+        comments=comments, overall_feedback="", cost_usd=None,
+        cost_method="estimated", model="reviewer3",
+    )
+    md = _cache["build_method_data"](
+        review=review, method_key=method_key,
+        method_label="Reviewer3 (reviewer3)", paragraphs=paragraphs,
+    )
+    _cache["merge_into_paper_json"](
+        out_file=orphan.out_json, slug=slug,
+        title=paper.get("title") or title, paragraphs=paragraphs,
+        method_key=method_key, method_data=md,
+    )
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description=__doc__.splitlines()[0])
+    ap.add_argument("--kind", choices=["both", "perturbation", "conference"],
+                    default="both")
+    ap.add_argument("--dry-run", action="store_true",
+                    help="Just report what's recoverable; don't write anything.")
+    ap.add_argument("--parallel", type=int, default=10,
+                    help="Concurrent API fetches (default 10).")
+    args = ap.parse_args()
+
+    api_key = os.environ.get("REVIEWER3_API_KEY")
+    if not api_key:
+        print("REVIEWER3_API_KEY not set (check .env)", file=sys.stderr)
+        return 1
+    headers = {"x-api-key": api_key}
+
+    orphans = find_orphans(args.kind)
+    print(f"found {len(orphans)} orphan .sid file(s) "
+          f"(no result on disk for these cells yet)")
+    if not orphans:
+        return 0
+
+    # Fetch every sid in parallel; bucket by status.
+    def _fetch(orphan: Orphan):
+        sid = orphan.sid_file.read_text().strip()
+        if not sid:
+            return orphan, sid, None
+        return orphan, sid, fetch_session(sid, headers=headers)
+
+    completed: list[tuple[Orphan, str, dict]] = []
+    incomplete = 0
+    with ThreadPoolExecutor(max_workers=args.parallel) as pool:
+        for orphan, sid, body in pool.map(_fetch, orphans):
+            if body is None:
+                incomplete += 1
+                continue
+            completed.append((orphan, sid, body))
+
+    print(f"  {len(completed)} completed on R3 (recoverable)")
+    print(f"  {incomplete} still in-progress / failed / unreachable")
+
+    if args.dry_run or not completed:
+        for orphan, sid, _ in completed:
+            print(f"  would write: {orphan.out_json}  (sid={sid})")
+        return 0
+
+    n_pert = n_conf = 0
+    for orphan, sid, body in completed:
+        try:
+            if orphan.kind == "pert":
+                write_perturbation(orphan, body)
+                n_pert += 1
+            else:
+                write_conference(orphan, body)
+                n_conf += 1
+        except Exception as e:
+            print(f"  write failed for {sid} ({orphan.out_json}): "
+                  f"{type(e).__name__}: {e}", file=sys.stderr)
+
+    print(f"\nrescued: perturbation={n_pert}  conference={n_conf}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/benchmarks/perturbation/run_benchmark.py b/benchmarks/perturbation/run_benchmark.py
index 9117911..b44112e 100644
--- a/benchmarks/perturbation/run_benchmark.py
+++ b/benchmarks/perturbation/run_benchmark.py
@@ -98,6 +98,11 @@ class Config:
     review_mode: str = field(default="author", metadata={"choices": ["author", "journal"]})
     poll_interval_s: float = 5.0
     poll_timeout_s: float = 1200.0
+    # Cap pages of the rendered PDF sent to R3. None = no cap. Used because
+    # the token-truncated staged file often isn't valid LaTeX; we compile the
+    # FULL pre-truncation source and trim by pages instead (matches the
+    # max_pages: 20 convention in conference_study/configs/coarse.yaml).
+    max_pages: int | None = None
     # Legacy aliases (read on load, normalized into models/methods).
     review_models: list[str] = field(default_factory=list)
     review_methods: list[str] = field(default_factory=list)
diff --git a/benchmarks/perturbation/systems/reviewer3.py b/benchmarks/perturbation/systems/reviewer3.py
index 66ab23e..503720f 100644
--- a/benchmarks/perturbation/systems/reviewer3.py
+++ b/benchmarks/perturbation/systems/reviewer3.py
@@ -30,6 +30,15 @@ class Reviewer3System(System):
 
     def build_jobs(self, units, cfg, results_dir):
         domain = results_dir.name
+        overrides = {
+            k: cfg[k] for k in ("review_mode", "poll_interval_s", "poll_timeout_s")
+            if k in cfg and cfg[k] is not None
+        }
+        # We compile the FULL (pre-truncation) source for R3 — the token-cut
+        # staged file frequently isn't valid LaTeX (chops mid-environment).
+        # `max_pages` then trims the rendered PDF so R3 still sees roughly the
+        # same content window as coarse (which uses max_pages: 20).
+        max_pages = cfg.get("max_pages")
         out: list[tuple[CellKey, ReviewJob]] = []
         for u in units:
             if not u.staged_corrupted.exists():
@@ -42,10 +51,21 @@ def build_jobs(self, units, cfg, results_dir):
                     continue
                 out_json.unlink()
             tag = f"{domain}/{u.paper_label}/{u.error_type}/{REVIEWER3_SLUG}"
+            # Persist the R3 sessionId next to the review JSON. If a prior
+            # run was killed mid-poll, the next run resumes that session
+            # instead of creating a duplicate (~34% credit waste observed
+            # without this).
+            sid_file = review_dir / f"{u.staged_corrupted.stem}.sid"
             job = ReviewJob(
                 tag=tag, out_json=out_json, review_dir=review_dir,
                 paper_label=f"{u.error_type}/{u.paper_label}",
-                payload={"paper": u.staged_corrupted},
+                payload={
+                    "paper": u.staged_corrupted,
+                    "source": u.src_corrupted,
+                    "max_pages": max_pages,
+                    "sid_file": sid_file,
+                    "overrides": overrides,
+                },
             )
             out.append(((REVIEWER3_SLUG,), job))
         return out
@@ -54,12 +74,16 @@ def run_jobs(self, cell_key, jobs, parallel):
         if not jobs:
             return []
         cfg = reviewer3_adapter.config_from_env()
-        # cfg overrides come from the caller via run_jobs_with_cfg; default cfg is fine here.
+        for k, v in jobs[0].payload.get("overrides", {}).items():
+            setattr(cfg, k, v)
         adapter_jobs = [
             reviewer3_adapter.Reviewer3Job(
                 paper=j.payload["paper"],
                 out_json=j.out_json,
                 paper_label=j.tag,
+                source=j.payload.get("source"),
+                max_pages=j.payload.get("max_pages"),
+                sid_file=j.payload.get("sid_file"),
             )
             for j in jobs
         ]
diff --git a/benchmarks/perturbation/systems/reviewer3_adapter.py b/benchmarks/perturbation/systems/reviewer3_adapter.py
index c2f7408..552d911 100644
--- a/benchmarks/perturbation/systems/reviewer3_adapter.py
+++ b/benchmarks/perturbation/systems/reviewer3_adapter.py
@@ -90,10 +90,23 @@ def config_from_env() -> Reviewer3Config:
 
 @dataclass
 class Reviewer3Job:
-    paper: Path           # path to *_corrupted.md
-    out_json: Path        # where to write the pipeline-shaped JSON
-    paper_label: str      # e.g. "<error_type>/<paper_label>"
+    paper: Path                 # path to *_corrupted.md (staged, possibly token-truncated)
+    out_json: Path              # where to write the pipeline-shaped JSON
+    paper_label: str            # e.g. "<error_type>/<paper_label>"
     title: str | None = None
+    # Optional: full pre-truncation source. When provided, _ensure_pdf
+    # compiles THIS instead of `paper` — the staged file is often invalid
+    # LaTeX because token truncation can chop mid-environment.
+    source: Path | None = None
+    # Optional: trim the rendered PDF to its first N pages so R3 still sees
+    # roughly the same window other systems do (coarse uses max_pages: 20).
+    max_pages: int | None = None
+    # Optional: where to persist the R3 sessionId. When set, _run_one writes
+    # the sid here right after submit succeeds and reads it back on subsequent
+    # runs to resume the same R3 session instead of creating a duplicate.
+    # This prevents the duplicate-session credit waste we observed when we
+    # killed runs mid-poll (R3 keeps the session live and we'd submit fresh).
+    sid_file: Path | None = None
 
 
 @dataclass
@@ -114,8 +127,280 @@ def _headers(cfg: Reviewer3Config) -> dict[str, str]:
     return {"x-api-key": cfg.api_key}
 
 
-def _submit(cfg: Reviewer3Config, paper: Path, *, title: str | None) -> str:
-    """POST /api/internal/review (multipart). Returns sessionId."""
+def _ensure_pdf(paper: Path, *, source: Path | None = None,
+                max_pages: int | None = None) -> Path:
+    """Reviewer 3 only accepts PDF. Return a compiled+possibly-trimmed PDF.
+
+    Resolution order for the source bytes:
+      1. If `paper` is already a `.pdf`, return it as-is (page trim still applies).
+      2. If `source` was provided, compile that — preferred for LaTeX-as-md
+         since the staged `paper` is often invalid LaTeX (token truncation
+         chops mid-environment).
+      3. Else compile `paper` directly. If it lacks `\\end{document}`, append
+         one as a best-effort close.
+
+    When `max_pages` is set, the resulting PDF is trimmed to its first N
+    pages via pymupdf. This matches what coarse does (max_pages: 20) so R3
+    sees roughly the same window other systems see.
+    """
+    if paper.suffix.lower() == ".pdf":
+        return _maybe_trim_pages(paper, max_pages)
+
+    src_for_compile = source if (source is not None and source.exists()) else paper
+    cached_suffix = ".trim.pdf" if max_pages else ".pdf"
+    cached = src_for_compile.with_suffix(cached_suffix)
+    src_mtime = src_for_compile.stat().st_mtime
+    if cached.exists() and cached.stat().st_mtime > src_mtime:
+        return cached
+
+    head = src_for_compile.read_text(errors="replace")[:2000]
+    if "\\documentclass" not in head:
+        raise RuntimeError(
+            f"don't know how to convert {src_for_compile.name} to PDF "
+            "(no \\documentclass found; expected LaTeX-as-md or PDF)"
+        )
+    text = src_for_compile.read_text(errors="replace")
+    if "\\end{document}" not in text:
+        # `source` should always close cleanly; this only fires if we fell back
+        # to compiling the token-truncated `paper`.
+        text = text.rstrip() + "\n\n\\end{document}\n"
+    # Rewrite \documentclass{<custom>} -> \documentclass{amsart} when the
+    # custom class isn't installed on the local TeX Live. Without this many
+    # journal-class papers (aq-amsart, sn-jnl, atlasdoc, aastex*, cas-sc, ...)
+    # bail at line 1 with "File `X.cls' not found".
+    text = _force_known_documentclass(text)
+    # Same idea for missing \usepackage{X.sty} (e.g. jinstpub on some hep-ex
+    # papers). Comment them out — pdflatex aborts hard on missing packages.
+    text = _strip_missing_packages(text)
+    # Strip orphan \input / \include — the perturbation corpus dumps each paper
+    # into a single .md but a few preserve `\input{mypreamble.tex}`-style
+    # directives that pdflatex can't resolve (fatal error, no PDF produced).
+    text = _strip_orphan_includes(text, src_for_compile.parent)
+    # Stripped-include papers (and some others) rely on author-defined shortcuts
+    # like \bbC, \calA, \vvirg, \ootimes from the missing preamble. Inject
+    # \providecommand fallbacks so the body compiles.
+    text = _inject_rescue_preamble(text)
+
+    import shutil, subprocess, tempfile
+    with tempfile.TemporaryDirectory() as td:
+        tex = Path(td) / "source.tex"
+        tex.write_text(text)
+        # Run twice to resolve cross-refs; ignore exit code, accept partial PDF.
+        # Capture output as bytes — pdflatex's own log can contain non-UTF-8
+        # accent bytes and we don't read this output anyway (the .log file is
+        # the source of truth on failure).
+        for _ in range(2):
+            subprocess.run(
+                ["pdflatex", "-interaction=nonstopmode", "source.tex"],
+                cwd=td, capture_output=True, timeout=300,
+            )
+        out_pdf = Path(td) / "source.pdf"
+        if not out_pdf.exists() or out_pdf.stat().st_size < 1000:
+            log_path = Path(td) / "source.log"
+            log = log_path.read_text(errors="replace") if log_path.exists() else ""
+            raise RuntimeError(
+                f"pdflatex produced no usable PDF for {src_for_compile}: {log[-1500:]}"
+            )
+        if max_pages:
+            _trim_pages_to(out_pdf, max_pages)
+        shutil.copy(out_pdf, cached)
+    return cached
+
+
+_INPUT_RE = __import__("re").compile(
+    r"\\(?:input|include)\s*\{([^}]+)\}", flags=__import__("re").IGNORECASE,
+)
+
+
+def _strip_orphan_includes(text: str, base_dir: Path) -> str:
+    """Comment out `\\input{path}` / `\\include{path}` whose target file isn't
+    next to the source. pdflatex aborts hard on a missing \\input, which kills
+    the compile even when the rest of the document is fine."""
+    def _replace(m):
+        target = m.group(1).strip()
+        # Common LaTeX convention: optional .tex extension.
+        for cand in (target, target + ".tex"):
+            if (base_dir / cand).exists():
+                return m.group(0)
+        return "% [stripped missing include] " + m.group(0)
+    return _INPUT_RE.sub(_replace, text)
+
+
+# Defensive preamble injected after \documentclass to provide fallbacks for
+# common custom-command patterns that authors typically define in private
+# preamble files (e.g. mypreamble.tex). \providecommand is a no-op when the
+# command is already defined, so this is safe to inject blindly.
+_RESCUE_PREAMBLE = r"""
+% --- injected by reviewer3_adapter: providecommand fallbacks ---
+% Run at \begin{document} so we don't fight with packages that define the
+% same shortcuts (e.g. amsfonts -> \Bbbk). \providecommand is itself a no-op
+% when the command already exists, but only at the moment it runs; AtBeginDoc
+% defers the check past all \usepackage{...} loads.
+\AtBeginDocument{%
+  \providecommand{\bb}[1]{\mathbb{#1}}%
+  \providecommand{\cal}[1]{\mathcal{#1}}%
+  \providecommand{\bff}[1]{\mathbf{#1}}%
+  \makeatletter
+  \@for\letter:={A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z}\do{%
+    \expandafter\providecommand\csname bb\letter\endcsname{\ensuremath{\mathbb{\letter}}}%
+    \expandafter\providecommand\csname cal\letter\endcsname{\ensuremath{\mathcal{\letter}}}%
+  }%
+  \@for\letter:={a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z}\do{%
+    \expandafter\providecommand\csname bf\letter\endcsname{\ensuremath{\mathbf{\letter}}}%
+  }%
+  \makeatother
+  \providecommand{\eps}{\epsilon}%
+  \providecommand{\veps}{\varepsilon}%
+  \providecommand{\vvirg}{,\,}%
+  \providecommand{\ootimes}{\otimes}%
+}
+% --- end injected preamble ---
+"""
+
+
+def _inject_rescue_preamble(text: str) -> str:
+    """Insert _RESCUE_PREAMBLE right after the first \\documentclass{...} line.
+    Idempotent: looks for our marker before injecting."""
+    if "injected by reviewer3_adapter" in text:
+        return text
+    import re
+    m = re.search(r"\\documentclass(\[[^\]]*\])?\{[^}]+\}", text)
+    if not m:
+        return text
+    cut = m.end()
+    return text[:cut] + "\n" + _RESCUE_PREAMBLE + text[cut:]
+
+
+# Bundled-with-TeX-Live classes are kept as-is. Anything else gets rewritten
+# to `\documentclass{amsart}` so pdflatex doesn't bail at line 1 with
+# "File `<custom>.cls' not found". Class-specific commands in the body then
+# emit undefined-control-sequence warnings, but pdflatex in nonstopmode still
+# produces a usable PDF.
+_KNOWN_CLASSES_CACHE: dict[str, bool] = {}
+
+
+def _class_is_installed(cls: str) -> bool:
+    if cls in _KNOWN_CLASSES_CACHE:
+        return _KNOWN_CLASSES_CACHE[cls]
+    import subprocess
+    try:
+        r = subprocess.run(["kpsewhich", f"{cls}.cls"],
+                           capture_output=True, timeout=5)
+        ok = (r.returncode == 0 and r.stdout.strip() != b"")
+    except Exception:
+        ok = False
+    _KNOWN_CLASSES_CACHE[cls] = ok
+    return ok
+
+
+def _force_known_documentclass(text: str) -> str:
+    """Rewrite `\\documentclass[opts]{<custom>}` to `\\documentclass{amsart}`
+    when <custom> isn't bundled with the local TeX Live. Drops options too —
+    they're class-specific and frequently invalid against amsart. amsart is
+    the math-friendly fallback (preserves theorem/lemma environments).
+
+    Matches only the first **uncommented** \\documentclass — many papers carry
+    example/commented-out variants before the active one.
+    """
+    import re
+    pat = re.compile(r"^(?P<lead>[^%\n]*?)\\documentclass(\[[^\]]*\])?\{([^}]+)\}",
+                     re.MULTILINE)
+    for m in pat.finditer(text):
+        # If there's a `%` before the `\documentclass` on this line, it's commented.
+        if "%" in m.group("lead"):
+            continue
+        cls = m.group(3).strip()
+        if _class_is_installed(cls):
+            return text
+        # Replace just the `\documentclass...{...}` span, keeping any leading content
+        # (it's empty in practice but be safe).
+        start = m.start() + len(m.group("lead"))
+        return text[:start] + r"\documentclass{amsart}" + text[m.end():]
+    return text
+
+
+# Same pattern for missing packages — pdflatex aborts hard on
+# "File `X.sty' not found", so comment out \usepackage{X} (and the bracketed
+# options line) when X isn't installed. Bundled-with-TeX packages stay.
+_KNOWN_PACKAGES_CACHE: dict[str, bool] = {}
+
+
+def _package_is_installed(pkg: str) -> bool:
+    if pkg in _KNOWN_PACKAGES_CACHE:
+        return _KNOWN_PACKAGES_CACHE[pkg]
+    import subprocess
+    try:
+        r = subprocess.run(["kpsewhich", f"{pkg}.sty"],
+                           capture_output=True, timeout=5)
+        ok = (r.returncode == 0 and r.stdout.strip() != b"")
+    except Exception:
+        ok = False
+    _KNOWN_PACKAGES_CACHE[pkg] = ok
+    return ok
+
+
+def _strip_missing_packages(text: str) -> str:
+    """Comment out `\\usepackage[opts]{X}` lines whose .sty isn't installed.
+    Multi-package forms like `\\usepackage{a,b,c}` are split: any missing
+    member causes the whole line to be commented out (rare in practice).
+    """
+    import re
+    def _replace(m):
+        pkgs = [p.strip() for p in m.group(2).split(",")]
+        if all(_package_is_installed(p) for p in pkgs if p):
+            return m.group(0)
+        return "% [stripped missing package] " + m.group(0)
+    return re.sub(r"\\usepackage(\[[^\]]*\])?\{([^}]+)\}", _replace, text)
+
+
+def _maybe_trim_pages(pdf: Path, max_pages: int | None) -> Path:
+    """Return `pdf` (already a PDF). If `max_pages` is set and the PDF has more,
+    return a trimmed sibling cached next to it."""
+    if not max_pages:
+        return pdf
+    import fitz
+    src = fitz.open(pdf)
+    try:
+        if src.page_count <= max_pages:
+            return pdf
+        trimmed = pdf.with_suffix(f".first{max_pages}p.pdf")
+        if trimmed.exists() and trimmed.stat().st_mtime > pdf.stat().st_mtime:
+            return trimmed
+        dst = fitz.open()
+        dst.insert_pdf(src, from_page=0, to_page=max_pages - 1)
+        dst.save(trimmed)
+        dst.close()
+        return trimmed
+    finally:
+        src.close()
+
+
+def _trim_pages_to(pdf: Path, max_pages: int) -> None:
+    """In-place trim of `pdf` to its first `max_pages` pages."""
+    import fitz
+    src = fitz.open(pdf)
+    try:
+        if src.page_count <= max_pages:
+            return
+        dst = fitz.open()
+        dst.insert_pdf(src, from_page=0, to_page=max_pages - 1)
+        tmp = pdf.with_suffix(".pdf.tmp")
+        dst.save(tmp)
+        dst.close()
+    finally:
+        src.close()
+    tmp.replace(pdf)
+
+
+def _submit(cfg: Reviewer3Config, paper: Path, *, title: str | None,
+            source: Path | None = None, max_pages: int | None = None) -> str:
+    """POST /api/internal/review (multipart). Returns sessionId.
+
+    `source` and `max_pages` are forwarded to `_ensure_pdf` so callers can opt
+    into compiling the full pre-truncation source and trimming the rendered
+    PDF — see _ensure_pdf docstring.
+    """
+    paper = _ensure_pdf(paper, source=source, max_pages=max_pages)
     url = f"{cfg.base_url}/api/internal/review"
     data: dict[str, str] = {
         "userId": cfg.user_id,
@@ -125,7 +410,7 @@ def _submit(cfg: Reviewer3Config, paper: Path, *, title: str | None) -> str:
     if title:
         data["title"] = title
     with paper.open("rb") as fh:
-        files = {"file": (paper.name, fh, "text/markdown")}
+        files = {"file": (paper.name, fh, "application/pdf")}
         resp = requests.post(url, headers=_headers(cfg), data=data, files=files,
                              timeout=cfg.request_timeout_s)
     if resp.status_code >= 400:
@@ -180,29 +465,36 @@ def _pick(d: dict, *keys: str, default: str = "") -> str:
     return default
 
 
+# Canonical severity mapping lives in benchmarks/perturbation/_severity.py
+# so every system (coarse, reviewer3, openaireview) and the downstream
+# conference-study analyses share one source of truth.
+import sys as _sys
+from pathlib import Path as _Path
+_sys.path.insert(0, str(_Path(__file__).resolve().parent.parent))
+from _severity import normalize_severity as _normalize_r3_severity  # noqa: E402
+
+
 def _normalize_comment(raw: dict, idx: int) -> dict:
-    """Best-effort mapping from Reviewer 3 comment shape to pipeline schema.
+    """Map a Reviewer 3 comment to the pipeline schema.
 
-    The OpenAPI spec doesn't pin field names. We try common synonyms; anything
-    we don't recognize is preserved on the side as `_raw` for later inspection.
+    Reviewer 3 comments carry: reviewerId, comment, title, citedText, severity, rank.
+    `citedText` is the verbatim excerpt from the paper — what our scorer uses as
+    `quote` for fuzzy/semantic matching.
     """
     cid = _pick(raw, "id", "commentId", "uuid") or f"reviewer3_{idx}"
-    title = _pick(raw, "title", "subject", "heading", "summary")
-    quote = _pick(raw, "quote", "snippet", "excerpt", "passage", "text", "highlight")
-    explanation = _pick(raw, "explanation", "comment", "feedback", "body", "rationale", "message")
+    title = _pick(raw, "title")
+    quote = _pick(raw, "citedText", "quote", "snippet", "excerpt", "passage", "highlight")
+    explanation = _pick(raw, "comment", "explanation", "feedback", "body", "rationale", "message")
+    severity = _normalize_r3_severity("reviewer3", raw.get("severity"))
     if not explanation:
-        # last resort: serialize whatever we have so the comment isn't empty
-        explanation = json.dumps({k: v for k, v in raw.items()
-                                  if k not in ("id", "commentId", "uuid", "title", "subject",
-                                               "heading", "summary", "quote", "snippet",
-                                               "excerpt", "passage", "text", "highlight")},
-                                 ensure_ascii=False)
+        explanation = json.dumps(raw, ensure_ascii=False)
     return {
         "id": cid,
         "title": title,
         "quote": quote,
         "explanation": explanation,
         "comment_type": "technical",
+        "severity": severity,
         "paragraph_index": None,
         "_raw": raw,
     }
@@ -243,25 +535,67 @@ def _run_one(job: Reviewer3Job, cfg: Reviewer3Config) -> Reviewer3Result:
     job.out_json.parent.mkdir(parents=True, exist_ok=True)
     raw_path = job.out_json.with_suffix(".raw.json")
     tag = f"reviewer3/{job.paper_label}"
-    print(f"[{tag}] starting: {job.paper.name}", file=sys.stderr, flush=True)
     start = time.time()
     sid = ""
+    sid_file = job.sid_file
+
+    def _write_outputs(body: dict, elapsed: float) -> int:
+        raw_path.write_text(json.dumps(body, indent=2, ensure_ascii=False))
+        pipeline = build_pipeline_json(job.paper, body, elapsed_s=elapsed)
+        job.out_json.write_text(json.dumps(pipeline, indent=2, ensure_ascii=False))
+        # Successful → sid_file no longer needed for resume, but leave it as
+        # an audit trail (cheap, sometimes useful for tracing back to R3 UI).
+        return len(pipeline["methods"][next(iter(pipeline["methods"]))]["comments"])
+
     try:
-        sid = _submit(cfg, job.paper, title=job.title)
+        # Resume path: if a sid_file exists, try to recover the prior session
+        # instead of submitting fresh. This is the dedup-credit-waste fix.
+        if sid_file and sid_file.exists():
+            sid = sid_file.read_text().strip()
+            print(f"[{tag}] resuming sessionId={sid} (from {sid_file.name})",
+                  file=sys.stderr, flush=True)
+            try:
+                body = _poll_until_done(cfg, sid, tag=tag)
+                elapsed = time.time() - start
+                n = _write_outputs(body, elapsed)
+                print(f"[{tag}] done in {elapsed:.0f}s ({n} comments, resumed)",
+                      file=sys.stderr, flush=True)
+                return Reviewer3Result(job=job, ok=True, elapsed_s=elapsed,
+                                       session_id=sid, raw_response=body)
+            except RuntimeError as e:
+                # Session-fetch hard fail (e.g., 403/404 — sid stale/invalid).
+                # Drop the sid_file and fall through to fresh submit.
+                msg = str(e)
+                if "fetch failed" in msg and ("403" in msg or "404" in msg):
+                    print(f"[{tag}] sid {sid} unrecoverable ({msg[:80]}); "
+                          "submitting fresh", file=sys.stderr, flush=True)
+                    sid_file.unlink(missing_ok=True)
+                    sid = ""
+                else:
+                    raise  # other RuntimeErrors (e.g., status=failed) propagate
+
+        # Submit path: no usable sid_file, send a new submission.
+        print(f"[{tag}] starting: {job.paper.name}", file=sys.stderr, flush=True)
+        sid = _submit(cfg, job.paper, title=job.title,
+                      source=job.source, max_pages=job.max_pages)
         print(f"[{tag}] submitted, sessionId={sid}", file=sys.stderr, flush=True)
+        if sid_file:
+            sid_file.parent.mkdir(parents=True, exist_ok=True)
+            sid_file.write_text(sid)
         body = _poll_until_done(cfg, sid, tag=tag)
         elapsed = time.time() - start
-        raw_path.write_text(json.dumps(body, indent=2, ensure_ascii=False))
-        pipeline = build_pipeline_json(job.paper, body, elapsed_s=elapsed)
-        job.out_json.write_text(json.dumps(pipeline, indent=2, ensure_ascii=False))
-        n = len(pipeline["methods"][next(iter(pipeline["methods"]))]["comments"])
-        print(f"[{tag}] done in {elapsed:.0f}s ({n} comments)", file=sys.stderr, flush=True)
+        n = _write_outputs(body, elapsed)
+        print(f"[{tag}] done in {elapsed:.0f}s ({n} comments)",
+              file=sys.stderr, flush=True)
         return Reviewer3Result(job=job, ok=True, elapsed_s=elapsed,
                                session_id=sid, raw_response=body)
     except Exception as e:
         elapsed = time.time() - start
         msg = f"{type(e).__name__}: {e}"
-        print(f"[{tag}] FAILED in {elapsed:.0f}s: {msg}", file=sys.stderr, flush=True)
+        print(f"[{tag}] FAILED in {elapsed:.0f}s: {msg}",
+              file=sys.stderr, flush=True)
+        # Keep sid_file on failure — next run will retry the same session
+        # rather than incur a duplicate submission cost.
         return Reviewer3Result(job=job, ok=False, elapsed_s=elapsed,
                                session_id=sid, error=msg)
 
diff --git a/pyproject.toml b/pyproject.toml
index 396295e..2500007 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,4 +34,4 @@ reviewer = ["viz/*.html", "skill/SKILL.md", "skill/scripts/*.py", "skill/referen
 mistral = ["mistral-ocr-cli>=1.2.0"]
 deepseek = ["deepseek-ocr-cli>=0.4.2"]
 dev = ["pytest>=8.0"]
-benchmarks = ["pyyaml>=6.0", "datasets>=2.0", "rapidfuzz>=3.0", "sentence-transformers>=3.0"]
+benchmarks = ["pyyaml>=6.0", "datasets>=2.0", "rapidfuzz>=3.0", "sentence-transformers>=3.0", "requests>=2.31"]