ChicagoHAI · dangng2004 · May 15, 2026 · May 15, 2026 · May 15, 2026 · May 16, 2026
diff --git a/.env.example b/.env.example
@@ -15,3 +15,9 @@ OPENROUTER_API_KEY=your_openrouter_api_key_here
 
 # Optional: custom OpenAI base URL (e.g. EU endpoint, Azure)
 # OPENAI_BASE_URL=https://eu.api.openai.com/v1
+
+# Reviewer 3 (closed-source HTTP API; benchmarks/perturbation only).
+# Required when running the perturbation benchmark with system: reviewer3.
+# REVIEWER3_API_KEY=sk_...
+# REVIEWER3_USER_ID=<uuid from web UI session JSON; not an email>
+# REVIEWER3_BASE_URL=https://reviewer3.com  # optional override
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,7 @@ __pycache__/
 *.pyc
 *.pyo
 .venv/
+.venv
 venv/
 
 # Jupyter
@@ -22,12 +23,18 @@ venv/
 
 # Run outputs
 review_results/
-benchmarks/conference_study/results/
-benchmarks/perturbation/perturbation_results/
+benchmarks/conference_study/results
+benchmarks/perturbation/perturbation_results
+benchmarks/perturbation/results
+benchmarks/perturbation/data
 
 # conference_study study artifacts (not code)
-benchmarks/conference_study/manifests/
-benchmarks/conference_study/reports/
+benchmarks/conference_study/manifests
+benchmarks/conference_study/reports
 # Symlink targets — trailing-slash patterns wouldn't match the symlinks
 benchmarks/conference_study/analyses/manifests
 benchmarks/conference_study/analyses/results
+benchmarks/conference_study/papers
+
+# Moved out of repo (see commit 6373fad); ignore the leftover local dir.
+benchmarks/experimental_perturbations/
diff --git a/benchmarks/conference_study/analyses/compute_auc.py b/benchmarks/conference_study/analyses/compute_auc.py
@@ -51,17 +51,14 @@
 REPO_ROOT = HERE.parent  # benchmarks/conference_study/
 RESULTS_ROOT = REPO_ROOT / "results"
 
-COARSE_SEVERITY_MAP = {"critical": "major", "major": "moderate", "minor": "minor"}
-SEVERITY_TIERS = ("major", "moderate", "minor")
-
-
-def normalize_severity(method: str, raw: str | None) -> str | None:
-    if not raw:
-        return None
-    raw = raw.lower()
-    if method == "coarse":
-        return COARSE_SEVERITY_MAP.get(raw)
-    return raw if raw in SEVERITY_TIERS else None
+# Severity normalization lives in benchmarks/perturbation/_severity.py so the
+# perturbation adapters and these analyses use one source of truth.
+sys.path.insert(0, str(HERE.parents[1] / "perturbation"))
+from _severity import (  # noqa: E402
+    COARSE_SEVERITY_MAP,
+    TIERS as SEVERITY_TIERS,
+    normalize_severity,
+)
 
 
 def load_manifest(path: Path) -> dict[str, list[dict]]:

diff --git a/benchmarks/conference_study/analyses/report_scaleup.py b/benchmarks/conference_study/analyses/report_scaleup.py
@@ -204,20 +204,14 @@ def comment_metrics_by_method(
     return out
 
 
-# Coarse uses {minor, major, critical}; openaireview methods use
-# {minor, moderate, major}. Normalize so a single set of tiers compares
-# apples-to-apples (highest=major, mid=moderate, low=minor).
-_COARSE_SEVERITY_MAP = {"critical": "major", "major": "moderate", "minor": "minor"}
-SEVERITY_TIERS = ("major", "moderate", "minor")
-
-
-def normalize_severity(method: str, raw: str | None) -> str | None:
-    if not raw:
-        return None
-    raw = raw.lower()
-    if method == "coarse":
-        return _COARSE_SEVERITY_MAP.get(raw)
-    return raw if raw in SEVERITY_TIERS else None
+# Severity normalization lives in benchmarks/perturbation/_severity.py so the
+# perturbation adapters and these analyses use one source of truth.
+sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "perturbation"))
+from _severity import (  # noqa: E402
+    COARSE_SEVERITY_MAP as _COARSE_SEVERITY_MAP,
+    TIERS as SEVERITY_TIERS,
+    normalize_severity,
+)
 
 
 def severity_counts_by_method(

diff --git a/benchmarks/conference_study/competitors/registry.py b/benchmarks/conference_study/competitors/registry.py
@@ -9,9 +9,11 @@
 
 from .base import CompetitorAdapter
 from .coarse_adapter import CoarseAdapter
+from .reviewer3_adapter import Reviewer3Adapter
 
 _REGISTRY: dict[str, type[CompetitorAdapter]] = {
     "coarse": CoarseAdapter,
+    "reviewer3": Reviewer3Adapter,
 }
 
 

diff --git a/benchmarks/conference_study/competitors/reviewer3_adapter.py b/benchmarks/conference_study/competitors/reviewer3_adapter.py
@@ -0,0 +1,118 @@
+"""Adapter for Reviewer 3 (closed-source HTTP API).
+
+Submission flow is the same as the perturbation benchmark — POST a PDF to
+`/api/internal/review`, poll the session until the `status` enum is terminal,
+then map each comment via `_normalize_comment`. We reuse those helpers from
+the perturbation adapter (`benchmarks/perturbation/systems/reviewer3_adapter.py`)
+rather than duplicating the HTTP code; the only difference here is that
+conference inputs arrive as PDFs already, so the LaTeX-as-md → PDF compile
+step (`_ensure_pdf`) is unnecessary.
+
+Reviewer 3 has no model selector, so `method_key(...)` always returns
+`"reviewer3__reviewer3"` regardless of the manifest `model` value. The
+conference YAML should pin `models: [reviewer3]` to avoid duplicate
+submissions across a phantom model loop.
+
+Required env:
+    REVIEWER3_API_KEY    sk_... (sent as `x-api-key` header)
+    REVIEWER3_USER_ID    UUID from the vendor's web UI session JSON (not an email)
+"""
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+from .base import CompetitorAdapter, NormalizedComment, NormalizedReview
+
+# Reuse the perturbation adapter's HTTP + normalization helpers.
+_PERT = Path(__file__).resolve().parents[2] / "perturbation" / "systems"
+sys.path.insert(0, str(_PERT))
+import reviewer3_adapter as _r3  # noqa: E402
+
+
+_METHOD_KEY = f"{_r3.REVIEWER3_SLUG}__{_r3.REVIEWER3_SLUG}"
+
+
+class Reviewer3Adapter(CompetitorAdapter):
+    name = "reviewer3"
+    required_env = ("REVIEWER3_API_KEY", "REVIEWER3_USER_ID")
+
+    def method_key(self, model: str) -> str:
+        # R3 has no model selector — fixed key regardless of `model`.
+        return _METHOD_KEY
+
+    def review(self, pdf: Path, model: str, cfg: dict) -> NormalizedReview:
+        opts = cfg.get("reviewer3_options", {}) or {}
+        rcfg = _r3.config_from_env()
+        for k in ("review_mode", "poll_interval_s", "poll_timeout_s",
+                  "request_timeout_s", "base_url"):
+            if k in opts and opts[k] is not None:
+                setattr(rcfg, k, opts[k])
+
+        # Cap PDF size sent to R3. `max_pages` lives at the top level of the
+        # config (run_competitors.py uses it for parse_document); we honor the
+        # same value here so the bytes shipped to R3 match the paragraph window
+        # we already cap on our side. Untrimmed full PDFs were tripping R3's
+        # HTTP 413 limit and inflating per-paper wall time.
+        max_pages = cfg.get("max_pages") or opts.get("max_pages")
+
+        # sid_file is injected by run_competitors.py just before invocation:
+        # `cfg["_sid_file"] = out_file.with_suffix(".sid")`. If present and
+        # the file already exists, we resume that R3 session instead of
+        # submitting fresh — avoids duplicate-session credit waste when a
+        # prior run was killed mid-poll. (See PR notes; ~34% of credits
+        # observed wasted on duplicates before this fix.)
+        sid_file = cfg.get("_sid_file")
+        if isinstance(sid_file, str):
+            sid_file = Path(sid_file)
+        session_id = ""
+        body = None
+        if sid_file and sid_file.exists():
+            session_id = sid_file.read_text().strip()
+            try:
+                body = _r3._poll_until_done(rcfg, session_id,
+                                            tag=f"reviewer3/{pdf.stem} (resumed)")
+            except RuntimeError as e:
+                m = str(e)
+                if "fetch failed" in m and ("403" in m or "404" in m):
+                    sid_file.unlink(missing_ok=True)
+                    session_id, body = "", None
+                else:
+                    raise
+
+        if body is None:
+            session_id = _r3._submit(rcfg, pdf, title=pdf.stem, max_pages=max_pages)
+            if sid_file:
+                sid_file.parent.mkdir(parents=True, exist_ok=True)
+                sid_file.write_text(session_id)
+            body = _r3._poll_until_done(rcfg, session_id, tag=f"reviewer3/{pdf.stem}")
+
+        comments: list[NormalizedComment] = []
+        for i, raw in enumerate(body.get("comments") or []):
+            if not isinstance(raw, dict):
+                raw = {"comment": str(raw)}
+            norm = _r3._normalize_comment(raw, i)
+            comments.append(NormalizedComment(
+                title=norm.get("title", ""),
+                quote=norm.get("quote", ""),
+                explanation=norm.get("explanation", ""),
+                comment_type=norm.get("comment_type", "technical"),
+                extra={
+                    "severity": norm.get("severity"),
+                    "reviewerId": raw.get("reviewerId"),
+                    "rank": raw.get("rank"),
+                    "session_id": session_id,
+                },
+            ))
+
+        # R3 doesn't publish pricing and doesn't return overall_feedback or
+        # token counts in its response, so we leave those empty/None.
+        return NormalizedReview(
+            comments=comments,
+            overall_feedback="",
+            cost_usd=None,
+            cost_method="estimated",
+            prompt_tokens=None,
+            completion_tokens=None,
+            model=_r3.REVIEWER3_SLUG,
+        )
diff --git a/benchmarks/conference_study/configs/reviewer3.yaml b/benchmarks/conference_study/configs/reviewer3.yaml
@@ -0,0 +1,32 @@
+# Reviewer 3 (closed-source HTTP API) run on the v2 conference cohort.
+# Results -> benchmarks/conference_study/results/reviewer3_v2/
+# Log    -> benchmarks/conference_study/results/reviewer3_v2/run_log.jsonl
+#
+# Prerequisites:
+#   - REVIEWER3_API_KEY  and  REVIEWER3_USER_ID  set in .env
+#   - Manifest, papers, and results dirs reachable. In this worktree they are
+#     symlinks into the sibling OpenAIReview worktree (gitignored data lives
+#     only there). Set them up with:
+#       ln -s ../../../OpenAIReview/benchmarks/conference_study/manifests manifests
+#       ln -s ../../../OpenAIReview/benchmarks/conference_study/papers    papers
+#       ln -s ../../../OpenAIReview/benchmarks/conference_study/results   results
+
+name: reviewer3_v2
+competitor: reviewer3
+
+manifest: manifests/v2/combined.json
+
+# R3 has no model selector. Pin to a single dummy entry so run_competitors.py
+# loops once per paper rather than once per (paper × manifest model).
+models:
+  - reviewer3
+
+timeout_sec: 3600       # outer per-(paper, model) wall cap (R3 is 10-30 min typical)
+max_per_model: 5        
+max_pages: 20           # parse_document cap; matches coarse.yaml convention
+
+# Adapter-specific options forwarded to Reviewer3Adapter.review(cfg=...).
+reviewer3_options:
+  review_mode: author      # author | journal (R3 reviewMode enum)
+  poll_interval_s: 30
+  poll_timeout_s: 3600     # 60 min/paper cap
diff --git a/benchmarks/conference_study/run_competitors.py b/benchmarks/conference_study/run_competitors.py
@@ -125,6 +125,14 @@ def run_one(paper: dict, model: str, adapter, cfg: dict, dry_run: bool = False)
         title, content, _was_ocr = parse_document(pdf, max_pages=MAX_PAGES)
         paragraphs = split_into_paragraphs(content)
 
+        # Inject sid_file location so the adapter can persist/resume the
+        # competitor-side session id (e.g. reviewer3). Adapters that don't
+        # care simply ignore the underscore-prefixed key. The file lives
+        # next to the merged paper JSON so it survives across runs.
+        out_file = RESULTS_DIR / f"{paper['slug']}.json"
+        sid_dir = RESULTS_DIR / ".sids"
+        cfg = {**cfg, "_sid_file": sid_dir / f"{paper['slug']}.{method_key}.sid"}
+
         review = adapter.review(pdf, model, cfg)
 
         method_data = build_method_data(
@@ -134,7 +142,6 @@ def run_one(paper: dict, model: str, adapter, cfg: dict, dry_run: bool = False)
             paragraphs=paragraphs,
         )
 
-        out_file = RESULTS_DIR / f"{paper['slug']}.json"
         merge_into_paper_json(
             out_file=out_file,
             slug=paper["slug"],

diff --git a/benchmarks/perturbation/.gitignore b/benchmarks/perturbation/.gitignore
@@ -8,8 +8,19 @@ papers/
 # Run logs
 reports/*.log
 
-# Temporary / ephemeral configs
+# Temporary / ephemeral configs.
+# Underscore-prefix are by-convention scratch. The other rules below cover the
+# bulk per-domain configs that we generate locally (one per system × domain)
+# but don't check in. The `!*_reviewer3.yaml` exception preserves the canonical
+# reviewer3 configs that are tracked.
 configs/_*
+configs/cs_*scaleup*.yaml
+configs/full_*.yaml
+configs/grok_*.yaml
+configs/longtail_*.yaml
+configs/subset_*.yaml
+configs/r3_smoke*.yaml
+!configs/full_*_reviewer3.yaml
 
 # Python
 __pycache__/

diff --git a/benchmarks/perturbation/_severity.py b/benchmarks/perturbation/_severity.py
@@ -0,0 +1,85 @@
+"""Canonical severity tiers and per-system normalization.
+
+The perturbation benchmark, the conference study analyses, and the viz layer
+all want to compare comment severities across review systems. Each system uses
+its own native vocabulary, so before any cross-system comparison the raw value
+must be mapped to the canonical 3-tier scale used by openaireview itself:
+
+    major     - Undermines a key claim/methodology; affects conclusions.
+    moderate  - Real error or gap that is localized and fixable.
+    minor     - Framing concern, mild overclaim, or resolvable ambiguity.
+
+Per-system maps:
+
+  * openaireview: identity. Output is already in {major, moderate, minor}.
+  * coarse:       {critical, major, minor} -> {major, moderate, minor}
+                  (shift down one tier; same mapping that the conference-study
+                  scripts in benchmarks/conference_study/analyses/ use).
+  * reviewer3:    integer 1..4 per their OpenAPI spec, where
+                  1=Critical, 2=Major, 3=Minor, 4=Editorial.
+                  Compressed to the 3-tier scale by collapsing R3 Minor and
+                  Editorial into `minor`, since in practice R3 tags substantive
+                  -but-lower-importance findings as Editorial rather than style
+                  notes. Confirm with the vendor if the label is later clarified.
+
+The conference_study analyses currently inline `COARSE_SEVERITY_MAP` (see
+`benchmarks/conference_study/analyses/compute_auc.py` and `report_scaleup.py`).
+Once those analyses are co-resident with this module they should import
+`COARSE_SEVERITY_MAP` and `normalize_severity` from here instead.
+"""
+
+from __future__ import annotations
+
+
+TIERS: tuple[str, ...] = ("major", "moderate", "minor")
+
+
+# openaireview methods emit canonical tier strings directly.
+OPENAIREVIEW_SEVERITY_MAP: dict[str, str] = {t: t for t in TIERS}
+
+# coarse uses {minor, major, critical}. Shift down one level.
+COARSE_SEVERITY_MAP: dict[str, str] = {
+    "critical": "major",
+    "major": "moderate",
+    "minor": "minor",
+}
+
+# Reviewer 3 spec: 1=Critical, 2=Major, 3=Minor, 4=Editorial.
+# Compress to 3 tiers; Editorial collapses with Minor (see module docstring).
+REVIEWER3_SEVERITY_MAP: dict[int, str] = {
+    1: "major",
+    2: "moderate",
+    3: "minor",
+    4: "minor",
+}
+
+
+def normalize_severity(system: str, raw: object) -> str | None:
+    """Map a system-native severity value to the canonical 3-tier scale.
+
+    Returns None for unrecognized values so callers can decide whether to drop
+    the comment, default it, or warn.
+
+    `system` is the registry key matching `benchmarks/perturbation/systems/`:
+    'openaireview', 'coarse', or 'reviewer3'.
+    """
+    if raw is None:
+        return None
+    sysn = system.lower()
+    if sysn == "reviewer3":
+        if isinstance(raw, int):
+            return REVIEWER3_SEVERITY_MAP.get(raw)
+        # tolerate the str-form for hand-written test fixtures
+        try:
+            return REVIEWER3_SEVERITY_MAP.get(int(raw))
+        except (TypeError, ValueError):
+            return None
+    if not isinstance(raw, str):
+        return None
+    s = raw.lower()
+    if sysn == "coarse":
+        return COARSE_SEVERITY_MAP.get(s)
+    if sysn == "openaireview":
+        return OPENAIREVIEW_SEVERITY_MAP.get(s)
+    # unknown system -> pass through if already canonical
+    return s if s in TIERS else None