Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,9 @@ OPENROUTER_API_KEY=your_openrouter_api_key_here

# Optional: custom OpenAI base URL (e.g. EU endpoint, Azure)
# OPENAI_BASE_URL=https://eu.api.openai.com/v1

# Reviewer 3 (closed-source HTTP API; benchmarks/perturbation only).
# Required when running the perturbation benchmark with system: reviewer3.
# REVIEWER3_API_KEY=sk_...
# REVIEWER3_USER_ID=<uuid from web UI session JSON; not an email>
# REVIEWER3_BASE_URL=https://reviewer3.com # optional override
15 changes: 11 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ __pycache__/
*.pyc
*.pyo
.venv/
.venv
venv/

# Jupyter
Expand All @@ -22,12 +23,18 @@ venv/

# Run outputs
review_results/
benchmarks/conference_study/results/
benchmarks/perturbation/perturbation_results/
benchmarks/conference_study/results
benchmarks/perturbation/perturbation_results
benchmarks/perturbation/results
benchmarks/perturbation/data

# conference_study study artifacts (not code)
benchmarks/conference_study/manifests/
benchmarks/conference_study/reports/
benchmarks/conference_study/manifests
benchmarks/conference_study/reports
# Symlink targets — trailing-slash patterns wouldn't match the symlinks
benchmarks/conference_study/analyses/manifests
benchmarks/conference_study/analyses/results
benchmarks/conference_study/papers

# Moved out of repo (see commit 6373fad); ignore the leftover local dir.
benchmarks/experimental_perturbations/
19 changes: 8 additions & 11 deletions benchmarks/conference_study/analyses/compute_auc.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,17 +51,14 @@
REPO_ROOT = HERE.parent # benchmarks/conference_study/
RESULTS_ROOT = REPO_ROOT / "results"

COARSE_SEVERITY_MAP = {"critical": "major", "major": "moderate", "minor": "minor"}
SEVERITY_TIERS = ("major", "moderate", "minor")


def normalize_severity(method: str, raw: str | None) -> str | None:
if not raw:
return None
raw = raw.lower()
if method == "coarse":
return COARSE_SEVERITY_MAP.get(raw)
return raw if raw in SEVERITY_TIERS else None
# Severity normalization lives in benchmarks/perturbation/_severity.py so the
# perturbation adapters and these analyses use one source of truth.
sys.path.insert(0, str(HERE.parents[1] / "perturbation"))
from _severity import ( # noqa: E402
COARSE_SEVERITY_MAP,
TIERS as SEVERITY_TIERS,
normalize_severity,
)


def load_manifest(path: Path) -> dict[str, list[dict]]:
Expand Down
22 changes: 8 additions & 14 deletions benchmarks/conference_study/analyses/report_scaleup.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,20 +204,14 @@ def comment_metrics_by_method(
return out


# Coarse uses {minor, major, critical}; openaireview methods use
# {minor, moderate, major}. Normalize so a single set of tiers compares
# apples-to-apples (highest=major, mid=moderate, low=minor).
_COARSE_SEVERITY_MAP = {"critical": "major", "major": "moderate", "minor": "minor"}
SEVERITY_TIERS = ("major", "moderate", "minor")


def normalize_severity(method: str, raw: str | None) -> str | None:
if not raw:
return None
raw = raw.lower()
if method == "coarse":
return _COARSE_SEVERITY_MAP.get(raw)
return raw if raw in SEVERITY_TIERS else None
# Severity normalization lives in benchmarks/perturbation/_severity.py so the
# perturbation adapters and these analyses use one source of truth.
sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "perturbation"))
from _severity import ( # noqa: E402
COARSE_SEVERITY_MAP as _COARSE_SEVERITY_MAP,
TIERS as SEVERITY_TIERS,
normalize_severity,
)


def severity_counts_by_method(
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/conference_study/competitors/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@

from .base import CompetitorAdapter
from .coarse_adapter import CoarseAdapter
from .reviewer3_adapter import Reviewer3Adapter

_REGISTRY: dict[str, type[CompetitorAdapter]] = {
"coarse": CoarseAdapter,
"reviewer3": Reviewer3Adapter,
}


Expand Down
118 changes: 118 additions & 0 deletions benchmarks/conference_study/competitors/reviewer3_adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
"""Adapter for Reviewer 3 (closed-source HTTP API).

Submission flow is the same as the perturbation benchmark — POST a PDF to
`/api/internal/review`, poll the session until the `status` enum is terminal,
then map each comment via `_normalize_comment`. We reuse those helpers from
the perturbation adapter (`benchmarks/perturbation/systems/reviewer3_adapter.py`)
rather than duplicating the HTTP code; the only difference here is that
conference inputs arrive as PDFs already, so the LaTeX-as-md → PDF compile
step (`_ensure_pdf`) is unnecessary.

Reviewer 3 has no model selector, so `method_key(...)` always returns
`"reviewer3__reviewer3"` regardless of the manifest `model` value. The
conference YAML should pin `models: [reviewer3]` to avoid duplicate
submissions across a phantom model loop.

Required env:
REVIEWER3_API_KEY sk_... (sent as `x-api-key` header)
REVIEWER3_USER_ID UUID from the vendor's web UI session JSON (not an email)
"""
from __future__ import annotations

import sys
from pathlib import Path

from .base import CompetitorAdapter, NormalizedComment, NormalizedReview

# Reuse the perturbation adapter's HTTP + normalization helpers.
_PERT = Path(__file__).resolve().parents[2] / "perturbation" / "systems"
sys.path.insert(0, str(_PERT))
import reviewer3_adapter as _r3 # noqa: E402


_METHOD_KEY = f"{_r3.REVIEWER3_SLUG}__{_r3.REVIEWER3_SLUG}"


class Reviewer3Adapter(CompetitorAdapter):
name = "reviewer3"
required_env = ("REVIEWER3_API_KEY", "REVIEWER3_USER_ID")

def method_key(self, model: str) -> str:
# R3 has no model selector — fixed key regardless of `model`.
return _METHOD_KEY

def review(self, pdf: Path, model: str, cfg: dict) -> NormalizedReview:
opts = cfg.get("reviewer3_options", {}) or {}
rcfg = _r3.config_from_env()
for k in ("review_mode", "poll_interval_s", "poll_timeout_s",
"request_timeout_s", "base_url"):
if k in opts and opts[k] is not None:
setattr(rcfg, k, opts[k])

# Cap PDF size sent to R3. `max_pages` lives at the top level of the
# config (run_competitors.py uses it for parse_document); we honor the
# same value here so the bytes shipped to R3 match the paragraph window
# we already cap on our side. Untrimmed full PDFs were tripping R3's
# HTTP 413 limit and inflating per-paper wall time.
max_pages = cfg.get("max_pages") or opts.get("max_pages")

# sid_file is injected by run_competitors.py just before invocation:
# `cfg["_sid_file"] = out_file.with_suffix(".sid")`. If present and
# the file already exists, we resume that R3 session instead of
# submitting fresh — avoids duplicate-session credit waste when a
# prior run was killed mid-poll. (See PR notes; ~34% of credits
# observed wasted on duplicates before this fix.)
sid_file = cfg.get("_sid_file")
if isinstance(sid_file, str):
sid_file = Path(sid_file)
session_id = ""
body = None
if sid_file and sid_file.exists():
session_id = sid_file.read_text().strip()
try:
body = _r3._poll_until_done(rcfg, session_id,
tag=f"reviewer3/{pdf.stem} (resumed)")
except RuntimeError as e:
m = str(e)
if "fetch failed" in m and ("403" in m or "404" in m):
sid_file.unlink(missing_ok=True)
session_id, body = "", None
else:
raise

if body is None:
session_id = _r3._submit(rcfg, pdf, title=pdf.stem, max_pages=max_pages)
if sid_file:
sid_file.parent.mkdir(parents=True, exist_ok=True)
sid_file.write_text(session_id)
body = _r3._poll_until_done(rcfg, session_id, tag=f"reviewer3/{pdf.stem}")

comments: list[NormalizedComment] = []
for i, raw in enumerate(body.get("comments") or []):
if not isinstance(raw, dict):
raw = {"comment": str(raw)}
norm = _r3._normalize_comment(raw, i)
comments.append(NormalizedComment(
title=norm.get("title", ""),
quote=norm.get("quote", ""),
explanation=norm.get("explanation", ""),
comment_type=norm.get("comment_type", "technical"),
extra={
"severity": norm.get("severity"),
"reviewerId": raw.get("reviewerId"),
"rank": raw.get("rank"),
"session_id": session_id,
},
))

# R3 doesn't publish pricing and doesn't return overall_feedback or
# token counts in its response, so we leave those empty/None.
return NormalizedReview(
comments=comments,
overall_feedback="",
cost_usd=None,
cost_method="estimated",
prompt_tokens=None,
completion_tokens=None,
model=_r3.REVIEWER3_SLUG,
)
32 changes: 32 additions & 0 deletions benchmarks/conference_study/configs/reviewer3.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Reviewer 3 (closed-source HTTP API) run on the v2 conference cohort.
# Results -> benchmarks/conference_study/results/reviewer3_v2/
# Log -> benchmarks/conference_study/results/reviewer3_v2/run_log.jsonl
#
# Prerequisites:
# - REVIEWER3_API_KEY and REVIEWER3_USER_ID set in .env
# - Manifest, papers, and results dirs reachable. In this worktree they are
# symlinks into the sibling OpenAIReview worktree (gitignored data lives
# only there). Set them up with:
# ln -s ../../../OpenAIReview/benchmarks/conference_study/manifests manifests
# ln -s ../../../OpenAIReview/benchmarks/conference_study/papers papers
# ln -s ../../../OpenAIReview/benchmarks/conference_study/results results

name: reviewer3_v2
competitor: reviewer3

manifest: manifests/v2/combined.json

# R3 has no model selector. Pin to a single dummy entry so run_competitors.py
# loops once per paper rather than once per (paper × manifest model).
models:
- reviewer3

timeout_sec: 3600 # outer per-(paper, model) wall cap (R3 is 10-30 min typical)
max_per_model: 5
max_pages: 20 # parse_document cap; matches coarse.yaml convention

# Adapter-specific options forwarded to Reviewer3Adapter.review(cfg=...).
reviewer3_options:
review_mode: author # author | journal (R3 reviewMode enum)
poll_interval_s: 30
poll_timeout_s: 3600 # 60 min/paper cap
9 changes: 8 additions & 1 deletion benchmarks/conference_study/run_competitors.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,14 @@ def run_one(paper: dict, model: str, adapter, cfg: dict, dry_run: bool = False)
title, content, _was_ocr = parse_document(pdf, max_pages=MAX_PAGES)
paragraphs = split_into_paragraphs(content)

# Inject sid_file location so the adapter can persist/resume the
# competitor-side session id (e.g. reviewer3). Adapters that don't
# care simply ignore the underscore-prefixed key. The file lives
# next to the merged paper JSON so it survives across runs.
out_file = RESULTS_DIR / f"{paper['slug']}.json"
sid_dir = RESULTS_DIR / ".sids"
cfg = {**cfg, "_sid_file": sid_dir / f"{paper['slug']}.{method_key}.sid"}

review = adapter.review(pdf, model, cfg)

method_data = build_method_data(
Expand All @@ -134,7 +142,6 @@ def run_one(paper: dict, model: str, adapter, cfg: dict, dry_run: bool = False)
paragraphs=paragraphs,
)

out_file = RESULTS_DIR / f"{paper['slug']}.json"
merge_into_paper_json(
out_file=out_file,
slug=paper["slug"],
Expand Down
13 changes: 12 additions & 1 deletion benchmarks/perturbation/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,19 @@ papers/
# Run logs
reports/*.log

# Temporary / ephemeral configs
# Temporary / ephemeral configs.
# Underscore-prefix are by-convention scratch. The other rules below cover the
# bulk per-domain configs that we generate locally (one per system × domain)
# but don't check in. The `!*_reviewer3.yaml` exception preserves the canonical
# reviewer3 configs that are tracked.
configs/_*
configs/cs_*scaleup*.yaml
configs/full_*.yaml
configs/grok_*.yaml
configs/longtail_*.yaml
configs/subset_*.yaml
configs/r3_smoke*.yaml
!configs/full_*_reviewer3.yaml

# Python
__pycache__/
Expand Down
85 changes: 85 additions & 0 deletions benchmarks/perturbation/_severity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
"""Canonical severity tiers and per-system normalization.

The perturbation benchmark, the conference study analyses, and the viz layer
all want to compare comment severities across review systems. Each system uses
its own native vocabulary, so before any cross-system comparison the raw value
must be mapped to the canonical 3-tier scale used by openaireview itself:

major - Undermines a key claim/methodology; affects conclusions.
moderate - Real error or gap that is localized and fixable.
minor - Framing concern, mild overclaim, or resolvable ambiguity.

Per-system maps:

* openaireview: identity. Output is already in {major, moderate, minor}.
* coarse: {critical, major, minor} -> {major, moderate, minor}
(shift down one tier; same mapping that the conference-study
scripts in benchmarks/conference_study/analyses/ use).
* reviewer3: integer 1..4 per their OpenAPI spec, where
1=Critical, 2=Major, 3=Minor, 4=Editorial.
Compressed to the 3-tier scale by collapsing R3 Minor and
Editorial into `minor`, since in practice R3 tags substantive
-but-lower-importance findings as Editorial rather than style
notes. Confirm with the vendor if the label is later clarified.

The conference_study analyses currently inline `COARSE_SEVERITY_MAP` (see
`benchmarks/conference_study/analyses/compute_auc.py` and `report_scaleup.py`).
Once those analyses are co-resident with this module they should import
`COARSE_SEVERITY_MAP` and `normalize_severity` from here instead.
"""

from __future__ import annotations


TIERS: tuple[str, ...] = ("major", "moderate", "minor")


# openaireview methods emit canonical tier strings directly.
OPENAIREVIEW_SEVERITY_MAP: dict[str, str] = {t: t for t in TIERS}

# coarse uses {minor, major, critical}. Shift down one level.
COARSE_SEVERITY_MAP: dict[str, str] = {
"critical": "major",
"major": "moderate",
"minor": "minor",
}

# Reviewer 3 spec: 1=Critical, 2=Major, 3=Minor, 4=Editorial.
# Compress to 3 tiers; Editorial collapses with Minor (see module docstring).
REVIEWER3_SEVERITY_MAP: dict[int, str] = {
1: "major",
2: "moderate",
3: "minor",
4: "minor",
}


def normalize_severity(system: str, raw: object) -> str | None:
"""Map a system-native severity value to the canonical 3-tier scale.

Returns None for unrecognized values so callers can decide whether to drop
the comment, default it, or warn.

`system` is the registry key matching `benchmarks/perturbation/systems/`:
'openaireview', 'coarse', or 'reviewer3'.
"""
if raw is None:
return None
sysn = system.lower()
if sysn == "reviewer3":
if isinstance(raw, int):
return REVIEWER3_SEVERITY_MAP.get(raw)
# tolerate the str-form for hand-written test fixtures
try:
return REVIEWER3_SEVERITY_MAP.get(int(raw))
except (TypeError, ValueError):
return None
if not isinstance(raw, str):
return None
s = raw.lower()
if sysn == "coarse":
return COARSE_SEVERITY_MAP.get(s)
if sysn == "openaireview":
return OPENAIREVIEW_SEVERITY_MAP.get(s)
# unknown system -> pass through if already canonical
return s if s in TIERS else None
Loading