Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview
review_mode: author # author | journal (Reviewer 3 reviewMode enum)
poll_interval_s: 30
poll_timeout_s: 1800 # 30 min/paper cap
max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention)

results_dir: benchmarks/perturbation/results/full_cs_CC_reviewer3
1 change: 1 addition & 0 deletions benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview
review_mode: author # author | journal (Reviewer 3 reviewMode enum)
poll_interval_s: 30
poll_timeout_s: 1800 # 30 min/paper cap
max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention)

results_dir: benchmarks/perturbation/results/full_cs_LG_reviewer3
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview
review_mode: author # author | journal (Reviewer 3 reviewMode enum)
poll_interval_s: 30
poll_timeout_s: 1800 # 30 min/paper cap
max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention)

results_dir: benchmarks/perturbation/results/full_econ_EM_reviewer3
1 change: 1 addition & 0 deletions benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview
review_mode: author # author | journal (Reviewer 3 reviewMode enum)
poll_interval_s: 30
poll_timeout_s: 1800 # 30 min/paper cap
max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention)

results_dir: benchmarks/perturbation/results/full_hep_ex_reviewer3
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview
review_mode: author # author | journal (Reviewer 3 reviewMode enum)
poll_interval_s: 30
poll_timeout_s: 1800 # 30 min/paper cap
max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention)

results_dir: benchmarks/perturbation/results/full_math_all_reviewer3
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview
review_mode: author # author | journal (Reviewer 3 reviewMode enum)
poll_interval_s: 30
poll_timeout_s: 1800 # 30 min/paper cap
max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention)

results_dir: benchmarks/perturbation/results/full_physics_atm_clus_reviewer3
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview
review_mode: author # author | journal (Reviewer 3 reviewMode enum)
poll_interval_s: 30
poll_timeout_s: 1800 # 30 min/paper cap
max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention)

results_dir: benchmarks/perturbation/results/full_q_bio_GN_reviewer3
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview
review_mode: author # author | journal (Reviewer 3 reviewMode enum)
poll_interval_s: 30
poll_timeout_s: 1800 # 30 min/paper cap
max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention)

results_dir: benchmarks/perturbation/results/full_stat_AP_reviewer3
5 changes: 5 additions & 0 deletions benchmarks/perturbation/run_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,11 @@ class Config:
review_mode: str = field(default="author", metadata={"choices": ["author", "journal"]})
poll_interval_s: float = 5.0
poll_timeout_s: float = 1200.0
# Cap pages of the rendered PDF sent to R3. None = no cap. Used because
# the token-truncated staged file often isn't valid LaTeX; we compile the
# FULL pre-truncation source and trim by pages instead (matches the
# max_pages: 20 convention in conference_study/configs/coarse.yaml).
max_pages: int | None = None
# Legacy aliases (read on load, normalized into models/methods).
review_models: list[str] = field(default_factory=list)
review_methods: list[str] = field(default_factory=list)
Expand Down
14 changes: 13 additions & 1 deletion benchmarks/perturbation/systems/reviewer3.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ def build_jobs(self, units, cfg, results_dir):
k: cfg[k] for k in ("review_mode", "poll_interval_s", "poll_timeout_s")
if k in cfg and cfg[k] is not None
}
# We compile the FULL (pre-truncation) source for R3 — the token-cut
# staged file frequently isn't valid LaTeX (chops mid-environment).
# `max_pages` then trims the rendered PDF so R3 still sees roughly the
# same content window as coarse (which uses max_pages: 20).
max_pages = cfg.get("max_pages")
out: list[tuple[CellKey, ReviewJob]] = []
for u in units:
if not u.staged_corrupted.exists():
Expand All @@ -49,7 +54,12 @@ def build_jobs(self, units, cfg, results_dir):
job = ReviewJob(
tag=tag, out_json=out_json, review_dir=review_dir,
paper_label=f"{u.error_type}/{u.paper_label}",
payload={"paper": u.staged_corrupted, "overrides": overrides},
payload={
"paper": u.staged_corrupted,
"source": u.src_corrupted,
"max_pages": max_pages,
"overrides": overrides,
},
)
out.append(((REVIEWER3_SLUG,), job))
return out
Expand All @@ -65,6 +75,8 @@ def run_jobs(self, cell_key, jobs, parallel):
paper=j.payload["paper"],
out_json=j.out_json,
paper_label=j.tag,
source=j.payload.get("source"),
max_pages=j.payload.get("max_pages"),
)
for j in jobs
]
Expand Down
198 changes: 173 additions & 25 deletions benchmarks/perturbation/systems/reviewer3_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,17 @@ def config_from_env() -> Reviewer3Config:

@dataclass
class Reviewer3Job:
paper: Path # path to *_corrupted.md
out_json: Path # where to write the pipeline-shaped JSON
paper_label: str # e.g. "<error_type>/<paper_label>"
paper: Path # path to *_corrupted.md (staged, possibly token-truncated)
out_json: Path # where to write the pipeline-shaped JSON
paper_label: str # e.g. "<error_type>/<paper_label>"
title: str | None = None
# Optional: full pre-truncation source. When provided, _ensure_pdf
# compiles THIS instead of `paper` — the staged file is often invalid
# LaTeX because token truncation can chop mid-environment.
source: Path | None = None
# Optional: trim the rendered PDF to its first N pages so R3 still sees
# roughly the same window other systems do (coarse uses max_pages: 20).
max_pages: int | None = None


@dataclass
Expand All @@ -114,48 +121,188 @@ def _headers(cfg: Reviewer3Config) -> dict[str, str]:
return {"x-api-key": cfg.api_key}


def _ensure_pdf(paper: Path) -> Path:
"""Reviewer 3 only accepts PDF. If `paper` is already PDF, return it.
If it's a LaTeX source (starts with `\\documentclass`) — true for the
`cs_CC` corpus where `.md` files are really `.tex` — compile via pdflatex
and cache the result next to the source. Otherwise raise."""
def _ensure_pdf(paper: Path, *, source: Path | None = None,
max_pages: int | None = None) -> Path:
"""Reviewer 3 only accepts PDF. Return a compiled+possibly-trimmed PDF.

Resolution order for the source bytes:
1. If `paper` is already a `.pdf`, return it as-is (page trim still applies).
2. If `source` was provided, compile that — preferred for LaTeX-as-md
since the staged `paper` is often invalid LaTeX (token truncation
chops mid-environment).
3. Else compile `paper` directly. If it lacks `\\end{document}`, append
one as a best-effort close.

When `max_pages` is set, the resulting PDF is trimmed to its first N
pages via pymupdf. This matches what coarse does (max_pages: 20) so R3
sees roughly the same window other systems see.
"""
if paper.suffix.lower() == ".pdf":
return paper
head = paper.read_text(errors="replace")[:2000]
return _maybe_trim_pages(paper, max_pages)

src_for_compile = source if (source is not None and source.exists()) else paper
cached_suffix = ".trim.pdf" if max_pages else ".pdf"
cached = src_for_compile.with_suffix(cached_suffix)
src_mtime = src_for_compile.stat().st_mtime
if cached.exists() and cached.stat().st_mtime > src_mtime:
return cached

head = src_for_compile.read_text(errors="replace")[:2000]
if "\\documentclass" not in head:
raise RuntimeError(
f"don't know how to convert {paper.name} to PDF "
f"don't know how to convert {src_for_compile.name} to PDF "
"(no \\documentclass found; expected LaTeX-as-md or PDF)"
)
cached = paper.with_suffix(".pdf")
if cached.exists() and cached.stat().st_mtime > paper.stat().st_mtime:
return cached
import shutil, subprocess, tempfile
text = paper.read_text(errors="replace")
# Staging truncates by tokens, which can leave the LaTeX source missing
# \end{document} (or mid-environment). Best-effort: ensure document closes.
text = src_for_compile.read_text(errors="replace")
if "\\end{document}" not in text:
# `source` should always close cleanly; this only fires if we fell back
# to compiling the token-truncated `paper`.
text = text.rstrip() + "\n\n\\end{document}\n"
# Strip orphan \input / \include — the perturbation corpus dumps each paper
# into a single .md but a few preserve `\input{mypreamble.tex}`-style
# directives that pdflatex can't resolve (fatal error, no PDF produced).
text = _strip_orphan_includes(text, src_for_compile.parent)
# Stripped-include papers (and some others) rely on author-defined shortcuts
# like \bbC, \calA, \vvirg, \ootimes from the missing preamble. Inject
# \providecommand fallbacks so the body compiles.
text = _inject_rescue_preamble(text)

import shutil, subprocess, tempfile
with tempfile.TemporaryDirectory() as td:
tex = Path(td) / "source.tex"
tex.write_text(text)
# Run twice to resolve cross-refs; ignore exit code, accept partial PDF.
# Capture output as bytes — pdflatex's own log can contain non-UTF-8
# accent bytes and we don't read this output anyway (the .log file is
# the source of truth on failure).
for _ in range(2):
subprocess.run(
["pdflatex", "-interaction=nonstopmode", "source.tex"],
cwd=td, capture_output=True, text=True, timeout=180,
cwd=td, capture_output=True, timeout=300,
)
out_pdf = Path(td) / "source.pdf"
if not out_pdf.exists() or out_pdf.stat().st_size < 1000:
log = (Path(td) / "source.log").read_text(errors="replace") if (Path(td) / "source.log").exists() else ""
raise RuntimeError(f"pdflatex produced no usable PDF for {paper}: {log[-1500:]}")
log_path = Path(td) / "source.log"
log = log_path.read_text(errors="replace") if log_path.exists() else ""
raise RuntimeError(
f"pdflatex produced no usable PDF for {src_for_compile}: {log[-1500:]}"
)
if max_pages:
_trim_pages_to(out_pdf, max_pages)
shutil.copy(out_pdf, cached)
return cached


def _submit(cfg: Reviewer3Config, paper: Path, *, title: str | None) -> str:
"""POST /api/internal/review (multipart). Returns sessionId."""
paper = _ensure_pdf(paper)
_INPUT_RE = __import__("re").compile(
r"\\(?:input|include)\s*\{([^}]+)\}", flags=__import__("re").IGNORECASE,
)


def _strip_orphan_includes(text: str, base_dir: Path) -> str:
"""Comment out `\\input{path}` / `\\include{path}` whose target file isn't
next to the source. pdflatex aborts hard on a missing \\input, which kills
the compile even when the rest of the document is fine."""
def _replace(m):
target = m.group(1).strip()
# Common LaTeX convention: optional .tex extension.
for cand in (target, target + ".tex"):
if (base_dir / cand).exists():
return m.group(0)
return "% [stripped missing include] " + m.group(0)
return _INPUT_RE.sub(_replace, text)


# Defensive preamble injected after \documentclass to provide fallbacks for
# common custom-command patterns that authors typically define in private
# preamble files (e.g. mypreamble.tex). \providecommand is a no-op when the
# command is already defined, so this is safe to inject blindly.
_RESCUE_PREAMBLE = r"""
% --- injected by reviewer3_adapter: providecommand fallbacks ---
% blackboard / cal / bf shortcuts authors commonly define per-paper
\providecommand{\bb}[1]{\mathbb{#1}}
\providecommand{\cal}[1]{\mathcal{#1}}
\providecommand{\bff}[1]{\mathbf{#1}}
% common single-letter shortcuts (\bbR, \calA, \bfx, etc.)
\makeatletter
\@for\letter:={A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z}\do{%
\expandafter\providecommand\csname bb\letter\endcsname{\ensuremath{\mathbb{\letter}}}%
\expandafter\providecommand\csname cal\letter\endcsname{\ensuremath{\mathcal{\letter}}}%
}
\@for\letter:={a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z}\do{%
\expandafter\providecommand\csname bf\letter\endcsname{\ensuremath{\mathbf{\letter}}}%
}
\makeatother
% other commonly-used shortcuts
\providecommand{\eps}{\epsilon}
\providecommand{\veps}{\varepsilon}
\providecommand{\vvirg}{,\,}
\providecommand{\ootimes}{\otimes}
\providecommand{\Bbbk}{\mathbb{k}}
% --- end injected preamble ---
"""


def _inject_rescue_preamble(text: str) -> str:
"""Insert _RESCUE_PREAMBLE right after the first \\documentclass{...} line.
Idempotent: looks for our marker before injecting."""
if "injected by reviewer3_adapter" in text:
return text
import re
m = re.search(r"\\documentclass(\[[^\]]*\])?\{[^}]+\}", text)
if not m:
return text
cut = m.end()
return text[:cut] + "\n" + _RESCUE_PREAMBLE + text[cut:]


def _maybe_trim_pages(pdf: Path, max_pages: int | None) -> Path:
"""Return `pdf` (already a PDF). If `max_pages` is set and the PDF has more,
return a trimmed sibling cached next to it."""
if not max_pages:
return pdf
import fitz
src = fitz.open(pdf)
try:
if src.page_count <= max_pages:
return pdf
trimmed = pdf.with_suffix(f".first{max_pages}p.pdf")
if trimmed.exists() and trimmed.stat().st_mtime > pdf.stat().st_mtime:
return trimmed
dst = fitz.open()
dst.insert_pdf(src, from_page=0, to_page=max_pages - 1)
dst.save(trimmed)
dst.close()
return trimmed
finally:
src.close()


def _trim_pages_to(pdf: Path, max_pages: int) -> None:
"""In-place trim of `pdf` to its first `max_pages` pages."""
import fitz
src = fitz.open(pdf)
try:
if src.page_count <= max_pages:
return
dst = fitz.open()
dst.insert_pdf(src, from_page=0, to_page=max_pages - 1)
tmp = pdf.with_suffix(".pdf.tmp")
dst.save(tmp)
dst.close()
finally:
src.close()
tmp.replace(pdf)


def _submit(cfg: Reviewer3Config, paper: Path, *, title: str | None,
source: Path | None = None, max_pages: int | None = None) -> str:
"""POST /api/internal/review (multipart). Returns sessionId.

`source` and `max_pages` are forwarded to `_ensure_pdf` so callers can opt
into compiling the full pre-truncation source and trimming the rendered
PDF — see _ensure_pdf docstring.
"""
paper = _ensure_pdf(paper, source=source, max_pages=max_pages)
url = f"{cfg.base_url}/api/internal/review"
data: dict[str, str] = {
"userId": cfg.user_id,
Expand Down Expand Up @@ -294,7 +441,8 @@ def _run_one(job: Reviewer3Job, cfg: Reviewer3Config) -> Reviewer3Result:
start = time.time()
sid = ""
try:
sid = _submit(cfg, job.paper, title=job.title)
sid = _submit(cfg, job.paper, title=job.title,
source=job.source, max_pages=job.max_pages)
print(f"[{tag}] submitted, sessionId={sid}", file=sys.stderr, flush=True)
body = _poll_until_done(cfg, sid, tag=tag)
elapsed = time.time() - start
Expand Down