From 8c6238f9ec87c62b7e0664a64f9c938db55d6253 Mon Sep 17 00:00:00 2001 From: Dang Nguyen Date: Fri, 15 May 2026 16:05:46 -0500 Subject: [PATCH] reviewer3: compile FULL source + trim PDF by pages (fixes pdflatex failures) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The token-based truncation in `prepare_units` cuts the LaTeX-as-md staged file at a token boundary, which routinely leaves the document mid-environment. pdflatex on the staged file then "produces no usable PDF" for a fraction of papers, surfacing as a hard failure in the reviewer3 run. Switches the reviewer3 system to compile the FULL pre-truncation source (`u.src_corrupted`) and then trim the rendered PDF to its first N pages. This matches the `max_pages: 20` convention conference_study/configs/coarse.yaml already uses for coarse, so reviewer3 sees roughly the same content window the other systems see. Three pdflatex robustness fixes layered in: 1. Strip orphan `\input{...}` / `\include{...}` whose target file isn't bundled. pdflatex aborts hard on a missing \input, killing the compile for the whole paper even when the body is fine (paper_005 cs_CC had `\input{mypreamble.tex}`). 2. Inject a defensive preamble of `\providecommand` fallbacks for common author-defined shortcuts (\bbR, \calA, \bfx, \eps, \vvirg, \ootimes, etc.). Authors typically define these in private preamble files we don't have; \providecommand is a no-op when the command is already defined, so the injection is safe blanket coverage. 3. subprocess.run uses bytes (text=False) instead of text=True so the pdflatex log's non-UTF-8 accent bytes don't blow up Python's decoder (paper_009 cs_CC had byte 0xaa at offset ~57k). Changes - Reviewer3System.build_jobs threads `u.src_corrupted` (full path) and `cfg["max_pages"]` into the job payload. - Reviewer3Job adopts `source` + `max_pages` fields; `_submit` / `_ensure_pdf` forward them. - `_ensure_pdf` prefers `source` over `paper` for the compile when set; caches alongside the source with `.trim.pdf` suffix when trimmed. - `_trim_pages_to` (in-place) and `_maybe_trim_pages` (for already-PDF inputs) use pymupdf to cap pages. - `max_pages: 20` added to all 8 `full_*_reviewer3.yaml` configs. - run_benchmark.py Config gains `max_pages: int | None = None`. Smoke-validated on three previously-failing cs_CC papers (2604.19872v1 with missing \input + custom commands, 2604.24325v1 with same pattern, 2604.24879v1 with non-UTF8 bytes in pdflatex output) — all three now produce 20-page trimmed PDFs in 2–4s. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../configs/full_cs_CC_reviewer3.yaml | 1 + .../configs/full_cs_LG_reviewer3.yaml | 1 + .../configs/full_econ_EM_reviewer3.yaml | 1 + .../configs/full_hep_ex_reviewer3.yaml | 1 + .../configs/full_math_all_reviewer3.yaml | 1 + .../full_physics_atm_clus_reviewer3.yaml | 1 + .../configs/full_q_bio_GN_reviewer3.yaml | 1 + .../configs/full_stat_AP_reviewer3.yaml | 1 + benchmarks/perturbation/run_benchmark.py | 5 + benchmarks/perturbation/systems/reviewer3.py | 14 +- .../perturbation/systems/reviewer3_adapter.py | 198 +++++++++++++++--- 11 files changed, 199 insertions(+), 26 deletions(-) diff --git a/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml b/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml index 3488e54..61e3c36 100644 --- a/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml +++ b/benchmarks/perturbation/configs/full_cs_CC_reviewer3.yaml @@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview review_mode: author # author | journal (Reviewer 3 reviewMode enum) poll_interval_s: 30 poll_timeout_s: 1800 # 30 min/paper cap +max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) results_dir: benchmarks/perturbation/results/full_cs_CC_reviewer3 diff --git a/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml b/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml index bc3cbb3..d9aa287 100644 --- a/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml +++ b/benchmarks/perturbation/configs/full_cs_LG_reviewer3.yaml @@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview review_mode: author # author | journal (Reviewer 3 reviewMode enum) poll_interval_s: 30 poll_timeout_s: 1800 # 30 min/paper cap +max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) results_dir: benchmarks/perturbation/results/full_cs_LG_reviewer3 diff --git a/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml b/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml index f871179..efebb8b 100644 --- a/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml +++ b/benchmarks/perturbation/configs/full_econ_EM_reviewer3.yaml @@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview review_mode: author # author | journal (Reviewer 3 reviewMode enum) poll_interval_s: 30 poll_timeout_s: 1800 # 30 min/paper cap +max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) results_dir: benchmarks/perturbation/results/full_econ_EM_reviewer3 diff --git a/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml b/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml index 901fbdb..711eae7 100644 --- a/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml +++ b/benchmarks/perturbation/configs/full_hep_ex_reviewer3.yaml @@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview review_mode: author # author | journal (Reviewer 3 reviewMode enum) poll_interval_s: 30 poll_timeout_s: 1800 # 30 min/paper cap +max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) results_dir: benchmarks/perturbation/results/full_hep_ex_reviewer3 diff --git a/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml b/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml index ef94db9..69e061c 100644 --- a/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml +++ b/benchmarks/perturbation/configs/full_math_all_reviewer3.yaml @@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview review_mode: author # author | journal (Reviewer 3 reviewMode enum) poll_interval_s: 30 poll_timeout_s: 1800 # 30 min/paper cap +max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) results_dir: benchmarks/perturbation/results/full_math_all_reviewer3 diff --git a/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml b/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml index 79d5851..48a09f6 100644 --- a/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml +++ b/benchmarks/perturbation/configs/full_physics_atm_clus_reviewer3.yaml @@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview review_mode: author # author | journal (Reviewer 3 reviewMode enum) poll_interval_s: 30 poll_timeout_s: 1800 # 30 min/paper cap +max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) results_dir: benchmarks/perturbation/results/full_physics_atm_clus_reviewer3 diff --git a/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml b/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml index 00d3bf4..756cfc9 100644 --- a/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml +++ b/benchmarks/perturbation/configs/full_q_bio_GN_reviewer3.yaml @@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview review_mode: author # author | journal (Reviewer 3 reviewMode enum) poll_interval_s: 30 poll_timeout_s: 1800 # 30 min/paper cap +max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) results_dir: benchmarks/perturbation/results/full_q_bio_GN_reviewer3 diff --git a/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml b/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml index a375500..fa4cb19 100644 --- a/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml +++ b/benchmarks/perturbation/configs/full_stat_AP_reviewer3.yaml @@ -9,5 +9,6 @@ score_model: google/gemini-3-flash-preview review_mode: author # author | journal (Reviewer 3 reviewMode enum) poll_interval_s: 30 poll_timeout_s: 1800 # 30 min/paper cap +max_pages: 20 # trim rendered PDF to first N pages (matches coarse.yaml convention) results_dir: benchmarks/perturbation/results/full_stat_AP_reviewer3 diff --git a/benchmarks/perturbation/run_benchmark.py b/benchmarks/perturbation/run_benchmark.py index 9117911..b44112e 100644 --- a/benchmarks/perturbation/run_benchmark.py +++ b/benchmarks/perturbation/run_benchmark.py @@ -98,6 +98,11 @@ class Config: review_mode: str = field(default="author", metadata={"choices": ["author", "journal"]}) poll_interval_s: float = 5.0 poll_timeout_s: float = 1200.0 + # Cap pages of the rendered PDF sent to R3. None = no cap. Used because + # the token-truncated staged file often isn't valid LaTeX; we compile the + # FULL pre-truncation source and trim by pages instead (matches the + # max_pages: 20 convention in conference_study/configs/coarse.yaml). + max_pages: int | None = None # Legacy aliases (read on load, normalized into models/methods). review_models: list[str] = field(default_factory=list) review_methods: list[str] = field(default_factory=list) diff --git a/benchmarks/perturbation/systems/reviewer3.py b/benchmarks/perturbation/systems/reviewer3.py index 1f4b1b9..8ccc9a7 100644 --- a/benchmarks/perturbation/systems/reviewer3.py +++ b/benchmarks/perturbation/systems/reviewer3.py @@ -34,6 +34,11 @@ def build_jobs(self, units, cfg, results_dir): k: cfg[k] for k in ("review_mode", "poll_interval_s", "poll_timeout_s") if k in cfg and cfg[k] is not None } + # We compile the FULL (pre-truncation) source for R3 — the token-cut + # staged file frequently isn't valid LaTeX (chops mid-environment). + # `max_pages` then trims the rendered PDF so R3 still sees roughly the + # same content window as coarse (which uses max_pages: 20). + max_pages = cfg.get("max_pages") out: list[tuple[CellKey, ReviewJob]] = [] for u in units: if not u.staged_corrupted.exists(): @@ -49,7 +54,12 @@ def build_jobs(self, units, cfg, results_dir): job = ReviewJob( tag=tag, out_json=out_json, review_dir=review_dir, paper_label=f"{u.error_type}/{u.paper_label}", - payload={"paper": u.staged_corrupted, "overrides": overrides}, + payload={ + "paper": u.staged_corrupted, + "source": u.src_corrupted, + "max_pages": max_pages, + "overrides": overrides, + }, ) out.append(((REVIEWER3_SLUG,), job)) return out @@ -65,6 +75,8 @@ def run_jobs(self, cell_key, jobs, parallel): paper=j.payload["paper"], out_json=j.out_json, paper_label=j.tag, + source=j.payload.get("source"), + max_pages=j.payload.get("max_pages"), ) for j in jobs ] diff --git a/benchmarks/perturbation/systems/reviewer3_adapter.py b/benchmarks/perturbation/systems/reviewer3_adapter.py index 71c1196..570fad6 100644 --- a/benchmarks/perturbation/systems/reviewer3_adapter.py +++ b/benchmarks/perturbation/systems/reviewer3_adapter.py @@ -90,10 +90,17 @@ def config_from_env() -> Reviewer3Config: @dataclass class Reviewer3Job: - paper: Path # path to *_corrupted.md - out_json: Path # where to write the pipeline-shaped JSON - paper_label: str # e.g. "/" + paper: Path # path to *_corrupted.md (staged, possibly token-truncated) + out_json: Path # where to write the pipeline-shaped JSON + paper_label: str # e.g. "/" title: str | None = None + # Optional: full pre-truncation source. When provided, _ensure_pdf + # compiles THIS instead of `paper` — the staged file is often invalid + # LaTeX because token truncation can chop mid-environment. + source: Path | None = None + # Optional: trim the rendered PDF to its first N pages so R3 still sees + # roughly the same window other systems do (coarse uses max_pages: 20). + max_pages: int | None = None @dataclass @@ -114,48 +121,188 @@ def _headers(cfg: Reviewer3Config) -> dict[str, str]: return {"x-api-key": cfg.api_key} -def _ensure_pdf(paper: Path) -> Path: - """Reviewer 3 only accepts PDF. If `paper` is already PDF, return it. - If it's a LaTeX source (starts with `\\documentclass`) — true for the - `cs_CC` corpus where `.md` files are really `.tex` — compile via pdflatex - and cache the result next to the source. Otherwise raise.""" +def _ensure_pdf(paper: Path, *, source: Path | None = None, + max_pages: int | None = None) -> Path: + """Reviewer 3 only accepts PDF. Return a compiled+possibly-trimmed PDF. + + Resolution order for the source bytes: + 1. If `paper` is already a `.pdf`, return it as-is (page trim still applies). + 2. If `source` was provided, compile that — preferred for LaTeX-as-md + since the staged `paper` is often invalid LaTeX (token truncation + chops mid-environment). + 3. Else compile `paper` directly. If it lacks `\\end{document}`, append + one as a best-effort close. + + When `max_pages` is set, the resulting PDF is trimmed to its first N + pages via pymupdf. This matches what coarse does (max_pages: 20) so R3 + sees roughly the same window other systems see. + """ if paper.suffix.lower() == ".pdf": - return paper - head = paper.read_text(errors="replace")[:2000] + return _maybe_trim_pages(paper, max_pages) + + src_for_compile = source if (source is not None and source.exists()) else paper + cached_suffix = ".trim.pdf" if max_pages else ".pdf" + cached = src_for_compile.with_suffix(cached_suffix) + src_mtime = src_for_compile.stat().st_mtime + if cached.exists() and cached.stat().st_mtime > src_mtime: + return cached + + head = src_for_compile.read_text(errors="replace")[:2000] if "\\documentclass" not in head: raise RuntimeError( - f"don't know how to convert {paper.name} to PDF " + f"don't know how to convert {src_for_compile.name} to PDF " "(no \\documentclass found; expected LaTeX-as-md or PDF)" ) - cached = paper.with_suffix(".pdf") - if cached.exists() and cached.stat().st_mtime > paper.stat().st_mtime: - return cached - import shutil, subprocess, tempfile - text = paper.read_text(errors="replace") - # Staging truncates by tokens, which can leave the LaTeX source missing - # \end{document} (or mid-environment). Best-effort: ensure document closes. + text = src_for_compile.read_text(errors="replace") if "\\end{document}" not in text: + # `source` should always close cleanly; this only fires if we fell back + # to compiling the token-truncated `paper`. text = text.rstrip() + "\n\n\\end{document}\n" + # Strip orphan \input / \include — the perturbation corpus dumps each paper + # into a single .md but a few preserve `\input{mypreamble.tex}`-style + # directives that pdflatex can't resolve (fatal error, no PDF produced). + text = _strip_orphan_includes(text, src_for_compile.parent) + # Stripped-include papers (and some others) rely on author-defined shortcuts + # like \bbC, \calA, \vvirg, \ootimes from the missing preamble. Inject + # \providecommand fallbacks so the body compiles. + text = _inject_rescue_preamble(text) + + import shutil, subprocess, tempfile with tempfile.TemporaryDirectory() as td: tex = Path(td) / "source.tex" tex.write_text(text) # Run twice to resolve cross-refs; ignore exit code, accept partial PDF. + # Capture output as bytes — pdflatex's own log can contain non-UTF-8 + # accent bytes and we don't read this output anyway (the .log file is + # the source of truth on failure). for _ in range(2): subprocess.run( ["pdflatex", "-interaction=nonstopmode", "source.tex"], - cwd=td, capture_output=True, text=True, timeout=180, + cwd=td, capture_output=True, timeout=300, ) out_pdf = Path(td) / "source.pdf" if not out_pdf.exists() or out_pdf.stat().st_size < 1000: - log = (Path(td) / "source.log").read_text(errors="replace") if (Path(td) / "source.log").exists() else "" - raise RuntimeError(f"pdflatex produced no usable PDF for {paper}: {log[-1500:]}") + log_path = Path(td) / "source.log" + log = log_path.read_text(errors="replace") if log_path.exists() else "" + raise RuntimeError( + f"pdflatex produced no usable PDF for {src_for_compile}: {log[-1500:]}" + ) + if max_pages: + _trim_pages_to(out_pdf, max_pages) shutil.copy(out_pdf, cached) return cached -def _submit(cfg: Reviewer3Config, paper: Path, *, title: str | None) -> str: - """POST /api/internal/review (multipart). Returns sessionId.""" - paper = _ensure_pdf(paper) +_INPUT_RE = __import__("re").compile( + r"\\(?:input|include)\s*\{([^}]+)\}", flags=__import__("re").IGNORECASE, +) + + +def _strip_orphan_includes(text: str, base_dir: Path) -> str: + """Comment out `\\input{path}` / `\\include{path}` whose target file isn't + next to the source. pdflatex aborts hard on a missing \\input, which kills + the compile even when the rest of the document is fine.""" + def _replace(m): + target = m.group(1).strip() + # Common LaTeX convention: optional .tex extension. + for cand in (target, target + ".tex"): + if (base_dir / cand).exists(): + return m.group(0) + return "% [stripped missing include] " + m.group(0) + return _INPUT_RE.sub(_replace, text) + + +# Defensive preamble injected after \documentclass to provide fallbacks for +# common custom-command patterns that authors typically define in private +# preamble files (e.g. mypreamble.tex). \providecommand is a no-op when the +# command is already defined, so this is safe to inject blindly. +_RESCUE_PREAMBLE = r""" +% --- injected by reviewer3_adapter: providecommand fallbacks --- +% blackboard / cal / bf shortcuts authors commonly define per-paper +\providecommand{\bb}[1]{\mathbb{#1}} +\providecommand{\cal}[1]{\mathcal{#1}} +\providecommand{\bff}[1]{\mathbf{#1}} +% common single-letter shortcuts (\bbR, \calA, \bfx, etc.) +\makeatletter +\@for\letter:={A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z}\do{% + \expandafter\providecommand\csname bb\letter\endcsname{\ensuremath{\mathbb{\letter}}}% + \expandafter\providecommand\csname cal\letter\endcsname{\ensuremath{\mathcal{\letter}}}% +} +\@for\letter:={a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z}\do{% + \expandafter\providecommand\csname bf\letter\endcsname{\ensuremath{\mathbf{\letter}}}% +} +\makeatother +% other commonly-used shortcuts +\providecommand{\eps}{\epsilon} +\providecommand{\veps}{\varepsilon} +\providecommand{\vvirg}{,\,} +\providecommand{\ootimes}{\otimes} +\providecommand{\Bbbk}{\mathbb{k}} +% --- end injected preamble --- +""" + + +def _inject_rescue_preamble(text: str) -> str: + """Insert _RESCUE_PREAMBLE right after the first \\documentclass{...} line. + Idempotent: looks for our marker before injecting.""" + if "injected by reviewer3_adapter" in text: + return text + import re + m = re.search(r"\\documentclass(\[[^\]]*\])?\{[^}]+\}", text) + if not m: + return text + cut = m.end() + return text[:cut] + "\n" + _RESCUE_PREAMBLE + text[cut:] + + +def _maybe_trim_pages(pdf: Path, max_pages: int | None) -> Path: + """Return `pdf` (already a PDF). If `max_pages` is set and the PDF has more, + return a trimmed sibling cached next to it.""" + if not max_pages: + return pdf + import fitz + src = fitz.open(pdf) + try: + if src.page_count <= max_pages: + return pdf + trimmed = pdf.with_suffix(f".first{max_pages}p.pdf") + if trimmed.exists() and trimmed.stat().st_mtime > pdf.stat().st_mtime: + return trimmed + dst = fitz.open() + dst.insert_pdf(src, from_page=0, to_page=max_pages - 1) + dst.save(trimmed) + dst.close() + return trimmed + finally: + src.close() + + +def _trim_pages_to(pdf: Path, max_pages: int) -> None: + """In-place trim of `pdf` to its first `max_pages` pages.""" + import fitz + src = fitz.open(pdf) + try: + if src.page_count <= max_pages: + return + dst = fitz.open() + dst.insert_pdf(src, from_page=0, to_page=max_pages - 1) + tmp = pdf.with_suffix(".pdf.tmp") + dst.save(tmp) + dst.close() + finally: + src.close() + tmp.replace(pdf) + + +def _submit(cfg: Reviewer3Config, paper: Path, *, title: str | None, + source: Path | None = None, max_pages: int | None = None) -> str: + """POST /api/internal/review (multipart). Returns sessionId. + + `source` and `max_pages` are forwarded to `_ensure_pdf` so callers can opt + into compiling the full pre-truncation source and trimming the rendered + PDF — see _ensure_pdf docstring. + """ + paper = _ensure_pdf(paper, source=source, max_pages=max_pages) url = f"{cfg.base_url}/api/internal/review" data: dict[str, str] = { "userId": cfg.user_id, @@ -294,7 +441,8 @@ def _run_one(job: Reviewer3Job, cfg: Reviewer3Config) -> Reviewer3Result: start = time.time() sid = "" try: - sid = _submit(cfg, job.paper, title=job.title) + sid = _submit(cfg, job.paper, title=job.title, + source=job.source, max_pages=job.max_pages) print(f"[{tag}] submitted, sessionId={sid}", file=sys.stderr, flush=True) body = _poll_until_done(cfg, sid, tag=tag) elapsed = time.time() - start