ChicagoHAI · jingxuangu · May 15, 2026
diff --git a/benchmarks/perturbation/README.md b/benchmarks/perturbation/README.md
@@ -53,29 +53,36 @@ of the third-party systems.
 
 ## Configuration
 
-Configs are YAML files in `configs/`. Copy `default.yaml` and edit to create experiment variants. Committed configs serve as the experiment log.
+`run_benchmark.py` now uses the unified runner schema below. It rejects unknown
+keys at load time, so older experiment configs in `configs/` that contain
+generation-era fields such as `max_papers`, `length`, `error_type`, and
+`perturb_model` are retained as historical experiment logs and are not directly
+loadable by the current unified runner.
 
 ```yaml
-max_papers: 5
-length: short              # short (2k-7k words) | medium (7k-17k) | long (>17k)
-error_type: surface         # surface | formal | all
+system: openaireview        # openaireview | coarse | reviewer3
+input_dir: benchmarks/perturbation/results/perturbations
+results_dir: benchmarks/perturbation/results
+max_tokens: 13000
+min_perturbations: 0
 score_method: llm           # llm | fuzzy | semantic
+score_model: google/gemini-3-flash-preview
 
-perturb_model: google/gemini-3-flash-preview
-score_model:   google/gemini-3-flash-preview
-
-review_models:
+models:
   - google/gemini-3-flash-preview
   - z-ai/glm-4.6
 
-review_methods:
+methods:                    # required for system: openaireview
   - zero_shot
   - progressive
-
-results_dir: benchmarks/perturbation/results
 ```
 
-Papers are streamed from the [proof-pile](https://huggingface.co/datasets/hoskinson-center/proof-pile) dataset and binned by word count.
+For `--stages score,report`, the configured `input_dir` must already contain
+prepared upstream perturbation artifacts named `*_recorrupted.md` and
+`*_kept_perturbations.json`, and `results_dir` must already contain matching
+review JSONs under the layout shown below. This repository does not currently
+check in those prepared/reviewed artifacts, so score/report smoke tests require
+local benchmark outputs from an earlier prepare/review run.
 
 ## Results Layout
 

diff --git a/benchmarks/perturbation/generate_report.py b/benchmarks/perturbation/generate_report.py
@@ -79,6 +79,17 @@ class CellResult:
     detected: list = field(default_factory=list)
     missed: list = field(default_factory=list)
     n_total_comments: int = 0
+    has_comment_efficiency_metrics: bool = False
+    n_detected_at_1: int = 0
+    n_detected_at_3: int = 0
+    n_detected_at_5: int = 0
+    n_detected_at_10: int = 0
+    recall_at_1: float = 0.0
+    recall_at_3: float = 0.0
+    recall_at_5: float = 0.0
+    recall_at_10: float = 0.0
+    comments_per_detected_error: float | None = None
+    detected_per_comment: float = 0.0
     # from review JSON
     prompt_tokens: int = 0
     completion_tokens: int = 0
@@ -161,6 +172,17 @@ def load_results(results_dir: Path, length: str, gt: dict[str, dict[str, str]])
             detected=score_data.get("detected", []),
             missed=score_data.get("missed", []),
             n_total_comments=score_data.get("n_total_comments", 0),
+            has_comment_efficiency_metrics="n_detected_at_1" in score_data,
+            n_detected_at_1=score_data.get("n_detected_at_1", 0),
+            n_detected_at_3=score_data.get("n_detected_at_3", 0),
+            n_detected_at_5=score_data.get("n_detected_at_5", 0),
+            n_detected_at_10=score_data.get("n_detected_at_10", 0),
+            recall_at_1=score_data.get("recall_at_1", 0.0),
+            recall_at_3=score_data.get("recall_at_3", 0.0),
+            recall_at_5=score_data.get("recall_at_5", 0.0),
+            recall_at_10=score_data.get("recall_at_10", 0.0),
+            comments_per_detected_error=score_data.get("comments_per_detected_error"),
+            detected_per_comment=score_data.get("detected_per_comment", 0.0),
         )
 
         # Per-error breakdown from manifest
@@ -271,6 +293,49 @@ def print_recall_by_model_method(cells: list[CellResult]) -> None:
     print()
 
 
+def _ratio(num: int, den: int) -> str:
+    return f"{num / den:.2f}" if den else "—"
+
+
+def print_comment_efficiency_metrics_by_model_method(cells: list[CellResult]) -> None:
+    metric_cells = [c for c in cells if c.has_comment_efficiency_metrics]
+    if not metric_cells:
+        return
+
+    print("## Comment-Efficiency Metrics — per model × method\n")
+    print("| model | method | comments | detected | R@1 | R@3 | R@5 | R@10 | comments/detected | detected/comment |")
+    print("|-------|--------:|---------:|---------:|----:|----:|----:|-----:|------------------:|-----------------:|")
+
+    groups: dict[tuple[str, str], dict[str, int]] = defaultdict(lambda: {
+        "comments": 0,
+        "inj": 0,
+        "det": 0,
+        "at1": 0,
+        "at3": 0,
+        "at5": 0,
+        "at10": 0,
+    })
+    for c in metric_cells:
+        g = groups[(c.model_slug, c.method)]
+        g["comments"] += c.n_total_comments
+        g["inj"] += c.n_injected
+        g["det"] += c.n_detected
+        g["at1"] += c.n_detected_at_1
+        g["at3"] += c.n_detected_at_3
+        g["at5"] += c.n_detected_at_5
+        g["at10"] += c.n_detected_at_10
+
+    for model, method in sorted(groups):
+        g = groups[(model, method)]
+        print(
+            f"| {model} | {method} | {g['comments']} | {g['det']} | "
+            f"{_pct(g['at1'], g['inj'])} | {_pct(g['at3'], g['inj'])} | "
+            f"{_pct(g['at5'], g['inj'])} | {_pct(g['at10'], g['inj'])} | "
+            f"{_ratio(g['comments'], g['det'])} | {_ratio(g['det'], g['comments'])} |"
+        )
+    print()
+
+
 def print_recall_by_length_method(cells: list[CellResult]) -> None:
     lengths = sorted({c.length for c in cells})
     if len(lengths) <= 1:
@@ -474,6 +539,7 @@ def _render_report(results_dirs: list[Path]) -> None:
     print_overall_by_method(all_cells)
     print_recall_by_length_method(all_cells)
     print_recall_by_model_method(all_cells)
+    print_comment_efficiency_metrics_by_model_method(all_cells)
     print_recall_by_length_model_method(all_cells)
     print_recall_by_error_type_x_method(all_cells)
     print_recall_by_error_type(all_cells)

diff --git a/benchmarks/perturbation/models.py b/benchmarks/perturbation/models.py
@@ -92,3 +92,14 @@ class PerturbationResult:
     n_total_comments: int
     detected: list[str]                # perturbation_ids where step 1 + step 2 passed
     missed: list[str]                  # perturbation_ids where detection failed
+    first_matching_comment_index: dict[str, int] = field(default_factory=dict)
+    n_detected_at_1: int = 0
+    n_detected_at_3: int = 0
+    n_detected_at_5: int = 0
+    n_detected_at_10: int = 0
+    recall_at_1: float = 0.0
+    recall_at_3: float = 0.0
+    recall_at_5: float = 0.0
+    recall_at_10: float = 0.0
+    comments_per_detected_error: float | None = None
+    detected_per_comment: float = 0.0
diff --git a/benchmarks/perturbation/score.py b/benchmarks/perturbation/score.py
@@ -9,6 +9,7 @@
 # (normalized) comment quote. Same coverage notion used by
 # reviewer.utils.locate_comment_in_document.
 _FUZZY_QUOTE_THRESHOLD = 0.75
+_COMMENT_EFFICIENCY_CUTOFFS = (1, 3, 5, 10)
 
 def score_review(perturbations: list[Perturbation], 
                  review_comments: list[dict], 
@@ -19,9 +20,10 @@ def score_review(perturbations: list[Perturbation],
 
     n_detected = 0
     detected = []
+    first_matching_comment_index: dict[str, int] = {}
 
     for p in perturbations:
-        for comment in review_comments:
+        for comment_idx, comment in enumerate(review_comments):
             if not _substring_match(comment.get('quote', ''), p.perturbed):
                 continue 
 
@@ -35,6 +37,7 @@ def score_review(perturbations: list[Perturbation],
             if explanation_match:
                 n_detected += 1
                 detected.append(p.perturbation_id)
+                first_matching_comment_index[p.perturbation_id] = comment_idx
                 break 
 
     missed = []
@@ -43,8 +46,44 @@ def score_review(perturbations: list[Perturbation],
             missed.append(p.perturbation_id)
 
     recall = n_detected / n_injected if n_injected > 0 else 0.0
+    comment_efficiency_metrics = _comment_efficiency_metrics(
+        first_matching_comment_index,
+        n_injected=n_injected,
+        n_detected=n_detected,
+        n_total_comments=n_total_comments,
+    )
+
+    return PerturbationResult(
+        n_injected=n_injected,
+        n_detected=n_detected,
+        recall=recall,
+        n_total_comments=n_total_comments,
+        detected=detected,
+        missed=missed,
+        first_matching_comment_index=first_matching_comment_index,
+        **comment_efficiency_metrics,
+    )
 
-    return PerturbationResult(n_injected=n_injected, n_detected=n_detected, recall=recall, n_total_comments=n_total_comments, detected=detected, missed=missed)
+
+def _comment_efficiency_metrics(
+    first_matching_comment_index: dict[str, int],
+    n_injected: int,
+    n_detected: int,
+    n_total_comments: int,
+) -> dict:
+    metrics: dict[str, float | int | None] = {}
+    for k in _COMMENT_EFFICIENCY_CUTOFFS:
+        n_at_k = sum(1 for idx in first_matching_comment_index.values() if idx < k)
+        metrics[f"n_detected_at_{k}"] = n_at_k
+        metrics[f"recall_at_{k}"] = n_at_k / n_injected if n_injected > 0 else 0.0
+
+    metrics["comments_per_detected_error"] = (
+        n_total_comments / n_detected if n_detected > 0 else None
+    )
+    metrics["detected_per_comment"] = (
+        n_detected / n_total_comments if n_total_comments > 0 else 0.0
+    )
+    return metrics
 
 
 def _substring_match(quote, perturbed) -> bool:
@@ -107,4 +146,4 @@ def _explanation_match_semantic(explanation, why_wrong) -> bool:
 
     sim = util.cos_sim(emb1, emb2)
 
-    return float(sim) >= 0.60
+    return float(sim) >= 0.60
diff --git a/src/reviewer/cli.py b/src/reviewer/cli.py
@@ -441,6 +441,17 @@ def cmd_score(args: argparse.Namespace) -> None:
         "n_total_comments": result.n_total_comments,
         "detected": result.detected,
         "missed": result.missed,
+        "first_matching_comment_index": result.first_matching_comment_index,
+        "n_detected_at_1": result.n_detected_at_1,
+        "n_detected_at_3": result.n_detected_at_3,
+        "n_detected_at_5": result.n_detected_at_5,
+        "n_detected_at_10": result.n_detected_at_10,
+        "recall_at_1": result.recall_at_1,
+        "recall_at_3": result.recall_at_3,
+        "recall_at_5": result.recall_at_5,
+        "recall_at_10": result.recall_at_10,
+        "comments_per_detected_error": result.comments_per_detected_error,
+        "detected_per_comment": result.detected_per_comment,
     }
     output_path.write_text(json.dumps(score_data, indent=2))
     print(f"\nResults saved to: {output_path}")

diff --git a/tests/test_perturbation_score.py b/tests/test_perturbation_score.py
@@ -0,0 +1,117 @@
+"""Unit tests for perturbation benchmark scoring metrics."""
+
+import sys
+import types
+from pathlib import Path
+
+_REPO = Path(__file__).resolve().parents[1]
+_SRC = _REPO / "src"
+_BENCHMARKS = _REPO / "benchmarks"
+if str(_SRC) not in sys.path:
+    sys.path.insert(0, str(_SRC))
+if str(_BENCHMARKS) not in sys.path:
+    sys.path.insert(0, str(_BENCHMARKS))
+
+if "tiktoken" not in sys.modules:
+    sys.modules["tiktoken"] = types.SimpleNamespace(
+        get_encoding=lambda _name: types.SimpleNamespace(
+            encode=lambda text: text.split(),
+            decode=lambda tokens: " ".join(tokens),
+        )
+    )
+if "rapidfuzz" not in sys.modules:
+    sys.modules["rapidfuzz"] = types.SimpleNamespace(
+        fuzz=types.SimpleNamespace(
+            token_set_ratio=lambda a, b: 100 if set(a.split()) == set(b.split()) else 0
+        )
+    )
+if "sentence_transformers" not in sys.modules:
+    sys.modules["sentence_transformers"] = types.SimpleNamespace(
+        SentenceTransformer=lambda _name: None,
+        util=types.SimpleNamespace(cos_sim=lambda _a, _b: 0.0),
+    )
+
+from perturbation.models import Error, Perturbation
+from perturbation.score import score_review
+
+
+def _perturbation(pid: str, perturbed: str) -> Perturbation:
+    return Perturbation(
+        perturbation_id=pid,
+        span_id=f"S_{pid}",
+        error=Error.OPERATOR_OR_SIGN,
+        original="x + y",
+        offset=0,
+        perturbed=perturbed,
+        why_wrong="same explanation",
+    )
+
+
+def _comment(quote: str, explanation: str = "same explanation") -> dict:
+    return {
+        "title": "issue",
+        "quote": quote,
+        "explanation": explanation,
+        "comment_type": "technical",
+    }
+
+
+def test_comment_efficiency_metrics_count_detection_at_comment_zero_for_all_cutoffs():
+    result = score_review(
+        [_perturbation("P0", "x - y")],
+        [_comment("The paper states x - y.")],
+        model="unused",
+        method="fuzzy",
+    )
+
+    assert result.first_matching_comment_index == {"P0": 0}
+    assert result.n_detected_at_1 == 1
+    assert result.n_detected_at_3 == 1
+    assert result.n_detected_at_5 == 1
+    assert result.n_detected_at_10 == 1
+    assert result.recall_at_1 == 1.0
+    assert result.recall_at_3 == 1.0
+    assert result.recall_at_5 == 1.0
+    assert result.recall_at_10 == 1.0
+
+
+def test_comment_efficiency_metrics_count_detection_at_comment_four_for_larger_cutoffs_only():
+    comments = [
+        _comment("unrelated 0"),
+        _comment("unrelated 1"),
+        _comment("unrelated 2"),
+        _comment("unrelated 3"),
+        _comment("The paper states a >= b."),
+    ]
+
+    result = score_review(
+        [_perturbation("P0", "a >= b")],
+        comments,
+        model="unused",
+        method="fuzzy",
+    )
+
+    assert result.first_matching_comment_index == {"P0": 4}
+    assert result.n_detected_at_1 == 0
+    assert result.n_detected_at_3 == 0
+    assert result.n_detected_at_5 == 1
+    assert result.n_detected_at_10 == 1
+    assert result.recall_at_1 == 0.0
+    assert result.recall_at_3 == 0.0
+    assert result.recall_at_5 == 1.0
+    assert result.recall_at_10 == 1.0
+
+
+def test_comment_efficiency_metrics_handle_no_detections_safely():
+    result = score_review(
+        [_perturbation("P0", "x - y")],
+        [_comment("unrelated"), _comment("also unrelated")],
+        model="unused",
+        method="fuzzy",
+    )
+
+    assert result.n_detected == 0
+    assert result.n_detected_at_1 == 0
+    assert result.recall_at_10 == 0.0
+    assert result.comments_per_detected_error is None
+    assert result.detected_per_comment == 0.0