Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 19 additions & 12 deletions benchmarks/perturbation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,29 +53,36 @@ of the third-party systems.

## Configuration

Configs are YAML files in `configs/`. Copy `default.yaml` and edit to create experiment variants. Committed configs serve as the experiment log.
`run_benchmark.py` now uses the unified runner schema below. It rejects unknown
keys at load time, so older experiment configs in `configs/` that contain
generation-era fields such as `max_papers`, `length`, `error_type`, and
`perturb_model` are retained as historical experiment logs and are not directly
loadable by the current unified runner.

```yaml
max_papers: 5
length: short # short (2k-7k words) | medium (7k-17k) | long (>17k)
error_type: surface # surface | formal | all
system: openaireview # openaireview | coarse | reviewer3
input_dir: benchmarks/perturbation/results/perturbations
results_dir: benchmarks/perturbation/results
max_tokens: 13000
min_perturbations: 0
score_method: llm # llm | fuzzy | semantic
score_model: google/gemini-3-flash-preview

perturb_model: google/gemini-3-flash-preview
score_model: google/gemini-3-flash-preview

review_models:
models:
- google/gemini-3-flash-preview
- z-ai/glm-4.6

review_methods:
methods: # required for system: openaireview
- zero_shot
- progressive

results_dir: benchmarks/perturbation/results
```

Papers are streamed from the [proof-pile](https://huggingface.co/datasets/hoskinson-center/proof-pile) dataset and binned by word count.
For `--stages score,report`, the configured `input_dir` must already contain
prepared upstream perturbation artifacts named `*_recorrupted.md` and
`*_kept_perturbations.json`, and `results_dir` must already contain matching
review JSONs under the layout shown below. This repository does not currently
check in those prepared/reviewed artifacts, so score/report smoke tests require
local benchmark outputs from an earlier prepare/review run.

## Results Layout

Expand Down
66 changes: 66 additions & 0 deletions benchmarks/perturbation/generate_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,17 @@ class CellResult:
detected: list = field(default_factory=list)
missed: list = field(default_factory=list)
n_total_comments: int = 0
has_comment_efficiency_metrics: bool = False
n_detected_at_1: int = 0
n_detected_at_3: int = 0
n_detected_at_5: int = 0
n_detected_at_10: int = 0
recall_at_1: float = 0.0
recall_at_3: float = 0.0
recall_at_5: float = 0.0
recall_at_10: float = 0.0
comments_per_detected_error: float | None = None
detected_per_comment: float = 0.0
# from review JSON
prompt_tokens: int = 0
completion_tokens: int = 0
Expand Down Expand Up @@ -161,6 +172,17 @@ def load_results(results_dir: Path, length: str, gt: dict[str, dict[str, str]])
detected=score_data.get("detected", []),
missed=score_data.get("missed", []),
n_total_comments=score_data.get("n_total_comments", 0),
has_comment_efficiency_metrics="n_detected_at_1" in score_data,
n_detected_at_1=score_data.get("n_detected_at_1", 0),
n_detected_at_3=score_data.get("n_detected_at_3", 0),
n_detected_at_5=score_data.get("n_detected_at_5", 0),
n_detected_at_10=score_data.get("n_detected_at_10", 0),
recall_at_1=score_data.get("recall_at_1", 0.0),
recall_at_3=score_data.get("recall_at_3", 0.0),
recall_at_5=score_data.get("recall_at_5", 0.0),
recall_at_10=score_data.get("recall_at_10", 0.0),
comments_per_detected_error=score_data.get("comments_per_detected_error"),
detected_per_comment=score_data.get("detected_per_comment", 0.0),
)

# Per-error breakdown from manifest
Expand Down Expand Up @@ -271,6 +293,49 @@ def print_recall_by_model_method(cells: list[CellResult]) -> None:
print()


def _ratio(num: int, den: int) -> str:
return f"{num / den:.2f}" if den else "—"


def print_comment_efficiency_metrics_by_model_method(cells: list[CellResult]) -> None:
metric_cells = [c for c in cells if c.has_comment_efficiency_metrics]
if not metric_cells:
return

print("## Comment-Efficiency Metrics — per model × method\n")
print("| model | method | comments | detected | R@1 | R@3 | R@5 | R@10 | comments/detected | detected/comment |")
print("|-------|--------:|---------:|---------:|----:|----:|----:|-----:|------------------:|-----------------:|")

groups: dict[tuple[str, str], dict[str, int]] = defaultdict(lambda: {
"comments": 0,
"inj": 0,
"det": 0,
"at1": 0,
"at3": 0,
"at5": 0,
"at10": 0,
})
for c in metric_cells:
g = groups[(c.model_slug, c.method)]
g["comments"] += c.n_total_comments
g["inj"] += c.n_injected
g["det"] += c.n_detected
g["at1"] += c.n_detected_at_1
g["at3"] += c.n_detected_at_3
g["at5"] += c.n_detected_at_5
g["at10"] += c.n_detected_at_10

for model, method in sorted(groups):
g = groups[(model, method)]
print(
f"| {model} | {method} | {g['comments']} | {g['det']} | "
f"{_pct(g['at1'], g['inj'])} | {_pct(g['at3'], g['inj'])} | "
f"{_pct(g['at5'], g['inj'])} | {_pct(g['at10'], g['inj'])} | "
f"{_ratio(g['comments'], g['det'])} | {_ratio(g['det'], g['comments'])} |"
)
print()


def print_recall_by_length_method(cells: list[CellResult]) -> None:
lengths = sorted({c.length for c in cells})
if len(lengths) <= 1:
Expand Down Expand Up @@ -474,6 +539,7 @@ def _render_report(results_dirs: list[Path]) -> None:
print_overall_by_method(all_cells)
print_recall_by_length_method(all_cells)
print_recall_by_model_method(all_cells)
print_comment_efficiency_metrics_by_model_method(all_cells)
print_recall_by_length_model_method(all_cells)
print_recall_by_error_type_x_method(all_cells)
print_recall_by_error_type(all_cells)
Expand Down
11 changes: 11 additions & 0 deletions benchmarks/perturbation/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,14 @@ class PerturbationResult:
n_total_comments: int
detected: list[str] # perturbation_ids where step 1 + step 2 passed
missed: list[str] # perturbation_ids where detection failed
first_matching_comment_index: dict[str, int] = field(default_factory=dict)
n_detected_at_1: int = 0
n_detected_at_3: int = 0
n_detected_at_5: int = 0
n_detected_at_10: int = 0
recall_at_1: float = 0.0
recall_at_3: float = 0.0
recall_at_5: float = 0.0
recall_at_10: float = 0.0
comments_per_detected_error: float | None = None
detected_per_comment: float = 0.0
45 changes: 42 additions & 3 deletions benchmarks/perturbation/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# (normalized) comment quote. Same coverage notion used by
# reviewer.utils.locate_comment_in_document.
_FUZZY_QUOTE_THRESHOLD = 0.75
_COMMENT_EFFICIENCY_CUTOFFS = (1, 3, 5, 10)

def score_review(perturbations: list[Perturbation],
review_comments: list[dict],
Expand All @@ -19,9 +20,10 @@ def score_review(perturbations: list[Perturbation],

n_detected = 0
detected = []
first_matching_comment_index: dict[str, int] = {}

for p in perturbations:
for comment in review_comments:
for comment_idx, comment in enumerate(review_comments):
if not _substring_match(comment.get('quote', ''), p.perturbed):
continue

Expand All @@ -35,6 +37,7 @@ def score_review(perturbations: list[Perturbation],
if explanation_match:
n_detected += 1
detected.append(p.perturbation_id)
first_matching_comment_index[p.perturbation_id] = comment_idx
break

missed = []
Expand All @@ -43,8 +46,44 @@ def score_review(perturbations: list[Perturbation],
missed.append(p.perturbation_id)

recall = n_detected / n_injected if n_injected > 0 else 0.0
comment_efficiency_metrics = _comment_efficiency_metrics(
first_matching_comment_index,
n_injected=n_injected,
n_detected=n_detected,
n_total_comments=n_total_comments,
)

return PerturbationResult(
n_injected=n_injected,
n_detected=n_detected,
recall=recall,
n_total_comments=n_total_comments,
detected=detected,
missed=missed,
first_matching_comment_index=first_matching_comment_index,
**comment_efficiency_metrics,
)

return PerturbationResult(n_injected=n_injected, n_detected=n_detected, recall=recall, n_total_comments=n_total_comments, detected=detected, missed=missed)

def _comment_efficiency_metrics(
first_matching_comment_index: dict[str, int],
n_injected: int,
n_detected: int,
n_total_comments: int,
) -> dict:
metrics: dict[str, float | int | None] = {}
for k in _COMMENT_EFFICIENCY_CUTOFFS:
n_at_k = sum(1 for idx in first_matching_comment_index.values() if idx < k)
metrics[f"n_detected_at_{k}"] = n_at_k
metrics[f"recall_at_{k}"] = n_at_k / n_injected if n_injected > 0 else 0.0

metrics["comments_per_detected_error"] = (
n_total_comments / n_detected if n_detected > 0 else None
)
metrics["detected_per_comment"] = (
n_detected / n_total_comments if n_total_comments > 0 else 0.0
)
return metrics


def _substring_match(quote, perturbed) -> bool:
Expand Down Expand Up @@ -107,4 +146,4 @@ def _explanation_match_semantic(explanation, why_wrong) -> bool:

sim = util.cos_sim(emb1, emb2)

return float(sim) >= 0.60
return float(sim) >= 0.60
11 changes: 11 additions & 0 deletions src/reviewer/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,17 @@ def cmd_score(args: argparse.Namespace) -> None:
"n_total_comments": result.n_total_comments,
"detected": result.detected,
"missed": result.missed,
"first_matching_comment_index": result.first_matching_comment_index,
"n_detected_at_1": result.n_detected_at_1,
"n_detected_at_3": result.n_detected_at_3,
"n_detected_at_5": result.n_detected_at_5,
"n_detected_at_10": result.n_detected_at_10,
"recall_at_1": result.recall_at_1,
"recall_at_3": result.recall_at_3,
"recall_at_5": result.recall_at_5,
"recall_at_10": result.recall_at_10,
"comments_per_detected_error": result.comments_per_detected_error,
"detected_per_comment": result.detected_per_comment,
}
output_path.write_text(json.dumps(score_data, indent=2))
print(f"\nResults saved to: {output_path}")
Expand Down
117 changes: 117 additions & 0 deletions tests/test_perturbation_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
"""Unit tests for perturbation benchmark scoring metrics."""

import sys
import types
from pathlib import Path

_REPO = Path(__file__).resolve().parents[1]
_SRC = _REPO / "src"
_BENCHMARKS = _REPO / "benchmarks"
if str(_SRC) not in sys.path:
sys.path.insert(0, str(_SRC))
if str(_BENCHMARKS) not in sys.path:
sys.path.insert(0, str(_BENCHMARKS))

if "tiktoken" not in sys.modules:
sys.modules["tiktoken"] = types.SimpleNamespace(
get_encoding=lambda _name: types.SimpleNamespace(
encode=lambda text: text.split(),
decode=lambda tokens: " ".join(tokens),
)
)
if "rapidfuzz" not in sys.modules:
sys.modules["rapidfuzz"] = types.SimpleNamespace(
fuzz=types.SimpleNamespace(
token_set_ratio=lambda a, b: 100 if set(a.split()) == set(b.split()) else 0
)
)
if "sentence_transformers" not in sys.modules:
sys.modules["sentence_transformers"] = types.SimpleNamespace(
SentenceTransformer=lambda _name: None,
util=types.SimpleNamespace(cos_sim=lambda _a, _b: 0.0),
)

from perturbation.models import Error, Perturbation
from perturbation.score import score_review


def _perturbation(pid: str, perturbed: str) -> Perturbation:
return Perturbation(
perturbation_id=pid,
span_id=f"S_{pid}",
error=Error.OPERATOR_OR_SIGN,
original="x + y",
offset=0,
perturbed=perturbed,
why_wrong="same explanation",
)


def _comment(quote: str, explanation: str = "same explanation") -> dict:
return {
"title": "issue",
"quote": quote,
"explanation": explanation,
"comment_type": "technical",
}


def test_comment_efficiency_metrics_count_detection_at_comment_zero_for_all_cutoffs():
result = score_review(
[_perturbation("P0", "x - y")],
[_comment("The paper states x - y.")],
model="unused",
method="fuzzy",
)

assert result.first_matching_comment_index == {"P0": 0}
assert result.n_detected_at_1 == 1
assert result.n_detected_at_3 == 1
assert result.n_detected_at_5 == 1
assert result.n_detected_at_10 == 1
assert result.recall_at_1 == 1.0
assert result.recall_at_3 == 1.0
assert result.recall_at_5 == 1.0
assert result.recall_at_10 == 1.0


def test_comment_efficiency_metrics_count_detection_at_comment_four_for_larger_cutoffs_only():
comments = [
_comment("unrelated 0"),
_comment("unrelated 1"),
_comment("unrelated 2"),
_comment("unrelated 3"),
_comment("The paper states a >= b."),
]

result = score_review(
[_perturbation("P0", "a >= b")],
comments,
model="unused",
method="fuzzy",
)

assert result.first_matching_comment_index == {"P0": 4}
assert result.n_detected_at_1 == 0
assert result.n_detected_at_3 == 0
assert result.n_detected_at_5 == 1
assert result.n_detected_at_10 == 1
assert result.recall_at_1 == 0.0
assert result.recall_at_3 == 0.0
assert result.recall_at_5 == 1.0
assert result.recall_at_10 == 1.0


def test_comment_efficiency_metrics_handle_no_detections_safely():
result = score_review(
[_perturbation("P0", "x - y")],
[_comment("unrelated"), _comment("also unrelated")],
model="unused",
method="fuzzy",
)

assert result.n_detected == 0
assert result.n_detected_at_1 == 0
assert result.recall_at_10 == 0.0
assert result.comments_per_detected_error is None
assert result.detected_per_comment == 0.0