diff --git a/benchmarks/perturbation/README.md b/benchmarks/perturbation/README.md index a9521f3..33d4bb4 100644 --- a/benchmarks/perturbation/README.md +++ b/benchmarks/perturbation/README.md @@ -53,29 +53,36 @@ of the third-party systems. ## Configuration -Configs are YAML files in `configs/`. Copy `default.yaml` and edit to create experiment variants. Committed configs serve as the experiment log. +`run_benchmark.py` now uses the unified runner schema below. It rejects unknown +keys at load time, so older experiment configs in `configs/` that contain +generation-era fields such as `max_papers`, `length`, `error_type`, and +`perturb_model` are retained as historical experiment logs and are not directly +loadable by the current unified runner. ```yaml -max_papers: 5 -length: short # short (2k-7k words) | medium (7k-17k) | long (>17k) -error_type: surface # surface | formal | all +system: openaireview # openaireview | coarse | reviewer3 +input_dir: benchmarks/perturbation/results/perturbations +results_dir: benchmarks/perturbation/results +max_tokens: 13000 +min_perturbations: 0 score_method: llm # llm | fuzzy | semantic +score_model: google/gemini-3-flash-preview -perturb_model: google/gemini-3-flash-preview -score_model: google/gemini-3-flash-preview - -review_models: +models: - google/gemini-3-flash-preview - z-ai/glm-4.6 -review_methods: +methods: # required for system: openaireview - zero_shot - progressive - -results_dir: benchmarks/perturbation/results ``` -Papers are streamed from the [proof-pile](https://huggingface.co/datasets/hoskinson-center/proof-pile) dataset and binned by word count. +For `--stages score,report`, the configured `input_dir` must already contain +prepared upstream perturbation artifacts named `*_recorrupted.md` and +`*_kept_perturbations.json`, and `results_dir` must already contain matching +review JSONs under the layout shown below. This repository does not currently +check in those prepared/reviewed artifacts, so score/report smoke tests require +local benchmark outputs from an earlier prepare/review run. ## Results Layout diff --git a/benchmarks/perturbation/generate_report.py b/benchmarks/perturbation/generate_report.py index 986887b..ec5b538 100644 --- a/benchmarks/perturbation/generate_report.py +++ b/benchmarks/perturbation/generate_report.py @@ -79,6 +79,17 @@ class CellResult: detected: list = field(default_factory=list) missed: list = field(default_factory=list) n_total_comments: int = 0 + has_comment_efficiency_metrics: bool = False + n_detected_at_1: int = 0 + n_detected_at_3: int = 0 + n_detected_at_5: int = 0 + n_detected_at_10: int = 0 + recall_at_1: float = 0.0 + recall_at_3: float = 0.0 + recall_at_5: float = 0.0 + recall_at_10: float = 0.0 + comments_per_detected_error: float | None = None + detected_per_comment: float = 0.0 # from review JSON prompt_tokens: int = 0 completion_tokens: int = 0 @@ -161,6 +172,17 @@ def load_results(results_dir: Path, length: str, gt: dict[str, dict[str, str]]) detected=score_data.get("detected", []), missed=score_data.get("missed", []), n_total_comments=score_data.get("n_total_comments", 0), + has_comment_efficiency_metrics="n_detected_at_1" in score_data, + n_detected_at_1=score_data.get("n_detected_at_1", 0), + n_detected_at_3=score_data.get("n_detected_at_3", 0), + n_detected_at_5=score_data.get("n_detected_at_5", 0), + n_detected_at_10=score_data.get("n_detected_at_10", 0), + recall_at_1=score_data.get("recall_at_1", 0.0), + recall_at_3=score_data.get("recall_at_3", 0.0), + recall_at_5=score_data.get("recall_at_5", 0.0), + recall_at_10=score_data.get("recall_at_10", 0.0), + comments_per_detected_error=score_data.get("comments_per_detected_error"), + detected_per_comment=score_data.get("detected_per_comment", 0.0), ) # Per-error breakdown from manifest @@ -271,6 +293,49 @@ def print_recall_by_model_method(cells: list[CellResult]) -> None: print() +def _ratio(num: int, den: int) -> str: + return f"{num / den:.2f}" if den else "—" + + +def print_comment_efficiency_metrics_by_model_method(cells: list[CellResult]) -> None: + metric_cells = [c for c in cells if c.has_comment_efficiency_metrics] + if not metric_cells: + return + + print("## Comment-Efficiency Metrics — per model × method\n") + print("| model | method | comments | detected | R@1 | R@3 | R@5 | R@10 | comments/detected | detected/comment |") + print("|-------|--------:|---------:|---------:|----:|----:|----:|-----:|------------------:|-----------------:|") + + groups: dict[tuple[str, str], dict[str, int]] = defaultdict(lambda: { + "comments": 0, + "inj": 0, + "det": 0, + "at1": 0, + "at3": 0, + "at5": 0, + "at10": 0, + }) + for c in metric_cells: + g = groups[(c.model_slug, c.method)] + g["comments"] += c.n_total_comments + g["inj"] += c.n_injected + g["det"] += c.n_detected + g["at1"] += c.n_detected_at_1 + g["at3"] += c.n_detected_at_3 + g["at5"] += c.n_detected_at_5 + g["at10"] += c.n_detected_at_10 + + for model, method in sorted(groups): + g = groups[(model, method)] + print( + f"| {model} | {method} | {g['comments']} | {g['det']} | " + f"{_pct(g['at1'], g['inj'])} | {_pct(g['at3'], g['inj'])} | " + f"{_pct(g['at5'], g['inj'])} | {_pct(g['at10'], g['inj'])} | " + f"{_ratio(g['comments'], g['det'])} | {_ratio(g['det'], g['comments'])} |" + ) + print() + + def print_recall_by_length_method(cells: list[CellResult]) -> None: lengths = sorted({c.length for c in cells}) if len(lengths) <= 1: @@ -474,6 +539,7 @@ def _render_report(results_dirs: list[Path]) -> None: print_overall_by_method(all_cells) print_recall_by_length_method(all_cells) print_recall_by_model_method(all_cells) + print_comment_efficiency_metrics_by_model_method(all_cells) print_recall_by_length_model_method(all_cells) print_recall_by_error_type_x_method(all_cells) print_recall_by_error_type(all_cells) diff --git a/benchmarks/perturbation/models.py b/benchmarks/perturbation/models.py index f178970..d9621df 100644 --- a/benchmarks/perturbation/models.py +++ b/benchmarks/perturbation/models.py @@ -92,3 +92,14 @@ class PerturbationResult: n_total_comments: int detected: list[str] # perturbation_ids where step 1 + step 2 passed missed: list[str] # perturbation_ids where detection failed + first_matching_comment_index: dict[str, int] = field(default_factory=dict) + n_detected_at_1: int = 0 + n_detected_at_3: int = 0 + n_detected_at_5: int = 0 + n_detected_at_10: int = 0 + recall_at_1: float = 0.0 + recall_at_3: float = 0.0 + recall_at_5: float = 0.0 + recall_at_10: float = 0.0 + comments_per_detected_error: float | None = None + detected_per_comment: float = 0.0 diff --git a/benchmarks/perturbation/score.py b/benchmarks/perturbation/score.py index a388a4e..b924da5 100644 --- a/benchmarks/perturbation/score.py +++ b/benchmarks/perturbation/score.py @@ -9,6 +9,7 @@ # (normalized) comment quote. Same coverage notion used by # reviewer.utils.locate_comment_in_document. _FUZZY_QUOTE_THRESHOLD = 0.75 +_COMMENT_EFFICIENCY_CUTOFFS = (1, 3, 5, 10) def score_review(perturbations: list[Perturbation], review_comments: list[dict], @@ -19,9 +20,10 @@ def score_review(perturbations: list[Perturbation], n_detected = 0 detected = [] + first_matching_comment_index: dict[str, int] = {} for p in perturbations: - for comment in review_comments: + for comment_idx, comment in enumerate(review_comments): if not _substring_match(comment.get('quote', ''), p.perturbed): continue @@ -35,6 +37,7 @@ def score_review(perturbations: list[Perturbation], if explanation_match: n_detected += 1 detected.append(p.perturbation_id) + first_matching_comment_index[p.perturbation_id] = comment_idx break missed = [] @@ -43,8 +46,44 @@ def score_review(perturbations: list[Perturbation], missed.append(p.perturbation_id) recall = n_detected / n_injected if n_injected > 0 else 0.0 + comment_efficiency_metrics = _comment_efficiency_metrics( + first_matching_comment_index, + n_injected=n_injected, + n_detected=n_detected, + n_total_comments=n_total_comments, + ) + + return PerturbationResult( + n_injected=n_injected, + n_detected=n_detected, + recall=recall, + n_total_comments=n_total_comments, + detected=detected, + missed=missed, + first_matching_comment_index=first_matching_comment_index, + **comment_efficiency_metrics, + ) - return PerturbationResult(n_injected=n_injected, n_detected=n_detected, recall=recall, n_total_comments=n_total_comments, detected=detected, missed=missed) + +def _comment_efficiency_metrics( + first_matching_comment_index: dict[str, int], + n_injected: int, + n_detected: int, + n_total_comments: int, +) -> dict: + metrics: dict[str, float | int | None] = {} + for k in _COMMENT_EFFICIENCY_CUTOFFS: + n_at_k = sum(1 for idx in first_matching_comment_index.values() if idx < k) + metrics[f"n_detected_at_{k}"] = n_at_k + metrics[f"recall_at_{k}"] = n_at_k / n_injected if n_injected > 0 else 0.0 + + metrics["comments_per_detected_error"] = ( + n_total_comments / n_detected if n_detected > 0 else None + ) + metrics["detected_per_comment"] = ( + n_detected / n_total_comments if n_total_comments > 0 else 0.0 + ) + return metrics def _substring_match(quote, perturbed) -> bool: @@ -107,4 +146,4 @@ def _explanation_match_semantic(explanation, why_wrong) -> bool: sim = util.cos_sim(emb1, emb2) - return float(sim) >= 0.60 \ No newline at end of file + return float(sim) >= 0.60 diff --git a/src/reviewer/cli.py b/src/reviewer/cli.py index c5828d7..95176f9 100644 --- a/src/reviewer/cli.py +++ b/src/reviewer/cli.py @@ -441,6 +441,17 @@ def cmd_score(args: argparse.Namespace) -> None: "n_total_comments": result.n_total_comments, "detected": result.detected, "missed": result.missed, + "first_matching_comment_index": result.first_matching_comment_index, + "n_detected_at_1": result.n_detected_at_1, + "n_detected_at_3": result.n_detected_at_3, + "n_detected_at_5": result.n_detected_at_5, + "n_detected_at_10": result.n_detected_at_10, + "recall_at_1": result.recall_at_1, + "recall_at_3": result.recall_at_3, + "recall_at_5": result.recall_at_5, + "recall_at_10": result.recall_at_10, + "comments_per_detected_error": result.comments_per_detected_error, + "detected_per_comment": result.detected_per_comment, } output_path.write_text(json.dumps(score_data, indent=2)) print(f"\nResults saved to: {output_path}") diff --git a/tests/test_perturbation_score.py b/tests/test_perturbation_score.py new file mode 100644 index 0000000..7b4afe3 --- /dev/null +++ b/tests/test_perturbation_score.py @@ -0,0 +1,117 @@ +"""Unit tests for perturbation benchmark scoring metrics.""" + +import sys +import types +from pathlib import Path + +_REPO = Path(__file__).resolve().parents[1] +_SRC = _REPO / "src" +_BENCHMARKS = _REPO / "benchmarks" +if str(_SRC) not in sys.path: + sys.path.insert(0, str(_SRC)) +if str(_BENCHMARKS) not in sys.path: + sys.path.insert(0, str(_BENCHMARKS)) + +if "tiktoken" not in sys.modules: + sys.modules["tiktoken"] = types.SimpleNamespace( + get_encoding=lambda _name: types.SimpleNamespace( + encode=lambda text: text.split(), + decode=lambda tokens: " ".join(tokens), + ) + ) +if "rapidfuzz" not in sys.modules: + sys.modules["rapidfuzz"] = types.SimpleNamespace( + fuzz=types.SimpleNamespace( + token_set_ratio=lambda a, b: 100 if set(a.split()) == set(b.split()) else 0 + ) + ) +if "sentence_transformers" not in sys.modules: + sys.modules["sentence_transformers"] = types.SimpleNamespace( + SentenceTransformer=lambda _name: None, + util=types.SimpleNamespace(cos_sim=lambda _a, _b: 0.0), + ) + +from perturbation.models import Error, Perturbation +from perturbation.score import score_review + + +def _perturbation(pid: str, perturbed: str) -> Perturbation: + return Perturbation( + perturbation_id=pid, + span_id=f"S_{pid}", + error=Error.OPERATOR_OR_SIGN, + original="x + y", + offset=0, + perturbed=perturbed, + why_wrong="same explanation", + ) + + +def _comment(quote: str, explanation: str = "same explanation") -> dict: + return { + "title": "issue", + "quote": quote, + "explanation": explanation, + "comment_type": "technical", + } + + +def test_comment_efficiency_metrics_count_detection_at_comment_zero_for_all_cutoffs(): + result = score_review( + [_perturbation("P0", "x - y")], + [_comment("The paper states x - y.")], + model="unused", + method="fuzzy", + ) + + assert result.first_matching_comment_index == {"P0": 0} + assert result.n_detected_at_1 == 1 + assert result.n_detected_at_3 == 1 + assert result.n_detected_at_5 == 1 + assert result.n_detected_at_10 == 1 + assert result.recall_at_1 == 1.0 + assert result.recall_at_3 == 1.0 + assert result.recall_at_5 == 1.0 + assert result.recall_at_10 == 1.0 + + +def test_comment_efficiency_metrics_count_detection_at_comment_four_for_larger_cutoffs_only(): + comments = [ + _comment("unrelated 0"), + _comment("unrelated 1"), + _comment("unrelated 2"), + _comment("unrelated 3"), + _comment("The paper states a >= b."), + ] + + result = score_review( + [_perturbation("P0", "a >= b")], + comments, + model="unused", + method="fuzzy", + ) + + assert result.first_matching_comment_index == {"P0": 4} + assert result.n_detected_at_1 == 0 + assert result.n_detected_at_3 == 0 + assert result.n_detected_at_5 == 1 + assert result.n_detected_at_10 == 1 + assert result.recall_at_1 == 0.0 + assert result.recall_at_3 == 0.0 + assert result.recall_at_5 == 1.0 + assert result.recall_at_10 == 1.0 + + +def test_comment_efficiency_metrics_handle_no_detections_safely(): + result = score_review( + [_perturbation("P0", "x - y")], + [_comment("unrelated"), _comment("also unrelated")], + model="unused", + method="fuzzy", + ) + + assert result.n_detected == 0 + assert result.n_detected_at_1 == 0 + assert result.recall_at_10 == 0.0 + assert result.comments_per_detected_error is None + assert result.detected_per_comment == 0.0