ChicagoHAI · dangng2004 · May 18, 2026 · May 18, 2026 · May 18, 2026
diff --git a/.gitignore b/.gitignore
@@ -31,3 +31,9 @@ benchmarks/conference_study/reports/
 # Symlink targets — trailing-slash patterns wouldn't match the symlinks
 benchmarks/conference_study/analyses/manifests
 benchmarks/conference_study/analyses/results
+benchmarks/conference_study/manifests
+benchmarks/conference_study/papers
+benchmarks/conference_study/results
+
+# Exploratory perturbation experiments (not committed; large)
+benchmarks/experimental_perturbations/
diff --git a/benchmarks/perturbation/.gitignore b/benchmarks/perturbation/.gitignore
@@ -4,12 +4,19 @@ _temp_results/
 
 # Sample paper sources (data, not code; users supply their own)
 papers/
+data/
 
-# Run logs
-reports/*.log
+# Reports (regeneratable from results/)
+reports/
 
-# Temporary / ephemeral configs
-configs/_*
+# Configs (mostly locally-generated per-domain ephemerals; tracked canonical
+# ones are preserved by the `!` exceptions below).
+configs/*
+!configs/default.yaml
+!configs/coarse_medium.yaml
+!configs/coarse_short.yaml
+!configs/surface_errors.yaml
+!configs/surface_errors_medium.yaml
 
 # Python
 __pycache__/

diff --git a/benchmarks/perturbation/generate_report.py b/benchmarks/perturbation/generate_report.py
@@ -135,15 +135,22 @@ def _extract_tokens_from_review(review_dir: Path, method: str, model_slug: str):
     return prompt, comp, cost
 
 
-def load_results(results_dir: Path, length: str, gt: dict[str, dict[str, str]]) -> list[CellResult]:
-    """Walk score directories and build CellResult list."""
+def load_results(results_dir: Path, length: str, gt: dict[str, dict[str, str]],
+                 score_subdir: str = "llm") -> list[CellResult]:
+    """Walk score directories and build CellResult list.
+
+    `score_subdir` filters which scoring-mode subdir to aggregate (e.g. "llm",
+    "llm_t4_grounded"). Defaults to "llm" for backward compatibility.
+    """
     cells: list[CellResult] = []
 
     # Score JSONs live at: <model>/<error_type>/<method>/paper_NNN/score/<score_method>/*.json
     for score_path in sorted(results_dir.glob("*/*/*/paper_*/score/*/*.json")):
         parts = score_path.relative_to(results_dir).parts
         if len(parts) < 7 or parts[4] != "score":
             continue
+        if parts[5] != score_subdir:
+            continue
         model_slug, error_type, method, paper_label = parts[0], parts[1], parts[2], parts[3]
         if model_slug == "perturb":
             continue
@@ -431,7 +438,7 @@ def _infer_length(results_dir: Path, cfg: dict) -> str:
     return name
 
 
-def _render_report(results_dirs: list[Path]) -> None:
+def _render_report(results_dirs: list[Path], score_subdir: str = "llm") -> None:
     """Print the report to stdout. Helpers all use `print()`, so callers can
     capture this with `contextlib.redirect_stdout`."""
     all_cells: list[CellResult] = []
@@ -449,7 +456,7 @@ def _render_report(results_dirs: list[Path]) -> None:
                 cfg = yaml.safe_load(f) or {}
         length = _infer_length(rd, cfg)
         gt = load_ground_truth(rd)
-        cells = load_results(rd, length, gt)
+        cells = load_results(rd, length, gt, score_subdir=score_subdir)
         all_cells.extend(cells)
         for paper_label, perts in gt.items():
             all_gt[f"{length}:{paper_label}"] = perts
@@ -480,13 +487,13 @@ def _render_report(results_dirs: list[Path]) -> None:
     print_token_usage(all_cells)
 
 
-def generate_report(results_dirs: list[Path]) -> str:
+def generate_report(results_dirs: list[Path], score_subdir: str = "llm") -> str:
     """Return the markdown report as a string. Importable from `run_benchmark.py`."""
     import contextlib
     import io
     buf = io.StringIO()
     with contextlib.redirect_stdout(buf):
-        _render_report(results_dirs)
+        _render_report(results_dirs, score_subdir=score_subdir)
     return buf.getvalue()
 
 
@@ -498,8 +505,11 @@ def main() -> None:
                         help="One or more results directories.")
     parser.add_argument("--out", type=Path, default=None,
                         help="Write to this path (default: stdout).")
+    parser.add_argument("--score-subdir", default="llm",
+                        help="Which scoring-mode subdir to aggregate "
+                             "(e.g. llm, llm_t4_grounded). Default: llm.")
     args = parser.parse_args()
-    md = generate_report(args.results_dirs)
+    md = generate_report(args.results_dirs, score_subdir=args.score_subdir)
     if args.out is None:
         sys.stdout.write(md)
     else:

diff --git a/benchmarks/perturbation/reports/README.md b/benchmarks/perturbation/reports/README.md
diff --git a/benchmarks/perturbation/reports/combined.md b/benchmarks/perturbation/reports/combined.md