PrunaAI · davidberenstein1957 · Apr 28, 2026 · May 5, 2026
diff --git a/src/pruna/evaluation/benchmarks.py b/src/pruna/evaluation/benchmarks.py
@@ -18,6 +18,9 @@
 from pruna.data.utils import get_literal_values_from_param
 from pruna.evaluation.metrics import MetricRegistry
 
+TASK_TYPE_TEXT_IMAGE = "text_to_image"
+TASK_TYPE_TEXT_PLUS_IMAGE_IMAGE = "text+image_image"
+
 
 @dataclass
 class Benchmark:
@@ -31,9 +34,11 @@ class Benchmark:
     description : str
         Description of what the benchmark evaluates.
     metrics : list[str]
-        List of metric names used for evaluation.
+        Metric names from ``MetricRegistry`` that the ``reference`` paper
+        explicitly names for that benchmark.
     task_type : str
-        Type of task the benchmark evaluates (e.g., 'text_to_image').
+        Type of task the benchmark evaluates (e.g., ``text_to_image``,
+        ``text+image_image``, ``text_to_video``).
     reference : str | None
         URL to the canonical paper (e.g., arXiv) for this benchmark.
     """
@@ -62,24 +67,11 @@ class BenchmarkRegistry:
     """
     Registry for benchmarks.
 
-    Metrics per benchmark are set to those explicitly used in the reference
-    paper (see reference URL). All entries verified from paper evaluation
-    sections (ar5iv/HTML or PDF) as of verification pass:
-
-    - Parti Prompts (2206.10789 §5.2, §5.4): human side-by-side only on P222.
-    - DrawBench (2205.11487 §4.3): human raters only; COCO uses FID + CLIP.
-    - GenAI Bench (2406.13743): VQAScore only (web/PWC; ar5iv failed).
-    - VBench (2311.17982): 16 dimension-specific methods; no single Pruna metric.
-    - COCO (2205.11487 §4.1): FID and CLIP score for fidelity and alignment.
-    - ImageNet (1409.0575 §4): top-1/top-5 classification accuracy.
-    - WikiText (1609.07843 §5): perplexity on validation/test.
-    - GenEval (2310.11513 §3.2): Mask2Former + CLIP color pipeline, binary score.
-    - HPS (2306.09341): HPS v2 scoring model (CLIP fine-tuned on HPD v2).
-    - ImgEdit (2505.20275 §4.2): GPT-4o 1–5 ratings and ImgEdit-Judge.
-    - Long Text Bench (2507.22058 §4): Text Accuracy (OCR, Qwen2.5-VL-7B).
-    - GEditBench (2504.17761 §4.2): VIEScore (SQ, PQ, O via GPT-4.1/Qwen2.5-VL).
-    - OneIG (2506.07977 §4.1): per-dimension metrics (semantic alignment, ED, etc.).
-    - DPG (2403.05135): DSG-style graph score, mPLUG-large adjudicator.
+    Each entry's ``metrics`` lists only ``MetricRegistry`` names that have a
+    directly named counterpart in the benchmark reference paper (e.g.
+    CLIPScore -> ``clip_score``, VQAScore -> ``vqa``, FID -> ``fid``).
+    If the paper uses a method with no matching registered metric, the list is
+    kept empty and callers should pass explicit metrics to ``Task``.
     """
 
     _registry: dict[str, Benchmark] = {}
@@ -154,7 +146,7 @@ def list(cls, task_type: str | None = None) -> list[str]:
             "perspectives, and symbol rendering from basic to complex compositions."
         ),
         metrics=[],  # Paper uses human evaluation only; pass explicit metrics if needed
-        task_type="text_to_image",
+        task_type=TASK_TYPE_TEXT_IMAGE,
         reference="https://arxiv.org/abs/2206.10789",
     ),
     Benchmark(
@@ -164,7 +156,7 @@ def list(cls, task_type: str | None = None) -> list[str]:
             "Enables side-by-side comparison on sample quality and image-text alignment with human raters."
         ),
         metrics=[],  # Paper uses human evaluation only; pass explicit metrics if needed
-        task_type="text_to_image",
+        task_type=TASK_TYPE_TEXT_IMAGE,
         reference="https://arxiv.org/abs/2205.11487",
     ),
     Benchmark(
@@ -174,8 +166,8 @@ def list(cls, task_type: str | None = None) -> list[str]:
             "Covers basic skills (scene, attributes, spatial relationships) to advanced reasoning "
             "(counting, comparison, logic/negation) with over 24k human ratings."
         ),
-        metrics=["vqa", "clip_score"],
-        task_type="text_to_image",
+        metrics=["vqa", "clip_score"],  # VQAScore + CLIPScore both named (arXiv:2406.13743)
+        task_type=TASK_TYPE_TEXT_IMAGE,
         reference="https://arxiv.org/abs/2406.13743",
     ),
     Benchmark(
@@ -195,8 +187,8 @@ def list(cls, task_type: str | None = None) -> list[str]:
             "MS-COCO for text-to-image evaluation (Imagen, 2205.11487). Paper reports "
             "FID for fidelity and CLIP score for image-text alignment."
         ),
-        metrics=["fid", "clip_score"],  # §4.1: FID + CLIP score
-        task_type="text_to_image",
+        metrics=["fid", "clip_score"],
+        task_type=TASK_TYPE_TEXT_IMAGE,
         reference="https://arxiv.org/abs/2205.11487",
     ),
     Benchmark(
@@ -223,11 +215,13 @@ def list(cls, task_type: str | None = None) -> list[str]:
         name="GenEval",
         description=(
             "Compositional text-to-image benchmark with 6 categories: single object, two object, "
-            "counting, colors, position, color attributes. Evaluates fine-grained alignment "
-            "between prompts and generated images via VQA-style questions."
+            "counting, colors, position, color attributes. Uses atomic yes/no questions per prompt; "
+            "``Task.from_benchmark`` wires ``qa_accuracy`` with strict per-image aggregation "
+            "(all questions must pass) plus ``clip_score``. For holistic VQAScore-style scoring "
+            "use GenAI Bench with ``vqa``."
         ),
-        metrics=["qa_accuracy", "clip_score"],  # strict QA + CLIP score
-        task_type="text_to_image",
+        metrics=["qa_accuracy", "clip_score"],
+        task_type=TASK_TYPE_TEXT_IMAGE,
         reference="https://arxiv.org/abs/2310.11513",
     ),
     Benchmark(
@@ -237,7 +231,7 @@ def list(cls, task_type: str | None = None) -> list[str]:
             "Covers anime, concept-art, paintings, and photo styles with human preference data."
         ),
         metrics=[],  # Paper uses HPS scoring model; not in Pruna
-        task_type="text_to_image",
+        task_type=TASK_TYPE_TEXT_IMAGE,
         reference="https://arxiv.org/abs/2306.09341",
     ),
     Benchmark(
@@ -246,71 +240,73 @@ def list(cls, task_type: str | None = None) -> list[str]:
             "Image editing benchmark with 8 edit types: replace, add, remove, adjust, extract, "
             "style, background, compose. Evaluates instruction-following for inpainting and editing."
         ),
-        metrics=[],  # Paper uses GPT-4o/ImgEdit-Judge; not in Pruna
-        task_type="text_to_image",
+        metrics=["img_edit_score"],  # Paper uses GPT-4o rubric scores + FakeShield judge.
+        task_type=TASK_TYPE_TEXT_PLUS_IMAGE_IMAGE,
         reference="https://arxiv.org/abs/2505.20275",
     ),
     Benchmark(
         name="Long Text Bench",
         description=(
-            "Text-to-image benchmark for long, detailed prompts. Evaluates model ability to "
-            "handle complex multi-clause descriptions and maintain coherence across long instructions."
+            "Text rendering benchmark evaluating whether T2I models correctly render specific text strings "
+            "provided in prompts. Uses ``text_score`` (normalized character accuracy in [0, 1]). "
+            "This is OCR correctness, not long-prompt semantic alignment."
         ),
         metrics=["text_score"],
-        task_type="text_to_image",
+        task_type=TASK_TYPE_TEXT_IMAGE,
         reference="https://arxiv.org/abs/2507.22058",
     ),
     Benchmark(
         name="GEditBench",
         description=(
             "General image editing benchmark with 11 task types: background change, color alter, "
             "material alter, motion change, style change, subject add/remove/replace, text change, "
-            "tone transfer, and human retouching."
+            "tone transfer, and human retouching. "
+            "Evaluated with VIEScore in text-image-edit (TIE) mode when source image bytes are available."
         ),
-        metrics=["vie_score"],
-        task_type="text+image_image",
+        metrics=["vie_score"],  # VIEScore is explicitly named in GEditBench.
+        task_type=TASK_TYPE_TEXT_PLUS_IMAGE_IMAGE,
         reference="https://arxiv.org/abs/2504.17761",
     ),
     Benchmark(
         name="OneIG Anime Stylization",
         description="OneIG subset: anime and stylized imagery.",
         metrics=["oneig_alignment"],
-        task_type="text_to_image",
+        task_type=TASK_TYPE_TEXT_IMAGE,
         reference="https://arxiv.org/abs/2506.07977",
     ),
     Benchmark(
         name="OneIG General Object",
         description="OneIG subset: everyday objects and scenes.",
         metrics=["oneig_alignment"],
-        task_type="text_to_image",
+        task_type=TASK_TYPE_TEXT_IMAGE,
         reference="https://arxiv.org/abs/2506.07977",
     ),
     Benchmark(
         name="OneIG Knowledge Reasoning",
         description="OneIG subset: knowledge- and reasoning-heavy prompts.",
         metrics=["oneig_reasoning"],
-        task_type="text_to_image",
+        task_type=TASK_TYPE_TEXT_IMAGE,
         reference="https://arxiv.org/abs/2506.07977",
     ),
     Benchmark(
         name="OneIG Multilingualism",
         description="OneIG subset: multilingual prompts (incl. Chinese splits).",
         metrics=["oneig_alignment"],
-        task_type="text_to_image",
+        task_type=TASK_TYPE_TEXT_IMAGE,
         reference="https://arxiv.org/abs/2506.07977",
     ),
     Benchmark(
         name="OneIG Portrait",
         description="OneIG subset: people and portraits.",
         metrics=["oneig_alignment"],
-        task_type="text_to_image",
+        task_type=TASK_TYPE_TEXT_IMAGE,
         reference="https://arxiv.org/abs/2506.07977",
     ),
     Benchmark(
         name="OneIG Text Rendering",
         description="OneIG subset: text and graphics painted into the image.",
         metrics=["oneig_text_score"],
-        task_type="text_to_image",
+        task_type=TASK_TYPE_TEXT_IMAGE,
         reference="https://arxiv.org/abs/2506.07977",
     ),
     Benchmark(
@@ -320,7 +316,7 @@ def list(cls, task_type: str | None = None) -> list[str]:
             "global, and other descriptive aspects with natural-language questions for alignment."
         ),
         metrics=[],  # Paper uses custom evaluation; not in Pruna
-        task_type="text_to_image",
+        task_type=TASK_TYPE_TEXT_IMAGE,
         reference="https://arxiv.org/abs/2403.05135",
     ),
 ]: