Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 42 additions & 46 deletions src/pruna/evaluation/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
from pruna.data.utils import get_literal_values_from_param
from pruna.evaluation.metrics import MetricRegistry

TASK_TYPE_TEXT_IMAGE = "text_to_image"
TASK_TYPE_TEXT_PLUS_IMAGE_IMAGE = "text+image_image"


@dataclass
class Benchmark:
Expand All @@ -31,9 +34,11 @@ class Benchmark:
description : str
Description of what the benchmark evaluates.
metrics : list[str]
List of metric names used for evaluation.
Metric names from ``MetricRegistry`` that the ``reference`` paper
explicitly names for that benchmark.
task_type : str
Type of task the benchmark evaluates (e.g., 'text_to_image').
Type of task the benchmark evaluates (e.g., ``text_to_image``,
``text+image_image``, ``text_to_video``).
reference : str | None
URL to the canonical paper (e.g., arXiv) for this benchmark.
"""
Expand Down Expand Up @@ -62,24 +67,11 @@ class BenchmarkRegistry:
"""
Registry for benchmarks.

Metrics per benchmark are set to those explicitly used in the reference
paper (see reference URL). All entries verified from paper evaluation
sections (ar5iv/HTML or PDF) as of verification pass:

- Parti Prompts (2206.10789 §5.2, §5.4): human side-by-side only on P222.
- DrawBench (2205.11487 §4.3): human raters only; COCO uses FID + CLIP.
- GenAI Bench (2406.13743): VQAScore only (web/PWC; ar5iv failed).
- VBench (2311.17982): 16 dimension-specific methods; no single Pruna metric.
- COCO (2205.11487 §4.1): FID and CLIP score for fidelity and alignment.
- ImageNet (1409.0575 §4): top-1/top-5 classification accuracy.
- WikiText (1609.07843 §5): perplexity on validation/test.
- GenEval (2310.11513 §3.2): Mask2Former + CLIP color pipeline, binary score.
- HPS (2306.09341): HPS v2 scoring model (CLIP fine-tuned on HPD v2).
- ImgEdit (2505.20275 §4.2): GPT-4o 1–5 ratings and ImgEdit-Judge.
- Long Text Bench (2507.22058 §4): Text Accuracy (OCR, Qwen2.5-VL-7B).
- GEditBench (2504.17761 §4.2): VIEScore (SQ, PQ, O via GPT-4.1/Qwen2.5-VL).
- OneIG (2506.07977 §4.1): per-dimension metrics (semantic alignment, ED, etc.).
- DPG (2403.05135): DSG-style graph score, mPLUG-large adjudicator.
Each entry's ``metrics`` lists only ``MetricRegistry`` names that have a
directly named counterpart in the benchmark reference paper (e.g.
CLIPScore -> ``clip_score``, VQAScore -> ``vqa``, FID -> ``fid``).
If the paper uses a method with no matching registered metric, the list is
kept empty and callers should pass explicit metrics to ``Task``.
"""

_registry: dict[str, Benchmark] = {}
Expand Down Expand Up @@ -154,7 +146,7 @@ def list(cls, task_type: str | None = None) -> list[str]:
"perspectives, and symbol rendering from basic to complex compositions."
),
metrics=[], # Paper uses human evaluation only; pass explicit metrics if needed
task_type="text_to_image",
task_type=TASK_TYPE_TEXT_IMAGE,
reference="https://arxiv.org/abs/2206.10789",
),
Benchmark(
Expand All @@ -164,7 +156,7 @@ def list(cls, task_type: str | None = None) -> list[str]:
"Enables side-by-side comparison on sample quality and image-text alignment with human raters."
),
metrics=[], # Paper uses human evaluation only; pass explicit metrics if needed
task_type="text_to_image",
task_type=TASK_TYPE_TEXT_IMAGE,
reference="https://arxiv.org/abs/2205.11487",
),
Benchmark(
Expand All @@ -174,8 +166,8 @@ def list(cls, task_type: str | None = None) -> list[str]:
"Covers basic skills (scene, attributes, spatial relationships) to advanced reasoning "
"(counting, comparison, logic/negation) with over 24k human ratings."
),
metrics=["vqa", "clip_score"],
task_type="text_to_image",
metrics=["vqa", "clip_score"], # VQAScore + CLIPScore both named (arXiv:2406.13743)
task_type=TASK_TYPE_TEXT_IMAGE,
reference="https://arxiv.org/abs/2406.13743",
),
Benchmark(
Expand All @@ -195,8 +187,8 @@ def list(cls, task_type: str | None = None) -> list[str]:
"MS-COCO for text-to-image evaluation (Imagen, 2205.11487). Paper reports "
"FID for fidelity and CLIP score for image-text alignment."
),
metrics=["fid", "clip_score"], # §4.1: FID + CLIP score
task_type="text_to_image",
metrics=["fid", "clip_score"],
task_type=TASK_TYPE_TEXT_IMAGE,
reference="https://arxiv.org/abs/2205.11487",
),
Benchmark(
Expand All @@ -223,11 +215,13 @@ def list(cls, task_type: str | None = None) -> list[str]:
name="GenEval",
description=(
"Compositional text-to-image benchmark with 6 categories: single object, two object, "
"counting, colors, position, color attributes. Evaluates fine-grained alignment "
"between prompts and generated images via VQA-style questions."
"counting, colors, position, color attributes. Uses atomic yes/no questions per prompt; "
"``Task.from_benchmark`` wires ``qa_accuracy`` with strict per-image aggregation "
"(all questions must pass) plus ``clip_score``. For holistic VQAScore-style scoring "
"use GenAI Bench with ``vqa``."
),
metrics=["qa_accuracy", "clip_score"], # strict QA + CLIP score
task_type="text_to_image",
metrics=["qa_accuracy", "clip_score"],
task_type=TASK_TYPE_TEXT_IMAGE,
reference="https://arxiv.org/abs/2310.11513",
),
Benchmark(
Expand All @@ -237,7 +231,7 @@ def list(cls, task_type: str | None = None) -> list[str]:
"Covers anime, concept-art, paintings, and photo styles with human preference data."
),
metrics=[], # Paper uses HPS scoring model; not in Pruna
task_type="text_to_image",
task_type=TASK_TYPE_TEXT_IMAGE,
reference="https://arxiv.org/abs/2306.09341",
),
Benchmark(
Expand All @@ -246,71 +240,73 @@ def list(cls, task_type: str | None = None) -> list[str]:
"Image editing benchmark with 8 edit types: replace, add, remove, adjust, extract, "
"style, background, compose. Evaluates instruction-following for inpainting and editing."
),
metrics=[], # Paper uses GPT-4o/ImgEdit-Judge; not in Pruna
task_type="text_to_image",
metrics=["img_edit_score"], # Paper uses GPT-4o rubric scores + FakeShield judge.
task_type=TASK_TYPE_TEXT_PLUS_IMAGE_IMAGE,
reference="https://arxiv.org/abs/2505.20275",
),
Benchmark(
name="Long Text Bench",
description=(
"Text-to-image benchmark for long, detailed prompts. Evaluates model ability to "
"handle complex multi-clause descriptions and maintain coherence across long instructions."
"Text rendering benchmark evaluating whether T2I models correctly render specific text strings "
"provided in prompts. Uses ``text_score`` (normalized character accuracy in [0, 1]). "
"This is OCR correctness, not long-prompt semantic alignment."
),
metrics=["text_score"],
task_type="text_to_image",
task_type=TASK_TYPE_TEXT_IMAGE,
reference="https://arxiv.org/abs/2507.22058",
),
Benchmark(
name="GEditBench",
description=(
"General image editing benchmark with 11 task types: background change, color alter, "
"material alter, motion change, style change, subject add/remove/replace, text change, "
"tone transfer, and human retouching."
"tone transfer, and human retouching. "
"Evaluated with VIEScore in text-image-edit (TIE) mode when source image bytes are available."
),
metrics=["vie_score"],
task_type="text+image_image",
metrics=["vie_score"], # VIEScore is explicitly named in GEditBench.
task_type=TASK_TYPE_TEXT_PLUS_IMAGE_IMAGE,
reference="https://arxiv.org/abs/2504.17761",
),
Benchmark(
name="OneIG Anime Stylization",
description="OneIG subset: anime and stylized imagery.",
metrics=["oneig_alignment"],
task_type="text_to_image",
task_type=TASK_TYPE_TEXT_IMAGE,
reference="https://arxiv.org/abs/2506.07977",
),
Benchmark(
name="OneIG General Object",
description="OneIG subset: everyday objects and scenes.",
metrics=["oneig_alignment"],
task_type="text_to_image",
task_type=TASK_TYPE_TEXT_IMAGE,
reference="https://arxiv.org/abs/2506.07977",
),
Benchmark(
name="OneIG Knowledge Reasoning",
description="OneIG subset: knowledge- and reasoning-heavy prompts.",
metrics=["oneig_reasoning"],
task_type="text_to_image",
task_type=TASK_TYPE_TEXT_IMAGE,
reference="https://arxiv.org/abs/2506.07977",
),
Benchmark(
name="OneIG Multilingualism",
description="OneIG subset: multilingual prompts (incl. Chinese splits).",
metrics=["oneig_alignment"],
task_type="text_to_image",
task_type=TASK_TYPE_TEXT_IMAGE,
reference="https://arxiv.org/abs/2506.07977",
),
Benchmark(
name="OneIG Portrait",
description="OneIG subset: people and portraits.",
metrics=["oneig_alignment"],
task_type="text_to_image",
task_type=TASK_TYPE_TEXT_IMAGE,
reference="https://arxiv.org/abs/2506.07977",
),
Benchmark(
name="OneIG Text Rendering",
description="OneIG subset: text and graphics painted into the image.",
metrics=["oneig_text_score"],
task_type="text_to_image",
task_type=TASK_TYPE_TEXT_IMAGE,
reference="https://arxiv.org/abs/2506.07977",
),
Benchmark(
Expand All @@ -320,7 +316,7 @@ def list(cls, task_type: str | None = None) -> list[str]:
"global, and other descriptive aspects with natural-language questions for alignment."
),
metrics=[], # Paper uses custom evaluation; not in Pruna
task_type="text_to_image",
task_type=TASK_TYPE_TEXT_IMAGE,
reference="https://arxiv.org/abs/2403.05135",
),
]:
Expand Down
Loading