diff --git a/docs/source/base_evaluator.mdx b/docs/source/base_evaluator.mdx index 9eea7cd99..557243586 100644 --- a/docs/source/base_evaluator.mdx +++ b/docs/source/base_evaluator.mdx @@ -8,9 +8,6 @@ Currently supported tasks are: - `"question-answering"`: will use the [`QuestionAnsweringEvaluator`]. - `"image-classification"`: will use the [`ImageClassificationEvaluator`]. - `"text-generation"`: will use the [`TextGenerationEvaluator`]. -- `"text2text-generation"`: will use the [`Text2TextGenerationEvaluator`]. -- `"summarization"`: will use the [`SummarizationEvaluator`]. -- `"translation"`: will use the [`TranslationEvaluator`]. - `"automatic-speech-recognition"`: will use the [`AutomaticSpeechRecognitionEvaluator`]. - `"audio-classification"`: will use the [`AudioClassificationEvaluator`]. diff --git a/docs/source/package_reference/evaluator_classes.mdx b/docs/source/package_reference/evaluator_classes.mdx index 1d5e74c37..cbcd653ff 100644 --- a/docs/source/package_reference/evaluator_classes.mdx +++ b/docs/source/package_reference/evaluator_classes.mdx @@ -37,21 +37,6 @@ The base class for all evaluator classes: [[autodoc]] evaluate.TextGenerationEvaluator - compute -### Text2TextGenerationEvaluator - -[[autodoc]] evaluate.Text2TextGenerationEvaluator - - compute - -### SummarizationEvaluator - -[[autodoc]] evaluate.SummarizationEvaluator - - compute - -### TranslationEvaluator - -[[autodoc]] evaluate.TranslationEvaluator - - compute - ### AutomaticSpeechRecognitionEvaluator [[autodoc]] evaluate.AutomaticSpeechRecognitionEvaluator @@ -60,4 +45,4 @@ The base class for all evaluator classes: ### AudioClassificationEvaluator [[autodoc]] evaluate.AudioClassificationEvaluator - - compute \ No newline at end of file + - compute diff --git a/src/evaluate/__init__.py b/src/evaluate/__init__.py index a8c25bd92..ce88f9ff8 100644 --- a/src/evaluate/__init__.py +++ b/src/evaluate/__init__.py @@ -33,12 +33,9 @@ Evaluator, ImageClassificationEvaluator, QuestionAnsweringEvaluator, - SummarizationEvaluator, - Text2TextGenerationEvaluator, TextClassificationEvaluator, TextGenerationEvaluator, TokenClassificationEvaluator, - TranslationEvaluator, evaluator, ) from .hub import push_to_hub diff --git a/src/evaluate/config.py b/src/evaluate/config.py index 4909fa251..fa953e906 100644 --- a/src/evaluate/config.py +++ b/src/evaluate/config.py @@ -153,7 +153,6 @@ HF_HUB_ALLOWED_TASKS = [ "image-classification", - "translation", "image-segmentation", "fill-mask", "automatic-speech-recognition", @@ -161,7 +160,6 @@ "sentence-similarity", "audio-classification", "question-answering", - "summarization", "zero-shot-classification", "table-to-text", "feature-extraction", @@ -169,7 +167,6 @@ "multiple-choice", "text-classification", "text-to-image", - "text2text-generation", "zero-shot-image-classification", "tabular-classification", "tabular-regression", diff --git a/src/evaluate/evaluator/__init__.py b/src/evaluate/evaluator/__init__.py index a2fe4be8a..d1b2bf650 100644 --- a/src/evaluate/evaluator/__init__.py +++ b/src/evaluate/evaluator/__init__.py @@ -29,7 +29,6 @@ from .base import Evaluator from .image_classification import ImageClassificationEvaluator from .question_answering import QuestionAnsweringEvaluator -from .text2text_generation import SummarizationEvaluator, Text2TextGenerationEvaluator, TranslationEvaluator from .text_classification import TextClassificationEvaluator from .text_generation import TextGenerationEvaluator from .token_classification import TokenClassificationEvaluator @@ -56,18 +55,6 @@ "implementation": TextGenerationEvaluator, "default_metric_name": "word_count", }, - "text2text-generation": { - "implementation": Text2TextGenerationEvaluator, - "default_metric_name": "bleu", - }, - "summarization": { - "implementation": SummarizationEvaluator, - "default_metric_name": "rouge", - }, - "translation": { - "implementation": TranslationEvaluator, - "default_metric_name": "bleu", - }, "automatic-speech-recognition": { "implementation": AutomaticSpeechRecognitionEvaluator, "default_metric_name": "wer", @@ -101,6 +88,8 @@ def check_task(task: str) -> Dict: Returns: task_defaults: `dict`, contains the implementasion class of a give Evaluator and the default metric name. """ + if task in ["text2text-generation", "summarization", "translation"]: + raise KeyError(f"Task {task} is no longer supported, instead use \"text_generation\".") if task in TASK_ALIASES: task = TASK_ALIASES[task] if not check_pipeline_task(task): diff --git a/src/evaluate/evaluator/base.py b/src/evaluate/evaluator/base.py index 09de31f19..f34ba101d 100644 --- a/src/evaluate/evaluator/base.py +++ b/src/evaluate/evaluator/base.py @@ -471,7 +471,7 @@ def prepare_pipeline( pipe = model_or_pipeline if tokenizer is not None and feature_extractor is not None: logger.warning("Ignoring the value of the preprocessor argument (`tokenizer` or `feature_extractor`).") - if (pipe.task != self.task) and not (self.task == "translation" and pipe.task.startswith("translation")): + if (pipe.task != self.task): raise ValueError( f"Incompatible `model_or_pipeline`. Please specify `model_or_pipeline` compatible with the `{self.task}` task." ) diff --git a/src/evaluate/evaluator/text2text_generation.py b/src/evaluate/evaluator/text2text_generation.py deleted file mode 100644 index 6dfd2c035..000000000 --- a/src/evaluate/evaluator/text2text_generation.py +++ /dev/null @@ -1,267 +0,0 @@ -# Copyright 2022 The HuggingFace Evaluate Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union - -from datasets import Dataset -from typing_extensions import Literal - -from ..module import EvaluationModule -from ..utils.file_utils import add_start_docstrings -from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator - - -if TYPE_CHECKING: - from transformers import Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel - - -TASK_DOCUMENTATION_KWARGS = r""" - input_column (`str`, defaults to `"text"`): - the name of the column containing the input text in the dataset specified by `data`. - label_column (`str`, defaults to `"label"`): - the name of the column containing the labels in the dataset specified by `data`. - generation_kwargs (`Dict`, *optional*, defaults to `None`): - The generation kwargs are passed to the pipeline and set the text generation strategy. -""" - -TEXT2TEXT_TASK_DOCSTRING_EXAMPLE = r""" - Examples: - ```python - >>> from evaluate import evaluator - >>> from datasets import load_dataset - >>> task_evaluator = evaluator("text2text-generation") - >>> data = load_dataset("cnn_dailymail", "3.0.0", split="validation[:40]") - >>> results = task_evaluator.compute( - >>> model_or_pipeline="facebook/bart-large-cnn", - >>> data=data, - >>> input_column="article", - >>> label_column="highlights", - >>> metric="rouge", - >>> ) - ``` -""" - -SUMMARIZATION_TASK_DOCSTRING_EXAMPLE = r""" - Examples: - ```python - >>> from evaluate import evaluator - >>> from datasets import load_dataset - >>> task_evaluator = evaluator("summarization") - >>> data = load_dataset("cnn_dailymail", "3.0.0", split="validation[:40]") - >>> results = task_evaluator.compute( - >>> model_or_pipeline="facebook/bart-large-cnn", - >>> data=data, - >>> input_column="article", - >>> label_column="highlights", - >>> ) - ``` -""" - - -TRANSLATION_TASK_DOCSTRING_EXAMPLE = r""" - Examples: - ```python - >>> from evaluate import evaluator - >>> from datasets import load_dataset - >>> task_evaluator = evaluator("translation") - >>> data = load_dataset("wmt19", "fr-de", split="validation[:40]") - >>> data = data.map(lambda x: {"text": x["translation"]["de"], "label": x["translation"]["fr"]}) - >>> results = task_evaluator.compute( - >>> model_or_pipeline="Helsinki-NLP/opus-mt-de-fr", - >>> data=data, - >>> ) - ``` -""" - - -class Text2TextGenerationEvaluator(Evaluator): - """ - Text2Text generation evaluator. - This Text2Text generation evaluator can currently be loaded from [`evaluator`] using the default task name - `text2text-generation`. - Methods in this class assume a data format compatible with the [`~transformers.Text2TextGenerationPipeline`]. - """ - - PREDICTION_PREFIX = "generated" - PIPELINE_KWARGS = {"truncation": True} - - def __init__(self, task="text2text-generation", default_metric_name=None): - super().__init__(task, default_metric_name=default_metric_name) - - def predictions_processor(self, predictions, label_mapping): - return {"predictions": [pred[f"{self.PREDICTION_PREFIX}_text"] for pred in predictions]} - - @add_start_docstrings( - EVALUTOR_COMPUTE_START_DOCSTRING, - TASK_DOCUMENTATION_KWARGS, - EVALUATOR_COMPUTE_RETURN_DOCSTRING, - TEXT2TEXT_TASK_DOCSTRING_EXAMPLE, - ) - def compute( - self, - model_or_pipeline: Union[ - str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel" # noqa: F821 - ] = None, - data: Union[str, Dataset] = None, - subset: Optional[str] = None, - split: Optional[str] = None, - metric: Union[str, EvaluationModule] = None, - tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None, # noqa: F821 - strategy: Literal["simple", "bootstrap"] = "simple", - confidence_level: float = 0.95, - n_resamples: int = 9999, - device: int = None, - random_state: Optional[int] = None, - input_column: str = "text", - label_column: str = "label", - generation_kwargs: dict = None, - ) -> Tuple[Dict[str, float], Any]: - if generation_kwargs is not None: - self.PIPELINE_KWARGS.update(generation_kwargs) - - result = super().compute( - model_or_pipeline=model_or_pipeline, - data=data, - subset=subset, - split=split, - metric=metric, - tokenizer=tokenizer, - strategy=strategy, - confidence_level=confidence_level, - n_resamples=n_resamples, - device=device, - random_state=random_state, - input_column=input_column, - label_column=label_column, - ) - - return result - - -class SummarizationEvaluator(Text2TextGenerationEvaluator): - """ - Text summarization evaluator. - This text summarization evaluator can currently be loaded from [`evaluator`] using the default task name - `summarization`. - Methods in this class assume a data format compatible with the [`SummarizationEvaluator`]. - """ - - PREDICTION_PREFIX = "summary" - PIPELINE_KWARGS = {"truncation": True} - - def __init__(self, task="summarization", default_metric_name=None): - super().__init__(task, default_metric_name=default_metric_name) - - @add_start_docstrings( - EVALUTOR_COMPUTE_START_DOCSTRING, - TASK_DOCUMENTATION_KWARGS, - EVALUATOR_COMPUTE_RETURN_DOCSTRING, - SUMMARIZATION_TASK_DOCSTRING_EXAMPLE, - ) - def compute( - self, - model_or_pipeline: Union[ - str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel" # noqa: F821 - ] = None, - data: Union[str, Dataset] = None, - subset: Optional[str] = None, - split: Optional[str] = None, - metric: Union[str, EvaluationModule] = None, - tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None, # noqa: F821 - strategy: Literal["simple", "bootstrap"] = "simple", - confidence_level: float = 0.95, - n_resamples: int = 9999, - device: int = None, - random_state: Optional[int] = None, - input_column: str = "text", - label_column: str = "label", - generation_kwargs: dict = None, - ) -> Tuple[Dict[str, float], Any]: - result = super().compute( - model_or_pipeline=model_or_pipeline, - data=data, - subset=subset, - split=split, - metric=metric, - tokenizer=tokenizer, - strategy=strategy, - confidence_level=confidence_level, - n_resamples=n_resamples, - device=device, - random_state=random_state, - input_column=input_column, - label_column=label_column, - generation_kwargs=generation_kwargs, - ) - - return result - - -class TranslationEvaluator(Text2TextGenerationEvaluator): - """ - Translation evaluator. - This translation generation evaluator can currently be loaded from [`evaluator`] using the default task name - `translation`. - Methods in this class assume a data format compatible with the [`~transformers.TranslationPipeline`]. - """ - - PREDICTION_PREFIX = "translation" - PIPELINE_KWARGS = {"truncation": True} - - def __init__(self, task="translation", default_metric_name=None): - super().__init__(task, default_metric_name=default_metric_name) - - @add_start_docstrings( - EVALUTOR_COMPUTE_START_DOCSTRING, - TASK_DOCUMENTATION_KWARGS, - EVALUATOR_COMPUTE_RETURN_DOCSTRING, - TRANSLATION_TASK_DOCSTRING_EXAMPLE, - ) - def compute( - self, - model_or_pipeline: Union[ - str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel" # noqa: F821 - ] = None, - data: Union[str, Dataset] = None, - subset: Optional[str] = None, - split: Optional[str] = None, - metric: Union[str, EvaluationModule] = None, - tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None, # noqa: F821 - strategy: Literal["simple", "bootstrap"] = "simple", - confidence_level: float = 0.95, - n_resamples: int = 9999, - device: int = None, - random_state: Optional[int] = None, - input_column: str = "text", - label_column: str = "label", - generation_kwargs: dict = None, - ) -> Tuple[Dict[str, float], Any]: - result = super().compute( - model_or_pipeline=model_or_pipeline, - data=data, - subset=subset, - split=split, - metric=metric, - tokenizer=tokenizer, - strategy=strategy, - confidence_level=confidence_level, - n_resamples=n_resamples, - device=device, - random_state=random_state, - input_column=input_column, - label_column=label_column, - generation_kwargs=generation_kwargs, - ) - - return result diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py index 259b5c7b9..365f50cc0 100644 --- a/tests/test_evaluator.py +++ b/tests/test_evaluator.py @@ -38,7 +38,6 @@ Evaluator, ImageClassificationEvaluator, QuestionAnsweringEvaluator, - Text2TextGenerationEvaluator, TextClassificationEvaluator, TextGenerationEvaluator, TokenClassificationEvaluator, @@ -59,15 +58,6 @@ def __call__(self, inputs, **kwargs): return [[{f"{self.prefix}_text": "Lorem ipsum"} for _ in range(self.num_return_sequences)] for _ in inputs] -class DummyText2TextGenerationPipeline: - def __init__(self, prefix="generated", task="text2text-generation"): - self.task = task - self.prefix = prefix - - def __call__(self, inputs, **kwargs): - return [{f"{self.prefix}_text": "Lorem ipsum"} for _ in inputs] - - class DummyTextClassificationPipeline: def __init__(self, sleep_time=None): self.task = "text-classification" @@ -910,77 +900,6 @@ def test_process_predictions_multiple_return_sequences(self): self.assertEqual(processed_predictions, {"data": ["A", "B", "C", "D"]}) -class TestText2TextGenerationEvaluator(TestCase): - def setUp(self): - self.data = Dataset.from_dict( - { - "text": ["Lorem ipsum"] * 4, - "label": ["Ipsum Lorem"] * 4, - } - ) - self.pipe = DummyText2TextGenerationPipeline() - self.evaluator = evaluator("text2text-generation") - - def test_pipe_init(self): - results = self.evaluator.compute( - model_or_pipeline=self.pipe, - data=self.data, - ) - self.assertEqual(results["bleu"], 0) - - def test_class_init(self): - evaluator = Text2TextGenerationEvaluator() - self.assertEqual(evaluator.task, "text2text-generation") - self.assertIsNone(evaluator.default_metric_name) - - results = evaluator.compute( - model_or_pipeline=self.pipe, - data=self.data, - metric="bleu", - ) - self.assertEqual(results["bleu"], 0) - - @slow - def test_default_pipe_init(self): - results = self.evaluator.compute(data=self.data) - self.assertEqual(results["bleu"], 0) - - def test_overwrite_default_metric(self): - rouge = load("rouge") - results = self.evaluator.compute( - model_or_pipeline=self.pipe, - data=self.data, - metric=rouge, - ) - self.assertEqual(results["rouge1"], 1.0) - results = self.evaluator.compute( - model_or_pipeline=self.pipe, - data=self.data, - metric="rouge", - ) - self.assertEqual(results["rouge1"], 1.0) - - def test_summarization(self): - pipe = DummyText2TextGenerationPipeline(task="summarization", prefix="summary") - e = evaluator("summarization") - - results = e.compute( - model_or_pipeline=pipe, - data=self.data, - ) - self.assertEqual(results["rouge1"], 1.0) - - def test_translation(self): - pipe = DummyText2TextGenerationPipeline(task="translation", prefix="translation") - e = evaluator("translation") - - results = e.compute( - model_or_pipeline=pipe, - data=self.data, - ) - self.assertEqual(results["bleu"], 0) - - class TestAutomaticSpeechRecognitionEvaluator(TestCase): def setUp(self): self.data = Dataset.from_dict(