diff --git a/docs/source/base_evaluator.mdx b/docs/source/base_evaluator.mdx
index 9eea7cd99..557243586 100644
--- a/docs/source/base_evaluator.mdx
+++ b/docs/source/base_evaluator.mdx
@@ -8,9 +8,6 @@ Currently supported tasks are:
 - `"question-answering"`: will use the [`QuestionAnsweringEvaluator`].
 - `"image-classification"`: will use the [`ImageClassificationEvaluator`].
 - `"text-generation"`: will use the [`TextGenerationEvaluator`].
-- `"text2text-generation"`: will use the [`Text2TextGenerationEvaluator`].
-- `"summarization"`: will use the [`SummarizationEvaluator`].
-- `"translation"`: will use the [`TranslationEvaluator`].
 - `"automatic-speech-recognition"`: will use the [`AutomaticSpeechRecognitionEvaluator`].
 - `"audio-classification"`: will use the [`AudioClassificationEvaluator`].
 
diff --git a/docs/source/package_reference/evaluator_classes.mdx b/docs/source/package_reference/evaluator_classes.mdx
index 1d5e74c37..cbcd653ff 100644
--- a/docs/source/package_reference/evaluator_classes.mdx
+++ b/docs/source/package_reference/evaluator_classes.mdx
@@ -37,21 +37,6 @@ The base class for all evaluator classes:
 [[autodoc]] evaluate.TextGenerationEvaluator
     - compute
 
-### Text2TextGenerationEvaluator
-
-[[autodoc]] evaluate.Text2TextGenerationEvaluator
-    - compute
-
-### SummarizationEvaluator
-
-[[autodoc]] evaluate.SummarizationEvaluator
-    - compute
-
-### TranslationEvaluator
-
-[[autodoc]] evaluate.TranslationEvaluator
-    - compute
-
 ### AutomaticSpeechRecognitionEvaluator
 
 [[autodoc]] evaluate.AutomaticSpeechRecognitionEvaluator
@@ -60,4 +45,4 @@ The base class for all evaluator classes:
 ### AudioClassificationEvaluator
 
 [[autodoc]] evaluate.AudioClassificationEvaluator
-    - compute
\ No newline at end of file
+    - compute
diff --git a/src/evaluate/__init__.py b/src/evaluate/__init__.py
index a8c25bd92..ce88f9ff8 100644
--- a/src/evaluate/__init__.py
+++ b/src/evaluate/__init__.py
@@ -33,12 +33,9 @@
     Evaluator,
     ImageClassificationEvaluator,
     QuestionAnsweringEvaluator,
-    SummarizationEvaluator,
-    Text2TextGenerationEvaluator,
     TextClassificationEvaluator,
     TextGenerationEvaluator,
     TokenClassificationEvaluator,
-    TranslationEvaluator,
     evaluator,
 )
 from .hub import push_to_hub
diff --git a/src/evaluate/config.py b/src/evaluate/config.py
index 4909fa251..fa953e906 100644
--- a/src/evaluate/config.py
+++ b/src/evaluate/config.py
@@ -153,7 +153,6 @@
 
 HF_HUB_ALLOWED_TASKS = [
     "image-classification",
-    "translation",
     "image-segmentation",
     "fill-mask",
     "automatic-speech-recognition",
@@ -161,7 +160,6 @@
     "sentence-similarity",
     "audio-classification",
     "question-answering",
-    "summarization",
     "zero-shot-classification",
     "table-to-text",
     "feature-extraction",
@@ -169,7 +167,6 @@
     "multiple-choice",
     "text-classification",
     "text-to-image",
-    "text2text-generation",
     "zero-shot-image-classification",
     "tabular-classification",
     "tabular-regression",
diff --git a/src/evaluate/evaluator/__init__.py b/src/evaluate/evaluator/__init__.py
index a2fe4be8a..d1b2bf650 100644
--- a/src/evaluate/evaluator/__init__.py
+++ b/src/evaluate/evaluator/__init__.py
@@ -29,7 +29,6 @@
 from .base import Evaluator
 from .image_classification import ImageClassificationEvaluator
 from .question_answering import QuestionAnsweringEvaluator
-from .text2text_generation import SummarizationEvaluator, Text2TextGenerationEvaluator, TranslationEvaluator
 from .text_classification import TextClassificationEvaluator
 from .text_generation import TextGenerationEvaluator
 from .token_classification import TokenClassificationEvaluator
@@ -56,18 +55,6 @@
         "implementation": TextGenerationEvaluator,
         "default_metric_name": "word_count",
     },
-    "text2text-generation": {
-        "implementation": Text2TextGenerationEvaluator,
-        "default_metric_name": "bleu",
-    },
-    "summarization": {
-        "implementation": SummarizationEvaluator,
-        "default_metric_name": "rouge",
-    },
-    "translation": {
-        "implementation": TranslationEvaluator,
-        "default_metric_name": "bleu",
-    },
     "automatic-speech-recognition": {
         "implementation": AutomaticSpeechRecognitionEvaluator,
         "default_metric_name": "wer",
@@ -101,6 +88,8 @@ def check_task(task: str) -> Dict:
     Returns:
         task_defaults: `dict`, contains the implementasion class of a give Evaluator and the default metric name.
     """
+    if task in ["text2text-generation", "summarization", "translation"]:
+        raise KeyError(f"Task {task} is no longer supported, instead use \"text_generation\".")
     if task in TASK_ALIASES:
         task = TASK_ALIASES[task]
     if not check_pipeline_task(task):
diff --git a/src/evaluate/evaluator/base.py b/src/evaluate/evaluator/base.py
index 09de31f19..f34ba101d 100644
--- a/src/evaluate/evaluator/base.py
+++ b/src/evaluate/evaluator/base.py
@@ -471,7 +471,7 @@ def prepare_pipeline(
                 pipe = model_or_pipeline
             if tokenizer is not None and feature_extractor is not None:
                 logger.warning("Ignoring the value of the preprocessor argument (`tokenizer` or `feature_extractor`).")
-        if (pipe.task != self.task) and not (self.task == "translation" and pipe.task.startswith("translation")):
+        if (pipe.task != self.task):
             raise ValueError(
                 f"Incompatible `model_or_pipeline`. Please specify `model_or_pipeline` compatible with the `{self.task}` task."
             )
diff --git a/src/evaluate/evaluator/text2text_generation.py b/src/evaluate/evaluator/text2text_generation.py
deleted file mode 100644
index 6dfd2c035..000000000
--- a/src/evaluate/evaluator/text2text_generation.py
+++ /dev/null
@@ -1,267 +0,0 @@
-# Copyright 2022 The HuggingFace Evaluate Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
-
-from datasets import Dataset
-from typing_extensions import Literal
-
-from ..module import EvaluationModule
-from ..utils.file_utils import add_start_docstrings
-from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator
-
-
-if TYPE_CHECKING:
-    from transformers import Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel
-
-
-TASK_DOCUMENTATION_KWARGS = r"""
-        input_column (`str`, defaults to `"text"`):
-            the name of the column containing the input text in the dataset specified by `data`.
-        label_column (`str`, defaults to `"label"`):
-            the name of the column containing the labels in the dataset specified by `data`.
-        generation_kwargs (`Dict`, *optional*, defaults to `None`):
-            The generation kwargs are passed to the pipeline and set the text generation strategy.
-"""
-
-TEXT2TEXT_TASK_DOCSTRING_EXAMPLE = r"""
-    Examples:
-    ```python
-    >>> from evaluate import evaluator
-    >>> from datasets import load_dataset
-    >>> task_evaluator = evaluator("text2text-generation")
-    >>> data = load_dataset("cnn_dailymail", "3.0.0", split="validation[:40]")
-    >>> results = task_evaluator.compute(
-    >>>     model_or_pipeline="facebook/bart-large-cnn",
-    >>>     data=data,
-    >>>     input_column="article",
-    >>>     label_column="highlights",
-    >>>     metric="rouge",
-    >>> )
-    ```
-"""
-
-SUMMARIZATION_TASK_DOCSTRING_EXAMPLE = r"""
-    Examples:
-    ```python
-    >>> from evaluate import evaluator
-    >>> from datasets import load_dataset
-    >>> task_evaluator = evaluator("summarization")
-    >>> data = load_dataset("cnn_dailymail", "3.0.0", split="validation[:40]")
-    >>> results = task_evaluator.compute(
-    >>>     model_or_pipeline="facebook/bart-large-cnn",
-    >>>     data=data,
-    >>>     input_column="article",
-    >>>     label_column="highlights",
-    >>> )
-    ```
-"""
-
-
-TRANSLATION_TASK_DOCSTRING_EXAMPLE = r"""
-    Examples:
-    ```python
-    >>> from evaluate import evaluator
-    >>> from datasets import load_dataset
-    >>> task_evaluator = evaluator("translation")
-    >>> data = load_dataset("wmt19", "fr-de", split="validation[:40]")
-    >>> data = data.map(lambda x: {"text": x["translation"]["de"], "label": x["translation"]["fr"]})
-    >>> results = task_evaluator.compute(
-    >>>     model_or_pipeline="Helsinki-NLP/opus-mt-de-fr",
-    >>>     data=data,
-    >>> )
-    ```
-"""
-
-
-class Text2TextGenerationEvaluator(Evaluator):
-    """
-    Text2Text generation evaluator.
-    This Text2Text generation evaluator can currently be loaded from [`evaluator`] using the default task name
-    `text2text-generation`.
-    Methods in this class assume a data format compatible with the [`~transformers.Text2TextGenerationPipeline`].
-    """
-
-    PREDICTION_PREFIX = "generated"
-    PIPELINE_KWARGS = {"truncation": True}
-
-    def __init__(self, task="text2text-generation", default_metric_name=None):
-        super().__init__(task, default_metric_name=default_metric_name)
-
-    def predictions_processor(self, predictions, label_mapping):
-        return {"predictions": [pred[f"{self.PREDICTION_PREFIX}_text"] for pred in predictions]}
-
-    @add_start_docstrings(
-        EVALUTOR_COMPUTE_START_DOCSTRING,
-        TASK_DOCUMENTATION_KWARGS,
-        EVALUATOR_COMPUTE_RETURN_DOCSTRING,
-        TEXT2TEXT_TASK_DOCSTRING_EXAMPLE,
-    )
-    def compute(
-        self,
-        model_or_pipeline: Union[
-            str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel"  # noqa: F821
-        ] = None,
-        data: Union[str, Dataset] = None,
-        subset: Optional[str] = None,
-        split: Optional[str] = None,
-        metric: Union[str, EvaluationModule] = None,
-        tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None,  # noqa: F821
-        strategy: Literal["simple", "bootstrap"] = "simple",
-        confidence_level: float = 0.95,
-        n_resamples: int = 9999,
-        device: int = None,
-        random_state: Optional[int] = None,
-        input_column: str = "text",
-        label_column: str = "label",
-        generation_kwargs: dict = None,
-    ) -> Tuple[Dict[str, float], Any]:
-        if generation_kwargs is not None:
-            self.PIPELINE_KWARGS.update(generation_kwargs)
-
-        result = super().compute(
-            model_or_pipeline=model_or_pipeline,
-            data=data,
-            subset=subset,
-            split=split,
-            metric=metric,
-            tokenizer=tokenizer,
-            strategy=strategy,
-            confidence_level=confidence_level,
-            n_resamples=n_resamples,
-            device=device,
-            random_state=random_state,
-            input_column=input_column,
-            label_column=label_column,
-        )
-
-        return result
-
-
-class SummarizationEvaluator(Text2TextGenerationEvaluator):
-    """
-    Text summarization evaluator.
-    This text summarization evaluator can currently be loaded from [`evaluator`] using the default task name
-    `summarization`.
-    Methods in this class assume a data format compatible with the [`SummarizationEvaluator`].
-    """
-
-    PREDICTION_PREFIX = "summary"
-    PIPELINE_KWARGS = {"truncation": True}
-
-    def __init__(self, task="summarization", default_metric_name=None):
-        super().__init__(task, default_metric_name=default_metric_name)
-
-    @add_start_docstrings(
-        EVALUTOR_COMPUTE_START_DOCSTRING,
-        TASK_DOCUMENTATION_KWARGS,
-        EVALUATOR_COMPUTE_RETURN_DOCSTRING,
-        SUMMARIZATION_TASK_DOCSTRING_EXAMPLE,
-    )
-    def compute(
-        self,
-        model_or_pipeline: Union[
-            str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel"  # noqa: F821
-        ] = None,
-        data: Union[str, Dataset] = None,
-        subset: Optional[str] = None,
-        split: Optional[str] = None,
-        metric: Union[str, EvaluationModule] = None,
-        tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None,  # noqa: F821
-        strategy: Literal["simple", "bootstrap"] = "simple",
-        confidence_level: float = 0.95,
-        n_resamples: int = 9999,
-        device: int = None,
-        random_state: Optional[int] = None,
-        input_column: str = "text",
-        label_column: str = "label",
-        generation_kwargs: dict = None,
-    ) -> Tuple[Dict[str, float], Any]:
-        result = super().compute(
-            model_or_pipeline=model_or_pipeline,
-            data=data,
-            subset=subset,
-            split=split,
-            metric=metric,
-            tokenizer=tokenizer,
-            strategy=strategy,
-            confidence_level=confidence_level,
-            n_resamples=n_resamples,
-            device=device,
-            random_state=random_state,
-            input_column=input_column,
-            label_column=label_column,
-            generation_kwargs=generation_kwargs,
-        )
-
-        return result
-
-
-class TranslationEvaluator(Text2TextGenerationEvaluator):
-    """
-    Translation evaluator.
-    This translation generation evaluator can currently be loaded from [`evaluator`] using the default task name
-    `translation`.
-    Methods in this class assume a data format compatible with the [`~transformers.TranslationPipeline`].
-    """
-
-    PREDICTION_PREFIX = "translation"
-    PIPELINE_KWARGS = {"truncation": True}
-
-    def __init__(self, task="translation", default_metric_name=None):
-        super().__init__(task, default_metric_name=default_metric_name)
-
-    @add_start_docstrings(
-        EVALUTOR_COMPUTE_START_DOCSTRING,
-        TASK_DOCUMENTATION_KWARGS,
-        EVALUATOR_COMPUTE_RETURN_DOCSTRING,
-        TRANSLATION_TASK_DOCSTRING_EXAMPLE,
-    )
-    def compute(
-        self,
-        model_or_pipeline: Union[
-            str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel"  # noqa: F821
-        ] = None,
-        data: Union[str, Dataset] = None,
-        subset: Optional[str] = None,
-        split: Optional[str] = None,
-        metric: Union[str, EvaluationModule] = None,
-        tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None,  # noqa: F821
-        strategy: Literal["simple", "bootstrap"] = "simple",
-        confidence_level: float = 0.95,
-        n_resamples: int = 9999,
-        device: int = None,
-        random_state: Optional[int] = None,
-        input_column: str = "text",
-        label_column: str = "label",
-        generation_kwargs: dict = None,
-    ) -> Tuple[Dict[str, float], Any]:
-        result = super().compute(
-            model_or_pipeline=model_or_pipeline,
-            data=data,
-            subset=subset,
-            split=split,
-            metric=metric,
-            tokenizer=tokenizer,
-            strategy=strategy,
-            confidence_level=confidence_level,
-            n_resamples=n_resamples,
-            device=device,
-            random_state=random_state,
-            input_column=input_column,
-            label_column=label_column,
-            generation_kwargs=generation_kwargs,
-        )
-
-        return result
diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py
index 259b5c7b9..365f50cc0 100644
--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -38,7 +38,6 @@
     Evaluator,
     ImageClassificationEvaluator,
     QuestionAnsweringEvaluator,
-    Text2TextGenerationEvaluator,
     TextClassificationEvaluator,
     TextGenerationEvaluator,
     TokenClassificationEvaluator,
@@ -59,15 +58,6 @@ def __call__(self, inputs, **kwargs):
         return [[{f"{self.prefix}_text": "Lorem ipsum"} for _ in range(self.num_return_sequences)] for _ in inputs]
 
 
-class DummyText2TextGenerationPipeline:
-    def __init__(self, prefix="generated", task="text2text-generation"):
-        self.task = task
-        self.prefix = prefix
-
-    def __call__(self, inputs, **kwargs):
-        return [{f"{self.prefix}_text": "Lorem ipsum"} for _ in inputs]
-
-
 class DummyTextClassificationPipeline:
     def __init__(self, sleep_time=None):
         self.task = "text-classification"
@@ -910,77 +900,6 @@ def test_process_predictions_multiple_return_sequences(self):
         self.assertEqual(processed_predictions, {"data": ["A", "B", "C", "D"]})
 
 
-class TestText2TextGenerationEvaluator(TestCase):
-    def setUp(self):
-        self.data = Dataset.from_dict(
-            {
-                "text": ["Lorem ipsum"] * 4,
-                "label": ["Ipsum Lorem"] * 4,
-            }
-        )
-        self.pipe = DummyText2TextGenerationPipeline()
-        self.evaluator = evaluator("text2text-generation")
-
-    def test_pipe_init(self):
-        results = self.evaluator.compute(
-            model_or_pipeline=self.pipe,
-            data=self.data,
-        )
-        self.assertEqual(results["bleu"], 0)
-
-    def test_class_init(self):
-        evaluator = Text2TextGenerationEvaluator()
-        self.assertEqual(evaluator.task, "text2text-generation")
-        self.assertIsNone(evaluator.default_metric_name)
-
-        results = evaluator.compute(
-            model_or_pipeline=self.pipe,
-            data=self.data,
-            metric="bleu",
-        )
-        self.assertEqual(results["bleu"], 0)
-
-    @slow
-    def test_default_pipe_init(self):
-        results = self.evaluator.compute(data=self.data)
-        self.assertEqual(results["bleu"], 0)
-
-    def test_overwrite_default_metric(self):
-        rouge = load("rouge")
-        results = self.evaluator.compute(
-            model_or_pipeline=self.pipe,
-            data=self.data,
-            metric=rouge,
-        )
-        self.assertEqual(results["rouge1"], 1.0)
-        results = self.evaluator.compute(
-            model_or_pipeline=self.pipe,
-            data=self.data,
-            metric="rouge",
-        )
-        self.assertEqual(results["rouge1"], 1.0)
-
-    def test_summarization(self):
-        pipe = DummyText2TextGenerationPipeline(task="summarization", prefix="summary")
-        e = evaluator("summarization")
-
-        results = e.compute(
-            model_or_pipeline=pipe,
-            data=self.data,
-        )
-        self.assertEqual(results["rouge1"], 1.0)
-
-    def test_translation(self):
-        pipe = DummyText2TextGenerationPipeline(task="translation", prefix="translation")
-        e = evaluator("translation")
-
-        results = e.compute(
-            model_or_pipeline=pipe,
-            data=self.data,
-        )
-        self.assertEqual(results["bleu"], 0)
-
-
 class TestAutomaticSpeechRecognitionEvaluator(TestCase):
     def setUp(self):
         self.data = Dataset.from_dict(