diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index cebff3fbe1..3179338ffc 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -47,6 +47,10 @@ jobs:
         run: |
           pip install -e .[ray,forecast]
           pip install 'tensorboardX<=2.2'
+      - name: If python version > 3.6 and not on windows, install autogluon
+        if: matrix.python-version >= '3.7' && (matrix.os == 'macOS-latest' || matrix.os == 'ubuntu-latest')
+        run: |
+          pip install -e .[autogluon]
       - name: Lint with flake8
         run: |
           # stop the build if there are Python syntax errors or undefined names
diff --git a/flaml/automl.py b/flaml/automl.py
index e3ebaa40ac..db18c84f41 100644
--- a/flaml/automl.py
+++ b/flaml/automl.py
@@ -47,6 +47,7 @@
     REGRESSION,
     _is_nlp_task,
     NLG_TASKS,
+    MM_TASKS,
 )
 from . import tune
 from .training_log import training_log_reader, training_log_writer
@@ -1690,6 +1691,10 @@ def _decide_split_type(self, split_type):
             self._state.task = get_classification_objective(
                 len(np.unique(self._y_train_all))
             )
+        elif self._state.task == "mm-classification":
+            self._state.task = "mm-" + get_classification_objective(
+                len(np.unique(self._y_train_all))
+            )
         if not isinstance(split_type, str):
             assert hasattr(split_type, "split") and hasattr(
                 split_type, "get_n_splits"
@@ -2452,6 +2457,9 @@ def is_to_reverse_metric(metric, task):
                 estimator_list = ["lgbm", "xgboost", "xgb_limitdepth"]
             elif _is_nlp_task(self._state.task):
                 estimator_list = ["transformer"]
+            # NOTE: if multimodal task, use multimodal estimator
+            elif self._state.task in MM_TASKS:
+                estimator_list = ["multimodal"]
             else:
                 try:
                     import catboost
diff --git a/flaml/data.py b/flaml/data.py
index 149cd8983c..6b451e805e 100644
--- a/flaml/data.py
+++ b/flaml/data.py
@@ -23,9 +23,12 @@
     SEQCLASSIFICATION,
     MULTICHOICECLASSIFICATION,
     TOKENCLASSIFICATION,
+    "mm-binary",
+    "mm-multiclass",
+    "mm-classification",
 )
 SEQREGRESSION = "seq-regression"
-REGRESSION = ("regression", SEQREGRESSION)
+REGRESSION = ("regression", SEQREGRESSION, "mm-regression")
 TS_FORECASTREGRESSION = (
     "forecast",
     "ts_forecast",
@@ -46,6 +49,11 @@
     MULTICHOICECLASSIFICATION,
     TOKENCLASSIFICATION,
 )
+MM_TASKS = (
+    "mm-classification",
+    "mm-regression",
+    "mm-binary",
+    "mm-multiclass",)
 
 
 def _is_nlp_task(task):
@@ -245,7 +253,6 @@ def concat(X1, X2):
 
 class DataTransformer:
     """Transform input training data."""
-
     def fit_transform(self, X: Union[DataFrame, np.array], y, task):
         """Fit transformer and process the input training data according to the task type.
 
@@ -269,6 +276,10 @@ def fit_transform(self, X: Union[DataFrame, np.array], y, task):
             if len(str_columns) > 0:
                 X[str_columns] = X[str_columns].astype("string")
             self._str_columns = str_columns
+        # NOTE: if multimodal task, no preprocessing on X
+        elif task in MM_TASKS:
+            for column in X.columns:
+                X[column].astype("object")
         elif isinstance(X, DataFrame):
             X = X.copy()
             n = X.shape[0]
@@ -395,6 +406,9 @@ def transform(self, X: Union[DataFrame, np.array]):
             # ids (input ids, token type id, attention mask, etc.)
             if len(self._str_columns) > 0:
                 X[self._str_columns] = X[self._str_columns].astype("string")
+        elif self._task in MM_TASKS:
+            for column in X.columns:
+                X[column].astype("category")
         elif isinstance(X, DataFrame):
             cat_columns, num_columns, datetime_columns = (
                 self._cat_columns,
diff --git a/flaml/ml.py b/flaml/ml.py
index 092a02565e..384fcd1d5b 100644
--- a/flaml/ml.py
+++ b/flaml/ml.py
@@ -37,6 +37,7 @@
     ARIMA,
     SARIMAX,
     TransformersEstimator,
+    MultiModalEstimator,
     TransformersEstimatorModelSelection,
 )
 from .data import CLASSIFICATION, group_counts, TS_FORECAST, TS_VALUE_COL
@@ -122,6 +123,8 @@ def get_estimator_class(task, estimator_name):
         estimator_class = SARIMAX
     elif estimator_name == "transformer":
         estimator_class = TransformersEstimator
+    elif estimator_name == "multimodal":
+        estimator_class = MultiModalEstimator
     elif estimator_name == "transformer_ms":
         estimator_class = TransformersEstimatorModelSelection
     else:
@@ -584,7 +587,7 @@ def compute_estimator(
         n_jobs=n_jobs,
     )
 
-    if isinstance(estimator, TransformersEstimator):
+    if isinstance(estimator, (TransformersEstimator, MultiModalEstimator)):
         fit_kwargs["metric"] = eval_metric
         fit_kwargs["X_val"] = X_val
         fit_kwargs["y_val"] = y_val
@@ -650,6 +653,8 @@ def train_estimator(
     )
     if isinstance(estimator, TransformersEstimator):
         fit_kwargs["metric"] = eval_metric
+    elif isinstance(estimator, MultiModalEstimator):
+        fit_kwargs["metric"] = eval_metric
 
     if X_train is not None:
         train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs)
diff --git a/flaml/model.py b/flaml/model.py
index 78d423f4c1..9ee4f5dafb 100644
--- a/flaml/model.py
+++ b/flaml/model.py
@@ -32,6 +32,7 @@
     SUMMARIZATION,
     NLG_TASKS,
     MULTICHOICECLASSIFICATION,
+    MM_TASKS
 )
 
 try:
@@ -133,6 +134,13 @@ def estimator(self):
     def _preprocess(self, X):
         return X
 
+    @staticmethod
+    def _join(X_train, y_train):
+        y_train = DataFrame(y_train, index=X_train.index)
+        y_train.columns = ["label"]
+        train_df = X_train.join(y_train)
+        return train_df
+
     def _fit(self, X_train, y_train, **kwargs):
 
         current_time = time.time()
@@ -2127,6 +2135,115 @@ class XGBoostLimitDepth_TS(TS_SKLearn):
     base_class = XGBoostLimitDepthEstimator
 
 
+class MultiModalEstimator(BaseEstimator):
+    """
+    The class for tuning AutoGluon TextPredictor
+    """
+    def __init__(self, task="binary", **config):
+        super().__init__(task, **config)
+        import uuid
+
+        self.trial_id = str(uuid.uuid1().hex)[:8]
+
+    @classmethod
+    def search_space(cls, **params):
+        """
+        Add the possible search space configs here, e.g. 'optimization.lr'
+        reference:
+        https://auto.gluon.ai/stable/tutorials/text_prediction/customization.html#custom-hyperparameter-values
+        """
+        search_space_dict = {
+            "model.fusion_mlp.hidden_sizes": {
+                "domain": tune.choice(list(range(32, 129))),
+                "init_value": 128,
+            },
+            "optimization.learning_rate": {
+                "domain": tune.loguniform(lower=1E-5, upper=1E-4),
+                "init_value": 1E-4,
+            },
+            "optimization.weight_decay": {
+                "domain": tune.choice([1E-4, 1E-3, 1E-2]),
+                "init_value": 1E-4,
+            },
+            "optimization.warmup_steps": {
+                "domain": tune.choice([0.1, 0.2]),
+                "init_value": 0.1,
+            },
+        }
+        return search_space_dict
+
+    def fit(self, X_train=None, y_train=None, budget=None, **kwargs):
+        from autogluon.text import TextPredictor
+        from .nlp.utils import AGArgs
+
+        self._kwargs = kwargs
+        self.ag_args = AGArgs(**kwargs["ag_args"])
+        seed = self._kwargs.get("seed", 123)
+
+        # get & set the hyperparameters, update with self.params
+        hyperparameters = self.ag_args.hyperparameters
+        for key, value in self.params.items():
+            if key == "n_jobs":
+                continue
+            elif key == "model.fusion_mlp.hidden_sizes":
+                hyperparameters[key] = [value]
+            else:
+                hyperparameters[key] = value.item() if isinstance(value, np.float64) else value
+
+        start_time = time.time()
+        self.model_path = os.path.join(self.ag_args.output_dir, self.trial_id)
+        assert self._task in MM_TASKS, f"The task is not multimodal, but {self._task}. "
+        model = TextPredictor(path=self.model_path,
+                              label="label",
+                              problem_type=self._task[3:],
+                              eval_metric=kwargs["metric"],
+                              backend="pytorch",
+                              verbosity=0)
+        train_data = BaseEstimator._join(X_train, y_train)
+        # use valid data for early stopping
+        X_val = kwargs.get("X_val")
+        y_val = kwargs.get("y_val")
+        if X_val is not None and y_val is not None:
+            tuning_data = BaseEstimator._join(X_val, y_val)
+        else:
+            tuning_data = None
+        # NOTE: if no tuning_data, model.fit() will holdout a fraction from train_data for early stopping
+        model.fit(train_data=train_data,
+                  tuning_data=tuning_data,
+                  hyperparameters=hyperparameters,
+                  num_gpus=kwargs.get("gpu_per_trial", None),
+                  time_limit=budget,
+                  seed=seed)
+
+        training_time = time.time() - start_time
+        return training_time
+
+    def predict(self, X):
+        from autogluon.text import TextPredictor
+
+        model = TextPredictor.load(path=self.model_path, backend="pytorch")
+        output = model.predict(X, as_pandas=False)
+        return output
+
+    def predict_proba(self, X):
+        from autogluon.text import TextPredictor
+
+        # only works for classification tasks
+        assert (
+            self._task in CLASSIFICATION
+        ), "predict_proba() only for classification tasks."
+        model = TextPredictor.load(path=self.model_path, backend="pytorch")
+        output = model.predict_proba(X, as_pandas=False)
+        return output
+
+    def score(self, X_val: DataFrame, y_val: Series, **kwargs):
+        from autogluon.text import TextPredictor
+
+        model = TextPredictor.load(path=self.model_path, backend="pytorch")
+        val_data = BaseEstimator._join(X_val, y_val)
+        return model.evaluate(val_data)
+
+
 class suppress_stdout_stderr(object):
     def __init__(self):
         # Open a pair of null files
diff --git a/flaml/nlp/utils.py b/flaml/nlp/utils.py
index cd2e7a409e..427f91fe5a 100644
--- a/flaml/nlp/utils.py
+++ b/flaml/nlp/utils.py
@@ -1,3 +1,5 @@
+import argparse
+from dataclasses import dataclass, field
 from itertools import chain
 from typing import Dict, Any
 import numpy as np
@@ -475,3 +477,57 @@ def _set_model_config(checkpoint_path):
         model_config = _set_model_config(checkpoint_path)
         this_model = get_this_model(checkpoint_path, task, model_config)
         return this_model
+
+
+@dataclass
+class AGArgs:
+    """
+    The Autogluon configurations
+    Args:
+        output_dir (str): data root directory for outputing the log and intermediate data, model.
+        hf_model_checkpoint_name (str, optional, defaults to "google/electra-base-discriminator"): the HF model checkpoint.
+        per_device_batch_size (int, optional, defaults to 8)
+        num_train_epochs (int, optional, defaults to 10)
+        batch_size (int, optional, defaults to 128)
+    """
+    output_dir: str = field(default="data/mm_output/", metadata={"help": "data dir", "required": True})
+    hf_model_path: str = field(default="google/electra-base-discriminator", metadata={"help": "Hugging Face model path"})
+    per_device_batch_size: int = field(default=8, metadata={"help": "per device batch size"})
+    num_train_epochs: int = field(default=10, metadata={"help": "number of train epochs"})
+    batch_size: int = field(default=128, metadata={"help": "batch size"})
+    hyperparameters: dict = field(init=False)
+
+    def __post_init__(self):
+        """
+        Get the preset using the AGArgs. Save as self.hyperparameters.
+        """
+        from autogluon.text.text_prediction.presets import get_text_preset
+
+        # get the override from the text preset tuple
+        self.hyperparameters = get_text_preset("default")[1]
+
+        self.hyperparameters["model.hf_text.checkpoint_name"] = self.hf_model_path
+        self.hyperparameters["env.per_gpu_batch_size"] = self.per_device_batch_size
+        self.hyperparameters["env.batch_size"] = self.batch_size
+        self.hyperparameters["optimization.max_epochs"] = self.num_train_epochs
+
+    @staticmethod
+    def load_args():
+        from dataclasses import fields
+
+        arg_parser = argparse.ArgumentParser()
+        for each_field in fields(AGArgs):
+            arg_parser.add_argument(
+                "--" + each_field.name,
+                type=each_field.type,
+                help=each_field.metadata["help"],
+                required=each_field.metadata["required"]
+                if "required" in each_field.metadata
+                else False,
+                choices=each_field.metadata["choices"]
+                if "choices" in each_field.metadata
+                else None,
+                default=each_field.default,
+            )
+        console_args, unknown = arg_parser.parse_known_args()
+        return console_args
diff --git a/setup.py b/setup.py
index 907f1fe50f..73a2ef2abb 100644
--- a/setup.py
+++ b/setup.py
@@ -63,6 +63,10 @@
             "hcrystalball==0.1.10",
             "seqeval",
         ],
+        "autogluon": [
+            "autogluon.text==0.4.0",
+            "autogluon.features==0.4.0",
+        ],
         "catboost": ["catboost>=0.26"],
         "blendsearch": ["optuna==2.8.0"],
         "ray": [
diff --git a/test/nlp/test_multimodalestimator.py b/test/nlp/test_multimodalestimator.py
new file mode 100644
index 0000000000..b80bf41138
--- /dev/null
+++ b/test/nlp/test_multimodalestimator.py
@@ -0,0 +1,70 @@
+from flaml import AutoML
+import pandas as pd
+import numpy as np
+import os
+import sys
+import platform
+import pickle
+from sklearn.model_selection import train_test_split
+os.environ["AUTOGLUON_TEXT_TRAIN_WITHOUT_GPU"] = "1"
+
+
+def test_multimodalestimator():
+    if sys.version < "3.7":
+        # do not test on python3.6
+        return
+    elif platform.system() == "Windows":
+        # do not test on windows
+        return
+    train_data = {
+        "sentence1": [
+            "Mary had a little lamb.",
+            "Its fleece was white as snow."
+        ],
+        "numerical1": [1, 2],
+        "label": [1, 2],
+    }
+
+    valid_data = {
+        "sentence1": [
+            "Mary had a little lamb.",
+            "Its fleece was white as snow."
+        ],
+        "numerical1": [1, 2],
+        "label": [1, 2],
+    }
+    train_dataset = pd.DataFrame(train_data)
+    valid_dataset = pd.DataFrame(valid_data)
+
+    feature_columns = ["sentence1", "numerical1"]
+    metric = "r2"
+    automl = AutoML()
+    automl_settings = {
+        "gpu_per_trial": 0,
+        "max_iter": 2,
+        "time_budget": 30,
+        "task": "mm-regression",
+        "metric": "r2",
+        "seed": 123,
+    }
+
+    automl_settings["ag_args"] = {
+        "output_dir": "test/ag_output/",
+        "hf_model_path": "google/electra-small-discriminator"
+    }
+
+    automl.fit(
+        X_train=train_dataset[feature_columns],
+        y_train=train_dataset["label"],
+        X_val=valid_dataset[feature_columns],
+        y_val=valid_dataset["label"],
+        eval_method="holdout",
+        auto_augment=False,
+        **automl_settings
+    )
+    automl.pickle("automl.pkl")
+    with open("automl.pkl", "rb") as f:
+        automl = pickle.load(f)
+    print("Try to run inference on validation set")
+    score = automl.score(valid_dataset[feature_columns], valid_dataset["label"])
+    print(f"Inference on validation set complete, {metric}: {score}")