Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
61 commits
Select commit Hold shift + click to select a range
0a0a4c6
Change readme to trigger test
Mar 15, 2022
002683f
add dependencies for AG
Mar 15, 2022
60a847c
add user permission to test_notebook_example L81
Mar 15, 2022
60a9e27
add mlflow dependency to setup
Mar 16, 2022
bc7f38d
add textpredictor estimator and test
Mar 16, 2022
f9ca56b
new estimator, no test file
Mar 16, 2022
fe0ecbb
Update automl.py
Qiaochu-Song Mar 16, 2022
4a52ac7
Update automl.py
Qiaochu-Song Mar 16, 2022
30cc834
add test with gc, narrow down mxnet version
Mar 16, 2022
14e6720
Merge branch 'test_main' of github.com:Qiaochu-Song/FLAML into test_main
Mar 16, 2022
6b75a73
skip test for py3.6 and win+py3.8, loose mxnet ver
Mar 16, 2022
d10945e
no ag on windows, remove mlflow dependency
Mar 16, 2022
06f64b2
no ag on windows, remove mlflow dependency
Mar 16, 2022
c9ff3d4
test with direct return
Mar 17, 2022
e7b6f6d
debug without new test
Mar 17, 2022
2307b37
w/o os.environ setting in new test, direct return
Mar 17, 2022
bf3203b
debug, import only in new test
Mar 17, 2022
10c93b2
move new test to automl
Mar 17, 2022
53b5f09
move new test to test/nlp/
Mar 17, 2022
ee3cacb
pass data with X_train
Mar 21, 2022
8096a89
pr fixes, debugging
Mar 24, 2022
fed989b
update with upstream
Mar 24, 2022
c40af7d
Rename to MultimodalEstimator, pr fix
Mar 24, 2022
d0b3b11
remove comment
Mar 24, 2022
30e9f60
Update data.py
Qiaochu-Song Mar 25, 2022
d15dd60
fix bug
Mar 25, 2022
6c42839
Merge branch 'new-test2' of github.com:Qiaochu-Song/FLAML into new-test2
Mar 25, 2022
301eb16
remove useless import
Mar 25, 2022
c59a3b2
remove useless import
Mar 25, 2022
f04b69e
Merge branch 'new-test2' of github.com:Qiaochu-Song/FLAML into new-test2
Mar 25, 2022
2f07223
resolve conflict
Mar 28, 2022
ea515d2
remove task mapping for AG
Mar 28, 2022
6cc2f9e
use 0.5 threshold for text/cat inference
Apr 13, 2022
4cc2b4e
add MM_TASKS; no preprocess on X; pass val_data for early stopping
Apr 14, 2022
4fa136d
adjust testing data and raise budget
Apr 14, 2022
c5d9914
Merge remote-tracking branch 'upstream/main' into new-test2
Apr 14, 2022
25c1baf
shrink test toy data and budget
Apr 14, 2022
f9d3b22
change to regression test
Apr 14, 2022
c1568b4
add metric to kwargs for mm in train_estimator, raise test budget
Apr 14, 2022
1e4201d
use valid data if any for early stopping, raise test budget
Apr 15, 2022
9692d4e
return to the original budget
Apr 15, 2022
1b2cb28
fix valid DF checking
Apr 16, 2022
05941bc
simplify isinstance in ml.py
Apr 18, 2022
984d000
Merge remote-tracking branch 'upstream/main' into new-test2
Apr 18, 2022
74f27b5
reduce text column and budget
Apr 19, 2022
c8848c7
use only 4-row toy test data
Apr 19, 2022
7be2c5c
test 10s budget
Apr 19, 2022
1c7f7ad
minimize test toy dataset
Apr 19, 2022
be60fa6
shorter test sentence
Apr 19, 2022
3a29c5b
give enough test budget
Apr 20, 2022
543b660
give enough test budget
Apr 20, 2022
4296129
solve conflict
May 4, 2022
ca30eab
Merge branch 'mxtextpredictor' of github.com:Qiaochu-Song/FLAML into …
May 6, 2022
5bd061f
add pytorch backend support
May 12, 2022
2b150e7
set pytorch backend to default
May 19, 2022
505c894
pytorch backend support only
May 19, 2022
cd98daf
solve merge conflict
May 19, 2022
98ee138
test remove os and python ver constraints
May 19, 2022
ff8c078
no support for python 3.6
May 19, 2022
24a5333
no support for python 3.6 or windows
May 19, 2022
2aeb563
Merge branch 'main' into mxtextpredictor
Qiaochu-Song May 20, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ jobs:
run: |
pip install -e .[ray,forecast]
pip install 'tensorboardX<=2.2'
- name: If python version > 3.6 and not on windows, install autogluon
if: matrix.python-version >= '3.7' && (matrix.os == 'macOS-latest' || matrix.os == 'ubuntu-latest')
run: |
pip install -e .[autogluon]
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
Expand Down
8 changes: 8 additions & 0 deletions flaml/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
REGRESSION,
_is_nlp_task,
NLG_TASKS,
MM_TASKS,
)
from . import tune
from .training_log import training_log_reader, training_log_writer
Expand Down Expand Up @@ -1690,6 +1691,10 @@ def _decide_split_type(self, split_type):
self._state.task = get_classification_objective(
len(np.unique(self._y_train_all))
)
elif self._state.task == "mm-classification":
self._state.task = "mm-" + get_classification_objective(
len(np.unique(self._y_train_all))
)
if not isinstance(split_type, str):
assert hasattr(split_type, "split") and hasattr(
split_type, "get_n_splits"
Expand Down Expand Up @@ -2452,6 +2457,9 @@ def is_to_reverse_metric(metric, task):
estimator_list = ["lgbm", "xgboost", "xgb_limitdepth"]
elif _is_nlp_task(self._state.task):
estimator_list = ["transformer"]
# NOTE: if multimodal task, use multimodal estimator
elif self._state.task in MM_TASKS:
estimator_list = ["multimodal"]
else:
try:
import catboost
Expand Down
18 changes: 16 additions & 2 deletions flaml/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,12 @@
SEQCLASSIFICATION,
MULTICHOICECLASSIFICATION,
TOKENCLASSIFICATION,
"mm-binary",
"mm-multiclass",
"mm-classification",
)
SEQREGRESSION = "seq-regression"
REGRESSION = ("regression", SEQREGRESSION)
REGRESSION = ("regression", SEQREGRESSION, "mm-regression")
TS_FORECASTREGRESSION = (
"forecast",
"ts_forecast",
Expand All @@ -46,6 +49,11 @@
MULTICHOICECLASSIFICATION,
TOKENCLASSIFICATION,
)
MM_TASKS = (
"mm-classification",
"mm-regression",
"mm-binary",
"mm-multiclass",)


def _is_nlp_task(task):
Expand Down Expand Up @@ -245,7 +253,6 @@ def concat(X1, X2):

class DataTransformer:
"""Transform input training data."""

def fit_transform(self, X: Union[DataFrame, np.array], y, task):
"""Fit transformer and process the input training data according to the task type.

Expand All @@ -269,6 +276,10 @@ def fit_transform(self, X: Union[DataFrame, np.array], y, task):
if len(str_columns) > 0:
X[str_columns] = X[str_columns].astype("string")
self._str_columns = str_columns
# NOTE: if multimodal task, no preprocessing on X
elif task in MM_TASKS:
for column in X.columns:
X[column].astype("object")
elif isinstance(X, DataFrame):
X = X.copy()
n = X.shape[0]
Expand Down Expand Up @@ -395,6 +406,9 @@ def transform(self, X: Union[DataFrame, np.array]):
# ids (input ids, token type id, attention mask, etc.)
if len(self._str_columns) > 0:
X[self._str_columns] = X[self._str_columns].astype("string")
elif self._task in MM_TASKS:
for column in X.columns:
X[column].astype("category")
elif isinstance(X, DataFrame):
cat_columns, num_columns, datetime_columns = (
self._cat_columns,
Expand Down
7 changes: 6 additions & 1 deletion flaml/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
ARIMA,
SARIMAX,
TransformersEstimator,
MultiModalEstimator,
TransformersEstimatorModelSelection,
)
from .data import CLASSIFICATION, group_counts, TS_FORECAST, TS_VALUE_COL
Expand Down Expand Up @@ -122,6 +123,8 @@ def get_estimator_class(task, estimator_name):
estimator_class = SARIMAX
elif estimator_name == "transformer":
estimator_class = TransformersEstimator
elif estimator_name == "multimodal":
estimator_class = MultiModalEstimator
elif estimator_name == "transformer_ms":
estimator_class = TransformersEstimatorModelSelection
else:
Expand Down Expand Up @@ -584,7 +587,7 @@ def compute_estimator(
n_jobs=n_jobs,
)

if isinstance(estimator, TransformersEstimator):
if isinstance(estimator, (TransformersEstimator, MultiModalEstimator)):
fit_kwargs["metric"] = eval_metric
fit_kwargs["X_val"] = X_val
fit_kwargs["y_val"] = y_val
Expand Down Expand Up @@ -650,6 +653,8 @@ def train_estimator(
)
if isinstance(estimator, TransformersEstimator):
fit_kwargs["metric"] = eval_metric
elif isinstance(estimator, MultiModalEstimator):
fit_kwargs["metric"] = eval_metric

if X_train is not None:
train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs)
Expand Down
117 changes: 117 additions & 0 deletions flaml/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
SUMMARIZATION,
NLG_TASKS,
MULTICHOICECLASSIFICATION,
MM_TASKS
)

try:
Expand Down Expand Up @@ -133,6 +134,13 @@ def estimator(self):
def _preprocess(self, X):
return X

@staticmethod
def _join(X_train, y_train):
y_train = DataFrame(y_train, index=X_train.index)
y_train.columns = ["label"]
train_df = X_train.join(y_train)
return train_df

def _fit(self, X_train, y_train, **kwargs):

current_time = time.time()
Expand Down Expand Up @@ -2127,6 +2135,115 @@ class XGBoostLimitDepth_TS(TS_SKLearn):
base_class = XGBoostLimitDepthEstimator


class MultiModalEstimator(BaseEstimator):
"""
The class for tuning AutoGluon TextPredictor
"""
def __init__(self, task="binary", **config):
super().__init__(task, **config)
import uuid

self.trial_id = str(uuid.uuid1().hex)[:8]

@classmethod
def search_space(cls, **params):
"""
Add the possible search space configs here, e.g. 'optimization.lr'
reference:
https://auto.gluon.ai/stable/tutorials/text_prediction/customization.html#custom-hyperparameter-values
"""
search_space_dict = {
"model.fusion_mlp.hidden_sizes": {
"domain": tune.choice(list(range(32, 129))),
"init_value": 128,
},
"optimization.learning_rate": {
"domain": tune.loguniform(lower=1E-5, upper=1E-4),
"init_value": 1E-4,
},
"optimization.weight_decay": {
"domain": tune.choice([1E-4, 1E-3, 1E-2]),
"init_value": 1E-4,
},
"optimization.warmup_steps": {
"domain": tune.choice([0.1, 0.2]),
"init_value": 0.1,
},
}
return search_space_dict
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There were only 4 hyperparameters and now there are 9. Which one was the search space used in your original experiment for autogluon?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The original four are "model.network.agg_net.mid_units", "optimization.warmup_portion", "optimization.lr", "optimization.wd".


def fit(self, X_train=None, y_train=None, budget=None, **kwargs):
from autogluon.text import TextPredictor
from .nlp.utils import AGArgs

self._kwargs = kwargs
self.ag_args = AGArgs(**kwargs["ag_args"])
seed = self._kwargs.get("seed", 123)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why 123?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They used this value for their experiment.


# get & set the hyperparameters, update with self.params
hyperparameters = self.ag_args.hyperparameters
for key, value in self.params.items():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of a for loop, try implement using one line with dict.update()

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are two problems here:

  1. flaml uses np.float64 but AG uses float, need to convert or else will raise error;
  2. there will be a "n_jobs" in self.params, which should not be passed to "search_space" of "hyperparameters".

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

both problems can be solved using python conditional value assignment https://stackoverflow.com/questions/6402311/python-conditional-assignment-operator

if key == "n_jobs":
continue
elif key == "model.fusion_mlp.hidden_sizes":
hyperparameters[key] = [value]
else:
hyperparameters[key] = value.item() if isinstance(value, np.float64) else value

start_time = time.time()
self.model_path = os.path.join(self.ag_args.output_dir, self.trial_id)
assert self._task in MM_TASKS, f"The task is not multimodal, but {self._task}. "
model = TextPredictor(path=self.model_path,
label="label",
problem_type=self._task[3:],
eval_metric=kwargs["metric"],
backend="pytorch",
verbosity=0)
train_data = BaseEstimator._join(X_train, y_train)
# use valid data for early stopping
X_val = kwargs.get("X_val")
y_val = kwargs.get("y_val")
if X_val is not None and y_val is not None:
tuning_data = BaseEstimator._join(X_val, y_val)
else:
tuning_data = None
# NOTE: if no tuning_data, model.fit() will holdout a fraction from train_data for early stopping
model.fit(train_data=train_data,
tuning_data=tuning_data,
hyperparameters=hyperparameters,
num_gpus=kwargs.get("gpu_per_trial", None),
time_limit=budget,
seed=seed)

training_time = time.time() - start_time
return training_time

def predict(self, X):
from autogluon.text import TextPredictor

model = TextPredictor.load(path=self.model_path, backend="pytorch")
output = model.predict(X, as_pandas=False)
return output

def predict_proba(self, X):
from autogluon.text import TextPredictor

# only works for classification tasks
assert (
self._task in CLASSIFICATION
), "predict_proba() only for classification tasks."
model = TextPredictor.load(path=self.model_path, backend="pytorch")
output = model.predict_proba(X, as_pandas=False)
return output

def score(self, X_val: DataFrame, y_val: Series, **kwargs):
from autogluon.text import TextPredictor

model = TextPredictor.load(path=self.model_path, backend="pytorch")
val_data = BaseEstimator._join(X_val, y_val)
return model.evaluate(val_data)


class suppress_stdout_stderr(object):
def __init__(self):
# Open a pair of null files
Expand Down
56 changes: 56 additions & 0 deletions flaml/nlp/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import argparse
from dataclasses import dataclass, field
from itertools import chain
from typing import Dict, Any
import numpy as np
Expand Down Expand Up @@ -475,3 +477,57 @@ def _set_model_config(checkpoint_path):
model_config = _set_model_config(checkpoint_path)
this_model = get_this_model(checkpoint_path, task, model_config)
return this_model


@dataclass
class AGArgs:
"""
The Autogluon configurations
Args:
output_dir (str): data root directory for outputing the log and intermediate data, model.
hf_model_checkpoint_name (str, optional, defaults to "google/electra-base-discriminator"): the HF model checkpoint.
per_device_batch_size (int, optional, defaults to 8)
num_train_epochs (int, optional, defaults to 10)
batch_size (int, optional, defaults to 128)
"""
output_dir: str = field(default="data/mm_output/", metadata={"help": "data dir", "required": True})
hf_model_path: str = field(default="google/electra-base-discriminator", metadata={"help": "Hugging Face model path"})
per_device_batch_size: int = field(default=8, metadata={"help": "per device batch size"})
num_train_epochs: int = field(default=10, metadata={"help": "number of train epochs"})
batch_size: int = field(default=128, metadata={"help": "batch size"})
hyperparameters: dict = field(init=False)

def __post_init__(self):
"""
Get the preset using the AGArgs. Save as self.hyperparameters.
"""
from autogluon.text.text_prediction.presets import get_text_preset

# get the override from the text preset tuple
self.hyperparameters = get_text_preset("default")[1]

self.hyperparameters["model.hf_text.checkpoint_name"] = self.hf_model_path
self.hyperparameters["env.per_gpu_batch_size"] = self.per_device_batch_size
self.hyperparameters["env.batch_size"] = self.batch_size
self.hyperparameters["optimization.max_epochs"] = self.num_train_epochs

@staticmethod
def load_args():
from dataclasses import fields

arg_parser = argparse.ArgumentParser()
for each_field in fields(AGArgs):
arg_parser.add_argument(
"--" + each_field.name,
type=each_field.type,
help=each_field.metadata["help"],
required=each_field.metadata["required"]
if "required" in each_field.metadata
else False,
choices=each_field.metadata["choices"]
if "choices" in each_field.metadata
else None,
default=each_field.default,
)
console_args, unknown = arg_parser.parse_known_args()
return console_args
4 changes: 4 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@
"hcrystalball==0.1.10",
"seqeval",
],
"autogluon": [
"autogluon.text==0.4.0",
"autogluon.features==0.4.0",
],
"catboost": ["catboost>=0.26"],
"blendsearch": ["optuna==2.8.0"],
"ray": [
Expand Down
Loading