From 922b2b7924085d91410ecfdc86e950373477a4d6 Mon Sep 17 00:00:00 2001 From: gbrunin Date: Mon, 27 Jun 2022 12:04:23 +0200 Subject: [PATCH 01/12] Upgraded pymatgen and matminer requirements --- README.md | 6 ------ modnet/featurizers/featurizers.py | 8 ++++---- modnet/preprocessing.py | 10 +++++----- setup.py | 8 ++++---- 4 files changed, 13 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index d4ffd405..72a4b761 100644 --- a/README.md +++ b/README.md @@ -45,12 +45,6 @@ activate the environment: conda activate modnet ``` -Then, install pymatgen v2020.8.13 with conda, which will bundle several pre-built dependencies (e.g., numpy, scipy): - -```shell -conda install -c conda-forge pymatgen=2020.8.13 -``` - Finally, install MODNet from PyPI with pip: ```bash diff --git a/modnet/featurizers/featurizers.py b/modnet/featurizers/featurizers.py index 0835668c..0fd3ec77 100644 --- a/modnet/featurizers/featurizers.py +++ b/modnet/featurizers/featurizers.py @@ -70,7 +70,7 @@ def featurize(self, df: pd.DataFrame) -> pd.DataFrame: Arguments: df: the input dataframe with a `"structure"` column - containing `pymatgen.Structure` objects. + containing `pymatgen.core.structure.Structure` objects. Returns: The featurized DataFrame. @@ -137,7 +137,7 @@ def featurize_composition(self, df: pd.DataFrame) -> pd.DataFrame: Arguments: df: the input dataframe with a `"structure"` column - containing `pymatgen.Structure` objects. + containing `pymatgen.core.structure.Structure` objects. Returns: pandas.DataFrame: the decorated DataFrame, or an empty @@ -184,7 +184,7 @@ def featurize_structure(self, df: pd.DataFrame) -> pd.DataFrame: Arguments: df: the input dataframe with a `"structure"` column - containing `pymatgen.Structure` objects. + containing `pymatgen.core.structure.Structure` objects. Returns: pandas.DataFrame: the decorated DataFrame. @@ -206,7 +206,7 @@ def featurize_site( Arguments: df: the input dataframe with a `"structure"` column - containing `pymatgen.Structure` objects. + containing `pymatgen.core.structure.Structure` objects. aliases: optional dictionary to map matminer output column names to new aliases, mostly used for backwards-compatibility. diff --git a/modnet/preprocessing.py b/modnet/preprocessing.py index 8cf3bed5..7b888eee 100644 --- a/modnet/preprocessing.py +++ b/modnet/preprocessing.py @@ -13,7 +13,7 @@ from typing import Dict, List, Union, Optional, Callable, Hashable, Iterable, Tuple from functools import partial -from pymatgen import Structure, Composition +from pymatgen.core import Structure, Composition from sklearn.feature_selection import mutual_info_regression, mutual_info_classif from sklearn.utils import resample @@ -539,14 +539,14 @@ def merge_ranked(lists: List[List[Hashable]]) -> List[Hashable]: class MODData: - """The MODData class takes takes a list of `pymatgen.Structure` + """The MODData class takes takes a list of `pymatgen.core.structure.Structure` objects and creates a `pandas.DataFrame` that contains many matminer features per structure. It then uses mutual information between features and targets, and between the features themselves, to perform feature selection using relevance-redundancy indices. Attributes: - df_structure (pd.DataFrame): dataframe storing the `pymatgen.Structure` + df_structure (pd.DataFrame): dataframe storing the `pymatgen.core.structure.Structure` representations for each structured, indexed by ID. df_targets (pd.Dataframe): dataframe storing the prediction targets per structure, indexed by ID. @@ -906,12 +906,12 @@ def rebalance(self): @property def structures(self) -> List[Union[Structure, CompositionContainer]]: - """Returns the list of `pymatgen.Structure` objects.""" + """Returns the list of `pymatgen.core.structure.Structure` objects.""" return list(self.df_structure["structure"]) @property def compositions(self) -> List[Union[Structure, CompositionContainer]]: - """Returns the list of materials as`pymatgen.Composition` objects.""" + """Returns the list of materials as`pymatgen.core.composition.Composition` objects.""" return [s.composition for s in self.df_structure["structure"]] @property diff --git a/setup.py b/setup.py index 45d311d3..bb05700a 100644 --- a/setup.py +++ b/setup.py @@ -37,10 +37,10 @@ "pandas>=0.25.3", "tensorflow>=2.4", "tensorflow-probability>=0.12", - "pymatgen>=2020,<2020.9", - "matminer>=0.6.2", - "numpy>=1.18.3", - "scikit-learn>=0.23,<0.24", + "pymatgen>=2022.5.17", + "matminer>=0.7.6", + "numpy>=1.22.3", + "scikit-learn>=1.1.0", ], tests_require=tests_require, test_suite="modnet.tests", From f6c8b7354883afc264799d6ab563cdf9fa73176a Mon Sep 17 00:00:00 2001 From: gbrunin Date: Wed, 12 Apr 2023 16:47:54 +0200 Subject: [PATCH 02/12] Better handling of NaNs in features by adding the possibility to use the mean of the column. Fixes bug where they were all set to 0. --- modnet/hyper_opt/fit_genetic.py | 4 +++ modnet/models/bayesian.py | 11 ++++---- modnet/models/ensemble.py | 8 +++--- modnet/models/vanilla.py | 49 ++++++++++++++++++++++++--------- modnet/preprocessing.py | 11 +++----- 5 files changed, 54 insertions(+), 29 deletions(-) diff --git a/modnet/hyper_opt/fit_genetic.py b/modnet/hyper_opt/fit_genetic.py index dd067109..1bb430fd 100644 --- a/modnet/hyper_opt/fit_genetic.py +++ b/modnet/hyper_opt/fit_genetic.py @@ -40,6 +40,7 @@ def __init__( self.weights = weights self.xscale_list = ["minmax", "standard"] + self.impute_missing_list = [-1, "mean"] self.lr_list = [0.1, 0.01, 0.005, 0.001] self.batch_size_list = [32, 64, 128, 256] self.fraction_list = [1, 0.75, 0.5, 0.25] @@ -52,6 +53,7 @@ def __init__( "fraction2": random.choice(self.fraction_list), "fraction3": random.choice(self.fraction_list), "xscale": random.choice(self.xscale_list), + "impute_missing": random.choice(self.impute_missing_list), "lr": random.choice(self.lr_list), "batch_size": random.choice(self.batch_size_list), "n_feat": 0, @@ -210,6 +212,7 @@ def evaluate( epochs=800 if not fast else 1, batch_size=self.genes["batch_size"], xscale=self.genes["xscale"], + impute_missing=self.genes["impute_missing"], callbacks=callbacks, verbose=0, ) @@ -272,6 +275,7 @@ def refit_model(self, data: MODData, n_models=10, n_jobs=1, fast: bool = False): epochs=800 if not fast else 1, batch_size=self.genes["batch_size"], xscale=self.genes["xscale"], + impute_missing=self.genes["impute_missing"], callbacks=callbacks, verbose=0, ) diff --git a/modnet/models/bayesian.py b/modnet/models/bayesian.py index 8790cd92..a1bc13bf 100644 --- a/modnet/models/bayesian.py +++ b/modnet/models/bayesian.py @@ -89,6 +89,7 @@ def __init__( self.out_act = out_act self._scaler = None + self._imputer = None self.optimal_descriptors = None self.target_names = None self.targets = targets @@ -300,17 +301,17 @@ class OR only return the most probable class. # prevents Nan predictions if some features are inf x = ( test_data.get_featurized_df() - .replace([np.inf, -np.inf, np.nan], 0)[ - self.optimal_descriptors[: self.n_feat] - ] + .replace([np.inf, -np.inf], np.nan)[self.optimal_descriptors[: self.n_feat]] .values ) # Scale the input features: - x = np.nan_to_num(x) if self._scaler is not None: x = self._scaler.transform(x) - x = np.nan_to_num(x) + + # Impute missing data + if self._imputer is not None: + x = self._imputer.transform(x) all_predictions = [] diff --git a/modnet/models/ensemble.py b/modnet/models/ensemble.py index 991c47b5..f4d1372b 100644 --- a/modnet/models/ensemble.py +++ b/modnet/models/ensemble.py @@ -142,9 +142,9 @@ def predict( Parameters: test_data: A featurized and feature-selected `MODData` object containing the descriptors used in training. - return_prob: For a classification tasks only: whether to return the probability of each + return_prob: For a classification task only: whether to return the probability of each class OR only return the most probable class. - return_unc: wheter to return a second dataframe containing the uncertainties + return_unc: whether to return a second dataframe containing the uncertainties Returns: A `pandas.DataFrame` containing the predicted values of the targets. @@ -276,8 +276,6 @@ def fit_preset( for k, _ in enumerate(presets): presets[k]["epochs"] = 5 - val_losses = 1e20 * np.ones((len(presets),)) - num_nested_folds = 5 if nested: num_nested_folds = nested @@ -445,6 +443,7 @@ def _validate_ensemble_model( act="relu", out_act="linear", xscale="minmax", + impute_missing=-1, callbacks=[], preset_id=None, fold_id=None, @@ -469,6 +468,7 @@ def _validate_ensemble_model( batch_size=batch_size, loss=loss, xscale=xscale, + impute_missing=impute_missing, callbacks=callbacks, verbose=verbose, val_fraction=0, diff --git a/modnet/models/vanilla.py b/modnet/models/vanilla.py index 6aa62133..d6eeb4d3 100644 --- a/modnet/models/vanilla.py +++ b/modnet/models/vanilla.py @@ -3,7 +3,7 @@ """ -from typing import List, Tuple, Dict, Optional, Callable, Any +from typing import List, Tuple, Dict, Optional, Callable, Any, Union from pathlib import Path import multiprocessing @@ -12,6 +12,7 @@ from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.model_selection import train_test_split from sklearn.metrics import mean_absolute_error, roc_auc_score +from sklearn.impute import SimpleImputer import tensorflow as tf from modnet.preprocessing import MODData @@ -88,6 +89,7 @@ def __init__( self.out_act = out_act self._scaler = None + self._imputer = None self.optimal_descriptors = None self.target_names = None self.targets = targets @@ -214,6 +216,7 @@ def fit( epochs: int = 200, batch_size: int = 128, xscale: Optional[str] = "minmax", + impute_missing: Optional[Union[float, str]] = -1, metrics: List[str] = ["mae"], callbacks: List[Callable] = None, verbose: int = 0, @@ -237,6 +240,13 @@ def fit( batch_size: The batch size to use for training. xscale: The feature scaler to use, either `None`, `'minmax'` or `'standard'`. + impute_missing: Determines how the NaN features are treated. + If float, sets the NaNs to the given float when the features + are scaled with xscale (default to -1). + If you use a StandardScaler (see xscale), make sure to use a value + that makes sense (most likely not -1 !). + If string, defines the strategy used in the scikit-learn SimpleImputer, + e.g., "mean" sets the NaNs to the mean of their feature column. metrics: A list of tf.keras metrics to pass to `compile(...)`. loss: The built-in tf.keras loss to pass to `compile(...)`. fit_params: Any additional parameters to pass to `fit(...)`, @@ -293,14 +303,25 @@ def fit( self._scaler = StandardScaler() x = self._scaler.fit_transform(x) - x = np.nan_to_num(x, nan=-1) + + # Handles NaN data + if isinstance(impute_missing, str): + imp = SimpleImputer(missing_values=np.nan, strategy=impute_missing) + else: + imp = SimpleImputer( + missing_values=np.nan, strategy="constant", fill_value=impute_missing + ) + + self._imputer = imp + + x = self._imputer.fit_transform(x) if val_data is not None: val_x = val_data.get_featurized_df()[ self.optimal_descriptors[: self.n_feat] ].values val_x = self._scaler.transform(val_x) - val_x = np.nan_to_num(val_x, nan=-1) + val_x = self._imputer.transform(val_x) val_y = [] for targ in self.targets_flatten: if self.num_classes[targ] >= 2: # Classification @@ -384,6 +405,7 @@ def fit_preset( nested: int = 5, callbacks: List[Any] = None, n_jobs=None, + **fit_params, ) -> Tuple[ List[List[Any]], np.ndarray, @@ -576,11 +598,13 @@ def fit_preset( loss=best_preset["loss"], callbacks=callbacks, verbose=verbose, + **fit_params, ) else: self.n_feat = best_model.n_feat self.model = best_model.model self._scaler = best_model._scaler + self._imputer = best_model._imputer os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0" # reset @@ -603,17 +627,17 @@ class OR only return the most probable class. # prevents Nan predictions if some features are inf x = ( test_data.get_featurized_df() - .replace([np.inf, -np.inf, np.nan], 0)[ - self.optimal_descriptors[: self.n_feat] - ] + .replace([np.inf, -np.inf], np.nan)[self.optimal_descriptors[: self.n_feat]] .values ) # Scale the input features: - x = np.nan_to_num(x) if self._scaler is not None: x = self._scaler.transform(x) - x = np.nan_to_num(x, nan=-1) + + # Handle the missing data (NaN features) + if self._imputer is not None: + x = self._imputer.transform(x) p = np.array(self.model.predict(x)) @@ -669,17 +693,16 @@ def evaluate(self, test_data: MODData) -> pd.DataFrame: # prevents Nan predictions if some features are inf x = ( test_data.get_featurized_df() - .replace([np.inf, -np.inf, np.nan], 0)[ - self.optimal_descriptors[: self.n_feat] - ] + .replace([np.inf, -np.inf], np.nan)[self.optimal_descriptors[: self.n_feat]] .values ) # Scale the input features: - x = np.nan_to_num(x) if self._scaler is not None: x = self._scaler.transform(x) - x = np.nan_to_num(x, nan=-1) + + if self._imputer is not None: + x = self._imputer.transform(x) y_pred = np.array(self.model.predict(x)) if len(y_pred.shape) == 2: diff --git a/modnet/preprocessing.py b/modnet/preprocessing.py index bdf3bb88..354d78eb 100644 --- a/modnet/preprocessing.py +++ b/modnet/preprocessing.py @@ -769,7 +769,8 @@ def featurize(self, fast: bool = False, db_file=None, n_jobs=None): else: df_final = self.featurizer.featurize(self.df_structure) - df_final = df_final.replace([np.inf, -np.inf, np.nan], 0) + # replace infinite values by nan that are handled during the fit + df_final = df_final.replace([np.inf, -np.inf], np.nan) self.df_featurized = df_final LOG.info("Data has successfully been featurized!") @@ -801,7 +802,7 @@ def feature_selection( """ if getattr(self, "df_featurized", None) is None: raise RuntimeError( - "Mutual information feature selection requiresd featurized data, please call `.featurize()`" + "Mutual information feature selection requires featurized data, please call `.featurize()`" ) if getattr(self, "df_targets", None) is None: raise RuntimeError( @@ -813,8 +814,6 @@ def feature_selection( if cross_nmi is not None: self.cross_nmi = cross_nmi - elif getattr(self, "cross_nmi", None) is None: - self.cross_nmi = None # Loading mutual information between features if use_precomputed_cross_nmi: @@ -841,9 +840,7 @@ def feature_selection( ) if self.cross_nmi.isna().sum().sum() > 0: - raise RuntimeError( - "Cross NMI (`moddata.cross_nmi`) contains NaN values, consider setting them to zero." - ) + raise RuntimeError("Cross NMI (`moddata.cross_nmi`) contains NaN values.") for i, name in enumerate(self.names): LOG.info(f"Starting target {i + 1}/{len(self.names)}: {self.names[i]} ...") From f247ac880e892de594b225f362df172ddbcccb41 Mon Sep 17 00:00:00 2001 From: gbrunin Date: Wed, 12 Apr 2023 18:03:14 +0200 Subject: [PATCH 03/12] Small bug when adding keys to genes. --- modnet/hyper_opt/fit_genetic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modnet/hyper_opt/fit_genetic.py b/modnet/hyper_opt/fit_genetic.py index 1bb430fd..a67ce551 100644 --- a/modnet/hyper_opt/fit_genetic.py +++ b/modnet/hyper_opt/fit_genetic.py @@ -79,14 +79,14 @@ def crossover(self, partner: Individual) -> Individual: """ genes_from_mother = random.sample( - range(10), k=5 + range(len(self.genes)), k=5 ) # creates indices to take randomly 5 genes from one parent, and 5 genes from the other child_genes = { list(self.genes.keys())[i]: list(self.genes.values())[i] if i in genes_from_mother else list(partner.genes.values())[i] - for i in range(10) + for i in range(len(self.genes)) } child = Individual( From 1ea13e4a9ccacf68162730d0d9c95fe059eb2806 Mon Sep 17 00:00:00 2001 From: gbrunin Date: Thu, 13 Apr 2023 11:17:34 +0200 Subject: [PATCH 04/12] Small typo and bug fix. --- modnet/hyper_opt/fit_genetic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modnet/hyper_opt/fit_genetic.py b/modnet/hyper_opt/fit_genetic.py index a67ce551..4dc83aad 100644 --- a/modnet/hyper_opt/fit_genetic.py +++ b/modnet/hyper_opt/fit_genetic.py @@ -481,7 +481,7 @@ def run( n_jobs (Optional[int], optional): Number of jobs to parallelize on. Defaults to None. early_stopping (Optional[int], optional): Number of successive generations without improvement before stopping. Defaults to 4. refit (Optional[int], optional): Wether to refit (>0) the best hyperparameters on the whole dataset or use the best Individual instead (=0). - The amount corresponds the the number of models used in the ensemble. Defaults to 0. + The amount corresponds to the number of models used in the ensemble. Defaults to 0. fast (bool, optional): Use only for debugging and testing. A fast GA run with small number of epochs, generations, individuals and folds. Overrides the size_pop, num_generation and nested arguments.. Defaults to False. @@ -600,7 +600,7 @@ def run( else: ensemble = [] - for m in models[ranking[:10]]: + for m in models[ranking[:refit]]: ensemble += m.model self.best_model = EnsembleMODNetModel(modnet_models=ensemble) From eb42b1ee5fcbdb1d951e8f8f148d9d9bd862775d Mon Sep 17 00:00:00 2001 From: gbrunin Date: Thu, 13 Apr 2023 16:56:11 +0200 Subject: [PATCH 05/12] Update with a choice of order between scaling and imputing. --- modnet/hyper_opt/fit_genetic.py | 31 +++++++++---- modnet/models/bayesian.py | 11 +++-- modnet/models/vanilla.py | 79 +++++++++++++++++++++------------ modnet/preprocessing.py | 4 +- 4 files changed, 80 insertions(+), 45 deletions(-) diff --git a/modnet/hyper_opt/fit_genetic.py b/modnet/hyper_opt/fit_genetic.py index 4dc83aad..d5bc59ae 100644 --- a/modnet/hyper_opt/fit_genetic.py +++ b/modnet/hyper_opt/fit_genetic.py @@ -22,6 +22,7 @@ def __init__( multi_label: bool, targets: List = None, weights: Dict[str, float] = None, + **model_params, ) -> Individual: """ Args: @@ -33,6 +34,9 @@ def __init__( weights (Dict[str, float]): Optional (for joint learning only). The relative loss weights to apply for each target. """ + self.elu = "elu" + self.loss = "mae" + self.n_neurons_first_layer = 32 * random.randint(1, 10) self.max_feat = max_feat self.num_classes = num_classes self.multi_label = multi_label @@ -40,20 +44,25 @@ def __init__( self.weights = weights self.xscale_list = ["minmax", "standard"] - self.impute_missing_list = [-1, "mean"] + self.impute_missing_list = [0, "mean"] + self.xscale_before_impute = True self.lr_list = [0.1, 0.01, 0.005, 0.001] self.batch_size_list = [32, 64, 128, 256] self.fraction_list = [1, 0.75, 0.5, 0.25] + if model_params: + self.__dict__.update(model_params) + self.genes = { - "act": "elu", - "loss": "mae", - "n_neurons_first_layer": 32 * random.randint(1, 10), + "act": self.elu, + "loss": self.loss, + "n_neurons_first_layer": self.n_neurons_first_layer, "fraction1": random.choice(self.fraction_list), "fraction2": random.choice(self.fraction_list), "fraction3": random.choice(self.fraction_list), "xscale": random.choice(self.xscale_list), "impute_missing": random.choice(self.impute_missing_list), + "xscale_before_impute": self.xscale_before_impute, "lr": random.choice(self.lr_list), "batch_size": random.choice(self.batch_size_list), "n_feat": 0, @@ -79,8 +88,8 @@ def crossover(self, partner: Individual) -> Individual: """ genes_from_mother = random.sample( - range(len(self.genes)), k=5 - ) # creates indices to take randomly 5 genes from one parent, and 5 genes from the other + range(len(self.genes)), k=len(self.genes) // 2 + ) # creates indices to take randomly half the genes from one parent, and half the genes from the other child_genes = { list(self.genes.keys())[i]: list(self.genes.values())[i] @@ -213,6 +222,7 @@ def evaluate( batch_size=self.genes["batch_size"], xscale=self.genes["xscale"], impute_missing=self.genes["impute_missing"], + xscale_before_impute=self.genes["xscale_before_impute"], callbacks=callbacks, verbose=0, ) @@ -276,6 +286,7 @@ def refit_model(self, data: MODData, n_models=10, n_jobs=1, fast: bool = False): batch_size=self.genes["batch_size"], xscale=self.genes["xscale"], impute_missing=self.genes["impute_missing"], + xscale_before_impute=self.genes["xscale_before_impute"], callbacks=callbacks, verbose=0, ) @@ -338,7 +349,9 @@ def _end_run(self): self.pool.close() self.pool.join() - def initialization_population(self, size_pop: int, multi_label: bool) -> None: + def initialization_population( + self, size_pop: int, multi_label: bool, **model_params + ) -> None: """Initializes the initial population (Generation 0). Args: @@ -354,6 +367,7 @@ def initialization_population(self, size_pop: int, multi_label: bool) -> None: multi_label=multi_label, targets=self.targets, weights=self.weights, + **model_params, ) for _ in range(size_pop) ] @@ -468,6 +482,7 @@ def run( early_stopping: Optional[int] = 4, refit: Optional[int] = 5, fast=False, + **model_params, ) -> EnsembleMODNetModel: """Run the GA and return best model. @@ -495,7 +510,7 @@ def run( LOG.info("Generation number 0") self.initialization_population( - size_pop, multi_label=multi_label + size_pop, multi_label=multi_label, **model_params ) # initialization of the population val_loss, models, individuals = self.function_fitness( pop=self.pop, diff --git a/modnet/models/bayesian.py b/modnet/models/bayesian.py index a1bc13bf..8790cd92 100644 --- a/modnet/models/bayesian.py +++ b/modnet/models/bayesian.py @@ -89,7 +89,6 @@ def __init__( self.out_act = out_act self._scaler = None - self._imputer = None self.optimal_descriptors = None self.target_names = None self.targets = targets @@ -301,17 +300,17 @@ class OR only return the most probable class. # prevents Nan predictions if some features are inf x = ( test_data.get_featurized_df() - .replace([np.inf, -np.inf], np.nan)[self.optimal_descriptors[: self.n_feat]] + .replace([np.inf, -np.inf, np.nan], 0)[ + self.optimal_descriptors[: self.n_feat] + ] .values ) # Scale the input features: + x = np.nan_to_num(x) if self._scaler is not None: x = self._scaler.transform(x) - - # Impute missing data - if self._imputer is not None: - x = self._imputer.transform(x) + x = np.nan_to_num(x) all_predictions = [] diff --git a/modnet/models/vanilla.py b/modnet/models/vanilla.py index d6eeb4d3..058e1715 100644 --- a/modnet/models/vanilla.py +++ b/modnet/models/vanilla.py @@ -13,6 +13,7 @@ from sklearn.model_selection import train_test_split from sklearn.metrics import mean_absolute_error, roc_auc_score from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline import tensorflow as tf from modnet.preprocessing import MODData @@ -88,8 +89,11 @@ def __init__( self.act = act self.out_act = out_act + self.xscale = None self._scaler = None self._imputer = None + self.impute_missing = None + self._scale_impute = None self.optimal_descriptors = None self.target_names = None self.targets = targets @@ -216,7 +220,8 @@ def fit( epochs: int = 200, batch_size: int = 128, xscale: Optional[str] = "minmax", - impute_missing: Optional[Union[float, str]] = -1, + impute_missing: Optional[Union[float, str]] = 0, + xscale_before_impute: bool = True, metrics: List[str] = ["mae"], callbacks: List[Callable] = None, verbose: int = 0, @@ -241,12 +246,16 @@ def fit( xscale: The feature scaler to use, either `None`, `'minmax'` or `'standard'`. impute_missing: Determines how the NaN features are treated. - If float, sets the NaNs to the given float when the features - are scaled with xscale (default to -1). - If you use a StandardScaler (see xscale), make sure to use a value - that makes sense (most likely not -1 !). - If string, defines the strategy used in the scikit-learn SimpleImputer, + If str, defines the strategy used in the scikit-learn SimpleImputer, e.g., "mean" sets the NaNs to the mean of their feature column. + If a float is provided, and if xscale_before_impute is False, this + float is used to replace NaNs in the original dataset. + If a float is provided but xscale_before_impute is True, the float + is not used and standard values are used. + If you want to do something more sophisticated, make your own + modifications to MODData.df_featurized before fitting the model. + xscale_before_impute: whether to first scale the input and then impute values, or + first impute values and then scale the inputs. metrics: A list of tf.keras metrics to pass to `compile(...)`. loss: The built-in tf.keras loss to pass to `compile(...)`. fit_params: Any additional parameters to pass to `fit(...)`, @@ -262,6 +271,7 @@ def fit( ) self.xscale = xscale + self.impute_missing = impute_missing self.target_names = list(self.weights.keys()) self.optimal_descriptors = training_data.get_optimal_descriptors() @@ -295,33 +305,50 @@ def fit( ) y.append(y_inner) - # Scale the input features: + # Define the scaler if self.xscale == "minmax": self._scaler = MinMaxScaler(feature_range=(-0.5, 0.5)) elif self.xscale == "standard": self._scaler = StandardScaler() - x = self._scaler.fit_transform(x) - - # Handles NaN data + # Define the imputer if isinstance(impute_missing, str): - imp = SimpleImputer(missing_values=np.nan, strategy=impute_missing) + self._imputer = SimpleImputer( + missing_values=np.nan, strategy=impute_missing + ) else: - imp = SimpleImputer( + if self.xscale == "minmax": + impute_missing = -1 if xscale_before_impute else impute_missing + elif self.xscale == "standard": + impute_missing = ( + 10 * np.max(StandardScaler().fit_transform(np.nan_to_num(x))) + if xscale_before_impute + else impute_missing + ) + self.impute_missing = impute_missing + + self._imputer = SimpleImputer( missing_values=np.nan, strategy="constant", fill_value=impute_missing ) - self._imputer = imp + # Scale and impute input features in the desired order + if xscale_before_impute: + self._scale_impute = Pipeline( + [("scaler", self._scaler), ("imputer", self._imputer)] + ) + else: + self._scale_impute = Pipeline( + [("imputer", self._imputer), ("scaler", self._scaler)] + ) - x = self._imputer.fit_transform(x) + x = self._scale_impute.fit_transform(x) if val_data is not None: val_x = val_data.get_featurized_df()[ self.optimal_descriptors[: self.n_feat] ].values - val_x = self._scaler.transform(val_x) - val_x = self._imputer.transform(val_x) + val_x = self._scale_impute.transform(val_x) val_y = [] for targ in self.targets_flatten: if self.num_classes[targ] >= 2: # Classification @@ -605,6 +632,7 @@ def fit_preset( self.model = best_model.model self._scaler = best_model._scaler self._imputer = best_model._imputer + self._scale_impute = best_model._scale_impute os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0" # reset @@ -631,13 +659,9 @@ class OR only return the most probable class. .values ) - # Scale the input features: - if self._scaler is not None: - x = self._scaler.transform(x) - - # Handle the missing data (NaN features) - if self._imputer is not None: - x = self._imputer.transform(x) + # Scale and impute input features: + if self._scale_impute is not None: + x = self._scale_impute.transform(x) p = np.array(self.model.predict(x)) @@ -697,12 +721,9 @@ def evaluate(self, test_data: MODData) -> pd.DataFrame: .values ) - # Scale the input features: - if self._scaler is not None: - x = self._scaler.transform(x) - - if self._imputer is not None: - x = self._imputer.transform(x) + # Scale and impute input features: + if self._scale_impute is not None: + x = self._scale_impute.transform(x) y_pred = np.array(self.model.predict(x)) if len(y_pred.shape) == 2: diff --git a/modnet/preprocessing.py b/modnet/preprocessing.py index 354d78eb..0e6193b9 100644 --- a/modnet/preprocessing.py +++ b/modnet/preprocessing.py @@ -24,7 +24,7 @@ import tqdm from multiprocessing import Pool -from modnet.featurizers import MODFeaturizer +from modnet.featurizers import MODFeaturizer, clean_df from modnet import __version__ from modnet.utils import LOG @@ -770,7 +770,7 @@ def featurize(self, fast: bool = False, db_file=None, n_jobs=None): df_final = self.featurizer.featurize(self.df_structure) # replace infinite values by nan that are handled during the fit - df_final = df_final.replace([np.inf, -np.inf], np.nan) + df_final = clean_df(df_final) self.df_featurized = df_final LOG.info("Data has successfully been featurized!") From 23f8ee460a3380e9ca7b54102e1fa51aacd6457e Mon Sep 17 00:00:00 2001 From: gbrunin Date: Fri, 14 Apr 2023 08:35:50 +0200 Subject: [PATCH 06/12] Small bug fix. --- modnet/models/vanilla.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modnet/models/vanilla.py b/modnet/models/vanilla.py index 058e1715..e88184a0 100644 --- a/modnet/models/vanilla.py +++ b/modnet/models/vanilla.py @@ -322,7 +322,7 @@ def fit( impute_missing = -1 if xscale_before_impute else impute_missing elif self.xscale == "standard": impute_missing = ( - 10 * np.max(StandardScaler().fit_transform(np.nan_to_num(x))) + 10 * np.max(np.nan_to_num(StandardScaler().fit_transform(x))) if xscale_before_impute else impute_missing ) From 3b3e4b318e5336aeed4446168de0610217945162 Mon Sep 17 00:00:00 2001 From: gbrunin Date: Tue, 9 May 2023 16:47:41 +0200 Subject: [PATCH 07/12] Small name change. --- modnet/hyper_opt/fit_genetic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modnet/hyper_opt/fit_genetic.py b/modnet/hyper_opt/fit_genetic.py index d5bc59ae..4cebbb72 100644 --- a/modnet/hyper_opt/fit_genetic.py +++ b/modnet/hyper_opt/fit_genetic.py @@ -34,7 +34,7 @@ def __init__( weights (Dict[str, float]): Optional (for joint learning only). The relative loss weights to apply for each target. """ - self.elu = "elu" + self.act = "elu" self.loss = "mae" self.n_neurons_first_layer = 32 * random.randint(1, 10) self.max_feat = max_feat @@ -54,7 +54,7 @@ def __init__( self.__dict__.update(model_params) self.genes = { - "act": self.elu, + "act": self.act, "loss": self.loss, "n_neurons_first_layer": self.n_neurons_first_layer, "fraction1": random.choice(self.fraction_list), From 3b27d5c4903d2a54731e529bc2669efaacac5875 Mon Sep 17 00:00:00 2001 From: gbrunin Date: Wed, 17 May 2023 08:13:55 +0200 Subject: [PATCH 08/12] Typos. --- modnet/hyper_opt/fit_genetic.py | 2 +- modnet/models/vanilla.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modnet/hyper_opt/fit_genetic.py b/modnet/hyper_opt/fit_genetic.py index 4cebbb72..82a555eb 100644 --- a/modnet/hyper_opt/fit_genetic.py +++ b/modnet/hyper_opt/fit_genetic.py @@ -495,7 +495,7 @@ def run( In this case the softmax output-activation is replaced by a sigmoid. n_jobs (Optional[int], optional): Number of jobs to parallelize on. Defaults to None. early_stopping (Optional[int], optional): Number of successive generations without improvement before stopping. Defaults to 4. - refit (Optional[int], optional): Wether to refit (>0) the best hyperparameters on the whole dataset or use the best Individual instead (=0). + refit (Optional[int], optional): Whether to refit (>0) the best hyperparameters on the whole dataset or use the best Individual instead (=0). The amount corresponds to the number of models used in the ensemble. Defaults to 0. fast (bool, optional): Use only for debugging and testing. A fast GA run with small number of epochs, generations, individuals and folds. Overrides the size_pop, num_generation and nested arguments.. Defaults to False. diff --git a/modnet/models/vanilla.py b/modnet/models/vanilla.py index e88184a0..17d3b641 100644 --- a/modnet/models/vanilla.py +++ b/modnet/models/vanilla.py @@ -255,7 +255,7 @@ def fit( If you want to do something more sophisticated, make your own modifications to MODData.df_featurized before fitting the model. xscale_before_impute: whether to first scale the input and then impute values, or - first impute values and then scale the inputs. + first impute values and then scale the inputs. metrics: A list of tf.keras metrics to pass to `compile(...)`. loss: The built-in tf.keras loss to pass to `compile(...)`. fit_params: Any additional parameters to pass to `fit(...)`, From cf1edf9ca4af8d0145f8b2a2ef0542c1d6c78471 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Wed, 31 May 2023 15:27:53 +0100 Subject: [PATCH 09/12] Rename according to PP's PR --- modnet/hyper_opt/fit_genetic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modnet/hyper_opt/fit_genetic.py b/modnet/hyper_opt/fit_genetic.py index 50b8773c..efa18a3f 100644 --- a/modnet/hyper_opt/fit_genetic.py +++ b/modnet/hyper_opt/fit_genetic.py @@ -54,8 +54,8 @@ def __init__( self.batch_size_list = [32, 64, 128, 256] self.fraction_list = [1, 0.75, 0.5, 0.25] - if model_params: - self.__dict__.update(model_params) + if fit_params: + self.__dict__.update(fit_params) self.genes = { "act": self.act, From dd4e5e12535cf9d67d823bb57bcb7770d6381a43 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Wed, 31 May 2023 15:32:40 +0100 Subject: [PATCH 10/12] Add some additional 'bad columns' in testing --- modnet/tests/test_preprocessing.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/modnet/tests/test_preprocessing.py b/modnet/tests/test_preprocessing.py index f2d05a30..b88ab604 100644 --- a/modnet/tests/test_preprocessing.py +++ b/modnet/tests/test_preprocessing.py @@ -31,7 +31,17 @@ def check_column_values(new: MODData, reference: MODData, tolerance=0.03): # different number of symm ops being detected. # We need a mechanism to allow these discrepancies through in certain cases: - allowed_bad_columns = ["GlobalSymmetryFeatures|n_symmetry_ops"] + allowed_bad_columns = [ + "GlobalSymmetryFeatures|n_symmetry_ops", + 'GlobalSymmetryFeatures|crystal_system', + 'YangSolidSolution|Yang delta', + 'Miedema|Miedema_deltaH_inter', + 'AtomicPackingEfficiency|mean simul. packing efficiency', + 'Miedema|Miedema_deltaH_amor', + 'AtomicPackingEfficiency|mean abs simul. packing efficiency', + 'Miedema|Miedema_deltaH_ss_min' + ] + for col in allowed_bad_columns: if col in error_cols: error_cols.remove(col) From 4542d072a73f652234848969f95c07fccf27de6f Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Wed, 31 May 2023 15:33:54 +0100 Subject: [PATCH 11/12] Tidy up merge --- modnet/hyper_opt/fit_genetic.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modnet/hyper_opt/fit_genetic.py b/modnet/hyper_opt/fit_genetic.py index efa18a3f..9ee0c0e1 100644 --- a/modnet/hyper_opt/fit_genetic.py +++ b/modnet/hyper_opt/fit_genetic.py @@ -38,7 +38,7 @@ def __init__( """ self.act = "elu" - self.loss = "mae" + self.loss = loss self.n_neurons_first_layer = 32 * random.randint(1, 10) self.max_feat = max_feat self.num_classes = num_classes @@ -61,8 +61,6 @@ def __init__( "act": self.act, "loss": self.loss, "n_neurons_first_layer": self.n_neurons_first_layer, - "loss": loss, - "n_neurons_first_layer": 32 * random.randint(1, 10), "fraction1": random.choice(self.fraction_list), "fraction2": random.choice(self.fraction_list), "fraction3": random.choice(self.fraction_list), From 9eeca8295fefbb44b8ec9439da5af3cd8993c82b Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Wed, 31 May 2023 15:40:58 +0100 Subject: [PATCH 12/12] Fix linting --- modnet/tests/test_preprocessing.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/modnet/tests/test_preprocessing.py b/modnet/tests/test_preprocessing.py index b88ab604..e27fa477 100644 --- a/modnet/tests/test_preprocessing.py +++ b/modnet/tests/test_preprocessing.py @@ -32,14 +32,14 @@ def check_column_values(new: MODData, reference: MODData, tolerance=0.03): # We need a mechanism to allow these discrepancies through in certain cases: allowed_bad_columns = [ - "GlobalSymmetryFeatures|n_symmetry_ops", - 'GlobalSymmetryFeatures|crystal_system', - 'YangSolidSolution|Yang delta', - 'Miedema|Miedema_deltaH_inter', - 'AtomicPackingEfficiency|mean simul. packing efficiency', - 'Miedema|Miedema_deltaH_amor', - 'AtomicPackingEfficiency|mean abs simul. packing efficiency', - 'Miedema|Miedema_deltaH_ss_min' + "GlobalSymmetryFeatures|n_symmetry_ops", + "GlobalSymmetryFeatures|crystal_system", + "YangSolidSolution|Yang delta", + "Miedema|Miedema_deltaH_inter", + "AtomicPackingEfficiency|mean simul. packing efficiency", + "Miedema|Miedema_deltaH_amor", + "AtomicPackingEfficiency|mean abs simul. packing efficiency", + "Miedema|Miedema_deltaH_ss_min", ] for col in allowed_bad_columns: