From 922b2b7924085d91410ecfdc86e950373477a4d6 Mon Sep 17 00:00:00 2001
From: gbrunin <guillaume.brunin@uclouvain.be>
Date: Mon, 27 Jun 2022 12:04:23 +0200
Subject: [PATCH 01/12] Upgraded pymatgen and matminer requirements

---
 README.md                         |  6 ------
 modnet/featurizers/featurizers.py |  8 ++++----
 modnet/preprocessing.py           | 10 +++++-----
 setup.py                          |  8 ++++----
 4 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index d4ffd405..72a4b761 100644
--- a/README.md
+++ b/README.md
@@ -45,12 +45,6 @@ activate the environment:
 conda activate modnet
 ```
 
-Then, install pymatgen v2020.8.13 with conda, which will bundle several pre-built dependencies (e.g., numpy, scipy):
-
-```shell
-conda install -c conda-forge pymatgen=2020.8.13
-```
-
 Finally, install MODNet from PyPI with pip:
 
 ```bash
diff --git a/modnet/featurizers/featurizers.py b/modnet/featurizers/featurizers.py
index 0835668c..0fd3ec77 100644
--- a/modnet/featurizers/featurizers.py
+++ b/modnet/featurizers/featurizers.py
@@ -70,7 +70,7 @@ def featurize(self, df: pd.DataFrame) -> pd.DataFrame:
 
         Arguments:
             df: the input dataframe with a `"structure"` column
-                containing `pymatgen.Structure` objects.
+                containing `pymatgen.core.structure.Structure` objects.
 
         Returns:
             The featurized DataFrame.
@@ -137,7 +137,7 @@ def featurize_composition(self, df: pd.DataFrame) -> pd.DataFrame:
 
         Arguments:
             df: the input dataframe with a `"structure"` column
-                containing `pymatgen.Structure` objects.
+                containing `pymatgen.core.structure.Structure` objects.
 
         Returns:
             pandas.DataFrame: the decorated DataFrame, or an empty
@@ -184,7 +184,7 @@ def featurize_structure(self, df: pd.DataFrame) -> pd.DataFrame:
 
         Arguments:
             df: the input dataframe with a `"structure"` column
-                containing `pymatgen.Structure` objects.
+                containing `pymatgen.core.structure.Structure` objects.
 
         Returns:
             pandas.DataFrame: the decorated DataFrame.
@@ -206,7 +206,7 @@ def featurize_site(
 
         Arguments:
             df: the input dataframe with a `"structure"` column
-                containing `pymatgen.Structure` objects.
+                containing `pymatgen.core.structure.Structure` objects.
             aliases: optional dictionary to map matminer output column
                 names to new aliases, mostly used for
                 backwards-compatibility.
diff --git a/modnet/preprocessing.py b/modnet/preprocessing.py
index 8cf3bed5..7b888eee 100644
--- a/modnet/preprocessing.py
+++ b/modnet/preprocessing.py
@@ -13,7 +13,7 @@
 from typing import Dict, List, Union, Optional, Callable, Hashable, Iterable, Tuple
 from functools import partial
 
-from pymatgen import Structure, Composition
+from pymatgen.core import Structure, Composition
 
 from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
 from sklearn.utils import resample
@@ -539,14 +539,14 @@ def merge_ranked(lists: List[List[Hashable]]) -> List[Hashable]:
 
 
 class MODData:
-    """The MODData class takes takes a list of `pymatgen.Structure`
+    """The MODData class takes takes a list of `pymatgen.core.structure.Structure`
     objects and creates a `pandas.DataFrame` that contains many matminer
     features per structure. It then uses mutual information between
     features and targets, and between the features themselves, to
     perform feature selection using relevance-redundancy indices.
 
     Attributes:
-        df_structure (pd.DataFrame): dataframe storing the `pymatgen.Structure`
+        df_structure (pd.DataFrame): dataframe storing the `pymatgen.core.structure.Structure`
             representations for each structured, indexed by ID.
         df_targets (pd.Dataframe): dataframe storing the prediction targets
             per structure, indexed by ID.
@@ -906,12 +906,12 @@ def rebalance(self):
 
     @property
     def structures(self) -> List[Union[Structure, CompositionContainer]]:
-        """Returns the list of `pymatgen.Structure` objects."""
+        """Returns the list of `pymatgen.core.structure.Structure` objects."""
         return list(self.df_structure["structure"])
 
     @property
     def compositions(self) -> List[Union[Structure, CompositionContainer]]:
-        """Returns the list of materials as`pymatgen.Composition` objects."""
+        """Returns the list of materials as`pymatgen.core.composition.Composition` objects."""
         return [s.composition for s in self.df_structure["structure"]]
 
     @property
diff --git a/setup.py b/setup.py
index 45d311d3..bb05700a 100644
--- a/setup.py
+++ b/setup.py
@@ -37,10 +37,10 @@
         "pandas>=0.25.3",
         "tensorflow>=2.4",
         "tensorflow-probability>=0.12",
-        "pymatgen>=2020,<2020.9",
-        "matminer>=0.6.2",
-        "numpy>=1.18.3",
-        "scikit-learn>=0.23,<0.24",
+        "pymatgen>=2022.5.17",
+        "matminer>=0.7.6",
+        "numpy>=1.22.3",
+        "scikit-learn>=1.1.0",
     ],
     tests_require=tests_require,
     test_suite="modnet.tests",

From f6c8b7354883afc264799d6ab563cdf9fa73176a Mon Sep 17 00:00:00 2001
From: gbrunin <guillaume.brunin@uclouvain.be>
Date: Wed, 12 Apr 2023 16:47:54 +0200
Subject: [PATCH 02/12] Better handling of NaNs in features by adding the
 possibility to use the mean of the column. Fixes bug where they were all set
 to 0.

---
 modnet/hyper_opt/fit_genetic.py |  4 +++
 modnet/models/bayesian.py       | 11 ++++----
 modnet/models/ensemble.py       |  8 +++---
 modnet/models/vanilla.py        | 49 ++++++++++++++++++++++++---------
 modnet/preprocessing.py         | 11 +++-----
 5 files changed, 54 insertions(+), 29 deletions(-)

diff --git a/modnet/hyper_opt/fit_genetic.py b/modnet/hyper_opt/fit_genetic.py
index dd067109..1bb430fd 100644
--- a/modnet/hyper_opt/fit_genetic.py
+++ b/modnet/hyper_opt/fit_genetic.py
@@ -40,6 +40,7 @@ def __init__(
         self.weights = weights
 
         self.xscale_list = ["minmax", "standard"]
+        self.impute_missing_list = [-1, "mean"]
         self.lr_list = [0.1, 0.01, 0.005, 0.001]
         self.batch_size_list = [32, 64, 128, 256]
         self.fraction_list = [1, 0.75, 0.5, 0.25]
@@ -52,6 +53,7 @@ def __init__(
             "fraction2": random.choice(self.fraction_list),
             "fraction3": random.choice(self.fraction_list),
             "xscale": random.choice(self.xscale_list),
+            "impute_missing": random.choice(self.impute_missing_list),
             "lr": random.choice(self.lr_list),
             "batch_size": random.choice(self.batch_size_list),
             "n_feat": 0,
@@ -210,6 +212,7 @@ def evaluate(
             epochs=800 if not fast else 1,
             batch_size=self.genes["batch_size"],
             xscale=self.genes["xscale"],
+            impute_missing=self.genes["impute_missing"],
             callbacks=callbacks,
             verbose=0,
         )
@@ -272,6 +275,7 @@ def refit_model(self, data: MODData, n_models=10, n_jobs=1, fast: bool = False):
             epochs=800 if not fast else 1,
             batch_size=self.genes["batch_size"],
             xscale=self.genes["xscale"],
+            impute_missing=self.genes["impute_missing"],
             callbacks=callbacks,
             verbose=0,
         )
diff --git a/modnet/models/bayesian.py b/modnet/models/bayesian.py
index 8790cd92..a1bc13bf 100644
--- a/modnet/models/bayesian.py
+++ b/modnet/models/bayesian.py
@@ -89,6 +89,7 @@ def __init__(
         self.out_act = out_act
 
         self._scaler = None
+        self._imputer = None
         self.optimal_descriptors = None
         self.target_names = None
         self.targets = targets
@@ -300,17 +301,17 @@ class OR only return the most probable class.
         # prevents Nan predictions if some features are inf
         x = (
             test_data.get_featurized_df()
-            .replace([np.inf, -np.inf, np.nan], 0)[
-                self.optimal_descriptors[: self.n_feat]
-            ]
+            .replace([np.inf, -np.inf], np.nan)[self.optimal_descriptors[: self.n_feat]]
             .values
         )
 
         # Scale the input features:
-        x = np.nan_to_num(x)
         if self._scaler is not None:
             x = self._scaler.transform(x)
-            x = np.nan_to_num(x)
+
+        # Impute missing data
+        if self._imputer is not None:
+            x = self._imputer.transform(x)
 
         all_predictions = []
 
diff --git a/modnet/models/ensemble.py b/modnet/models/ensemble.py
index 991c47b5..f4d1372b 100644
--- a/modnet/models/ensemble.py
+++ b/modnet/models/ensemble.py
@@ -142,9 +142,9 @@ def predict(
         Parameters:
             test_data: A featurized and feature-selected `MODData`
                 object containing the descriptors used in training.
-            return_prob: For a classification tasks only: whether to return the probability of each
+            return_prob: For a classification task only: whether to return the probability of each
                 class OR only return the most probable class.
-            return_unc: wheter to return a second dataframe containing the uncertainties
+            return_unc: whether to return a second dataframe containing the uncertainties
 
         Returns:
             A `pandas.DataFrame` containing the predicted values of the targets.
@@ -276,8 +276,6 @@ def fit_preset(
             for k, _ in enumerate(presets):
                 presets[k]["epochs"] = 5
 
-        val_losses = 1e20 * np.ones((len(presets),))
-
         num_nested_folds = 5
         if nested:
             num_nested_folds = nested
@@ -445,6 +443,7 @@ def _validate_ensemble_model(
     act="relu",
     out_act="linear",
     xscale="minmax",
+    impute_missing=-1,
     callbacks=[],
     preset_id=None,
     fold_id=None,
@@ -469,6 +468,7 @@ def _validate_ensemble_model(
         batch_size=batch_size,
         loss=loss,
         xscale=xscale,
+        impute_missing=impute_missing,
         callbacks=callbacks,
         verbose=verbose,
         val_fraction=0,
diff --git a/modnet/models/vanilla.py b/modnet/models/vanilla.py
index 6aa62133..d6eeb4d3 100644
--- a/modnet/models/vanilla.py
+++ b/modnet/models/vanilla.py
@@ -3,7 +3,7 @@
 
 """
 
-from typing import List, Tuple, Dict, Optional, Callable, Any
+from typing import List, Tuple, Dict, Optional, Callable, Any, Union
 from pathlib import Path
 import multiprocessing
 
@@ -12,6 +12,7 @@
 from sklearn.preprocessing import StandardScaler, MinMaxScaler
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import mean_absolute_error, roc_auc_score
+from sklearn.impute import SimpleImputer
 import tensorflow as tf
 
 from modnet.preprocessing import MODData
@@ -88,6 +89,7 @@ def __init__(
         self.out_act = out_act
 
         self._scaler = None
+        self._imputer = None
         self.optimal_descriptors = None
         self.target_names = None
         self.targets = targets
@@ -214,6 +216,7 @@ def fit(
         epochs: int = 200,
         batch_size: int = 128,
         xscale: Optional[str] = "minmax",
+        impute_missing: Optional[Union[float, str]] = -1,
         metrics: List[str] = ["mae"],
         callbacks: List[Callable] = None,
         verbose: int = 0,
@@ -237,6 +240,13 @@ def fit(
             batch_size: The batch size to use for training.
             xscale: The feature scaler to use, either `None`,
                 `'minmax'` or `'standard'`.
+            impute_missing: Determines how the NaN features are treated.
+                If float, sets the NaNs to the given float when the features
+                are scaled with xscale (default to -1).
+                If you use a StandardScaler (see xscale), make sure to use a value
+                that makes sense (most likely not -1 !).
+                If string, defines the strategy used in the scikit-learn SimpleImputer,
+                e.g., "mean" sets the NaNs to the mean of their feature column.
             metrics: A list of tf.keras metrics to pass to `compile(...)`.
             loss: The built-in tf.keras loss to pass to `compile(...)`.
             fit_params: Any additional parameters to pass to `fit(...)`,
@@ -293,14 +303,25 @@ def fit(
             self._scaler = StandardScaler()
 
         x = self._scaler.fit_transform(x)
-        x = np.nan_to_num(x, nan=-1)
+
+        # Handles NaN data
+        if isinstance(impute_missing, str):
+            imp = SimpleImputer(missing_values=np.nan, strategy=impute_missing)
+        else:
+            imp = SimpleImputer(
+                missing_values=np.nan, strategy="constant", fill_value=impute_missing
+            )
+
+        self._imputer = imp
+
+        x = self._imputer.fit_transform(x)
 
         if val_data is not None:
             val_x = val_data.get_featurized_df()[
                 self.optimal_descriptors[: self.n_feat]
             ].values
             val_x = self._scaler.transform(val_x)
-            val_x = np.nan_to_num(val_x, nan=-1)
+            val_x = self._imputer.transform(val_x)
             val_y = []
             for targ in self.targets_flatten:
                 if self.num_classes[targ] >= 2:  # Classification
@@ -384,6 +405,7 @@ def fit_preset(
         nested: int = 5,
         callbacks: List[Any] = None,
         n_jobs=None,
+        **fit_params,
     ) -> Tuple[
         List[List[Any]],
         np.ndarray,
@@ -576,11 +598,13 @@ def fit_preset(
                 loss=best_preset["loss"],
                 callbacks=callbacks,
                 verbose=verbose,
+                **fit_params,
             )
         else:
             self.n_feat = best_model.n_feat
             self.model = best_model.model
             self._scaler = best_model._scaler
+            self._imputer = best_model._imputer
 
         os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0"  # reset
 
@@ -603,17 +627,17 @@ class OR only return the most probable class.
         # prevents Nan predictions if some features are inf
         x = (
             test_data.get_featurized_df()
-            .replace([np.inf, -np.inf, np.nan], 0)[
-                self.optimal_descriptors[: self.n_feat]
-            ]
+            .replace([np.inf, -np.inf], np.nan)[self.optimal_descriptors[: self.n_feat]]
             .values
         )
 
         # Scale the input features:
-        x = np.nan_to_num(x)
         if self._scaler is not None:
             x = self._scaler.transform(x)
-            x = np.nan_to_num(x, nan=-1)
+
+        # Handle the missing data (NaN features)
+        if self._imputer is not None:
+            x = self._imputer.transform(x)
 
         p = np.array(self.model.predict(x))
 
@@ -669,17 +693,16 @@ def evaluate(self, test_data: MODData) -> pd.DataFrame:
         # prevents Nan predictions if some features are inf
         x = (
             test_data.get_featurized_df()
-            .replace([np.inf, -np.inf, np.nan], 0)[
-                self.optimal_descriptors[: self.n_feat]
-            ]
+            .replace([np.inf, -np.inf], np.nan)[self.optimal_descriptors[: self.n_feat]]
             .values
         )
 
         # Scale the input features:
-        x = np.nan_to_num(x)
         if self._scaler is not None:
             x = self._scaler.transform(x)
-            x = np.nan_to_num(x, nan=-1)
+
+        if self._imputer is not None:
+            x = self._imputer.transform(x)
 
         y_pred = np.array(self.model.predict(x))
         if len(y_pred.shape) == 2:
diff --git a/modnet/preprocessing.py b/modnet/preprocessing.py
index bdf3bb88..354d78eb 100644
--- a/modnet/preprocessing.py
+++ b/modnet/preprocessing.py
@@ -769,7 +769,8 @@ def featurize(self, fast: bool = False, db_file=None, n_jobs=None):
         else:
             df_final = self.featurizer.featurize(self.df_structure)
 
-        df_final = df_final.replace([np.inf, -np.inf, np.nan], 0)
+        # replace infinite values by nan that are handled during the fit
+        df_final = df_final.replace([np.inf, -np.inf], np.nan)
 
         self.df_featurized = df_final
         LOG.info("Data has successfully been featurized!")
@@ -801,7 +802,7 @@ def feature_selection(
         """
         if getattr(self, "df_featurized", None) is None:
             raise RuntimeError(
-                "Mutual information feature selection requiresd featurized data, please call `.featurize()`"
+                "Mutual information feature selection requires featurized data, please call `.featurize()`"
             )
         if getattr(self, "df_targets", None) is None:
             raise RuntimeError(
@@ -813,8 +814,6 @@ def feature_selection(
 
         if cross_nmi is not None:
             self.cross_nmi = cross_nmi
-        elif getattr(self, "cross_nmi", None) is None:
-            self.cross_nmi = None
 
         # Loading mutual information between features
         if use_precomputed_cross_nmi:
@@ -841,9 +840,7 @@ def feature_selection(
             )
 
         if self.cross_nmi.isna().sum().sum() > 0:
-            raise RuntimeError(
-                "Cross NMI (`moddata.cross_nmi`) contains NaN values, consider setting them to zero."
-            )
+            raise RuntimeError("Cross NMI (`moddata.cross_nmi`) contains NaN values.")
 
         for i, name in enumerate(self.names):
             LOG.info(f"Starting target {i + 1}/{len(self.names)}: {self.names[i]} ...")

From f247ac880e892de594b225f362df172ddbcccb41 Mon Sep 17 00:00:00 2001
From: gbrunin <guillaume.brunin@uclouvain.be>
Date: Wed, 12 Apr 2023 18:03:14 +0200
Subject: [PATCH 03/12] Small bug when adding keys to genes.

---
 modnet/hyper_opt/fit_genetic.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modnet/hyper_opt/fit_genetic.py b/modnet/hyper_opt/fit_genetic.py
index 1bb430fd..a67ce551 100644
--- a/modnet/hyper_opt/fit_genetic.py
+++ b/modnet/hyper_opt/fit_genetic.py
@@ -79,14 +79,14 @@ def crossover(self, partner: Individual) -> Individual:
         """
 
         genes_from_mother = random.sample(
-            range(10), k=5
+            range(len(self.genes)), k=5
         )  # creates indices to take randomly 5 genes from one parent, and 5 genes from the other
 
         child_genes = {
             list(self.genes.keys())[i]: list(self.genes.values())[i]
             if i in genes_from_mother
             else list(partner.genes.values())[i]
-            for i in range(10)
+            for i in range(len(self.genes))
         }
 
         child = Individual(

From 1ea13e4a9ccacf68162730d0d9c95fe059eb2806 Mon Sep 17 00:00:00 2001
From: gbrunin <guillaume.brunin@uclouvain.be>
Date: Thu, 13 Apr 2023 11:17:34 +0200
Subject: [PATCH 04/12] Small typo and bug fix.

---
 modnet/hyper_opt/fit_genetic.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modnet/hyper_opt/fit_genetic.py b/modnet/hyper_opt/fit_genetic.py
index a67ce551..4dc83aad 100644
--- a/modnet/hyper_opt/fit_genetic.py
+++ b/modnet/hyper_opt/fit_genetic.py
@@ -481,7 +481,7 @@ def run(
             n_jobs (Optional[int], optional): Number of jobs to parallelize on. Defaults to None.
             early_stopping (Optional[int], optional): Number of successive generations without improvement before stopping. Defaults to 4.
             refit (Optional[int], optional): Wether to refit (>0) the best hyperparameters on the whole dataset or use the best Individual instead (=0).
-                The amount corresponds the the number of models used in the ensemble. Defaults to 0.
+                The amount corresponds to the number of models used in the ensemble. Defaults to 0.
             fast (bool, optional): Use only for debugging and testing. A fast GA run with small number of epochs, generations, individuals and folds.
                 Overrides the size_pop, num_generation and nested arguments.. Defaults to False.
 
@@ -600,7 +600,7 @@ def run(
 
         else:
             ensemble = []
-            for m in models[ranking[:10]]:
+            for m in models[ranking[:refit]]:
                 ensemble += m.model
             self.best_model = EnsembleMODNetModel(modnet_models=ensemble)
 

From eb42b1ee5fcbdb1d951e8f8f148d9d9bd862775d Mon Sep 17 00:00:00 2001
From: gbrunin <guillaume.brunin@uclouvain.be>
Date: Thu, 13 Apr 2023 16:56:11 +0200
Subject: [PATCH 05/12] Update with a choice of order between scaling and
 imputing.

---
 modnet/hyper_opt/fit_genetic.py | 31 +++++++++----
 modnet/models/bayesian.py       | 11 +++--
 modnet/models/vanilla.py        | 79 +++++++++++++++++++++------------
 modnet/preprocessing.py         |  4 +-
 4 files changed, 80 insertions(+), 45 deletions(-)

diff --git a/modnet/hyper_opt/fit_genetic.py b/modnet/hyper_opt/fit_genetic.py
index 4dc83aad..d5bc59ae 100644
--- a/modnet/hyper_opt/fit_genetic.py
+++ b/modnet/hyper_opt/fit_genetic.py
@@ -22,6 +22,7 @@ def __init__(
         multi_label: bool,
         targets: List = None,
         weights: Dict[str, float] = None,
+        **model_params,
     ) -> Individual:
         """
         Args:
@@ -33,6 +34,9 @@ def __init__(
             weights (Dict[str, float]): Optional (for joint learning only). The relative loss weights to apply for each target.
         """
 
+        self.elu = "elu"
+        self.loss = "mae"
+        self.n_neurons_first_layer = 32 * random.randint(1, 10)
         self.max_feat = max_feat
         self.num_classes = num_classes
         self.multi_label = multi_label
@@ -40,20 +44,25 @@ def __init__(
         self.weights = weights
 
         self.xscale_list = ["minmax", "standard"]
-        self.impute_missing_list = [-1, "mean"]
+        self.impute_missing_list = [0, "mean"]
+        self.xscale_before_impute = True
         self.lr_list = [0.1, 0.01, 0.005, 0.001]
         self.batch_size_list = [32, 64, 128, 256]
         self.fraction_list = [1, 0.75, 0.5, 0.25]
 
+        if model_params:
+            self.__dict__.update(model_params)
+
         self.genes = {
-            "act": "elu",
-            "loss": "mae",
-            "n_neurons_first_layer": 32 * random.randint(1, 10),
+            "act": self.elu,
+            "loss": self.loss,
+            "n_neurons_first_layer": self.n_neurons_first_layer,
             "fraction1": random.choice(self.fraction_list),
             "fraction2": random.choice(self.fraction_list),
             "fraction3": random.choice(self.fraction_list),
             "xscale": random.choice(self.xscale_list),
             "impute_missing": random.choice(self.impute_missing_list),
+            "xscale_before_impute": self.xscale_before_impute,
             "lr": random.choice(self.lr_list),
             "batch_size": random.choice(self.batch_size_list),
             "n_feat": 0,
@@ -79,8 +88,8 @@ def crossover(self, partner: Individual) -> Individual:
         """
 
         genes_from_mother = random.sample(
-            range(len(self.genes)), k=5
-        )  # creates indices to take randomly 5 genes from one parent, and 5 genes from the other
+            range(len(self.genes)), k=len(self.genes) // 2
+        )  # creates indices to take randomly half the genes from one parent, and half the genes from the other
 
         child_genes = {
             list(self.genes.keys())[i]: list(self.genes.values())[i]
@@ -213,6 +222,7 @@ def evaluate(
             batch_size=self.genes["batch_size"],
             xscale=self.genes["xscale"],
             impute_missing=self.genes["impute_missing"],
+            xscale_before_impute=self.genes["xscale_before_impute"],
             callbacks=callbacks,
             verbose=0,
         )
@@ -276,6 +286,7 @@ def refit_model(self, data: MODData, n_models=10, n_jobs=1, fast: bool = False):
             batch_size=self.genes["batch_size"],
             xscale=self.genes["xscale"],
             impute_missing=self.genes["impute_missing"],
+            xscale_before_impute=self.genes["xscale_before_impute"],
             callbacks=callbacks,
             verbose=0,
         )
@@ -338,7 +349,9 @@ def _end_run(self):
         self.pool.close()
         self.pool.join()
 
-    def initialization_population(self, size_pop: int, multi_label: bool) -> None:
+    def initialization_population(
+        self, size_pop: int, multi_label: bool, **model_params
+    ) -> None:
         """Initializes the initial population (Generation 0).
 
         Args:
@@ -354,6 +367,7 @@ def initialization_population(self, size_pop: int, multi_label: bool) -> None:
                 multi_label=multi_label,
                 targets=self.targets,
                 weights=self.weights,
+                **model_params,
             )
             for _ in range(size_pop)
         ]
@@ -468,6 +482,7 @@ def run(
         early_stopping: Optional[int] = 4,
         refit: Optional[int] = 5,
         fast=False,
+        **model_params,
     ) -> EnsembleMODNetModel:
         """Run the GA and return best model.
 
@@ -495,7 +510,7 @@ def run(
 
         LOG.info("Generation number 0")
         self.initialization_population(
-            size_pop, multi_label=multi_label
+            size_pop, multi_label=multi_label, **model_params
         )  # initialization of the population
         val_loss, models, individuals = self.function_fitness(
             pop=self.pop,
diff --git a/modnet/models/bayesian.py b/modnet/models/bayesian.py
index a1bc13bf..8790cd92 100644
--- a/modnet/models/bayesian.py
+++ b/modnet/models/bayesian.py
@@ -89,7 +89,6 @@ def __init__(
         self.out_act = out_act
 
         self._scaler = None
-        self._imputer = None
         self.optimal_descriptors = None
         self.target_names = None
         self.targets = targets
@@ -301,17 +300,17 @@ class OR only return the most probable class.
         # prevents Nan predictions if some features are inf
         x = (
             test_data.get_featurized_df()
-            .replace([np.inf, -np.inf], np.nan)[self.optimal_descriptors[: self.n_feat]]
+            .replace([np.inf, -np.inf, np.nan], 0)[
+                self.optimal_descriptors[: self.n_feat]
+            ]
             .values
         )
 
         # Scale the input features:
+        x = np.nan_to_num(x)
         if self._scaler is not None:
             x = self._scaler.transform(x)
-
-        # Impute missing data
-        if self._imputer is not None:
-            x = self._imputer.transform(x)
+            x = np.nan_to_num(x)
 
         all_predictions = []
 
diff --git a/modnet/models/vanilla.py b/modnet/models/vanilla.py
index d6eeb4d3..058e1715 100644
--- a/modnet/models/vanilla.py
+++ b/modnet/models/vanilla.py
@@ -13,6 +13,7 @@
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import mean_absolute_error, roc_auc_score
 from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
 import tensorflow as tf
 
 from modnet.preprocessing import MODData
@@ -88,8 +89,11 @@ def __init__(
         self.act = act
         self.out_act = out_act
 
+        self.xscale = None
         self._scaler = None
         self._imputer = None
+        self.impute_missing = None
+        self._scale_impute = None
         self.optimal_descriptors = None
         self.target_names = None
         self.targets = targets
@@ -216,7 +220,8 @@ def fit(
         epochs: int = 200,
         batch_size: int = 128,
         xscale: Optional[str] = "minmax",
-        impute_missing: Optional[Union[float, str]] = -1,
+        impute_missing: Optional[Union[float, str]] = 0,
+        xscale_before_impute: bool = True,
         metrics: List[str] = ["mae"],
         callbacks: List[Callable] = None,
         verbose: int = 0,
@@ -241,12 +246,16 @@ def fit(
             xscale: The feature scaler to use, either `None`,
                 `'minmax'` or `'standard'`.
             impute_missing: Determines how the NaN features are treated.
-                If float, sets the NaNs to the given float when the features
-                are scaled with xscale (default to -1).
-                If you use a StandardScaler (see xscale), make sure to use a value
-                that makes sense (most likely not -1 !).
-                If string, defines the strategy used in the scikit-learn SimpleImputer,
+                If str, defines the strategy used in the scikit-learn SimpleImputer,
                 e.g., "mean" sets the NaNs to the mean of their feature column.
+                If a float is provided, and if xscale_before_impute is False, this
+                float is used to replace NaNs in the original dataset.
+                If a float is provided but xscale_before_impute is True, the float
+                is not used and standard values are used.
+                If you want to do something more sophisticated, make your own
+                modifications to MODData.df_featurized before fitting the model.
+            xscale_before_impute: whether to first scale the input and then impute values, or
+                                  first impute values and then scale the inputs.
             metrics: A list of tf.keras metrics to pass to `compile(...)`.
             loss: The built-in tf.keras loss to pass to `compile(...)`.
             fit_params: Any additional parameters to pass to `fit(...)`,
@@ -262,6 +271,7 @@ def fit(
             )
 
         self.xscale = xscale
+        self.impute_missing = impute_missing
         self.target_names = list(self.weights.keys())
         self.optimal_descriptors = training_data.get_optimal_descriptors()
 
@@ -295,33 +305,50 @@ def fit(
                 )
             y.append(y_inner)
 
-        # Scale the input features:
+        # Define the scaler
         if self.xscale == "minmax":
             self._scaler = MinMaxScaler(feature_range=(-0.5, 0.5))
 
         elif self.xscale == "standard":
             self._scaler = StandardScaler()
 
-        x = self._scaler.fit_transform(x)
-
-        # Handles NaN data
+        # Define the imputer
         if isinstance(impute_missing, str):
-            imp = SimpleImputer(missing_values=np.nan, strategy=impute_missing)
+            self._imputer = SimpleImputer(
+                missing_values=np.nan, strategy=impute_missing
+            )
         else:
-            imp = SimpleImputer(
+            if self.xscale == "minmax":
+                impute_missing = -1 if xscale_before_impute else impute_missing
+            elif self.xscale == "standard":
+                impute_missing = (
+                    10 * np.max(StandardScaler().fit_transform(np.nan_to_num(x)))
+                    if xscale_before_impute
+                    else impute_missing
+                )
+            self.impute_missing = impute_missing
+
+            self._imputer = SimpleImputer(
                 missing_values=np.nan, strategy="constant", fill_value=impute_missing
             )
 
-        self._imputer = imp
+        # Scale and impute input features in the desired order
+        if xscale_before_impute:
+            self._scale_impute = Pipeline(
+                [("scaler", self._scaler), ("imputer", self._imputer)]
+            )
+        else:
+            self._scale_impute = Pipeline(
+                [("imputer", self._imputer), ("scaler", self._scaler)]
+            )
 
-        x = self._imputer.fit_transform(x)
+        x = self._scale_impute.fit_transform(x)
 
         if val_data is not None:
             val_x = val_data.get_featurized_df()[
                 self.optimal_descriptors[: self.n_feat]
             ].values
-            val_x = self._scaler.transform(val_x)
-            val_x = self._imputer.transform(val_x)
+            val_x = self._scale_impute.transform(val_x)
             val_y = []
             for targ in self.targets_flatten:
                 if self.num_classes[targ] >= 2:  # Classification
@@ -605,6 +632,7 @@ def fit_preset(
             self.model = best_model.model
             self._scaler = best_model._scaler
             self._imputer = best_model._imputer
+            self._scale_impute = best_model._scale_impute
 
         os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0"  # reset
 
@@ -631,13 +659,9 @@ class OR only return the most probable class.
             .values
         )
 
-        # Scale the input features:
-        if self._scaler is not None:
-            x = self._scaler.transform(x)
-
-        # Handle the missing data (NaN features)
-        if self._imputer is not None:
-            x = self._imputer.transform(x)
+        # Scale and impute input features:
+        if self._scale_impute is not None:
+            x = self._scale_impute.transform(x)
 
         p = np.array(self.model.predict(x))
 
@@ -697,12 +721,9 @@ def evaluate(self, test_data: MODData) -> pd.DataFrame:
             .values
         )
 
-        # Scale the input features:
-        if self._scaler is not None:
-            x = self._scaler.transform(x)
-
-        if self._imputer is not None:
-            x = self._imputer.transform(x)
+        # Scale and impute input features:
+        if self._scale_impute is not None:
+            x = self._scale_impute.transform(x)
 
         y_pred = np.array(self.model.predict(x))
         if len(y_pred.shape) == 2:
diff --git a/modnet/preprocessing.py b/modnet/preprocessing.py
index 354d78eb..0e6193b9 100644
--- a/modnet/preprocessing.py
+++ b/modnet/preprocessing.py
@@ -24,7 +24,7 @@
 import tqdm
 from multiprocessing import Pool
 
-from modnet.featurizers import MODFeaturizer
+from modnet.featurizers import MODFeaturizer, clean_df
 from modnet import __version__
 from modnet.utils import LOG
 
@@ -770,7 +770,7 @@ def featurize(self, fast: bool = False, db_file=None, n_jobs=None):
             df_final = self.featurizer.featurize(self.df_structure)
 
         # replace infinite values by nan that are handled during the fit
-        df_final = df_final.replace([np.inf, -np.inf], np.nan)
+        df_final = clean_df(df_final)
 
         self.df_featurized = df_final
         LOG.info("Data has successfully been featurized!")

From 23f8ee460a3380e9ca7b54102e1fa51aacd6457e Mon Sep 17 00:00:00 2001
From: gbrunin <guillaume.brunin@uclouvain.be>
Date: Fri, 14 Apr 2023 08:35:50 +0200
Subject: [PATCH 06/12] Small bug fix.

---
 modnet/models/vanilla.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modnet/models/vanilla.py b/modnet/models/vanilla.py
index 058e1715..e88184a0 100644
--- a/modnet/models/vanilla.py
+++ b/modnet/models/vanilla.py
@@ -322,7 +322,7 @@ def fit(
                 impute_missing = -1 if xscale_before_impute else impute_missing
             elif self.xscale == "standard":
                 impute_missing = (
-                    10 * np.max(StandardScaler().fit_transform(np.nan_to_num(x)))
+                    10 * np.max(np.nan_to_num(StandardScaler().fit_transform(x)))
                     if xscale_before_impute
                     else impute_missing
                 )

From 3b3e4b318e5336aeed4446168de0610217945162 Mon Sep 17 00:00:00 2001
From: gbrunin <guillaume.brunin@uclouvain.be>
Date: Tue, 9 May 2023 16:47:41 +0200
Subject: [PATCH 07/12] Small name change.

---
 modnet/hyper_opt/fit_genetic.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modnet/hyper_opt/fit_genetic.py b/modnet/hyper_opt/fit_genetic.py
index d5bc59ae..4cebbb72 100644
--- a/modnet/hyper_opt/fit_genetic.py
+++ b/modnet/hyper_opt/fit_genetic.py
@@ -34,7 +34,7 @@ def __init__(
             weights (Dict[str, float]): Optional (for joint learning only). The relative loss weights to apply for each target.
         """
 
-        self.elu = "elu"
+        self.act = "elu"
         self.loss = "mae"
         self.n_neurons_first_layer = 32 * random.randint(1, 10)
         self.max_feat = max_feat
@@ -54,7 +54,7 @@ def __init__(
             self.__dict__.update(model_params)
 
         self.genes = {
-            "act": self.elu,
+            "act": self.act,
             "loss": self.loss,
             "n_neurons_first_layer": self.n_neurons_first_layer,
             "fraction1": random.choice(self.fraction_list),

From 3b27d5c4903d2a54731e529bc2669efaacac5875 Mon Sep 17 00:00:00 2001
From: gbrunin <guillaume.brunin@uclouvain.be>
Date: Wed, 17 May 2023 08:13:55 +0200
Subject: [PATCH 08/12] Typos.

---
 modnet/hyper_opt/fit_genetic.py | 2 +-
 modnet/models/vanilla.py        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modnet/hyper_opt/fit_genetic.py b/modnet/hyper_opt/fit_genetic.py
index 4cebbb72..82a555eb 100644
--- a/modnet/hyper_opt/fit_genetic.py
+++ b/modnet/hyper_opt/fit_genetic.py
@@ -495,7 +495,7 @@ def run(
                 In this case the softmax output-activation is replaced by a sigmoid.
             n_jobs (Optional[int], optional): Number of jobs to parallelize on. Defaults to None.
             early_stopping (Optional[int], optional): Number of successive generations without improvement before stopping. Defaults to 4.
-            refit (Optional[int], optional): Wether to refit (>0) the best hyperparameters on the whole dataset or use the best Individual instead (=0).
+            refit (Optional[int], optional): Whether to refit (>0) the best hyperparameters on the whole dataset or use the best Individual instead (=0).
                 The amount corresponds to the number of models used in the ensemble. Defaults to 0.
             fast (bool, optional): Use only for debugging and testing. A fast GA run with small number of epochs, generations, individuals and folds.
                 Overrides the size_pop, num_generation and nested arguments.. Defaults to False.
diff --git a/modnet/models/vanilla.py b/modnet/models/vanilla.py
index e88184a0..17d3b641 100644
--- a/modnet/models/vanilla.py
+++ b/modnet/models/vanilla.py
@@ -255,7 +255,7 @@ def fit(
                 If you want to do something more sophisticated, make your own
                 modifications to MODData.df_featurized before fitting the model.
             xscale_before_impute: whether to first scale the input and then impute values, or
-                                  first impute values and then scale the inputs.
+                first impute values and then scale the inputs.
             metrics: A list of tf.keras metrics to pass to `compile(...)`.
             loss: The built-in tf.keras loss to pass to `compile(...)`.
             fit_params: Any additional parameters to pass to `fit(...)`,

From cf1edf9ca4af8d0145f8b2a2ef0542c1d6c78471 Mon Sep 17 00:00:00 2001
From: Matthew Evans <git@ml-evs.science>
Date: Wed, 31 May 2023 15:27:53 +0100
Subject: [PATCH 09/12] Rename according to PP's PR

---
 modnet/hyper_opt/fit_genetic.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modnet/hyper_opt/fit_genetic.py b/modnet/hyper_opt/fit_genetic.py
index 50b8773c..efa18a3f 100644
--- a/modnet/hyper_opt/fit_genetic.py
+++ b/modnet/hyper_opt/fit_genetic.py
@@ -54,8 +54,8 @@ def __init__(
         self.batch_size_list = [32, 64, 128, 256]
         self.fraction_list = [1, 0.75, 0.5, 0.25]
 
-        if model_params:
-            self.__dict__.update(model_params)
+        if fit_params:
+            self.__dict__.update(fit_params)
 
         self.genes = {
             "act": self.act,

From dd4e5e12535cf9d67d823bb57bcb7770d6381a43 Mon Sep 17 00:00:00 2001
From: Matthew Evans <git@ml-evs.science>
Date: Wed, 31 May 2023 15:32:40 +0100
Subject: [PATCH 10/12] Add some additional 'bad columns' in testing

---
 modnet/tests/test_preprocessing.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/modnet/tests/test_preprocessing.py b/modnet/tests/test_preprocessing.py
index f2d05a30..b88ab604 100644
--- a/modnet/tests/test_preprocessing.py
+++ b/modnet/tests/test_preprocessing.py
@@ -31,7 +31,17 @@ def check_column_values(new: MODData, reference: MODData, tolerance=0.03):
     # different number of symm ops being detected.
 
     # We need a mechanism to allow these discrepancies through in certain cases:
-    allowed_bad_columns = ["GlobalSymmetryFeatures|n_symmetry_ops"]
+    allowed_bad_columns = [
+        "GlobalSymmetryFeatures|n_symmetry_ops", 
+        'GlobalSymmetryFeatures|crystal_system', 
+        'YangSolidSolution|Yang delta', 
+        'Miedema|Miedema_deltaH_inter', 
+        'AtomicPackingEfficiency|mean simul. packing efficiency', 
+        'Miedema|Miedema_deltaH_amor', 
+        'AtomicPackingEfficiency|mean abs simul. packing efficiency', 
+        'Miedema|Miedema_deltaH_ss_min'
+    ]
+
     for col in allowed_bad_columns:
         if col in error_cols:
             error_cols.remove(col)

From 4542d072a73f652234848969f95c07fccf27de6f Mon Sep 17 00:00:00 2001
From: Matthew Evans <git@ml-evs.science>
Date: Wed, 31 May 2023 15:33:54 +0100
Subject: [PATCH 11/12] Tidy up merge

---
 modnet/hyper_opt/fit_genetic.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/modnet/hyper_opt/fit_genetic.py b/modnet/hyper_opt/fit_genetic.py
index efa18a3f..9ee0c0e1 100644
--- a/modnet/hyper_opt/fit_genetic.py
+++ b/modnet/hyper_opt/fit_genetic.py
@@ -38,7 +38,7 @@ def __init__(
         """
 
         self.act = "elu"
-        self.loss = "mae"
+        self.loss = loss
         self.n_neurons_first_layer = 32 * random.randint(1, 10)
         self.max_feat = max_feat
         self.num_classes = num_classes
@@ -61,8 +61,6 @@ def __init__(
             "act": self.act,
             "loss": self.loss,
             "n_neurons_first_layer": self.n_neurons_first_layer,
-            "loss": loss,
-            "n_neurons_first_layer": 32 * random.randint(1, 10),
             "fraction1": random.choice(self.fraction_list),
             "fraction2": random.choice(self.fraction_list),
             "fraction3": random.choice(self.fraction_list),

From 9eeca8295fefbb44b8ec9439da5af3cd8993c82b Mon Sep 17 00:00:00 2001
From: Matthew Evans <git@ml-evs.science>
Date: Wed, 31 May 2023 15:40:58 +0100
Subject: [PATCH 12/12] Fix linting

---
 modnet/tests/test_preprocessing.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/modnet/tests/test_preprocessing.py b/modnet/tests/test_preprocessing.py
index b88ab604..e27fa477 100644
--- a/modnet/tests/test_preprocessing.py
+++ b/modnet/tests/test_preprocessing.py
@@ -32,14 +32,14 @@ def check_column_values(new: MODData, reference: MODData, tolerance=0.03):
 
     # We need a mechanism to allow these discrepancies through in certain cases:
     allowed_bad_columns = [
-        "GlobalSymmetryFeatures|n_symmetry_ops", 
-        'GlobalSymmetryFeatures|crystal_system', 
-        'YangSolidSolution|Yang delta', 
-        'Miedema|Miedema_deltaH_inter', 
-        'AtomicPackingEfficiency|mean simul. packing efficiency', 
-        'Miedema|Miedema_deltaH_amor', 
-        'AtomicPackingEfficiency|mean abs simul. packing efficiency', 
-        'Miedema|Miedema_deltaH_ss_min'
+        "GlobalSymmetryFeatures|n_symmetry_ops",
+        "GlobalSymmetryFeatures|crystal_system",
+        "YangSolidSolution|Yang delta",
+        "Miedema|Miedema_deltaH_inter",
+        "AtomicPackingEfficiency|mean simul. packing efficiency",
+        "Miedema|Miedema_deltaH_amor",
+        "AtomicPackingEfficiency|mean abs simul. packing efficiency",
+        "Miedema|Miedema_deltaH_ss_min",
     ]
 
     for col in allowed_bad_columns: