Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 23 additions & 9 deletions modnet/hyper_opt/fit_genetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ def __init__(
fit_params: Any additional parameters to pass to `MODNetModel.fit(...)`,
"""

self.act = "elu"
self.loss = loss
self.n_neurons_first_layer = 32 * random.randint(1, 10)
self.max_feat = max_feat
self.num_classes = num_classes
self.multi_label = multi_label
Expand All @@ -45,18 +48,25 @@ def __init__(
self.fit_params = fit_params

self.xscale_list = ["minmax", "standard"]
self.impute_missing_list = [0, "mean"]
self.xscale_before_impute = True
self.lr_list = [0.1, 0.01, 0.005, 0.001]
self.batch_size_list = [32, 64, 128, 256]
self.fraction_list = [1, 0.75, 0.5, 0.25]

if fit_params:
self.__dict__.update(fit_params)

self.genes = {
"act": "elu",
"loss": loss,
"n_neurons_first_layer": 32 * random.randint(1, 10),
"act": self.act,
"loss": self.loss,
"n_neurons_first_layer": self.n_neurons_first_layer,
"fraction1": random.choice(self.fraction_list),
"fraction2": random.choice(self.fraction_list),
"fraction3": random.choice(self.fraction_list),
"xscale": random.choice(self.xscale_list),
"impute_missing": random.choice(self.impute_missing_list),
"xscale_before_impute": self.xscale_before_impute,
"lr": random.choice(self.lr_list),
"batch_size": random.choice(self.batch_size_list),
"n_feat": 0,
Expand All @@ -82,14 +92,14 @@ def crossover(self, partner: Individual) -> Individual:
"""

genes_from_mother = random.sample(
range(10), k=5
) # creates indices to take randomly 5 genes from one parent, and 5 genes from the other
range(len(self.genes)), k=len(self.genes) // 2
) # creates indices to take randomly half the genes from one parent, and half the genes from the other

child_genes = {
list(self.genes.keys())[i]: list(self.genes.values())[i]
if i in genes_from_mother
else list(partner.genes.values())[i]
for i in range(10)
for i in range(len(self.genes))
}

child = Individual(
Expand Down Expand Up @@ -221,6 +231,8 @@ def evaluate(
epochs=800 if not fast else 1,
batch_size=self.genes["batch_size"],
xscale=self.genes["xscale"],
impute_missing=self.genes["impute_missing"],
xscale_before_impute=self.genes["xscale_before_impute"],
callbacks=callbacks,
verbose=0,
**self.fit_params,
Expand Down Expand Up @@ -288,6 +300,8 @@ def refit_model(self, data: MODData, n_models=10, n_jobs=1, fast: bool = False):
epochs=800 if not fast else 1,
batch_size=self.genes["batch_size"],
xscale=self.genes["xscale"],
impute_missing=self.genes["impute_missing"],
xscale_before_impute=self.genes["xscale_before_impute"],
callbacks=callbacks,
verbose=0,
**self.fit_params,
Expand Down Expand Up @@ -522,8 +536,8 @@ def run(
loss: The built-in tf.keras loss to pass to `compile(...)`.
n_jobs (Optional[int], optional): Number of jobs to parallelize on. Defaults to None.
early_stopping (Optional[int], optional): Number of successive generations without improvement before stopping. Defaults to 4.
refit (Optional[int], optional): Wether to refit (>0) the best hyperparameters on the whole dataset or use the best Individual instead (=0).
The amount corresponds the the number of models used in the ensemble. Defaults to 0.
refit (Optional[int], optional): Whether to refit (>0) the best hyperparameters on the whole dataset or use the best Individual instead (=0).
The amount corresponds to the number of models used in the ensemble. Defaults to 0.
fast (bool, optional): Use only for debugging and testing. A fast GA run with small number of epochs, generations, individuals and folds.
Overrides the size_pop, num_generation and nested arguments.. Defaults to False.
fit_params: Any additional parameters to pass to `MODNetModel.fit(...)`,
Expand Down Expand Up @@ -646,7 +660,7 @@ def run(

else:
ensemble = []
for m in models[ranking[:10]]:
for m in models[ranking[:refit]]:
ensemble += m.model
self.best_model = EnsembleMODNetModel(modnet_models=ensemble)

Expand Down
8 changes: 4 additions & 4 deletions modnet/models/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,9 +142,9 @@ def predict(
Parameters:
test_data: A featurized and feature-selected `MODData`
object containing the descriptors used in training.
return_prob: For a classification tasks only: whether to return the probability of each
return_prob: For a classification task only: whether to return the probability of each
class OR only return the most probable class.
return_unc: wheter to return a second dataframe containing the uncertainties
return_unc: whether to return a second dataframe containing the uncertainties

Returns:
A `pandas.DataFrame` containing the predicted values of the targets.
Expand Down Expand Up @@ -276,8 +276,6 @@ def fit_preset(
for k, _ in enumerate(presets):
presets[k]["epochs"] = 5

val_losses = 1e20 * np.ones((len(presets),))

num_nested_folds = 5
if nested:
num_nested_folds = nested
Expand Down Expand Up @@ -445,6 +443,7 @@ def _validate_ensemble_model(
act="relu",
out_act="linear",
xscale="minmax",
impute_missing=-1,
callbacks=[],
preset_id=None,
fold_id=None,
Expand All @@ -469,6 +468,7 @@ def _validate_ensemble_model(
batch_size=batch_size,
loss=loss,
xscale=xscale,
impute_missing=impute_missing,
callbacks=callbacks,
verbose=verbose,
val_fraction=0,
Expand Down
88 changes: 66 additions & 22 deletions modnet/models/vanilla.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

"""

from typing import List, Tuple, Dict, Optional, Callable, Any
from typing import List, Tuple, Dict, Optional, Callable, Any, Union
from pathlib import Path
import multiprocessing

Expand All @@ -12,6 +12,8 @@
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import tensorflow as tf

from modnet.preprocessing import MODData
Expand Down Expand Up @@ -87,7 +89,11 @@ def __init__(
self.act = act
self.out_act = out_act

self.xscale = None
self._scaler = None
self._imputer = None
self.impute_missing = None
self._scale_impute = None
self.optimal_descriptors = None
self.target_names = None
self.targets = targets
Expand Down Expand Up @@ -215,6 +221,8 @@ def fit(
epochs: int = 200,
batch_size: int = 128,
xscale: Optional[str] = "minmax",
impute_missing: Optional[Union[float, str]] = 0,
xscale_before_impute: bool = True,
metrics: List[str] = ["mae"],
callbacks: List[Callable] = None,
verbose: int = 0,
Expand All @@ -240,6 +248,17 @@ def fit(
batch_size: The batch size to use for training.
xscale: The feature scaler to use, either `None`,
`'minmax'` or `'standard'`.
impute_missing: Determines how the NaN features are treated.
If str, defines the strategy used in the scikit-learn SimpleImputer,
e.g., "mean" sets the NaNs to the mean of their feature column.
If a float is provided, and if xscale_before_impute is False, this
float is used to replace NaNs in the original dataset.
If a float is provided but xscale_before_impute is True, the float
is not used and standard values are used.
If you want to do something more sophisticated, make your own
modifications to MODData.df_featurized before fitting the model.
xscale_before_impute: whether to first scale the input and then impute values, or
first impute values and then scale the inputs.
metrics: A list of tf.keras metrics to pass to `compile(...)`.
loss: The built-in tf.keras loss to pass to `compile(...)`.
fit_params: Any additional parameters to pass to `fit(...)`,
Expand All @@ -255,6 +274,7 @@ def fit(
)

self.xscale = xscale
self.impute_missing = impute_missing
self.target_names = list(self.weights.keys())
self.optimal_descriptors = training_data.get_optimal_descriptors()

Expand Down Expand Up @@ -300,22 +320,50 @@ def fit(
)
y.append(y_inner)

# Scale the input features:
# Define the scaler
if self.xscale == "minmax":
self._scaler = MinMaxScaler(feature_range=(-0.5, 0.5))

elif self.xscale == "standard":
self._scaler = StandardScaler()

x = self._scaler.fit_transform(x)
x = np.nan_to_num(x, nan=-1)
# Define the imputer
if isinstance(impute_missing, str):
self._imputer = SimpleImputer(
missing_values=np.nan, strategy=impute_missing
)
else:
if self.xscale == "minmax":
impute_missing = -1 if xscale_before_impute else impute_missing
elif self.xscale == "standard":
impute_missing = (
10 * np.max(np.nan_to_num(StandardScaler().fit_transform(x)))
if xscale_before_impute
else impute_missing
)
self.impute_missing = impute_missing

self._imputer = SimpleImputer(
missing_values=np.nan, strategy="constant", fill_value=impute_missing
)

# Scale and impute input features in the desired order
if xscale_before_impute:
self._scale_impute = Pipeline(
[("scaler", self._scaler), ("imputer", self._imputer)]
)
else:
self._scale_impute = Pipeline(
[("imputer", self._imputer), ("scaler", self._scaler)]
)

x = self._scale_impute.fit_transform(x)

if val_data is not None:
val_x = val_data.get_featurized_df()[
self.optimal_descriptors[: self.n_feat]
].values
val_x = self._scaler.transform(val_x)
val_x = np.nan_to_num(val_x, nan=-1)
val_x = self._scale_impute.transform(val_x)
val_y = []
for targ in self.targets_flatten:
if self.num_classes[targ] >= 2: # Classification
Expand Down Expand Up @@ -404,6 +452,7 @@ def fit_preset(
nested: int = 5,
callbacks: List[Any] = None,
n_jobs=None,
**fit_params,
) -> Tuple[
List[List[Any]],
np.ndarray,
Expand Down Expand Up @@ -596,11 +645,14 @@ def fit_preset(
loss=best_preset["loss"],
callbacks=callbacks,
verbose=verbose,
**fit_params,
)
else:
self.n_feat = best_model.n_feat
self.model = best_model.model
self._scaler = best_model._scaler
self._imputer = best_model._imputer
self._scale_impute = best_model._scale_impute

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0" # reset

Expand All @@ -623,17 +675,13 @@ class OR only return the most probable class.
# prevents Nan predictions if some features are inf
x = (
test_data.get_featurized_df()
.replace([np.inf, -np.inf, np.nan], 0)[
self.optimal_descriptors[: self.n_feat]
]
.replace([np.inf, -np.inf], np.nan)[self.optimal_descriptors[: self.n_feat]]
.values
)

# Scale the input features:
x = np.nan_to_num(x)
if self._scaler is not None:
x = self._scaler.transform(x)
x = np.nan_to_num(x, nan=-1)
# Scale and impute input features:
if self._scale_impute is not None:
x = self._scale_impute.transform(x)

p = np.array(self.model.predict(x))

Expand Down Expand Up @@ -689,17 +737,13 @@ def evaluate(self, test_data: MODData) -> pd.DataFrame:
# prevents Nan predictions if some features are inf
x = (
test_data.get_featurized_df()
.replace([np.inf, -np.inf, np.nan], 0)[
self.optimal_descriptors[: self.n_feat]
]
.replace([np.inf, -np.inf], np.nan)[self.optimal_descriptors[: self.n_feat]]
.values
)

# Scale the input features:
x = np.nan_to_num(x)
if self._scaler is not None:
x = self._scaler.transform(x)
x = np.nan_to_num(x, nan=-1)
# Scale and impute input features:
if self._scale_impute is not None:
x = self._scale_impute.transform(x)

y_pred = np.array(self.model.predict(x))
if len(y_pred.shape) == 2:
Expand Down
13 changes: 5 additions & 8 deletions modnet/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import tqdm
from multiprocessing import Pool

from modnet.featurizers import MODFeaturizer
from modnet.featurizers import MODFeaturizer, clean_df
from modnet import __version__
from modnet.utils import LOG

Expand Down Expand Up @@ -769,7 +769,8 @@ def featurize(self, fast: bool = False, db_file=None, n_jobs=None):
else:
df_final = self.featurizer.featurize(self.df_structure)

df_final = df_final.replace([np.inf, -np.inf, np.nan], 0)
# replace infinite values by nan that are handled during the fit
df_final = clean_df(df_final)

self.df_featurized = df_final
LOG.info("Data has successfully been featurized!")
Expand Down Expand Up @@ -804,7 +805,7 @@ def feature_selection(
"""
if getattr(self, "df_featurized", None) is None:
raise RuntimeError(
"Mutual information feature selection requiresd featurized data, please call `.featurize()`"
"Mutual information feature selection requires featurized data, please call `.featurize()`"
)
if getattr(self, "df_targets", None) is None:
raise RuntimeError(
Expand All @@ -822,8 +823,6 @@ def feature_selection(

if cross_nmi is not None:
self.cross_nmi = cross_nmi
elif getattr(self, "cross_nmi", None) is None:
self.cross_nmi = None

# Loading mutual information between features
if use_precomputed_cross_nmi:
Expand All @@ -850,9 +849,7 @@ def feature_selection(
)

if self.cross_nmi.isna().sum().sum() > 0:
raise RuntimeError(
"Cross NMI (`moddata.cross_nmi`) contains NaN values, consider setting them to zero."
)
raise RuntimeError("Cross NMI (`moddata.cross_nmi`) contains NaN values.")

selection_names = list(set(self.names).difference(set(ignore_names)))
for i, name in enumerate(selection_names):
Expand Down
12 changes: 11 additions & 1 deletion modnet/tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,17 @@ def check_column_values(new: MODData, reference: MODData, tolerance=0.03):
# different number of symm ops being detected.

# We need a mechanism to allow these discrepancies through in certain cases:
allowed_bad_columns = ["GlobalSymmetryFeatures|n_symmetry_ops"]
allowed_bad_columns = [
"GlobalSymmetryFeatures|n_symmetry_ops",
"GlobalSymmetryFeatures|crystal_system",
"YangSolidSolution|Yang delta",
"Miedema|Miedema_deltaH_inter",
"AtomicPackingEfficiency|mean simul. packing efficiency",
"Miedema|Miedema_deltaH_amor",
"AtomicPackingEfficiency|mean abs simul. packing efficiency",
"Miedema|Miedema_deltaH_ss_min",
]

for col in allowed_bad_columns:
if col in error_cols:
error_cols.remove(col)
Expand Down