--added new trainer arguements --upgraded versions of pytorch lightning --switched metrics to torchmetrics

manujosephv · manujosephv · commit 77cead1287c1 · 2021-06-18T20:50:08.000+05:30
diff --git a/pytorch_tabular/config/config.py b/pytorch_tabular/config/config.py
@@ -3,7 +3,7 @@
 # For license information, see LICENSE.TXT
 """Config"""
 from dataclasses import MISSING, dataclass, field
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Union
 import os
 from omegaconf import OmegaConf
 
@@ -168,18 +168,31 @@ class TrainerConfig:
 
         max_epochs (int): Maximum number of epochs to be run
 
-        min_epochs (int): Minimum number of epochs to be run
+        min_epochs (int): Force training for at least these many epochs. 1 by default
 
-        gpus (int): The index of the GPU to be used. If `None`, will use CPU
+        max_time (Optional[int]): Stop training after this amount of time has passed. Disabled by default (None)
+
+        gpus (int): Number of gpus to train on (int) or which GPUs to train on (list or str). -1 uses all available GPUs. By default uses CPU (None)
 
         accumulate_grad_batches (int): Accumulates grads every k batches or as set up in the dict.
             Trainer also calls optimizer.step() for the last indivisible step number.
 
         auto_lr_find (bool): Runs a learning rate finder algorithm (see this paper) when calling trainer.tune(),
             to find optimal initial learning rate.
 
+        auto_select_gpus (bool): If enabled and `gpus` is an integer, pick available gpus automatically.
+            This is especially useful when GPUs are configured to be in 'exclusive mode', such that only one 
+            process at a time can access them.
+
         check_val_every_n_epoch (int): Check val every n train epochs.
 
+        deterministic (bool): If true enables cudnn.deterministic. Might make your system slower, but ensures reproducibility.
+
+        accelerator(str): The accelerator backend to use. Defaults to None. Check this link for detailed documentation about the functionality. 
+            https://pytorch-lightning.readthedocs.io/en/latest/common/trainer.html#accelerator
+
+        tpu_cores(int): How many TPU cores to train on (1 or 8) / Single TPU to train on [1]. Defaults to None
+
         gradient_clip_val (float): Gradient clipping value
 
         overfit_batches (float): Uses this much data of the training set. If nonzero, will use the same training set
@@ -219,17 +232,20 @@ class TrainerConfig:
         default=64, metadata={"help": "Number of samples in each batch of training"}
     )
     fast_dev_run: bool = field(
-        default=False, metadata={"help": "Quick Debug Run of Val"}
+        default=False, metadata={"help": "runs n if set to ``n`` (int) else 1 if set to ``True`` batch(es) of train, val and test to find any bugs (ie: a sort of unit test)."}
     )
     max_epochs: int = field(
         default=10, metadata={"help": "Maximum number of epochs to be run"}
     )
-    min_epochs: int = field(
-        default=1, metadata={"help": "Minimum number of epochs to be run"}
+    min_epochs: Optional[int] = field(
+        default=1, metadata={"help": "Force training for at least these many epochs. 1 by default"}
+    )
+    max_time: Optional[int] = field(
+        default=None, metadata={"help": "Stop training after this amount of time has passed. Disabled by default (None)"}
     )
-    gpus: Optional[int] = field(
+    gpus: Union[int, list] = field(
         default=None,
-        metadata={"help": "The index of the GPU to be used. If None, will use CPU"},
+        metadata={"help": "Number of gpus to train on (int) or which GPUs to train on (list or str). -1 uses all available GPUs. By default uses CPU (None)"},
     )
     accumulate_grad_batches: int = field(
         default=1,
@@ -243,6 +259,12 @@ class TrainerConfig:
             "help": "Runs a learning rate finder algorithm (see this paper) when calling trainer.tune(), to find optimal initial learning rate."
         },
     )
+    auto_select_gpus: bool = field(
+        default=True,
+        metadata={
+            "help": "If enabled and `gpus` is an integer, pick available gpus automatically. This is especially useful when GPUs are configured to be in 'exclusive mode', such that only one process at a time can access them."
+        },
+    )
     check_val_every_n_epoch: int = field(
         default=1, metadata={"help": "Check val every n train epochs."}
     )
@@ -255,6 +277,25 @@ class TrainerConfig:
             "help": "Uses this much data of the training set. If nonzero, will use the same training set for validation and testing. If the training dataloaders have shuffle=True, Lightning will automatically disable it. Useful for quickly debugging or trying to overfit on purpose."
         },
     )
+    deterministic: bool = field(
+        default=False,
+        metadata={
+            "help": "If true enables cudnn.deterministic. Might make your system slower, but ensures reproducibility."
+        },
+    )
+    accelerator: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The accelerator backend to use. Defaults to None. Check this link for detailed documentation about the functionality. https://pytorch-lightning.readthedocs.io/en/latest/common/trainer.html#accelerator",
+            "choices": [None, "dp", "ddp", "ddp_cpu", "ddp2"],
+        },
+    )
+    tpu_cores: Optional[Union[List[int], str, int]] = field(
+        default=None,
+        metadata={
+            "help": "How many TPU cores to train on (1 or 8) / Single TPU to train on [1]. Defaults to None",
+        },
+    )
     profiler: Optional[str] = field(
         default=None,
         metadata={
@@ -530,7 +571,7 @@ class ModelConfig:
     metrics: Optional[List[str]] = field(
         default=None,
         metadata={
-            "help": "the list of metrics you need to track during training. The metrics should be one of the functional metrics implemented in PyTorch Lightning. By default, it is accuracy if classification and MeanSquaredLogError for regression"
+            "help": "the list of metrics you need to track during training. The metrics should be one of the functional metrics implemented in ``torchmetrics``. By default, it is accuracy if classification and mean_squared_error for regression"
         },
     )
     metrics_params: Optional[List] = field(
diff --git a/pytorch_tabular/models/base_model.py b/pytorch_tabular/models/base_model.py
@@ -8,6 +8,7 @@
 
 import pytorch_lightning as pl
 import torch
+import torchmetrics
 import torch.nn as nn
 from omegaconf import DictConfig
 
@@ -73,13 +74,13 @@ def _setup_loss(self):
     def _setup_metrics(self):
         if self.custom_metrics is None:
             self.metrics = []
-            task_module = pl.metrics.functional
+            task_module = torchmetrics.functional
             for metric in self.hparams.metrics:
                 try:
                     self.metrics.append(getattr(task_module, metric))
                 except AttributeError as e:
                     logger.error(
-                        f"{metric} is not a valid functional metric defined in the pytorch_lightning.metrics.functional module"
+                        f"{metric} is not a valid functional metric defined in the torchmetrics.functional module"
                     )
                     raise e
         else:
@@ -124,7 +125,7 @@ def calculate_metrics(self, y, y_hat, tag):
                 for i in range(self.hparams.output_dim):
                     if (
                         metric.__name__
-                        == pl.metrics.functional.mean_squared_log_error.__name__
+                        == torchmetrics.functional.mean_squared_log_error.__name__
                     ):
                         # MSLE should only be used in strictly positive targets. It is undefined otherwise
                         _metric = metric(
diff --git a/requirements.txt b/requirements.txt
@@ -1,13 +1,16 @@
-torch>=1.3
+torch>=1.4
 category-encoders==2.2.2
-numpy>=1.16.6
+numpy>=1.17.2
 pandas==1.1.5
 scikit-learn==0.23.2
-pytorch-lightning==1.0.8 #works well with wandb
-omegaconf==2.0.5
-tensorboard>=2.2.0
+pytorch-lightning==1.3.6
+omegaconf>=2.0.1
+torchmetrics>=0.3.2
+tensorboard>=2.2.0, !=2.5.0
 pytorch-tabnet==3.0.0
-PyYAML>=5.1  # OmegaConf requirement >=5.1
+PyYAML>=5.1.*  # OmegaConf requirement >=5.1
 # importlib-metadata <1,>=0.12
+matplotlib>3.1
 ipywidgets
-matplotlib
+# Use dataclasses backport for Python 3.6.
+dataclasses;python_version=='3.6'
diff --git a/requirements_testing.txt b/requirements_testing.txt
@@ -1,4 +1,4 @@
-pip==19.2.3
+pip==20.3.1
 bump2version==0.5.11
 wheel==0.33.6
 watchdog==0.9.0

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-pip==19.2.3`
	`1`	`+pip==20.3.1`
`2`	`2`	`bump2version==0.5.11`
`3`	`3`	`wheel==0.33.6`
`4`	`4`	`watchdog==0.9.0`