lightdatamodule

bartzbeielstein · bartzbeielstein · commit 83e5a7ac25d9 · 2023-11-12T23:43:56.000+01:00
diff --git a/notebooks/00_spotPython_tests.ipynb b/notebooks/00_spotPython_tests.ipynb
@@ -354,7 +354,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 1,
+      "execution_count": null,
       "metadata": {},
       "outputs": [],
       "source": [
@@ -365,34 +365,9 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 2,
+      "execution_count": null,
       "metadata": {},
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Batch Size: 5\n",
-            "---------------\n",
-            "Inputs: tensor([[1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-            "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-            "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-            "        [1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,\n",
-            "         0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,\n",
-            "         0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0],\n",
-            "        [1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,\n",
-            "         1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,\n",
-            "         0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0],\n",
-            "        [1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,\n",
-            "         1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
-            "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
-            "        [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,\n",
-            "         0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,\n",
-            "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])\n",
-            "Targets: tensor([ 0,  1,  6,  9, 10])\n"
-          ]
-        }
-      ],
+      "outputs": [],
       "source": [
         "from torch.utils.data import DataLoader\n",
         "# Set batch size for DataLoader\n",
@@ -456,9 +431,9 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "# from spotPython.light.pkldataset import PKLDataset\n",
-        "# import torch\n",
-        "# dataset = PKLDataset(pkl_file='./data/spotPython/data_sensitive.pkl', target_column='N', feature_type=torch.float32, target_type=torch.float64, rmNA=False)"
+        "from spotPython.data.pkldataset import PKLDataset\n",
+        "import torch\n",
+        "dataset = PKLDataset(directory=\"./data/spotPython/\", filename=\"data_sensitive.pkl\", target_column='N', feature_type=torch.float32, target_type=torch.float64, rmNA=False)"
       ]
     },
     {
@@ -467,22 +442,140 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "# from torch.utils.data import DataLoader\n",
-        "# # Set batch size for DataLoader\n",
-        "# batch_size = 5\n",
-        "# # Create DataLoader\n",
-        "# dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)\n",
+        "from torch.utils.data import DataLoader\n",
+        "# Set batch size for DataLoader\n",
+        "batch_size = 5\n",
+        "# Create DataLoader\n",
+        "dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)\n",
         "\n",
-        "# # Iterate over the data in the DataLoader\n",
-        "# for batch in dataloader:\n",
-        "#     inputs, targets = batch\n",
-        "#     print(f\"Batch Size: {inputs.size(0)}\")\n",
-        "#     print(\"---------------\")\n",
-        "#     print(f\"Inputs: {inputs}\")\n",
-        "#     print(f\"Targets: {targets}\")\n",
-        "#     break"
+        "# Iterate over the data in the DataLoader\n",
+        "for batch in dataloader:\n",
+        "    inputs, targets = batch\n",
+        "    print(f\"Batch Size: {inputs.size(0)}\")\n",
+        "    print(\"---------------\")\n",
+        "    print(f\"Inputs: {inputs}\")\n",
+        "    print(f\"Targets: {targets}\")\n",
+        "    break"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Test lightdatamodule"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Loading data from /Users/bartz/miniforge3/envs/spotCondaEnv/lib/python3.11/site-packages/spotPython/data/data.csv\n",
+            "11\n"
+          ]
+        }
+      ],
+      "source": [
+        "from spotPython.data.lightdatamodule import LightDataModule\n",
+        "from spotPython.data.csvdataset import CSVDataset\n",
+        "from spotPython.data.pkldataset import PKLDataset\n",
+        "import torch\n",
+        "dataset = CSVDataset(csv_file='data.csv', target_column='prognosis', feature_type=torch.long)\n",
+        "# dataset = PKLDataset(directory=\"./data/spotPython/\", filename=\"data_sensitive.pkl\", target_column='N', feature_type=torch.float32, target_type=torch.float64, rmNA=False)\n",
+        "print(len(dataset))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "data_module = LightDataModule(dataset=dataset, batch_size=5, test_size=0.5)"
       ]
     },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "full_train_size: 0.5\n",
+            "val_size: 0.25\n",
+            "train_size: 0.25\n",
+            "test_size: 0.5\n"
+          ]
+        }
+      ],
+      "source": [
+        "data_module.setup()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Training set size: 3\n"
+          ]
+        }
+      ],
+      "source": [
+        "print(f\"Training set size: {len(data_module.data_train)}\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Validation set size: 3\n"
+          ]
+        }
+      ],
+      "source": [
+        "print(f\"Validation set size: {len(data_module.data_val)}\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Test set size: 6\n"
+          ]
+        }
+      ],
+      "source": [
+        "print(f\"Test set size: {len(data_module.data_test)}\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": []
+    },
     {
       "cell_type": "code",
       "execution_count": null,
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "spotPython"
-version = "0.6.42"
+version = "0.6.43"
 authors = [
   { name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }
 ]
diff --git a/test/test_lightdatamodule.py b/test/test_lightdatamodule.py
@@ -0,0 +1,22 @@
+import pytest
+import torch
+from spotPython.data.lightdatamodule import LightDataModule
+from spotPython.data.csvdataset import CSVDataset
+
+
+def test_light_data_module():
+    # Create an instance of CSVDataset for testing
+    dataset = CSVDataset(target_column='prognosis', feature_type=torch.long)
+
+    # Test the length of the dataset
+    assert len(dataset) > 0
+
+    data_module = LightDataModule(dataset=dataset, batch_size=5, test_size=0.5)
+    data_module.setup()
+
+    # Test the length of val and train: should be equal, because test_size=0.5
+    assert len(data_module.data_train) ==  len(data_module.data_val)
+
+
+if __name__ == "__main__":
+    pytest.main(["-v", __file__])

Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"`
`7`	`7`
`8`	`8`	`[project]`
`9`	`9`	`name = "spotPython"`
`10`		`-version = "0.6.42"`
	`10`	`+version = "0.6.43"`
`11`	`11`	`authors = [`
`12`	`12`	`{ name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }`
`13`	`13`	`]`