sequential-parameter-optimization
diff --git a/‎makeSpot.sh‎
Lines changed: 0 additions & 1 deletion b/‎makeSpot.sh‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎notebooks/00_spotPython_tests.ipynb‎
Lines changed: 163 additions & 46 deletions b/‎notebooks/00_spotPython_tests.ipynb‎
Lines changed: 163 additions & 46 deletions
diff --git a/‎src/spotPython/data/csvdataset.py‎
Lines changed: 5 additions & 0 deletions b/‎src/spotPython/data/csvdataset.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/spotPython/data/lightdatamodule.py‎
Lines changed: 4 additions & 4 deletions b/‎src/spotPython/data/lightdatamodule.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/spotPython/hyperparameters/values.py‎
Lines changed: 29 additions & 0 deletions b/‎src/spotPython/hyperparameters/values.py‎
Lines changed: 29 additions & 0 deletions
@@ -2,4 +2,3 @@
 cd ~/workspace/spotPython
 rm -f dist/spotPython*; python -m build; python -m pip install dist/spotPython*.tar.gz
 python -m mkdocs build
-pytest
@@ -247,29 +247,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 1,
       "metadata": {},
       "outputs": [],
       "source": [
-        "# from pyhcf.data.daten_sensitive import DatenSensitive\n",
-        "# from pyhcf.utils.names import get_short_parameter_names\n",
-        "# daten = DatenSensitive()\n",
-        "# df = daten.load()\n",
-        "# names =  df.columns\n",
-        "# names = get_short_parameter_names(names)\n",
-        "# # rename columns with short names\n",
-        "# df.columns = names\n",
-        "# df.head()\n",
-        "# # save the df as a csv file\n",
-        "# df.to_csv('./data/spotPython/data_sensitive.csv', index=False)\n",
-        "# # save the df as a pickle file\n",
-        "# df.to_pickle('./data/spotPython/data_sensitive.pkl')\n",
-        "# # remove all rows with NaN values\n",
-        "# df = df.dropna()\n",
-        "# # save the df as a csv file\n",
-        "# df.to_csv('./data/spotPython/data_sensitive_rmNA.csv', index=False)\n",
-        "# # save the df as a pickle file\n",
-        "# df.to_pickle('./data/spotPython/data_sensitive_rmNA.pkl')\n"
+        "from pyhcf.data.daten_sensitive import DatenSensitive\n",
+        "from pyhcf.utils.names import get_short_parameter_names\n",
+        "daten = DatenSensitive()\n",
+        "df = daten.load()\n",
+        "names =  df.columns\n",
+        "names = get_short_parameter_names(names)\n",
+        "# rename columns with short names\n",
+        "df.columns = names\n",
+        "df.head()\n",
+        "# save the df as a csv file\n",
+        "df.to_csv('./data/spotPython/data_sensitive.csv', index=False)\n",
+        "# save the df as a pickle file\n",
+        "df.to_pickle('./data/spotPython/data_sensitive.pkl')\n",
+        "# remove all rows with NaN values\n",
+        "df = df.dropna()\n",
+        "# save the df as a csv file\n",
+        "df.to_csv('./data/spotPython/data_sensitive_rmNA.csv', index=False)\n",
+        "# save the df as a pickle file\n",
+        "df.to_pickle('./data/spotPython/data_sensitive_rmNA.pkl')\n"
       ]
     },
     {
@@ -398,9 +398,9 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "# from spotPython.light.pkldataset import PKLDataset\n",
-        "# import torch\n",
-        "# dataset = PKLDataset(pkl_file='./data/spotPython/data_sensitive.pkl', target_column='A', feature_type=torch.long, rmNA=False)"
+        "from spotPython.light.pkldataset import PKLDataset\n",
+        "import torch\n",
+        "dataset = PKLDataset(pkl_file='./data/spotPython/data_sensitive.pkl', target_column='A', feature_type=torch.long, rmNA=False)"
       ]
     },
     {
@@ -427,13 +427,13 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 3,
       "metadata": {},
       "outputs": [],
       "source": [
         "from spotPython.data.pkldataset import PKLDataset\n",
         "import torch\n",
-        "dataset = PKLDataset(directory=\"./data/spotPython/\", filename=\"data_sensitive.pkl\", target_column='N', feature_type=torch.float32, target_type=torch.float64, rmNA=False)"
+        "dataset = PKLDataset(directory=\"/Users/bartz/workspace/spotPython/notebooks/data/spotPython/\", filename=\"data_sensitive.pkl\", target_column='N', feature_type=torch.float32, target_type=torch.float64, rmNA=False)"
       ]
     },
     {
@@ -467,9 +467,17 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 5,
       "metadata": {},
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "11\n"
+          ]
+        }
+      ],
       "source": [
         "from spotPython.data.lightdatamodule import LightDataModule\n",
         "from spotPython.data.csvdataset import CSVDataset\n",
@@ -482,7 +490,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 6,
       "metadata": {},
       "outputs": [],
       "source": [
@@ -491,36 +499,71 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 7,
       "metadata": {},
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "full_train_size: 4\n",
+            "val_size: 2\n",
+            "train_size: 2\n",
+            "test_size: 7\n"
+          ]
+        }
+      ],
       "source": [
         "data_module.setup()"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 8,
       "metadata": {},
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Training set size: 2\n"
+          ]
+        }
+      ],
       "source": [
         "print(f\"Training set size: {len(data_module.data_train)}\")"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 9,
       "metadata": {},
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Validation set size: 2\n"
+          ]
+        }
+      ],
       "source": [
         "print(f\"Validation set size: {len(data_module.data_val)}\")"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 10,
       "metadata": {},
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Test set size: 7\n"
+          ]
+        }
+      ],
       "source": [
         "print(f\"Test set size: {len(data_module.data_test)}\")"
       ]
@@ -541,7 +584,36 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 1,
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from spotPython.utils.init import fun_control_init\n",
+        "from spotPython.hyperparameters.values import set_data_module\n",
+        "from spotPython.data.lightdatamodule import LightDataModule\n",
+        "from spotPython.data.csvdataset import CSVDataset\n",
+        "from spotPython.data.pkldataset import PKLDataset\n",
+        "import torch\n",
+        "fun_control = fun_control_init()\n",
+        "dataset = CSVDataset(csv_file='data.csv', target_column='prognosis', feature_type=torch.long)\n",
+        "dm = LightDataModule(dataset=dataset, batch_size=5, test_size=7)\n",
+        "dm.setup()\n",
+        "set_data_module(fun_control=fun_control,\n",
+        "                data_module=dm)\n",
+        "data_module = fun_control[\"data_module\"]\n",
+        "print(f\"Test set size: {len(data_module.data_test)}\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## same with the sensitive data set"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 13,
       "metadata": {},
       "outputs": [
         {
@@ -555,25 +627,70 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "Loading data from /Users/bartz/miniforge3/envs/py311/lib/python3.11/site-packages/spotPython/data/data.csv\n",
-            "full_train_size: 4\n",
-            "val_size: 2\n",
-            "train_size: 2\n",
-            "test_size: 7\n",
-            "Test set size: 7\n"
+            "full_train_size: 56925\n",
+            "val_size: 76\n",
+            "train_size: 56849\n",
+            "test_size: 77\n",
+            "Test set size: 77\n"
           ]
         }
       ],
       "source": [
         "from spotPython.utils.init import fun_control_init\n",
         "from spotPython.hyperparameters.values import set_data_module\n",
         "from spotPython.data.lightdatamodule import LightDataModule\n",
-        "from spotPython.data.csvdataset import CSVDataset\n",
         "from spotPython.data.pkldataset import PKLDataset\n",
         "import torch\n",
         "fun_control = fun_control_init()\n",
-        "dataset = CSVDataset(csv_file='data.csv', target_column='prognosis', feature_type=torch.long)\n",
-        "dm = LightDataModule(dataset=dataset, batch_size=5, test_size=7)\n",
+        "dataset = PKLDataset(directory=\"/Users/bartz/workspace/spotPython/notebooks/data/spotPython/\", filename=\"data_sensitive.pkl\", target_column='N', feature_type=torch.float32, target_type=torch.float64, rmNA=False)\n",
+        "dm = LightDataModule(dataset=dataset, batch_size=5, test_size=77)\n",
+        "dm.setup()\n",
+        "set_data_module(fun_control=fun_control,\n",
+        "                data_module=dm)\n",
+        "data_module = fun_control[\"data_module\"]\n",
+        "print(f\"Test set size: {len(data_module.data_test)}\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## same, but VBDO data set"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 15,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Seed set to 42\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "full_train_size: 630\n",
+            "val_size: 68\n",
+            "train_size: 562\n",
+            "test_size: 77\n",
+            "Test set size: 77\n"
+          ]
+        }
+      ],
+      "source": [
+        "from spotPython.utils.init import fun_control_init\n",
+        "from spotPython.hyperparameters.values import set_data_module\n",
+        "from spotPython.data.lightdatamodule import LightDataModule\n",
+        "from spotPython.data.csvdataset import CSVDataset\n",
+        "import torch\n",
+        "fun_control = fun_control_init()\n",
+        "dataset = CSVDataset(directory=\"/Users/bartz/workspace/spotPython/notebooks/data/VBDP/\", filename=\"train.csv\",target_column='prognosis', feature_type=torch.long)\n",
+        "dm = LightDataModule(dataset=dataset, batch_size=5, test_size=77)\n",
         "dm.setup()\n",
         "set_data_module(fun_control=fun_control,\n",
         "                data_module=dm)\n",
 
@@ -17,6 +17,7 @@ class CSVDataset(Dataset):
         target_type (torch.dtype): The data type of the targets. Defaults to torch.long.
         train (bool): Whether the dataset is for training or not. Defaults to True.
         rmNA (bool): Whether to remove rows with NA values or not. Defaults to True.
+        dropId (bool): Whether to drop the "id" column or not. Defaults to False.
         **desc: Additional keyword arguments.
 
     Attributes:
@@ -51,6 +52,7 @@ def __init__(
         target_type: torch.dtype = torch.long,
         train: bool = True,
         rmNA=True,
+        dropId=False,
         **desc,
     ) -> None:
         super().__init__()
@@ -61,6 +63,7 @@ def __init__(
         self.target_column = target_column
         self.train = train
         self.rmNA = rmNA
+        self.dropId = dropId
         self.data, self.targets = self._load_data()
 
     @property
@@ -81,6 +84,8 @@ def _load_data(self) -> tuple:
         # rm rows with NA
         if self.rmNA:
             df = df.dropna()
+        if self.dropId:
+            df = df.drop(columns=["id"])
         # Apply LabelEncoder to string columns
         le = LabelEncoder()
         df = df.apply(lambda col: le.fit_transform(col) if col.dtypes == object else col)
 
@@ -84,10 +84,10 @@ def setup(self, stage: Optional[str] = None) -> None:
             val_size = int(full_train_size * test_size / len(self.data_full))
             train_size = full_train_size - val_size
 
-        print(f"full_train_size: {full_train_size}")
-        print(f"val_size: {val_size}")
-        print(f"train_size: {train_size}")
-        print(f"test_size: {test_size}")
+        # print(f"full_train_size: {full_train_size}")
+        # print(f"val_size: {val_size}")
+        # print(f"train_size: {train_size}")
+        # print(f"test_size: {test_size}")
 
         # Assign train/val datasets for use in dataloaders
         if stage == "fit" or stage is None:
 
@@ -869,6 +869,35 @@ def get_default_hyperparameters_for_core_model(fun_control) -> dict:
     return values
 
 
+def set_data_set(fun_control, data_set) -> dict:
+    """
+    This function sets the lightning dataset in the fun_control dictionary.
+
+    Args:
+        fun_control (dict):
+            fun_control dictionary
+        data_set (class): Dataset class from torch.utils.data
+
+    Returns:
+        fun_control (dict):
+            updated fun_control
+
+    Examples:
+        >>> from spotPython.utils.init import fun_control_init
+            from spotPython.utils.prepare import set_data_module
+            from spotPython.data.lightdatamodule import LightDataModule
+            from spotPython.data.csvdataset import CSVDataset
+            from spotPython.data.pkldataset import PKLDataset
+            import torch
+            fun_control = fun_control_init()
+            ds = CSVDataset(csv_file='data.csv', target_column='prognosis', feature_type=torch.long)
+            set_data_set(fun_control=fun_control,
+                         data_set=ds)
+            fun_control["data_set"]
+    """
+    fun_control.update({"data_set": data_set})
+
+
 def set_data_module(fun_control, data_module) -> dict:
     """
     This function sets the lightning datamodule in the fun_control dictionary.