0.14.47

bartzbeielstein · bartzbeielstein · commit 98bfa5429855 · 2024-07-14T20:41:21.000+02:00
pkl updated
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "spotpython"
-version = "0.14.46"
+version = "0.14.47"
 authors = [
   { name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }
 ]
diff --git a/src/spotPython/data/csvdataset.py b/src/spotPython/data/csvdataset.py
@@ -149,3 +149,20 @@ def extra_repr(self) -> str:
         """
         split = "Train" if self.train else "Test"
         return f"Split: {split}"
+
+    def __ncols__(self) -> int:
+        """
+        Returns the number of columns in the dataset.
+
+        Returns:
+            int: The number of columns in the dataset.
+
+        Examples:
+            >>> from spotPython.data.pkldataset import PKLDataset
+                import torch
+                from torch.utils.data import DataLoader
+                dataset = PKLDataset(target_column='prognosis', feature_type=torch.long)
+                print(dataset.__ncols__())
+                64
+        """
+        return self.data.size(1)
diff --git a/src/spotPython/data/pkldataset.py b/src/spotPython/data/pkldataset.py
@@ -34,10 +34,12 @@ class PKLDataset(Dataset):
             The directory where the pkl file is located.
         feature_type (torch.dtype):
             The data type of the features.
+            Defaults to torch.float.
         target_column (str):
             The name of the target column.
         target_type (torch.dtype):
             The data type of the targets.
+            Defaults to torch.float.
         train (bool):
             Whether the dataset is for training or not.
         rmNA (bool):
@@ -73,7 +75,7 @@ class PKLDataset(Dataset):
                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                     [1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
-                    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
                     0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0],
                     [1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
                     1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
@@ -104,9 +106,11 @@ def __init__(
         directory: None = None,
         feature_type: torch.dtype = torch.float,
         target_column: str = "y",
-        target_type: torch.dtype = torch.long,
+        target_type: torch.dtype = torch.float,
         train: bool = True,
         rmNA=True,
+        oe=OrdinalEncoder(),
+        le=LabelEncoder(),
         **desc,
     ) -> None:
         super().__init__()
@@ -117,16 +121,15 @@ def __init__(
         self.target_column = target_column
         self.train = train
         self.rmNA = rmNA
+        self.oe = oe
+        self.le = le
         self.data, self.targets = self._load_data()
 
     @property
     def path(self):
-        # user defined directory:
         if self.directory:
             return pathlib.Path(self.directory).joinpath(self.filename)
-        # no user defined directory, use package directory
-        else:
-            return pathlib.Path(__file__).parent.joinpath(self.filename)
+        return pathlib.Path(__file__).parent.joinpath(self.filename)
 
     @property
     def _repr_content(self):
@@ -135,26 +138,37 @@ def _repr_content(self):
         return content
 
     def _load_data(self) -> tuple:
+        # ensure that self.target_type and self.feature_type are the same torch types
+        if self.target_type != self.feature_type:
+            raise ValueError("target_type and feature_type must be the same torch type")
         with open(self.path, "rb") as f:
             df = pd.read_pickle(f)
         # rm rows with NA
         if self.rmNA:
             df = df.dropna()
 
-        oe = OrdinalEncoder()
-        # Apply LabelEncoder to string columns
-        le = LabelEncoder()
-        # df = df.apply(lambda col: le.fit_transform(col) if col.dtypes == object else col)
-
         # Split DataFrame into feature and target DataFrames
         feature_df = df.drop(columns=[self.target_column])
-        feature_df = oe.fit_transform(feature_df)
+
+        # Identify non-numerical columns in the feature DataFrame
+        non_numerical_columns = feature_df.select_dtypes(exclude=["number"]).columns.tolist()
+
+        # Apply OrdinalEncoder to non-numerical feature columns
+        if non_numerical_columns:
+            feature_df[non_numerical_columns] = self.oe.fit_transform(feature_df[non_numerical_columns])
+
         target_df = df[self.target_column]
-        target_df = le.fit_transform(target_df)
 
-        # Convert DataFrames to PyTorch tensors
-        feature_tensor = torch.tensor(feature_df, dtype=self.feature_type)
-        target_tensor = torch.tensor(target_df, dtype=self.target_type)
+        # Check if the target column is non-numerical using dtype
+        if not pd.api.types.is_numeric_dtype(target_df):
+            target_df = self.le.fit_transform(target_df)
+
+        # Convert DataFrames to NumPy arrays and then to PyTorch tensors
+        feature_array = feature_df.to_numpy()
+        target_array = target_df
+
+        feature_tensor = torch.tensor(feature_array, dtype=self.feature_type)
+        target_tensor = torch.tensor(target_array, dtype=self.target_type)
 
         return feature_tensor, target_tensor
 
@@ -214,3 +228,20 @@ def extra_repr(self) -> str:
                 print(dataset)
         """
         return "filename={}, directory={}".format(self.filename, self.directory)
+
+    def __ncols__(self) -> int:
+        """
+        Returns the number of columns in the dataset.
+
+        Returns:
+            int: The number of columns in the dataset.
+
+        Examples:
+            >>> from spotPython.data.pkldataset import PKLDataset
+                import torch
+                from torch.utils.data import DataLoader
+                dataset = PKLDataset(target_column='prognosis', feature_type=torch.long)
+                print(dataset.__ncols__())
+                64
+        """
+        return self.data.size(1)
diff --git a/src/spotPython/fun/hyperlight.py b/src/spotPython/fun/hyperlight.py
@@ -146,6 +146,7 @@ def fun(self, X: np.ndarray, fun_control: dict = None) -> np.ndarray:
             except Exception as err:
                 if fun_control["verbosity"] > 0:
                     print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+                    pprint.pprint(fun_control)
                     print(f"Error in fun(). Call to train_model failed. {err=}, {type(err)=}")
                     print("Setting df_eval to np.nan\n")
                 logger.error(f"Error in fun(). Call to train_model failed. {err=}, {type(err)=}")
diff --git a/src/spotPython/hyperdict/light_hyper_dict.json b/src/spotPython/hyperdict/light_hyper_dict.json
@@ -349,17 +349,11 @@
         },
         "optimizer": {
             "levels": [
-                "Adadelta",
-                "Adagrad",
                 "Adam",
                 "AdamW",
-                "SparseAdam",
                 "Adamax",
-                "ASGD",
                 "NAdam",
                 "RAdam",
-                "RMSprop",
-                "Rprop",
                 "SGD"
             ],
             "type": "factor",
@@ -393,16 +387,14 @@
         },
         "initialization": {
             "levels": [
-                "Default",
-                "Kaiming",
-                "Xavier"
+                "Default"
             ],
             "type": "factor",
             "default": "Default",
             "transform": "None",
             "core_model_parameter_type": "str",
             "lower": 0,
-            "upper": 2
+            "upper": 0
         }
     }
 }
diff --git a/src/spotPython/spot/spot.py b/src/spotPython/spot/spot.py
@@ -696,6 +696,14 @@ def write_db_dict(self) -> None:
         print("The following dictionaries are written to the json file spotPython_db.json:")
         print("fun_control:")
         pprint.pprint(fun_control)
+        # check if all the keys in the dictionary are serializable
+        for key in fun_control.keys():
+            if not isinstance(fun_control[key], (int, float, str, list, dict)):
+                # remove the key from the dictionary
+                print(f"Removing non-serializable key: {key}")
+                fun_control.pop(key)
+        print("fun_control after removing non-serializabel keys:")
+        pprint.pprint(fun_control)
         print("design_control:")
         pprint.pprint(design_control)
         print("optimizer_control:")
diff --git a/src/spotPython/utils/init.py b/src/spotPython/utils/init.py
@@ -242,8 +242,10 @@ def fun_control_init(
             The weight coefficient of the objective function. Positive values mean minimization.
             If set to -1, scores that are better when maximized will be minimized, e.g, accuracy.
             Can be an array, so that different weights can be used for different (multiple) objectives.
+            Default is 1.0.
         weight_coeff (float):
             Determines how to weight older measures. Default is 1.0. Used in the OML algorithm eval_oml.py.
+            Default is 0.0.
         weights_entry (str):
             The weights entry used in the GUI. Default is None.
 

Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"`
`7`	`7`
`8`	`8`	`[project]`
`9`	`9`	`name = "spotpython"`
`10`		`-version = "0.14.46"`
	`10`	`+version = "0.14.47"`
`11`	`11`	`authors = [`
`12`	`12`	`{ name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }`
`13`	`13`	`]`