|
4055 | 4055 | }, |
4056 | 4056 | { |
4057 | 4057 | "cell_type": "code", |
4058 | | - "execution_count": 2, |
| 4058 | + "execution_count": null, |
4059 | 4059 | "metadata": {}, |
4060 | | - "outputs": [ |
4061 | | - { |
4062 | | - "name": "stderr", |
4063 | | - "output_type": "stream", |
4064 | | - "text": [ |
4065 | | - "Seed set to 123\n" |
4066 | | - ] |
4067 | | - }, |
4068 | | - { |
4069 | | - "name": "stdout", |
4070 | | - "output_type": "stream", |
4071 | | - "text": [ |
4072 | | - "Before modification:\n", |
4073 | | - "| name | type | default | lower | upper | transform |\n", |
4074 | | - "|-----------------|--------|-----------|---------|---------|-------------|\n", |
4075 | | - "| n_estimators | int | 10 | 2 | 1000 | None |\n", |
4076 | | - "| step | float | 1 | 0.1 | 10 | None |\n", |
4077 | | - "| use_aggregation | factor | 1 | 0 | 1 | None |\n", |
4078 | | - "Setting hyperparameter n_estimators to value [2, 5].\n", |
4079 | | - "Variable type is int.\n", |
4080 | | - "Core type is None.\n", |
4081 | | - "Calling modify_hyper_parameter_bounds().\n", |
4082 | | - "After modification:\n", |
4083 | | - "| name | type | default | lower | upper | transform |\n", |
4084 | | - "|-----------------|--------|-----------|---------|---------|-------------|\n", |
4085 | | - "| n_estimators | int | 10 | 2 | 5 | None |\n", |
4086 | | - "| step | float | 1 | 0.1 | 10 | None |\n", |
4087 | | - "| use_aggregation | factor | 1 | 0 | 1 | None |\n" |
4088 | | - ] |
4089 | | - } |
4090 | | - ], |
| 4060 | + "outputs": [], |
4091 | 4061 | "source": [ |
4092 | 4062 | "from spotRiver.hyperdict.river_hyper_dict import RiverHyperDict\n", |
4093 | 4063 | "from spotPython.utils.init import fun_control_init\n", |
|
4106 | 4076 | }, |
4107 | 4077 | { |
4108 | 4078 | "cell_type": "code", |
4109 | | - "execution_count": 6, |
| 4079 | + "execution_count": null, |
4110 | 4080 | "metadata": {}, |
4111 | | - "outputs": [ |
4112 | | - { |
4113 | | - "name": "stderr", |
4114 | | - "output_type": "stream", |
4115 | | - "text": [ |
4116 | | - "Seed set to 123\n" |
4117 | | - ] |
4118 | | - }, |
4119 | | - { |
4120 | | - "name": "stdout", |
4121 | | - "output_type": "stream", |
4122 | | - "text": [ |
4123 | | - "Before modification:\n", |
4124 | | - "| name | type | default | lower | upper | transform |\n", |
4125 | | - "|------------------------|--------|------------------|---------|----------|------------------------|\n", |
4126 | | - "| grace_period | int | 200 | 10 | 1000 | None |\n", |
4127 | | - "| max_depth | int | 20 | 2 | 20 | transform_power_2_int |\n", |
4128 | | - "| delta | float | 1e-07 | 1e-08 | 1e-06 | None |\n", |
4129 | | - "| tau | float | 0.05 | 0.01 | 0.1 | None |\n", |
4130 | | - "| leaf_prediction | factor | mean | 0 | 2 | None |\n", |
4131 | | - "| leaf_model | factor | LinearRegression | 0 | 2 | None |\n", |
4132 | | - "| model_selector_decay | float | 0.95 | 0.9 | 0.99 | None |\n", |
4133 | | - "| splitter | factor | EBSTSplitter | 0 | 2 | None |\n", |
4134 | | - "| min_samples_split | int | 5 | 2 | 10 | None |\n", |
4135 | | - "| binary_split | factor | 0 | 0 | 1 | None |\n", |
4136 | | - "| max_size | float | 500.0 | 100 | 1000 | None |\n", |
4137 | | - "| memory_estimate_period | int | 6 | 3 | 8 | transform_power_10_int |\n", |
4138 | | - "| stop_mem_management | factor | 0 | 0 | 1 | None |\n", |
4139 | | - "| remove_poor_attrs | factor | 0 | 0 | 1 | None |\n", |
4140 | | - "| merit_preprune | factor | 1 | 0 | 1 | None |\n", |
4141 | | - "After modification:\n", |
4142 | | - "| name | type | default | lower | upper | transform |\n", |
4143 | | - "|------------------------|--------|------------------|---------|----------|------------------------|\n", |
4144 | | - "| grace_period | int | 200 | 10 | 1000 | None |\n", |
4145 | | - "| max_depth | int | 20 | 2 | 20 | transform_power_2_int |\n", |
4146 | | - "| delta | float | 1e-07 | 1e-08 | 1e-06 | None |\n", |
4147 | | - "| tau | float | 0.05 | 0.01 | 0.1 | None |\n", |
4148 | | - "| leaf_prediction | factor | mean | 0 | 2 | None |\n", |
4149 | | - "| leaf_model | factor | LinearRegression | 0 | 1 | None |\n", |
4150 | | - "| model_selector_decay | float | 0.95 | 0.9 | 0.99 | None |\n", |
4151 | | - "| splitter | factor | EBSTSplitter | 0 | 2 | None |\n", |
4152 | | - "| min_samples_split | int | 5 | 2 | 10 | None |\n", |
4153 | | - "| binary_split | factor | 0 | 0 | 1 | None |\n", |
4154 | | - "| max_size | float | 500.0 | 100 | 1000 | None |\n", |
4155 | | - "| memory_estimate_period | int | 6 | 3 | 8 | transform_power_10_int |\n", |
4156 | | - "| stop_mem_management | factor | 0 | 0 | 1 | None |\n", |
4157 | | - "| remove_poor_attrs | factor | 0 | 0 | 1 | None |\n", |
4158 | | - "| merit_preprune | factor | 1 | 0 | 1 | None |\n" |
4159 | | - ] |
4160 | | - } |
4161 | | - ], |
| 4081 | + "outputs": [], |
4162 | 4082 | "source": [ |
4163 | 4083 | "import pprint\n", |
4164 | 4084 | "from spotRiver.hyperdict.river_hyper_dict import RiverHyperDict\n", |
|
4179 | 4099 | }, |
4180 | 4100 | { |
4181 | 4101 | "cell_type": "code", |
4182 | | - "execution_count": 7, |
| 4102 | + "execution_count": null, |
4183 | 4103 | "metadata": {}, |
4184 | | - "outputs": [ |
4185 | | - { |
4186 | | - "name": "stderr", |
4187 | | - "output_type": "stream", |
4188 | | - "text": [ |
4189 | | - "Seed set to 123\n" |
4190 | | - ] |
4191 | | - }, |
4192 | | - { |
4193 | | - "name": "stdout", |
4194 | | - "output_type": "stream", |
4195 | | - "text": [ |
4196 | | - "{'grace_period': {'type': 'int', 'default': 200, 'transform': 'None', 'lower': 10, 'upper': 1000}, 'max_depth': {'type': 'int', 'default': 20, 'transform': 'transform_power_2_int', 'lower': 2, 'upper': 20}, 'delta': {'type': 'float', 'default': 1e-07, 'transform': 'None', 'lower': 1e-08, 'upper': 1e-06}, 'tau': {'type': 'float', 'default': 0.05, 'transform': 'None', 'lower': 0.01, 'upper': 0.1}, 'leaf_prediction': {'levels': ['mean', 'model', 'adaptive'], 'type': 'factor', 'default': 'mean', 'transform': 'None', 'core_model_parameter_type': 'str', 'lower': 0, 'upper': 2}, 'leaf_model': {'levels': ['LinearRegression', 'Perceptron'], 'type': 'factor', 'default': 'LinearRegression', 'transform': 'None', 'class_name': 'river.linear_model', 'core_model_parameter_type': 'instance()', 'lower': 0, 'upper': 1}, 'model_selector_decay': {'type': 'float', 'default': 0.95, 'transform': 'None', 'lower': 0.9, 'upper': 0.99}, 'splitter': {'levels': ['EBSTSplitter', 'TEBSTSplitter', 'QOSplitter'], 'type': 'factor', 'default': 'EBSTSplitter', 'transform': 'None', 'class_name': 'river.tree.splitter', 'core_model_parameter_type': 'instance()', 'lower': 0, 'upper': 2}, 'min_samples_split': {'type': 'int', 'default': 5, 'transform': 'None', 'lower': 2, 'upper': 10}, 'binary_split': {'levels': [0, 1], 'type': 'factor', 'default': 0, 'transform': 'None', 'core_model_parameter_type': 'bool', 'lower': 0, 'upper': 1}, 'max_size': {'type': 'float', 'default': 500.0, 'transform': 'None', 'lower': 100.0, 'upper': 1000.0}, 'memory_estimate_period': {'type': 'int', 'default': 6, 'transform': 'transform_power_10_int', 'lower': 3, 'upper': 8}, 'stop_mem_management': {'levels': [0, 1], 'type': 'factor', 'default': 0, 'transform': 'None', 'core_model_parameter_type': 'bool', 'lower': 0, 'upper': 1}, 'remove_poor_attrs': {'levels': [0, 1], 'type': 'factor', 'default': 0, 'transform': 'None', 'core_model_parameter_type': 'bool', 'lower': 0, 'upper': 1}, 'merit_preprune': {'levels': [0, 1], 'type': 'factor', 'default': 1, 'transform': 'None', 'core_model_parameter_type': 'bool', 'lower': 0, 'upper': 1}}\n" |
4197 | | - ] |
4198 | | - } |
4199 | | - ], |
| 4104 | + "outputs": [], |
4200 | 4105 | "source": [ |
4201 | 4106 | "fun_control = fun_control_init(\n", |
4202 | 4107 | " core_model_name=\"tree.HoeffdingTreeRegressor\",\n", |
|
4213 | 4118 | }, |
4214 | 4119 | { |
4215 | 4120 | "cell_type": "code", |
4216 | | - "execution_count": 9, |
| 4121 | + "execution_count": null, |
4217 | 4122 | "metadata": {}, |
4218 | | - "outputs": [ |
4219 | | - { |
4220 | | - "name": "stderr", |
4221 | | - "output_type": "stream", |
4222 | | - "text": [ |
4223 | | - "Seed set to 123\n" |
4224 | | - ] |
4225 | | - }, |
4226 | | - { |
4227 | | - "name": "stdout", |
4228 | | - "output_type": "stream", |
4229 | | - "text": [ |
4230 | | - "Before modification:\n", |
4231 | | - "| name | type | default | lower | upper | transform |\n", |
4232 | | - "|-----------------|--------|-----------|---------|---------|-------------|\n", |
4233 | | - "| n_estimators | int | 10 | 2 | 1000 | None |\n", |
4234 | | - "| step | float | 1 | 0.1 | 10 | None |\n", |
4235 | | - "| use_aggregation | factor | 1 | 0 | 1 | None |\n", |
4236 | | - "Setting hyperparameter use_aggregation to value [0, 0].\n", |
4237 | | - "Variable type is factor.\n", |
4238 | | - "Core type is bool.\n", |
4239 | | - "Calling modify_boolean_hyper_parameter_levels().\n", |
4240 | | - "After modification:\n", |
4241 | | - "| name | type | default | lower | upper | transform |\n", |
4242 | | - "|-----------------|--------|-----------|---------|---------|-------------|\n", |
4243 | | - "| n_estimators | int | 10 | 2 | 1000 | None |\n", |
4244 | | - "| step | float | 1 | 0.1 | 10 | None |\n", |
4245 | | - "| use_aggregation | factor | 1 | 0 | 0 | None |\n" |
4246 | | - ] |
4247 | | - }, |
4248 | | - { |
4249 | | - "ename": "", |
4250 | | - "evalue": "", |
4251 | | - "output_type": "error", |
4252 | | - "traceback": [ |
4253 | | - "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n", |
4254 | | - "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n", |
4255 | | - "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n", |
4256 | | - "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details." |
4257 | | - ] |
4258 | | - } |
4259 | | - ], |
| 4123 | + "outputs": [], |
4260 | 4124 | "source": [ |
4261 | 4125 | "from spotRiver.hyperdict.river_hyper_dict import RiverHyperDict\n", |
4262 | 4126 | "from spotPython.utils.init import fun_control_init\n", |
|
4273 | 4137 | "print(gen_design_table(fun_control))" |
4274 | 4138 | ] |
4275 | 4139 | }, |
| 4140 | + { |
| 4141 | + "cell_type": "markdown", |
| 4142 | + "metadata": {}, |
| 4143 | + "source": [ |
| 4144 | + "# Scaler" |
| 4145 | + ] |
| 4146 | + }, |
| 4147 | + { |
| 4148 | + "cell_type": "markdown", |
| 4149 | + "metadata": {}, |
| 4150 | + "source": [ |
| 4151 | + "## Dataset" |
| 4152 | + ] |
| 4153 | + }, |
| 4154 | + { |
| 4155 | + "cell_type": "code", |
| 4156 | + "execution_count": 19, |
| 4157 | + "metadata": {}, |
| 4158 | + "outputs": [], |
| 4159 | + "source": [ |
| 4160 | + "import torch\n", |
| 4161 | + "from torch.utils.data import Dataset\n", |
| 4162 | + "\n", |
| 4163 | + "class MyDataset(Dataset):\n", |
| 4164 | + " def __init__(self, data, labels):\n", |
| 4165 | + " self.data = data\n", |
| 4166 | + " self.labels = labels\n", |
| 4167 | + "\n", |
| 4168 | + " def __len__(self):\n", |
| 4169 | + " return len(self.data)\n", |
| 4170 | + "\n", |
| 4171 | + " def __getitem__(self, idx):\n", |
| 4172 | + " return self.data[idx], self.labels[idx]" |
| 4173 | + ] |
| 4174 | + }, |
| 4175 | + { |
| 4176 | + "cell_type": "markdown", |
| 4177 | + "metadata": {}, |
| 4178 | + "source": [ |
| 4179 | + "## DataModule" |
| 4180 | + ] |
| 4181 | + }, |
| 4182 | + { |
| 4183 | + "cell_type": "code", |
| 4184 | + "execution_count": 20, |
| 4185 | + "metadata": {}, |
| 4186 | + "outputs": [], |
| 4187 | + "source": [ |
| 4188 | + "import pytorch_lightning as pl\n", |
| 4189 | + "from sklearn.preprocessing import StandardScaler\n", |
| 4190 | + "from torch.utils.data import DataLoader, random_split\n", |
| 4191 | + "\n", |
| 4192 | + "class MyDataModule(pl.LightningDataModule):\n", |
| 4193 | + " def __init__(self, full_dataset, train_size=0.8, batch_size=32, num_workers=4):\n", |
| 4194 | + " super().__init__()\n", |
| 4195 | + " self.dataset = full_dataset\n", |
| 4196 | + " self.train_size = train_size\n", |
| 4197 | + " self.batch_size = batch_size\n", |
| 4198 | + " self.num_workers = num_workers\n", |
| 4199 | + " self.scaler = StandardScaler()\n", |
| 4200 | + "\n", |
| 4201 | + " def setup(self, stage=None):\n", |
| 4202 | + " # Split the dataset\n", |
| 4203 | + " train_len = int(len(self.dataset) * self.train_size)\n", |
| 4204 | + " val_len = len(self.dataset) - train_len\n", |
| 4205 | + " self.train_set, self.val_set = random_split(self.dataset, [train_len, val_len])\n", |
| 4206 | + " \n", |
| 4207 | + " # Fit scaler on training data\n", |
| 4208 | + " train_data = torch.stack([item[0] for item in self.train_set])\n", |
| 4209 | + " print(f\"train_data before scaling\\n: {train_data}\") \n", |
| 4210 | + " self.scaler.fit(train_data)\n", |
| 4211 | + " \n", |
| 4212 | + " # Transform training data\n", |
| 4213 | + " scaled_train_data = self.scaler.transform(train_data)\n", |
| 4214 | + " self.train_set = self._update_dataset(self.train_set, scaled_train_data)\n", |
| 4215 | + " print(f\"train_data after scaling\\n: {self.train_set}\") \n", |
| 4216 | + " \n", |
| 4217 | + " # Transform validation data\n", |
| 4218 | + " val_data = torch.stack([item[0] for item in self.val_set])\n", |
| 4219 | + " scaled_val_data = self.scaler.transform(val_data)\n", |
| 4220 | + " self.val_set = self._update_dataset(self.val_set, scaled_val_data)\n", |
| 4221 | + "\n", |
| 4222 | + " def _update_dataset(self, original_dataset, scaled_data):\n", |
| 4223 | + " updated_dataset = []\n", |
| 4224 | + " for i, (data, label) in enumerate(original_dataset):\n", |
| 4225 | + " updated_dataset.append((torch.tensor(scaled_data[i]), label))\n", |
| 4226 | + " return updated_dataset\n", |
| 4227 | + "\n", |
| 4228 | + " def train_dataloader(self):\n", |
| 4229 | + " return DataLoader(self.train_set, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=True)\n", |
| 4230 | + "\n", |
| 4231 | + " def val_dataloader(self):\n", |
| 4232 | + " return DataLoader(self.val_set, batch_size=self.batch_size, num_workers=self.num_workers)\n", |
| 4233 | + "\n", |
| 4234 | + " def test_dataloader(self):\n", |
| 4235 | + " test_data = torch.stack([item[0] for item in self.test_set])\n", |
| 4236 | + " scaled_test_data = self.scaler.transform(test_data)\n", |
| 4237 | + " self.test_set = self._update_dataset(self.test_set, scaled_test_data)\n", |
| 4238 | + " return DataLoader(self.test_set, batch_size=self.batch_size, num_workers=self.num_workers)\n", |
| 4239 | + "\n", |
| 4240 | + " def prepare_data(self):\n", |
| 4241 | + " # Here you can download datasets if needed\n", |
| 4242 | + " pass" |
| 4243 | + ] |
| 4244 | + }, |
| 4245 | + { |
| 4246 | + "cell_type": "markdown", |
| 4247 | + "metadata": {}, |
| 4248 | + "source": [ |
| 4249 | + "## Example" |
| 4250 | + ] |
| 4251 | + }, |
| 4252 | + { |
| 4253 | + "cell_type": "code", |
| 4254 | + "execution_count": 21, |
| 4255 | + "metadata": {}, |
| 4256 | + "outputs": [ |
| 4257 | + { |
| 4258 | + "name": "stdout", |
| 4259 | + "output_type": "stream", |
| 4260 | + "text": [ |
| 4261 | + "data: tensor([[0.1279, 0.1770, 0.1569],\n", |
| 4262 | + " [0.6378, 0.3699, 0.0971],\n", |
| 4263 | + " [0.1516, 0.7931, 0.8748],\n", |
| 4264 | + " [0.8640, 0.3450, 0.7994],\n", |
| 4265 | + " [0.1711, 0.5990, 0.5109],\n", |
| 4266 | + " [0.2568, 0.1260, 0.3945],\n", |
| 4267 | + " [0.9566, 0.3997, 0.4479],\n", |
| 4268 | + " [0.5616, 0.4342, 0.3842],\n", |
| 4269 | + " [0.9247, 0.1204, 0.4356],\n", |
| 4270 | + " [0.2621, 0.9219, 0.2392]])\n", |
| 4271 | + "labels: tensor([0., 1., 0., 1., 0., 1., 0., 1., 0., 1.])\n", |
| 4272 | + "train_data before scaling\n", |
| 4273 | + ": tensor([[0.1279, 0.1770, 0.1569],\n", |
| 4274 | + " [0.1516, 0.7931, 0.8748],\n", |
| 4275 | + " [0.6378, 0.3699, 0.0971],\n", |
| 4276 | + " [0.5616, 0.4342, 0.3842],\n", |
| 4277 | + " [0.2621, 0.9219, 0.2392],\n", |
| 4278 | + " [0.1711, 0.5990, 0.5109],\n", |
| 4279 | + " [0.9566, 0.3997, 0.4479],\n", |
| 4280 | + " [0.2568, 0.1260, 0.3945]])\n", |
| 4281 | + "train_data after scaling\n", |
| 4282 | + ": [(tensor([-0.9444, -1.1516, -1.0144], dtype=torch.float64), tensor(0.)), (tensor([-0.8594, 1.2085, 2.1341], dtype=torch.float64), tensor(0.)), (tensor([ 0.8881, -0.4127, -1.2768], dtype=torch.float64), tensor(1.)), (tensor([ 0.6144, -0.1661, -0.0176], dtype=torch.float64), tensor(1.)), (tensor([-0.4621, 1.7019, -0.6533], dtype=torch.float64), tensor(1.)), (tensor([-0.7891, 0.4651, 0.5384], dtype=torch.float64), tensor(0.)), (tensor([ 2.0336, -0.2985, 0.2620], dtype=torch.float64), tensor(0.)), (tensor([-0.4812, -1.3467, 0.0277], dtype=torch.float64), tensor(1.))]\n", |
| 4283 | + "Batch data shape: torch.Size([8, 3])\n", |
| 4284 | + "tensor([[-0.7891, 0.4651, 0.5384],\n", |
| 4285 | + " [ 0.8881, -0.4127, -1.2768],\n", |
| 4286 | + " [ 2.0336, -0.2985, 0.2620],\n", |
| 4287 | + " [-0.8594, 1.2085, 2.1341],\n", |
| 4288 | + " [-0.9444, -1.1516, -1.0144],\n", |
| 4289 | + " [-0.4812, -1.3467, 0.0277],\n", |
| 4290 | + " [ 0.6144, -0.1661, -0.0176],\n", |
| 4291 | + " [-0.4621, 1.7019, -0.6533]], dtype=torch.float64)\n", |
| 4292 | + "tensor([0., 1., 0., 0., 0., 1., 1., 1.])\n" |
| 4293 | + ] |
| 4294 | + } |
| 4295 | + ], |
| 4296 | + "source": [ |
| 4297 | + "# generate a 3-dimensional tensor with 1000 samples\n", |
| 4298 | + "n = 10\n", |
| 4299 | + "data = torch.rand((n, 3))\n", |
| 4300 | + "print(f\"data: {data}\")\n", |
| 4301 | + "labels = torch.tensor([i % 2 for i in range(n)], dtype=torch.float32)\n", |
| 4302 | + "print(f\"labels: {labels}\")\n", |
| 4303 | + "full_dataset = MyDataset(data, labels)\n", |
| 4304 | + "\n", |
| 4305 | + "# Creating DataModule instance\n", |
| 4306 | + "data_module = MyDataModule(full_dataset)\n", |
| 4307 | + "\n", |
| 4308 | + "# Setup the data module\n", |
| 4309 | + "data_module.setup()\n", |
| 4310 | + "\n", |
| 4311 | + "# Example of fetching a single batch\n", |
| 4312 | + "train_loader = data_module.train_dataloader()\n", |
| 4313 | + "for batch in train_loader:\n", |
| 4314 | + " print(f\"Batch data shape: {batch[0].shape}\")\n", |
| 4315 | + " x, y = batch\n", |
| 4316 | + " print(x)\n", |
| 4317 | + " print(y)\n", |
| 4318 | + " break" |
| 4319 | + ] |
| 4320 | + }, |
4276 | 4321 | { |
4277 | 4322 | "cell_type": "code", |
4278 | 4323 | "execution_count": null, |
|
0 commit comments