@@ -52,9 +52,9 @@ def validate_fold(self, valloader, criterion, device):
5252 val_steps += 1
5353 return 100.0 * (correct / total )
5454
55- def evaluate_cv (self , dataset , shuffle = False ):
55+ def evaluate_cv (self , dataset , shuffle = False , num_workers = 0 , device = None ):
5656 try :
57- device = getDevice ()
57+ device = getDevice (device = device )
5858 # if torch.cuda.device_count() > 1:
5959 # self = nn.DataParallel(self)
6060 self .to (device )
@@ -65,13 +65,15 @@ def evaluate_cv(self, dataset, shuffle=False):
6565 train_subsampler = torch .utils .data .SubsetRandomSampler (train_ids )
6666 val_subsampler = torch .utils .data .SubsetRandomSampler (val_ids )
6767 trainloader = torch .utils .data .DataLoader (
68- dataset , batch_size = self .batch_size , sampler = train_subsampler , num_workers = 4
68+ dataset , batch_size = self .batch_size , sampler = train_subsampler , num_workers = num_workers
6969 )
7070 valloader = torch .utils .data .DataLoader (
71- dataset , batch_size = self .batch_size , sampler = val_subsampler , num_workers = 4
71+ dataset , batch_size = self .batch_size , sampler = val_subsampler , num_workers = num_workers
7272 )
7373 self .reset_weights ()
74+ # Train fold for several epochs:
7475 self .train_fold (trainloader , criterion , optimizer , device )
76+ # Validate fold:
7577 self .results [fold ] = self .validate_fold (valloader , criterion , device )
7678 df_eval = sum (self .results .values ()) / len (self .results .values ())
7779 df_preds = np .nan
@@ -81,11 +83,11 @@ def evaluate_cv(self, dataset, shuffle=False):
8183 df_preds = np .nan
8284 return df_eval , df_preds
8385
84- def evaluate_hold_out (self , dataset , shuffle , test_dataset = None ):
86+ def evaluate_hold_out (self , dataset , shuffle , test_dataset = None , device = None ):
8587 lr = self .lr
8688 epochs = self .epochs
8789 try :
88- device = getDevice ()
90+ device = getDevice (device = device )
8991 self .to (device )
9092 criterion = nn .CrossEntropyLoss ()
9193 # TODO: optimizer = optim.Adam(self.parameters(), lr=lr)
@@ -99,10 +101,14 @@ def evaluate_hold_out(self, dataset, shuffle, test_dataset=None):
99101 patience = 5
100102 best_val_loss = float ("inf" )
101103 counter = 0
104+ # We only have "one fold" which is trained for several epochs
105+ # (we do not have to reset the weights for each fold):
102106 for epoch in range (epochs ):
103- self .train_hold_out (trainloader , criterion , optimizer , device = device , epoch = epoch )
107+ print (f"Epoch: { epoch + 1 } " )
108+ # training loss from one epoch:
109+ _ = self .train_hold_out (trainloader , criterion , optimizer , device = device )
104110 # TODO: scheduler.step()
105- # Early stopping check
111+ # Early stopping check. Calculate validation loss from one epoch:
106112 val_accuracy , val_loss = self .validate_hold_out (valloader = valloader , criterion = criterion , device = device )
107113 if val_loss < best_val_loss :
108114 best_val_loss = val_loss
@@ -119,29 +125,30 @@ def evaluate_hold_out(self, dataset, shuffle, test_dataset=None):
119125 df_eval = np .nan
120126 df_preds = np .nan
121127 print (f"Returned to Spot: Validation loss: { df_eval } " )
128+ print ("----------------------------------------------" )
122129 return df_eval , df_preds
123130
124- def create_train_val_data_loaders (self , dataset , shuffle ):
131+ def create_train_val_data_loaders (self , dataset , shuffle , num_workers = 0 ):
125132 test_abs = int (len (dataset ) * 0.6 )
126133 train_subset , val_subset = random_split (dataset , [test_abs , len (dataset ) - test_abs ])
127134 trainloader = torch .utils .data .DataLoader (
128- train_subset , batch_size = int (self .batch_size ), shuffle = shuffle , num_workers = 8 , pin_memory = True
135+ train_subset , batch_size = int (self .batch_size ), shuffle = shuffle , num_workers = num_workers
129136 )
130137 valloader = torch .utils .data .DataLoader (
131- val_subset , batch_size = int (self .batch_size ), shuffle = shuffle , num_workers = 8 , pin_memory = True
138+ val_subset , batch_size = int (self .batch_size ), shuffle = shuffle , num_workers = num_workers
132139 )
133140 return trainloader , valloader
134141
135- def create_train_test_data_loaders (self , dataset , shuffle , test_dataset ):
142+ def create_train_test_data_loaders (self , dataset , shuffle , test_dataset , num_workers = 0 ):
136143 trainloader = torch .utils .data .DataLoader (
137- dataset , batch_size = int (self .batch_size ), shuffle = shuffle , num_workers = 8 , pin_memory = True
144+ dataset , batch_size = int (self .batch_size ), shuffle = shuffle , num_workers = num_workers
138145 )
139146 testloader = torch .utils .data .DataLoader (
140- test_dataset , batch_size = int (self .batch_size ), shuffle = shuffle , num_workers = 8 , pin_memory = True
147+ test_dataset , batch_size = int (self .batch_size ), shuffle = shuffle , num_workers = num_workers
141148 )
142149 return trainloader , testloader
143150
144- def train_hold_out (self , trainloader , criterion , optimizer , device , epoch ):
151+ def train_hold_out (self , trainloader , criterion , optimizer , device ):
145152 running_loss = 0.0
146153 epoch_steps = 0
147154 for i , data in enumerate (trainloader , 0 ):
@@ -158,10 +165,11 @@ def train_hold_out(self, trainloader, criterion, optimizer, device, epoch):
158165 epoch_steps += 1
159166 if i % 1000 == 999 : # print every 1000 mini-batches
160167 print (
161- "Epoch: %d, Batch: %5d. Batch Size: %d. Training Loss: %.3f"
162- % (epoch + 1 , i + 1 , int (self .batch_size ), running_loss / epoch_steps )
168+ "Batch: %5d. Batch Size: %d. Training Loss (running) : %.3f"
169+ % (i + 1 , int (self .batch_size ), running_loss / epoch_steps )
163170 )
164171 running_loss = 0.0
172+ return loss .item ()
165173
166174 def validate_hold_out (self , valloader , criterion , device ):
167175 val_loss = 0.0
0 commit comments