diff --git a/.jules/bolt.md b/.jules/bolt.md new file mode 100644 index 0000000..71ff575 --- /dev/null +++ b/.jules/bolt.md @@ -0,0 +1,3 @@ +## 2024-05-22 - [Optimizing Triplet Data Generation] +**Learning:** Naive data sampling in triplet loss generators can lead to O(N^2) complexity per epoch if not careful. Specifically, iterating through the entire dataset to find positive/negative samples for each anchor is extremely expensive. +**Action:** Use a hash map (dictionary) to pre-group data by class labels. This allows O(1) sampling for positive/negative pairs, drastically reducing data loading overhead. diff --git a/deeptuner/datagenerators/triplet_data_generator.py b/deeptuner/datagenerators/triplet_data_generator.py index 18523e7..32b37b6 100644 --- a/deeptuner/datagenerators/triplet_data_generator.py +++ b/deeptuner/datagenerators/triplet_data_generator.py @@ -13,6 +13,17 @@ def __init__(self, image_paths, labels, batch_size, image_size, num_classes): self.num_classes = num_classes self.label_encoder = LabelEncoder() self.encoded_labels = self.label_encoder.fit_transform(labels) + + # Optimize: Pre-group paths by label to avoid O(N) search in _generate_triplet_batch + self.label_to_paths = {} + for path, label in zip(self.image_paths, self.encoded_labels): + if label not in self.label_to_paths: + self.label_to_paths[label] = [] + self.label_to_paths[label].append(path) + + if len(self.label_to_paths) < 2: + raise ValueError("TripletDataGenerator requires at least 2 classes to generate negative pairs.") + self.image_data_generator = ImageDataGenerator(preprocessing_function=resnet.preprocess_input) self.on_epoch_end() print(f"Initialized TripletDataGenerator with {len(self.image_paths)} images") @@ -36,16 +47,23 @@ def _generate_triplet_batch(self, batch_image_paths, batch_labels): positive_images = [] negative_images = [] + # Get list of all unique labels for negative sampling + all_labels = list(self.label_to_paths.keys()) + for i in range(len(batch_image_paths)): anchor_path = batch_image_paths[i] anchor_label = batch_labels[i] - positive_path = np.random.choice( - [p for p, l in zip(self.image_paths, self.encoded_labels) if l == anchor_label] - ) - negative_path = np.random.choice( - [p for p, l in zip(self.image_paths, self.encoded_labels) if l != anchor_label] - ) + # Optimized: O(1) lookup instead of O(N) search + positive_path = np.random.choice(self.label_to_paths[anchor_label]) + + # Optimized: Pick a random different label, then pick a random path from it + # This avoids iterating over all non-matching paths + negative_label = anchor_label + while negative_label == anchor_label: + negative_label = np.random.choice(all_labels) + + negative_path = np.random.choice(self.label_to_paths[negative_label]) anchor_image = load_img(anchor_path, target_size=self.image_size) positive_image = load_img(positive_path, target_size=self.image_size)