Use typed external allocator for some thrust items

mconcas · mconcas · commit d19df0fbd258 · 2025-06-06T09:52:36.000+02:00
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
@@ -16,9 +16,12 @@
 #include "DetectorsBase/Propagator.h"
 #include "GPUCommonDef.h"
 
-namespace o2::its
+namespace o2
+{
+namespace its
 {
 class CellSeed;
+class ExternalAllocator;
 namespace gpu
 {
 #ifdef GPUCA_GPUCODE // GPUg() global kernels must only when compiled by GPU compiler
@@ -178,7 +181,8 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
 
 int filterCellNeighboursHandler(gpuPair<int, int>*,
                                 int*,
-                                unsigned int);
+                                unsigned int,
+                                o2::its::ExternalAllocator* = nullptr);
 
 template <int nLayers = 7>
 void processNeighboursHandler(const int startLayer,
@@ -191,6 +195,7 @@ void processNeighboursHandler(const int startLayer,
                               gsl::span<int*> neighboursDeviceLUTs,
                               const TrackingFrameInfo** foundTrackingFrameInfo,
                               bounded_vector<CellSeed>& seedsHost,
+                              o2::its::ExternalAllocator*,
                               const float bz,
                               const float MaxChi2ClusterAttachment,
                               const float maxChi2NDF,
@@ -212,5 +217,6 @@ void trackSeedHandler(CellSeed* trackSeeds,
                       const o2::base::PropagatorF::MatCorrType matCorrType,
                       const int nBlocks,
                       const int nThreads);
-} // namespace o2::its
+} // namespace its
+} // namespace o2
 #endif // ITSTRACKINGGPU_TRACKINGKERNELS_H_
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt
@@ -13,7 +13,7 @@
 if(CUDA_ENABLED)
 find_package(CUDAToolkit)
 message(STATUS "Building ITS CUDA tracker")
-# add_compile_options(-O0 -g -lineinfo -fPIC)
+add_compile_options(-O0 -g -lineinfo -fPIC)
 # add_compile_definitions(ITS_MEASURE_GPU_TIME)
 o2_add_library(ITStrackingCUDA
                SOURCES ClusterLinesGPU.cu
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
@@ -18,6 +18,7 @@
 #include "ITStrackingGPU/TrackerTraitsGPU.h"
 #include "ITStrackingGPU/TrackingKernels.h"
 #include "ITStracking/TrackingConfigParam.h"
+
 namespace o2::its
 {
 constexpr int UnusedIndex{-1};
@@ -209,7 +210,8 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
 
     filterCellNeighboursHandler(mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
                                 mTimeFrameGPU->getDeviceNeighbours(iLayer),
-                                nNeigh);
+                                nNeigh,
+                                mTimeFrameGPU->getExternalAllocator());
   }
   mTimeFrameGPU->createNeighboursDeviceArray();
   mTimeFrameGPU->unregisterRest();
@@ -236,6 +238,7 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
                                         mTimeFrameGPU->getDeviceNeighboursLUTs(),
                                         mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
                                         trackSeeds,
+                                        mTimeFrameGPU->getExternalAllocator(),
                                         this->mBz,
                                         this->mTrkParams[0].MaxChi2ClusterAttachment,
                                         this->mTrkParams[0].MaxChi2NDF,
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
@@ -28,15 +28,14 @@
 #include "ITStracking/Constants.h"
 #include "ITStracking/IndexTableUtils.h"
 #include "ITStracking/MathUtils.h"
+#include "ITStracking/ExternalAllocator.h"
 #include "DataFormatsITS/TrackITS.h"
 #include "ReconstructionDataFormats/Vertex.h"
 
 #include "ITStrackingGPU/TrackerTraitsGPU.h"
 #include "ITStrackingGPU/TrackingKernels.h"
 #include "ITStrackingGPU/Utils.h"
 
-#include "GPUCommonHelpers.h"
-
 #ifndef __HIPCC__
 #define THRUST_NAMESPACE thrust::cuda
 #else
@@ -64,6 +63,37 @@ GPUdii() float Sq(float v)
 namespace gpu
 {
 
+template <typename T>
+class TypedAllocator : public thrust::device_allocator<T>
+{
+ public:
+  using value_type = T;
+  using pointer = T*;
+
+  template <typename U>
+  struct rebind {
+    using other = TypedAllocator<U>;
+  };
+
+  explicit TypedAllocator(ExternalAllocator* allocPtr)
+    : mInternalAllocator(allocPtr) {}
+
+  T* allocate(size_t n)
+  {
+    return reinterpret_cast<T*>(mInternalAllocator->allocate(n * sizeof(T)));
+  }
+
+  void deallocate(T* p, size_t n)
+  {
+    char* raw_ptr = reinterpret_cast<char*>(p);
+    size_t bytes = n * sizeof(T);
+    mInternalAllocator->deallocate(raw_ptr, bytes); // redundant as internal dealloc is no-op.
+  }
+
+ private:
+  ExternalAllocator* mInternalAllocator;
+};
+
 GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex,
                               const o2::its::IndexTableUtils& utils,
                               const float z1, const float z2, float maxdeltaz, float maxdeltaphi)
@@ -1117,7 +1147,8 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
 
 int filterCellNeighboursHandler(gpuPair<int, int>* cellNeighbourPairs,
                                 int* cellNeighbours,
-                                unsigned int nNeigh)
+                                unsigned int nNeigh,
+                                o2::its::ExternalAllocator* allocator)
 {
   thrust::device_ptr<gpuPair<int, int>> neighVectorPairs(cellNeighbourPairs);
   thrust::device_ptr<int> validNeighs(cellNeighbours);
@@ -1140,6 +1171,7 @@ void processNeighboursHandler(const int startLayer,
                               gsl::span<int*> neighboursDeviceLUTs,
                               const TrackingFrameInfo** foundTrackingFrameInfo,
                               bounded_vector<CellSeed>& seedsHost,
+                              o2::its::ExternalAllocator* allocator,
                               const float bz,
                               const float maxChi2ClusterAttachment,
                               const float maxChi2NDF,
@@ -1148,8 +1180,10 @@ void processNeighboursHandler(const int startLayer,
                               const int nBlocks,
                               const int nThreads)
 {
-  thrust::device_vector<int> foundSeedsTable(nCells[startLayer] + 1); // Shortcut: device_vector skips central memory management, we are relying on the contingency.
-                                                                      // TODO: fix this.
+  auto allocInt = gpu::TypedAllocator<int>(allocator);
+  auto allocCellSeed = gpu::TypedAllocator<CellSeed>(allocator);
+  thrust::device_vector<int, gpu::TypedAllocator<int>> foundSeedsTable(nCells[startLayer] + 1, 0, allocInt); // Shortcut: device_vector skips central memory management, we are relying on the contingency.
+                                                                                                             // TODO: fix this.
 
   gpu::processNeighboursKernel<true><<<o2::gpu::CAMath::Min(nBlocks, GPU_BLOCKS),
                                        o2::gpu::CAMath::Min(nThreads, GPU_THREADS)>>>(
@@ -1172,8 +1206,8 @@ void processNeighboursHandler(const int startLayer,
     matCorrType);
   gpu::cubExclusiveScanInPlace(foundSeedsTable, nCells[startLayer] + 1);
 
-  thrust::device_vector<int> updatedCellId(foundSeedsTable.back());
-  thrust::device_vector<CellSeed> updatedCellSeed(foundSeedsTable.back());
+  thrust::device_vector<int, gpu::TypedAllocator<int>> updatedCellId(foundSeedsTable.back(), 0, allocInt);
+  thrust::device_vector<CellSeed, gpu::TypedAllocator<CellSeed>> updatedCellSeed(foundSeedsTable.back(), allocCellSeed);
   gpu::processNeighboursKernel<false><<<o2::gpu::CAMath::Min(nBlocks, GPU_BLOCKS),
                                         o2::gpu::CAMath::Min(nThreads, GPU_THREADS)>>>(
     startLayer,
@@ -1195,13 +1229,13 @@ void processNeighboursHandler(const int startLayer,
     matCorrType);
 
   int level = startLevel;
-  thrust::device_vector<int> lastCellId;
-  thrust::device_vector<CellSeed> lastCellSeed;
+  thrust::device_vector<int, gpu::TypedAllocator<int>> lastCellId(allocInt);
+  thrust::device_vector<CellSeed,gpu::TypedAllocator<CellSeed>> lastCellSeed(allocCellSeed);
   for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) {
     lastCellSeed.swap(updatedCellSeed);
     lastCellId.swap(updatedCellId);
-    thrust::device_vector<CellSeed>().swap(updatedCellSeed);
-    thrust::device_vector<int>().swap(updatedCellId);
+    thrust::device_vector<CellSeed, gpu::TypedAllocator<CellSeed>>(allocCellSeed).swap(updatedCellSeed);
+    thrust::device_vector<int, gpu::TypedAllocator<int>>(allocInt).swap(updatedCellId);
     auto lastCellSeedSize{lastCellSeed.size()};
     foundSeedsTable.resize(lastCellSeedSize + 1);
     thrust::fill(foundSeedsTable.begin(), foundSeedsTable.end(), 0);
@@ -1253,8 +1287,7 @@ void processNeighboursHandler(const int startLayer,
       propagator,
       matCorrType);
   }
-
-  thrust::device_vector<CellSeed> outSeeds(updatedCellSeed.size());
+  thrust::device_vector<CellSeed, gpu::TypedAllocator<CellSeed>> outSeeds(updatedCellSeed.size(), allocCellSeed);
   auto end = thrust::copy_if(updatedCellSeed.begin(), updatedCellSeed.end(), outSeeds.begin(), gpu::seed_selector(1.e3, maxChi2NDF * ((startLevel + 2) * 2 - 5)));
   auto s{end - outSeeds.begin()};
   seedsHost.reserve(seedsHost.size() + s);
@@ -1367,6 +1400,7 @@ template void processNeighboursHandler<7>(const int startLayer,
                                           gsl::span<int*> neighboursDeviceLUTs,
                                           const TrackingFrameInfo** foundTrackingFrameInfo,
                                           bounded_vector<CellSeed>& seedsHost,
+                                          o2::its::ExternalAllocator*,
                                           const float bz,
                                           const float maxChi2ClusterAttachment,
                                           const float maxChi2NDF,
diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/ExternalAllocator.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/ExternalAllocator.h
@@ -23,8 +23,8 @@ class ExternalAllocator
 {
  public:
   virtual void* allocate(size_t) = 0;
+  virtual void deallocate(char*, size_t) = 0;
 };
-
 } // namespace o2::its
 
 #endif
diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h
@@ -230,7 +230,23 @@ struct TimeFrame {
   void setBz(float bz) { mBz = bz; }
   float getBz() const { return mBz; }
 
-  virtual void setDevicePropagator(const o2::base::PropagatorImpl<float>*) { return; }
+  void setExternalAllocator(ExternalAllocator* allocator)
+  {
+    if (mIsGPU) {
+      LOGP(debug, "Setting timeFrame allocator to external");
+      mAllocator = allocator;
+      mExtAllocator = true; // to be removed
+    } else {
+      LOGP(debug, "External allocator is currently only supported for GPU");
+    }
+  }
+
+  ExternalAllocator* getExternalAllocator() { return mAllocator; }
+
+  virtual void setDevicePropagator(const o2::base::PropagatorImpl<float>*)
+  {
+    return;
+  };
   const o2::base::PropagatorImpl<float>* getDevicePropagator() const { return mPropagatorDevice; }
 
   template <typename... T>
@@ -277,17 +293,6 @@ struct TimeFrame {
   // State if memory will be externally managed.
   bool mExtAllocator = false;
   ExternalAllocator* mAllocator = nullptr;
-  void setExternalAllocator(ExternalAllocator* allocator)
-  {
-    if (mIsGPU) {
-      LOGP(debug, "Setting timeFrame allocator to external");
-      mAllocator = allocator;
-      mExtAllocator = true; // to be removed
-    } else {
-      LOGP(fatal, "External allocator is currently only supported for GPU");
-    }
-  }
-  void setExtAllocator(bool ext) { mExtAllocator = ext; }
   bool getExtAllocator() const { return mExtAllocator; }
 
   std::array<bounded_vector<Cluster>, nLayers> mUnsortedClusters;
diff --git a/GPU/GPUTracking/Global/GPUChainITS.cxx b/GPU/GPUTracking/Global/GPUChainITS.cxx
@@ -30,7 +30,7 @@ class GPUFrameworkExternalAllocator final : public o2::its::ExternalAllocator
   {
     return mFWReco->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU);
   }
-
+  void deallocate(char* ptr, size_t) {}
   void setReconstructionFramework(o2::gpu::GPUReconstruction* fwr) { mFWReco = fwr; }
 
  private:

Original file line number	Diff line number	Diff line change
`@@ -23,8 +23,8 @@ class ExternalAllocator`
`23`	`23`	`{`
`24`	`24`	`public:`
`25`	`25`	`virtual void* allocate(size_t) = 0;`
	`26`	`+ virtual void deallocate(char*, size_t) = 0;`
`26`	`27`	`};`
`27`		`-`
`28`	`28`	`} // namespace o2::its`
`29`	`29`
`30`	`30`	`#endif`
Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@ class GPUFrameworkExternalAllocator final : public o2::its::ExternalAllocator`
`30`	`30`	`{`
`31`	`31`	`return mFWReco->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU);`
`32`	`32`	`}`
`33`		`-`
	`33`	`+ void deallocate(char* ptr, size_t) {}`
`34`	`34`	`void setReconstructionFramework(o2::gpu::GPUReconstruction* fwr) { mFWReco = fwr; }`
`35`	`35`
`36`	`36`	`private:`