2828#include "ITStracking/Constants.h"
2929#include "ITStracking/IndexTableUtils.h"
3030#include "ITStracking/MathUtils.h"
31+ #include "ITStracking/ExternalAllocator.h"
3132#include "DataFormatsITS/TrackITS.h"
3233#include "ReconstructionDataFormats/Vertex.h"
3334
3435#include "ITStrackingGPU/TrackerTraitsGPU.h"
3536#include "ITStrackingGPU/TrackingKernels.h"
3637#include "ITStrackingGPU/Utils.h"
3738
38- #include "GPUCommonHelpers.h"
39-
4039#ifndef __HIPCC__
4140#define THRUST_NAMESPACE thrust::cuda
4241#else
@@ -64,6 +63,37 @@ GPUdii() float Sq(float v)
6463namespace gpu
6564{
6665
66+ template <typename T>
67+ class TypedAllocator : public thrust::device_allocator<T>
68+ {
69+ public:
70+ using value_type = T;
71+ using pointer = T*;
72+
73+ template <typename U>
74+ struct rebind {
75+ using other = TypedAllocator<U>;
76+ };
77+
78+ explicit TypedAllocator(ExternalAllocator* allocPtr)
79+ : mInternalAllocator(allocPtr) {}
80+
81+ T* allocate(size_t n)
82+ {
83+ return reinterpret_cast<T*>(mInternalAllocator->allocate(n * sizeof(T)));
84+ }
85+
86+ void deallocate(T* p, size_t n)
87+ {
88+ char* raw_ptr = reinterpret_cast<char*>(p);
89+ size_t bytes = n * sizeof(T);
90+ mInternalAllocator->deallocate(raw_ptr, bytes); // redundant as internal dealloc is no-op.
91+ }
92+
93+ private:
94+ ExternalAllocator* mInternalAllocator;
95+ };
96+
6797GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex,
6898 const o2::its::IndexTableUtils& utils,
6999 const float z1, const float z2, float maxdeltaz, float maxdeltaphi)
@@ -1117,7 +1147,8 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
11171147
11181148int filterCellNeighboursHandler(gpuPair<int, int>* cellNeighbourPairs,
11191149 int* cellNeighbours,
1120- unsigned int nNeigh)
1150+ unsigned int nNeigh,
1151+ o2::its::ExternalAllocator* allocator)
11211152{
11221153 thrust::device_ptr<gpuPair<int, int>> neighVectorPairs(cellNeighbourPairs);
11231154 thrust::device_ptr<int> validNeighs(cellNeighbours);
@@ -1140,6 +1171,7 @@ void processNeighboursHandler(const int startLayer,
11401171 gsl::span<int*> neighboursDeviceLUTs,
11411172 const TrackingFrameInfo** foundTrackingFrameInfo,
11421173 bounded_vector<CellSeed>& seedsHost,
1174+ o2::its::ExternalAllocator* allocator,
11431175 const float bz,
11441176 const float maxChi2ClusterAttachment,
11451177 const float maxChi2NDF,
@@ -1148,8 +1180,10 @@ void processNeighboursHandler(const int startLayer,
11481180 const int nBlocks,
11491181 const int nThreads)
11501182{
1151- thrust::device_vector<int> foundSeedsTable(nCells[startLayer] + 1); // Shortcut: device_vector skips central memory management, we are relying on the contingency.
1152- // TODO: fix this.
1183+ auto allocInt = gpu::TypedAllocator<int>(allocator);
1184+ auto allocCellSeed = gpu::TypedAllocator<CellSeed>(allocator);
1185+ thrust::device_vector<int, gpu::TypedAllocator<int>> foundSeedsTable(nCells[startLayer] + 1, 0, allocInt); // Shortcut: device_vector skips central memory management, we are relying on the contingency.
1186+ // TODO: fix this.
11531187
11541188 gpu::processNeighboursKernel<true><<<o2::gpu::CAMath::Min(nBlocks, GPU_BLOCKS),
11551189 o2::gpu::CAMath::Min(nThreads, GPU_THREADS)>>>(
@@ -1172,8 +1206,8 @@ void processNeighboursHandler(const int startLayer,
11721206 matCorrType);
11731207 gpu::cubExclusiveScanInPlace(foundSeedsTable, nCells[startLayer] + 1);
11741208
1175- thrust::device_vector<int> updatedCellId(foundSeedsTable.back());
1176- thrust::device_vector<CellSeed> updatedCellSeed(foundSeedsTable.back());
1209+ thrust::device_vector<int, gpu::TypedAllocator<int>> updatedCellId(foundSeedsTable.back(), 0, allocInt );
1210+ thrust::device_vector<CellSeed, gpu::TypedAllocator<CellSeed>> updatedCellSeed(foundSeedsTable.back(), allocCellSeed );
11771211 gpu::processNeighboursKernel<false><<<o2::gpu::CAMath::Min(nBlocks, GPU_BLOCKS),
11781212 o2::gpu::CAMath::Min(nThreads, GPU_THREADS)>>>(
11791213 startLayer,
@@ -1195,13 +1229,13 @@ void processNeighboursHandler(const int startLayer,
11951229 matCorrType);
11961230
11971231 int level = startLevel;
1198- thrust::device_vector<int> lastCellId;
1199- thrust::device_vector<CellSeed> lastCellSeed;
1232+ thrust::device_vector<int, gpu::TypedAllocator<int>> lastCellId(allocInt) ;
1233+ thrust::device_vector<CellSeed,gpu::TypedAllocator<CellSeed>> lastCellSeed(allocCellSeed) ;
12001234 for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) {
12011235 lastCellSeed.swap(updatedCellSeed);
12021236 lastCellId.swap(updatedCellId);
1203- thrust::device_vector<CellSeed>( ).swap(updatedCellSeed);
1204- thrust::device_vector<int>( ).swap(updatedCellId);
1237+ thrust::device_vector<CellSeed, gpu::TypedAllocator<CellSeed>>(allocCellSeed ).swap(updatedCellSeed);
1238+ thrust::device_vector<int, gpu::TypedAllocator<int>>(allocInt ).swap(updatedCellId);
12051239 auto lastCellSeedSize{lastCellSeed.size()};
12061240 foundSeedsTable.resize(lastCellSeedSize + 1);
12071241 thrust::fill(foundSeedsTable.begin(), foundSeedsTable.end(), 0);
@@ -1253,8 +1287,7 @@ void processNeighboursHandler(const int startLayer,
12531287 propagator,
12541288 matCorrType);
12551289 }
1256-
1257- thrust::device_vector<CellSeed> outSeeds(updatedCellSeed.size());
1290+ thrust::device_vector<CellSeed, gpu::TypedAllocator<CellSeed>> outSeeds(updatedCellSeed.size(), allocCellSeed);
12581291 auto end = thrust::copy_if(updatedCellSeed.begin(), updatedCellSeed.end(), outSeeds.begin(), gpu::seed_selector(1.e3, maxChi2NDF * ((startLevel + 2) * 2 - 5)));
12591292 auto s{end - outSeeds.begin()};
12601293 seedsHost.reserve(seedsHost.size() + s);
@@ -1367,6 +1400,7 @@ template void processNeighboursHandler<7>(const int startLayer,
13671400 gsl::span<int*> neighboursDeviceLUTs,
13681401 const TrackingFrameInfo** foundTrackingFrameInfo,
13691402 bounded_vector<CellSeed>& seedsHost,
1403+ o2::its::ExternalAllocator*,
13701404 const float bz,
13711405 const float maxChi2ClusterAttachment,
13721406 const float maxChi2NDF,
0 commit comments