From 6c326eb72ee782d42dc66cbce98c5a695c896401 Mon Sep 17 00:00:00 2001 From: Gabriele Cimador Date: Fri, 22 Aug 2025 16:30:49 +0200 Subject: [PATCH 1/2] ITS: GPU: added launch bounds for ITS kernels, not fully optimised for MI50 --- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 14 ++++++++------ .../include/ITStracking/TrackingConfigParam.h | 10 +++++----- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 94c6610ab9430..ab59e27993436 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -311,7 +311,7 @@ GPUdii() gpuSpan getClustersOnLayer(const int rof, } template -GPUg() void fitTrackSeedsKernel( +GPUg() void __launch_bounds__(256, 1) fitTrackSeedsKernel( CellSeed* trackSeeds, const TrackingFrameInfo** foundTrackingFrameInfo, o2::its::TrackITSExt* tracks, @@ -374,7 +374,7 @@ GPUg() void fitTrackSeedsKernel( } template -GPUg() void computeLayerCellNeighboursKernel( +GPUg() void __launch_bounds__(256, 1) computeLayerCellNeighboursKernel( CellSeed** cellSeedArray, int* neighboursLUT, int* neighboursIndexTable, @@ -438,7 +438,7 @@ GPUg() void computeLayerCellNeighboursKernel( } template -GPUg() void computeLayerCellsKernel( +GPUg() void __launch_bounds__(256, 1) computeLayerCellsKernel( const Cluster** sortedClusters, const Cluster** unsortedClusters, const TrackingFrameInfo** tfInfo, @@ -525,7 +525,7 @@ GPUg() void computeLayerCellsKernel( } template -GPUg() void computeLayerTrackletsMultiROFKernel( +GPUg() void __launch_bounds__(256, 1) computeLayerTrackletsMultiROFKernel( const IndexTableUtils* utils, const uint8_t* multMask, const int layerIndex, @@ -652,7 +652,8 @@ GPUg() void computeLayerTrackletsMultiROFKernel( } } -GPUg() void compileTrackletsLookupTableKernel(const Tracklet* tracklets, +GPUg() void __launch_bounds__(256, 1) compileTrackletsLookupTableKernel( + const Tracklet* tracklets, int* trackletsLookUpTable, const int nTracklets) { @@ -662,7 +663,8 @@ GPUg() void compileTrackletsLookupTableKernel(const Tracklet* tracklets, } template -GPUg() void processNeighboursKernel(const int layer, +GPUg() void __launch_bounds__(256, 1) processNeighboursKernel( + const int layer, const int level, CellSeed** allCellSeeds, CellSeed* currentCellSeeds, diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingConfigParam.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingConfigParam.h index e8d3692909d05..4c445bdbbfa16 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingConfigParam.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingConfigParam.h @@ -121,19 +121,19 @@ struct ITSGpuTrackingParamConfig : public o2::conf::ConfigurableParamHelper Date: Tue, 2 Sep 2025 09:02:32 +0000 Subject: [PATCH 2/2] Please consider the following formatting changes --- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index ab59e27993436..0e5fd21e5e90e 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -653,9 +653,9 @@ GPUg() void __launch_bounds__(256, 1) computeLayerTrackletsMultiROFKernel( } GPUg() void __launch_bounds__(256, 1) compileTrackletsLookupTableKernel( - const Tracklet* tracklets, - int* trackletsLookUpTable, - const int nTracklets) + const Tracklet* tracklets, + int* trackletsLookUpTable, + const int nTracklets) { for (int currentTrackletIndex = blockIdx.x * blockDim.x + threadIdx.x; currentTrackletIndex < nTracklets; currentTrackletIndex += blockDim.x * gridDim.x) { atomicAdd(&trackletsLookUpTable[tracklets[currentTrackletIndex].firstClusterIndex], 1); @@ -664,23 +664,23 @@ GPUg() void __launch_bounds__(256, 1) compileTrackletsLookupTableKernel( template GPUg() void __launch_bounds__(256, 1) processNeighboursKernel( - const int layer, - const int level, - CellSeed** allCellSeeds, - CellSeed* currentCellSeeds, - const int* currentCellIds, - const unsigned int nCurrentCells, - CellSeed* updatedCellSeeds, - int* updatedCellsIds, - int* foundSeedsTable, // auxiliary only in GPU code to compute the number of cells per iteration - const unsigned char** usedClusters, // Used clusters - int* neighbours, - int* neighboursLUT, - const TrackingFrameInfo** foundTrackingFrameInfo, - const float bz, - const float maxChi2ClusterAttachment, - const o2::base::Propagator* propagator, - const o2::base::PropagatorF::MatCorrType matCorrType) + const int layer, + const int level, + CellSeed** allCellSeeds, + CellSeed* currentCellSeeds, + const int* currentCellIds, + const unsigned int nCurrentCells, + CellSeed* updatedCellSeeds, + int* updatedCellsIds, + int* foundSeedsTable, // auxiliary only in GPU code to compute the number of cells per iteration + const unsigned char** usedClusters, // Used clusters + int* neighbours, + int* neighboursLUT, + const TrackingFrameInfo** foundTrackingFrameInfo, + const float bz, + const float maxChi2ClusterAttachment, + const o2::base::Propagator* propagator, + const o2::base::PropagatorF::MatCorrType matCorrType) { constexpr float layerxX0[7] = {5.e-3f, 5.e-3f, 5.e-3f, 1.e-2f, 1.e-2f, 1.e-2f, 1.e-2f}; // Hardcoded here for the moment. for (unsigned int iCurrentCell = blockIdx.x * blockDim.x + threadIdx.x; iCurrentCell < nCurrentCells; iCurrentCell += blockDim.x * gridDim.x) {