From 173ade5df983370e2512623a62e9ca756092d63b Mon Sep 17 00:00:00 2001 From: Gabriele Cimador Date: Wed, 27 Aug 2025 08:47:09 +0200 Subject: [PATCH] GPU Framework: fixed mismatch between CUDA and HIP launch bounds definitions --- .../Base/GPUReconstructionKernelMacros.h | 15 ++++++++++++ .../Base/cuda/GPUReconstructionCUDAGenRTC.cxx | 2 +- .../Base/cuda/GPUReconstructionCUDAKernels.cu | 2 +- .../cuda/GPUReconstructionCUDARTCCalls.cu | 2 +- .../GPUReconstructionCUDAkernel.template.cu | 2 +- .../GPUReconstructionHIPkernel.template.hip | 2 +- GPU/GPUTracking/Definitions/GPUDefMacros.h | 2 ++ .../Definitions/GPUDefParametersDefaults.h | 5 ++++ .../GPUDefParametersLoad.template.inc | 24 +++++++++---------- GPU/GPUTracking/kernels.cmake | 3 ++- 10 files changed, 41 insertions(+), 18 deletions(-) diff --git a/GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h b/GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h index a03d9de13ef8f..cc1c62bed507d 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h +++ b/GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h @@ -32,6 +32,21 @@ #define GPUCA_M_KRNL_NAME(...) GPUCA_M_KRNL_NAME_A(GPUCA_M_STRIP(__VA_ARGS__)) #if defined(GPUCA_GPUCODE) || defined(GPUCA_GPUCODE_HOSTONLY) + +#if defined(__HIPCC__) && !defined(GPUCA_GPUCODE_NO_LAUNCH_BOUNDS) + static_assert(GPUCA_PAR_AMD_EUS_PER_CU > 0); + #define GPUCA_MIN_WARPS_PER_EU(maxThreadsPerBlock, minBlocksPerCU) GPUCA_CEIL_INT_DIV((minBlocksPerCU) * (maxThreadsPerBlock), (GPUCA_WARP_SIZE * GPUCA_PAR_AMD_EUS_PER_CU)) + + #define GPUCA_LB_ARGS_1(maxThreadsPerBlock) maxThreadsPerBlock + #define GPUCA_LB_ARGS_2(maxThreadsPerBlock, minBlocksPerCU) maxThreadsPerBlock, GPUCA_MIN_WARPS_PER_EU(maxThreadsPerBlock, minBlocksPerCU) + + #define GPUCA_LAUNCH_BOUNDS_SELECT(n, ...) GPUCA_M_CAT(GPUCA_LB_ARGS_, n)(__VA_ARGS__) + #define GPUCA_LAUNCH_BOUNDS_DISP(...) GPUCA_LAUNCH_BOUNDS_SELECT(GPUCA_M_COUNT(__VA_ARGS__), __VA_ARGS__) + #define GPUCA_KRNL_REG_DEFAULT(args) __launch_bounds__(GPUCA_LAUNCH_BOUNDS_DISP(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args)))) +#elif !defined(GPUCA_GPUCODE_NO_LAUNCH_BOUNDS) + #define GPUCA_KRNL_REG_DEFAULT(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args))) +#endif + #ifndef GPUCA_KRNL_REG #define GPUCA_KRNL_REG(...) #endif diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx index 8c3fb92c11c9e..dba7e680d0b2c 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx @@ -74,7 +74,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile) } fclose(fp); } - const std::string launchBounds = o2::gpu::internal::GPUDefParametersExport(*mParDevice, true) + + const std::string launchBounds = o2::gpu::internal::GPUDefParametersExport(*mParDevice, true, mParDevice->par_AMD_EUS_PER_CU ? (mParDevice->par_AMD_EUS_PER_CU * mWarpSize) : 0) + "#define GPUCA_WARP_SIZE " + std::to_string(mWarpSize) + "\n"; if (GetProcessingSettings().rtctech.printLaunchBounds || GetProcessingSettings().debugLevel >= 3) { GPUInfo("RTC Launch Bounds:\n%s", launchBounds.c_str()); diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu index e6ed94bba2cec..e8779415f0ea4 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu @@ -74,7 +74,7 @@ inline void GPUReconstructionCUDA::runKernelBackend(const krnlSetupTime& _xyz, c } #undef GPUCA_KRNL_REG -#define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args))) +#define GPUCA_KRNL_REG(...) GPUCA_KRNL_REG_DEFAULT(__VA_ARGS__) // clang-format off #if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE != 1 // ---------- COMPILE_MODE = perkernel ---------- diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDARTCCalls.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDARTCCalls.cu index 3e4d3113fb995..571428dc39e21 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDARTCCalls.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDARTCCalls.cu @@ -15,7 +15,7 @@ #define GPUCA_GPUCODE_HOSTONLY #define GPUCA_GPUCODE_NO_LAUNCH_BOUNDS -#define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args))) +#define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_STRIP(args)) #include "GPUReconstructionCUDAIncludesSystem.h" #include "GPUReconstructionCUDADef.h" diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAkernel.template.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAkernel.template.cu index 847011a70f7f9..82759aab48d70 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAkernel.template.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAkernel.template.cu @@ -14,7 +14,7 @@ #define GPUCA_GPUCODE_COMPILEKERNELS #include "GPUReconstructionCUDAIncludesSystem.h" -#define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args))) +#define GPUCA_KRNL_REG(...) GPUCA_KRNL_REG_DEFAULT(__VA_ARGS__) #define GPUCA_KRNL(...) GPUCA_KRNLGPU(__VA_ARGS__); #include "GPUReconstructionCUDADef.h" #include "GPUReconstructionKernelMacros.h" diff --git a/GPU/GPUTracking/Base/hip/GPUReconstructionHIPkernel.template.hip b/GPU/GPUTracking/Base/hip/GPUReconstructionHIPkernel.template.hip index 30a84dfa135eb..7cb895cadd770 100644 --- a/GPU/GPUTracking/Base/hip/GPUReconstructionHIPkernel.template.hip +++ b/GPU/GPUTracking/Base/hip/GPUReconstructionHIPkernel.template.hip @@ -14,7 +14,7 @@ #define GPUCA_GPUCODE_COMPILEKERNELS #include "GPUReconstructionHIPIncludesSystem.h" -#define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args))) +#define GPUCA_KRNL_REG(...) GPUCA_KRNL_REG_DEFAULT(__VA_ARGS__) #define GPUCA_KRNL(...) GPUCA_KRNLGPU(__VA_ARGS__); #include "GPUReconstructionHIPDef.h" #include "GPUReconstructionKernelMacros.h" diff --git a/GPU/GPUTracking/Definitions/GPUDefMacros.h b/GPU/GPUTracking/Definitions/GPUDefMacros.h index caf2d1670f84e..ea62d7f34edb0 100644 --- a/GPU/GPUTracking/Definitions/GPUDefMacros.h +++ b/GPU/GPUTracking/Definitions/GPUDefMacros.h @@ -50,5 +50,7 @@ #define GPUCA_UNROLL(...) #endif +#define GPUCA_CEIL_INT_DIV(a, b) (((a) + (b) - 1) / (b)) + #endif // clang-format on diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h index 48d00b274dc9c..06f5deb11fca0 100644 --- a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h +++ b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h @@ -25,6 +25,7 @@ // GPU-architecture-dependent default settings #if defined(GPUCA_GPUTYPE_MI2xx) #define GPUCA_WARP_SIZE 64 + #define GPUCA_PAR_AMD_EUS_PER_CU 4 #define GPUCA_THREAD_COUNT_DEFAULT 256 #define GPUCA_LB_GPUTPCCreateTrackingData 256 #define GPUCA_LB_GPUTPCStartHitsSorter 512, 1 @@ -88,6 +89,7 @@ #define GPUCA_PAR_COMP_GATHER_MODE 3 #elif defined(GPUCA_GPUTYPE_VEGA) #define GPUCA_WARP_SIZE 64 + #define GPUCA_PAR_AMD_EUS_PER_CU 4 #define GPUCA_THREAD_COUNT_DEFAULT 256 #define GPUCA_LB_GPUTPCCreateTrackingData 128 #define GPUCA_LB_GPUTPCStartHitsSorter 1024, 2 @@ -276,6 +278,9 @@ #ifndef GPUCA_WARP_SIZE #define GPUCA_WARP_SIZE 32 #endif + #ifndef GPUCA_PAR_AMD_EUS_PER_CU + #define GPUCA_PAR_AMD_EUS_PER_CU 0 + #endif #ifndef GPUCA_THREAD_COUNT_DEFAULT #define GPUCA_THREAD_COUNT_DEFAULT 256 #endif diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersLoad.template.inc b/GPU/GPUTracking/Definitions/GPUDefParametersLoad.template.inc index ac71adc6232a6..8b7a79a9e48bf 100644 --- a/GPU/GPUTracking/Definitions/GPUDefParametersLoad.template.inc +++ b/GPU/GPUTracking/Definitions/GPUDefParametersLoad.template.inc @@ -39,23 +39,23 @@ static GPUDefParameters GPUDefParametersLoad() }; } -#define GPUCA_EXPORT_KERNEL_LB(name) \ - if (par.par_LB_maxThreads[i] > 0) { \ - o << "#define GPUCA_LB_" GPUCA_M_STR(name) " " << par.par_LB_maxThreads[i]; \ - if (par.par_LB_minBlocks[i] > 0) { \ - o << ", " << par.par_LB_minBlocks[i]; \ - } \ - if (!forRTC && par.par_LB_forceBlocks[i] > 0) { \ - o << ", " << par.par_LB_forceBlocks[i]; \ - } \ - o << "\n"; \ - } \ +#define GPUCA_EXPORT_KERNEL_LB(name) \ + if (par.par_LB_maxThreads[i] > 0) { \ + o << "#define GPUCA_LB_" GPUCA_M_STR(name) " " << par.par_LB_maxThreads[i]; \ + if (par.par_LB_minBlocks[i] > 0) { \ + o << ", " << GPUCA_CEIL_INT_DIV(par.par_LB_maxThreads[i] * par.par_LB_minBlocks[i], (minBlockFactor ? minBlockFactor : par.par_LB_maxThreads[i])); \ + } \ + if (!forRTC && par.par_LB_forceBlocks[i] > 0) { \ + o << ", " << par.par_LB_forceBlocks[i]; \ + } \ + o << "\n"; \ + } \ i++; #define GPUCA_EXPORT_KERNEL_PARAM(name) \ o << "#define GPUCA_PAR_" GPUCA_M_STR(name) " " << GPUCA_M_CAT(par.par_, name) << "\n"; -static std::string GPUDefParametersExport(const GPUDefParameters& par, bool forRTC) +static std::string GPUDefParametersExport(const GPUDefParameters& par, bool forRTC, int32_t minBlockFactor = 0) { std::stringstream o; // clang-format off int32_t i = 0; diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake index 7ebe631d86e92..46f8367f2c83c 100644 --- a/GPU/GPUTracking/kernels.cmake +++ b/GPU/GPUTracking/kernels.cmake @@ -147,7 +147,8 @@ o2_gpu_kernel_add_parameter(NEIGHBOURS_FINDER_MAX_NNEIGHUP COMP_GATHER_KERNEL COMP_GATHER_MODE SORT_STARTHITS - CF_SCAN_WORKGROUP_SIZE) + CF_SCAN_WORKGROUP_SIZE + AMD_EUS_PER_CU) o2_gpu_kernel_add_string_parameter(DEDX_STORAGE_TYPE MERGER_INTERPOLATION_ERROR_TYPE)