From 4cb6ffec8ad98f2269ba43d299f0e1d54afd132e Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Tue, 30 Sep 2025 12:06:38 +0200
Subject: [PATCH 1/3] GPU: Prevent CUDA 13 from breaking the GPUbenchmark

Some features are moved out from the `cudaDeviceProp` struct starting from CUDA 13.
---
 GPU/GPUbenchmark/cuda/Kernels.cu | 158 +++++++++++++++++--------------
 1 file changed, 89 insertions(+), 69 deletions(-)

diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index c309e7b2dbc5d..ccbd734d1dbeb 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -301,82 +301,102 @@ void printDeviceProp(int32_t deviceId)
   cudaDeviceProp props;
   GPUCHECK(cudaGetDeviceProperties(&props, deviceId));
 
-  std::cout << std::setw(w1) << "Name: " << props.name << std::endl;
-  std::cout << std::setw(w1) << "pciBusID: " << props.pciBusID << std::endl;
-  std::cout << std::setw(w1) << "pciDeviceID: " << props.pciDeviceID << std::endl;
-  std::cout << std::setw(w1) << "pciDomainID: " << props.pciDomainID << std::endl;
-  std::cout << std::setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << std::endl;
-  std::cout << std::setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor
-            << std::endl;
-  std::cout << std::setw(w1) << "isMultiGpuBoard: " << props.isMultiGpuBoard << std::endl;
-  std::cout << std::setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << std::endl;
-  std::cout << std::setw(w1) << "memoryClockRate: " << (float)props.memoryClockRate / 1000.0 << " Mhz"
-            << std::endl;
-  std::cout << std::setw(w1) << "memoryBusWidth: " << props.memoryBusWidth << std::endl;
-  std::cout << std::setw(w1) << "clockInstructionRate: " << (float)props.clockRate / 1000.0
-            << " Mhz" << std::endl;
-  std::cout << std::setw(w1) << "totalGlobalMem: " << std::fixed << std::setprecision(2)
-            << bytesToGB(props.totalGlobalMem) << " GB" << std::endl;
+        int32_t clockRateKHz = 0;
+      int32_t memoryClockRateKHz = 0;
+      int32_t computeMode = 0;
+
+#if (CUDART_VERSION >= 13000)
+      GPUCHECK(cudaDeviceGetAttribute(&clockRateKHz, cudaDevAttrClockRate, deviceId));
+      GPUCHECK(cudaDeviceGetAttribute(&memoryClockRateKHz, cudaDevAttrMemoryClockRate, deviceId));
+      GPUCHECK(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, deviceId));
+#else
+      clockRateKHz = props.clockRate;
+      memoryClockRateKHz = props.memoryClockRate;
+      computeMode = props.computeMode;
+      cooperativeMultiDevice = props.cooperativeMultiDeviceLaunch;
+#endif
+      std::cout << std::setw(w1) << "Name: " << props.name << std::endl;
+      std::cout << std::setw(w1) << "pciBusID: " << props.pciBusID << std::endl;
+      std::cout << std::setw(w1) << "pciDeviceID: " << props.pciDeviceID << std::endl;
+      std::cout << std::setw(w1) << "pciDomainID: " << props.pciDomainID << std::endl;
+      std::cout << std::setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << std::endl;
+      std::cout << std::setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor
+                << std::endl;
+      std::cout << std::setw(w1) << "isMultiGpuBoard: " << props.isMultiGpuBoard << std::endl;
+
+      // Use the variables we populated above for the moved properties
+      std::cout << std::setw(w1) << "clockRate: " << (float)clockRateKHz / 1000.0 << " Mhz" << std::endl;
+      std::cout << std::setw(w1) << "memoryClockRate: " << (float)memoryClockRateKHz / 1000.0 << " Mhz"
+                << std::endl;
+
+      std::cout << std::setw(w1) << "memoryBusWidth: " << props.memoryBusWidth << std::endl;
+
+      // clockInstructionRate is just another name for clockRate in this context
+      std::cout << std::setw(w1) << "clockInstructionRate: " << (float)clockRateKHz / 1000.0
+                << " Mhz" << std::endl;
+      std::cout << std::setw(w1) << "totalGlobalMem: " << std::fixed << std::setprecision(2)
+                << bytesToGB(props.totalGlobalMem) << " GB" << std::endl;
 #if !defined(__CUDACC__)
-  std::cout << std::setw(w1) << "maxSharedMemoryPerMultiProcessor: " << std::fixed << std::setprecision(2)
-            << bytesToconfig(props.sharedMemPerMultiprocessor) << " config" << std::endl;
+      std::cout << std::setw(w1) << "maxSharedMemoryPerMultiProcessor: " << std::fixed << std::setprecision(2)
+                << bytesToconfig(props.sharedMemPerMultiprocessor) << " config" << std::endl;
 #endif
 #if defined(__HIPCC__)
-  std::cout << std::setw(w1) << "maxSharedMemoryPerMultiProcessor: " << std::fixed << std::setprecision(2)
-            << bytesToconfig(props.maxSharedMemoryPerMultiProcessor) << " config" << std::endl;
+      std::cout << std::setw(w1) << "maxSharedMemoryPerMultiProcessor: " << std::fixed << std::setprecision(2)
+                << bytesToconfig(props.maxSharedMemoryPerMultiProcessor) << " config" << std::endl;
 #endif
-  std::cout << std::setw(w1) << "totalConstMem: " << props.totalConstMem << std::endl;
-  std::cout << std::setw(w1) << "sharedMemPerBlock: " << (float)props.sharedMemPerBlock / 1024.0 << " config"
-            << std::endl;
-  std::cout << std::setw(w1) << "canMapHostMemory: " << props.canMapHostMemory << std::endl;
-  std::cout << std::setw(w1) << "regsPerBlock: " << props.regsPerBlock << std::endl;
-  std::cout << std::setw(w1) << "warpSize: " << props.warpSize << std::endl;
-  std::cout << std::setw(w1) << "l2CacheSize: " << props.l2CacheSize << std::endl;
-  std::cout << std::setw(w1) << "computeMode: " << props.computeMode << std::endl;
-  std::cout << std::setw(w1) << "maxThreadsPerBlock: " << props.maxThreadsPerBlock << std::endl;
-  std::cout << std::setw(w1) << "maxThreadsDim.x: " << props.maxThreadsDim[0] << std::endl;
-  std::cout << std::setw(w1) << "maxThreadsDim.y: " << props.maxThreadsDim[1] << std::endl;
-  std::cout << std::setw(w1) << "maxThreadsDim.z: " << props.maxThreadsDim[2] << std::endl;
-  std::cout << std::setw(w1) << "maxGridSize.x: " << props.maxGridSize[0] << std::endl;
-  std::cout << std::setw(w1) << "maxGridSize.y: " << props.maxGridSize[1] << std::endl;
-  std::cout << std::setw(w1) << "maxGridSize.z: " << props.maxGridSize[2] << std::endl;
-  std::cout << std::setw(w1) << "major: " << props.major << std::endl;
-  std::cout << std::setw(w1) << "minor: " << props.minor << std::endl;
-  std::cout << std::setw(w1) << "concurrentKernels: " << props.concurrentKernels << std::endl;
-  std::cout << std::setw(w1) << "cooperativeLaunch: " << props.cooperativeLaunch << std::endl;
-  std::cout << std::setw(w1) << "cooperativeMultiDeviceLaunch: " << props.cooperativeMultiDeviceLaunch << std::endl;
+      std::cout << std::setw(w1) << "totalConstMem: " << props.totalConstMem << std::endl;
+      std::cout << std::setw(w1) << "sharedMemPerBlock: " << (float)props.sharedMemPerBlock / 1024.0 << " config"
+                << std::endl;
+      std::cout << std::setw(w1) << "canMapHostMemory: " << props.canMapHostMemory << std::endl;
+      std::cout << std::setw(w1) << "regsPerBlock: " << props.regsPerBlock << std::endl;
+      std::cout << std::setw(w1) << "warpSize: " << props.warpSize << std::endl;
+      std::cout << std::setw(w1) << "l2CacheSize: " << props.l2CacheSize << std::endl;
+
+      // Use the variable for computeMode
+      std::cout << std::setw(w1) << "computeMode: " << computeMode << std::endl;
+
+      std::cout << std::setw(w1) << "maxThreadsPerBlock: " << props.maxThreadsPerBlock << std::endl;
+      std::cout << std::setw(w1) << "maxThreadsDim.x: " << props.maxThreadsDim[0] << std::endl;
+      std::cout << std::setw(w1) << "maxThreadsDim.y: " << props.maxThreadsDim[1] << std::endl;
+      std::cout << std::setw(w1) << "maxThreadsDim.z: " << props.maxThreadsDim[2] << std::endl;
+      std::cout << std::setw(w1) << "maxGridSize.x: " << props.maxGridSize[0] << std::endl;
+      std::cout << std::setw(w1) << "maxGridSize.y: " << props.maxGridSize[1] << std::endl;
+      std::cout << std::setw(w1) << "maxGridSize.z: " << props.maxGridSize[2] << std::endl;
+      std::cout << std::setw(w1) << "major: " << props.major << std::endl;
+      std::cout << std::setw(w1) << "minor: " << props.minor << std::endl;
+      std::cout << std::setw(w1) << "concurrentKernels: " << props.concurrentKernels << std::endl;
 #if defined(__HIPCC__)
-  std::cout << std::setw(w1) << "arch.hasGlobalInt32Atomics: " << props.arch.hasGlobalInt32Atomics << std::endl;
-  std::cout << std::setw(w1) << "arch.hasGlobalFloatAtomicExch: " << props.arch.hasGlobalFloatAtomicExch
-            << std::endl;
-  std::cout << std::setw(w1) << "arch.hasSharedInt32Atomics: " << props.arch.hasSharedInt32Atomics << std::endl;
-  std::cout << std::setw(w1) << "arch.hasSharedFloatAtomicExch: " << props.arch.hasSharedFloatAtomicExch
-            << std::endl;
-  std::cout << std::setw(w1) << "arch.hasFloatAtomicAdd: " << props.arch.hasFloatAtomicAdd << std::endl;
-  std::cout << std::setw(w1) << "arch.hasGlobalInt64Atomics: " << props.arch.hasGlobalInt64Atomics << std::endl;
-  std::cout << std::setw(w1) << "arch.hasSharedInt64Atomics: " << props.arch.hasSharedInt64Atomics << std::endl;
-  std::cout << std::setw(w1) << "arch.hasDoubles: " << props.arch.hasDoubles << std::endl;
-  std::cout << std::setw(w1) << "arch.hasWarpVote: " << props.arch.hasWarpVote << std::endl;
-  std::cout << std::setw(w1) << "arch.hasWarpBallot: " << props.arch.hasWarpBallot << std::endl;
-  std::cout << std::setw(w1) << "arch.hasWarpShuffle: " << props.arch.hasWarpShuffle << std::endl;
-  std::cout << std::setw(w1) << "arch.hasFunnelShift: " << props.arch.hasFunnelShift << std::endl;
-  std::cout << std::setw(w1) << "arch.hasThreadFenceSystem: " << props.arch.hasThreadFenceSystem << std::endl;
-  std::cout << std::setw(w1) << "arch.hasSyncThreadsExt: " << props.arch.hasSyncThreadsExt << std::endl;
-  std::cout << std::setw(w1) << "arch.hasSurfaceFuncs: " << props.arch.hasSurfaceFuncs << std::endl;
-  std::cout << std::setw(w1) << "arch.has3dGrid: " << props.arch.has3dGrid << std::endl;
-  std::cout << std::setw(w1) << "arch.hasDynamicParallelism: " << props.arch.hasDynamicParallelism << std::endl;
-  std::cout << std::setw(w1) << "gcnArchName: " << props.gcnArchName << std::endl;
+      std::cout << std::setw(w1) << "arch.hasGlobalInt32Atomics: " << props.arch.hasGlobalInt32Atomics << std::endl;
+      std::cout << std::setw(w1) << "arch.hasGlobalFloatAtomicExch: " << props.arch.hasGlobalFloatAtomicExch
+                << std::endl;
+      std::cout << std::setw(w1) << "arch.hasSharedInt32Atomics: " << props.arch.hasSharedInt32Atomics << std::endl;
+      std::cout << std::setw(w1) << "arch.hasSharedFloatAtomicExch: " << props.arch.hasSharedFloatAtomicExch
+                << std::endl;
+      std::cout << std::setw(w1) << "arch.hasFloatAtomicAdd: " << props.arch.hasFloatAtomicAdd << std::endl;
+      std::cout << std::setw(w1) << "arch.hasGlobalInt64Atomics: " << props.arch.hasGlobalInt64Atomics << std::endl;
+      std::cout << std::setw(w1) << "arch.hasSharedInt64Atomics: " << props.arch.hasSharedInt64Atomics << std::endl;
+      std::cout << std::setw(w1) << "arch.hasDoubles: " << props.arch.hasDoubles << std::endl;
+      std::cout << std::setw(w1) << "arch.hasWarpVote: " << props.arch.hasWarpVote << std::endl;
+      std::cout << std::setw(w1) << "arch.hasWarpBallot: " << props.arch.hasWarpBallot << std::endl;
+      std::cout << std::setw(w1) << "arch.hasWarpShuffle: " << props.arch.hasWarpShuffle << std::endl;
+      std::cout << std::setw(w1) << "arch.hasFunnelShift: " << props.arch.hasFunnelShift << std::endl;
+      std::cout << std::setw(w1) << "arch.hasThreadFenceSystem: " << props.arch.hasThreadFenceSystem << std::endl;
+      std::cout << std::setw(w1) << "arch.hasSyncThreadsExt: " << props.arch.hasSyncThreadsExt << std::endl;
+      std::cout << std::setw(w1) << "arch.hasSurfaceFuncs: " << props.arch.hasSurfaceFuncs << std::endl;
+      std::cout << std::setw(w1) << "arch.has3dGrid: " << props.arch.has3dGrid << std::endl;
+      std::cout << std::setw(w1) << "arch.hasDynamicParallelism: " << props.arch.hasDynamicParallelism << std::endl;
+      std::cout << std::setw(w1) << "gcnArchName: " << props.gcnArchName << std::endl;
 #endif
-  std::cout << std::setw(w1) << "isIntegrated: " << props.integrated << std::endl;
-  std::cout << std::setw(w1) << "maxTexture1D: " << props.maxTexture1D << std::endl;
-  std::cout << std::setw(w1) << "maxTexture2D.width: " << props.maxTexture2D[0] << std::endl;
-  std::cout << std::setw(w1) << "maxTexture2D.height: " << props.maxTexture2D[1] << std::endl;
-  std::cout << std::setw(w1) << "maxTexture3D.width: " << props.maxTexture3D[0] << std::endl;
-  std::cout << std::setw(w1) << "maxTexture3D.height: " << props.maxTexture3D[1] << std::endl;
-  std::cout << std::setw(w1) << "maxTexture3D.depth: " << props.maxTexture3D[2] << std::endl;
+      std::cout << std::setw(w1) << "isIntegrated: " << props.integrated << std::endl;
+      std::cout << std::setw(w1) << "maxTexture1D: " << props.maxTexture1D << std::endl;
+      std::cout << std::setw(w1) << "maxTexture2D.width: " << props.maxTexture2D[0] << std::endl;
+      std::cout << std::setw(w1) << "maxTexture2D.height: " << props.maxTexture2D[1] << std::endl;
+      std::cout << std::setw(w1) << "maxTexture3D.width: " << props.maxTexture3D[0] << std::endl;
+      std::cout << std::setw(w1) << "maxTexture3D.height: " << props.maxTexture3D[1] << std::endl;
+      std::cout << std::setw(w1) << "maxTexture3D.depth: " << props.maxTexture3D[2] << std::endl;
 #if defined(__HIPCC__)
-  std::cout << std::setw(w1) << "isLargeBar: " << props.isLargeBar << std::endl;
-  std::cout << std::setw(w1) << "asicRevision: " << props.asicRevision << std::endl;
+      std::cout << std::setw(w1) << "isLargeBar: " << props.isLargeBar << std::endl;
+      std::cout << std::setw(w1) << "asicRevision: " << props.asicRevision << std::endl;
 #endif
 
   int32_t deviceCnt;

From 8a00fbb46d6cde3af211a0ac0f381ee480187d6b Mon Sep 17 00:00:00 2001
From: ALICE Builder <alibuild@users.noreply.github.com>
Date: Tue, 30 Sep 2025 12:09:50 +0200
Subject: [PATCH 2/3] Please consider the following formatting changes (#14707)

---
 GPU/GPUbenchmark/cuda/Kernels.cu | 170 +++++++++++++++----------------
 1 file changed, 85 insertions(+), 85 deletions(-)

diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index ccbd734d1dbeb..be622b30474e5 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -301,102 +301,102 @@ void printDeviceProp(int32_t deviceId)
   cudaDeviceProp props;
   GPUCHECK(cudaGetDeviceProperties(&props, deviceId));
 
-        int32_t clockRateKHz = 0;
-      int32_t memoryClockRateKHz = 0;
-      int32_t computeMode = 0;
+  int32_t clockRateKHz = 0;
+  int32_t memoryClockRateKHz = 0;
+  int32_t computeMode = 0;
 
 #if (CUDART_VERSION >= 13000)
-      GPUCHECK(cudaDeviceGetAttribute(&clockRateKHz, cudaDevAttrClockRate, deviceId));
-      GPUCHECK(cudaDeviceGetAttribute(&memoryClockRateKHz, cudaDevAttrMemoryClockRate, deviceId));
-      GPUCHECK(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, deviceId));
+  GPUCHECK(cudaDeviceGetAttribute(&clockRateKHz, cudaDevAttrClockRate, deviceId));
+  GPUCHECK(cudaDeviceGetAttribute(&memoryClockRateKHz, cudaDevAttrMemoryClockRate, deviceId));
+  GPUCHECK(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, deviceId));
 #else
-      clockRateKHz = props.clockRate;
-      memoryClockRateKHz = props.memoryClockRate;
-      computeMode = props.computeMode;
-      cooperativeMultiDevice = props.cooperativeMultiDeviceLaunch;
+  clockRateKHz = props.clockRate;
+  memoryClockRateKHz = props.memoryClockRate;
+  computeMode = props.computeMode;
+  cooperativeMultiDevice = props.cooperativeMultiDeviceLaunch;
 #endif
-      std::cout << std::setw(w1) << "Name: " << props.name << std::endl;
-      std::cout << std::setw(w1) << "pciBusID: " << props.pciBusID << std::endl;
-      std::cout << std::setw(w1) << "pciDeviceID: " << props.pciDeviceID << std::endl;
-      std::cout << std::setw(w1) << "pciDomainID: " << props.pciDomainID << std::endl;
-      std::cout << std::setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << std::endl;
-      std::cout << std::setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor
-                << std::endl;
-      std::cout << std::setw(w1) << "isMultiGpuBoard: " << props.isMultiGpuBoard << std::endl;
-
-      // Use the variables we populated above for the moved properties
-      std::cout << std::setw(w1) << "clockRate: " << (float)clockRateKHz / 1000.0 << " Mhz" << std::endl;
-      std::cout << std::setw(w1) << "memoryClockRate: " << (float)memoryClockRateKHz / 1000.0 << " Mhz"
-                << std::endl;
-
-      std::cout << std::setw(w1) << "memoryBusWidth: " << props.memoryBusWidth << std::endl;
-
-      // clockInstructionRate is just another name for clockRate in this context
-      std::cout << std::setw(w1) << "clockInstructionRate: " << (float)clockRateKHz / 1000.0
-                << " Mhz" << std::endl;
-      std::cout << std::setw(w1) << "totalGlobalMem: " << std::fixed << std::setprecision(2)
-                << bytesToGB(props.totalGlobalMem) << " GB" << std::endl;
+  std::cout << std::setw(w1) << "Name: " << props.name << std::endl;
+  std::cout << std::setw(w1) << "pciBusID: " << props.pciBusID << std::endl;
+  std::cout << std::setw(w1) << "pciDeviceID: " << props.pciDeviceID << std::endl;
+  std::cout << std::setw(w1) << "pciDomainID: " << props.pciDomainID << std::endl;
+  std::cout << std::setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << std::endl;
+  std::cout << std::setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor
+            << std::endl;
+  std::cout << std::setw(w1) << "isMultiGpuBoard: " << props.isMultiGpuBoard << std::endl;
+
+  // Use the variables we populated above for the moved properties
+  std::cout << std::setw(w1) << "clockRate: " << (float)clockRateKHz / 1000.0 << " Mhz" << std::endl;
+  std::cout << std::setw(w1) << "memoryClockRate: " << (float)memoryClockRateKHz / 1000.0 << " Mhz"
+            << std::endl;
+
+  std::cout << std::setw(w1) << "memoryBusWidth: " << props.memoryBusWidth << std::endl;
+
+  // clockInstructionRate is just another name for clockRate in this context
+  std::cout << std::setw(w1) << "clockInstructionRate: " << (float)clockRateKHz / 1000.0
+            << " Mhz" << std::endl;
+  std::cout << std::setw(w1) << "totalGlobalMem: " << std::fixed << std::setprecision(2)
+            << bytesToGB(props.totalGlobalMem) << " GB" << std::endl;
 #if !defined(__CUDACC__)
-      std::cout << std::setw(w1) << "maxSharedMemoryPerMultiProcessor: " << std::fixed << std::setprecision(2)
-                << bytesToconfig(props.sharedMemPerMultiprocessor) << " config" << std::endl;
+  std::cout << std::setw(w1) << "maxSharedMemoryPerMultiProcessor: " << std::fixed << std::setprecision(2)
+            << bytesToconfig(props.sharedMemPerMultiprocessor) << " config" << std::endl;
 #endif
 #if defined(__HIPCC__)
-      std::cout << std::setw(w1) << "maxSharedMemoryPerMultiProcessor: " << std::fixed << std::setprecision(2)
-                << bytesToconfig(props.maxSharedMemoryPerMultiProcessor) << " config" << std::endl;
+  std::cout << std::setw(w1) << "maxSharedMemoryPerMultiProcessor: " << std::fixed << std::setprecision(2)
+            << bytesToconfig(props.maxSharedMemoryPerMultiProcessor) << " config" << std::endl;
 #endif
-      std::cout << std::setw(w1) << "totalConstMem: " << props.totalConstMem << std::endl;
-      std::cout << std::setw(w1) << "sharedMemPerBlock: " << (float)props.sharedMemPerBlock / 1024.0 << " config"
-                << std::endl;
-      std::cout << std::setw(w1) << "canMapHostMemory: " << props.canMapHostMemory << std::endl;
-      std::cout << std::setw(w1) << "regsPerBlock: " << props.regsPerBlock << std::endl;
-      std::cout << std::setw(w1) << "warpSize: " << props.warpSize << std::endl;
-      std::cout << std::setw(w1) << "l2CacheSize: " << props.l2CacheSize << std::endl;
-
-      // Use the variable for computeMode
-      std::cout << std::setw(w1) << "computeMode: " << computeMode << std::endl;
-
-      std::cout << std::setw(w1) << "maxThreadsPerBlock: " << props.maxThreadsPerBlock << std::endl;
-      std::cout << std::setw(w1) << "maxThreadsDim.x: " << props.maxThreadsDim[0] << std::endl;
-      std::cout << std::setw(w1) << "maxThreadsDim.y: " << props.maxThreadsDim[1] << std::endl;
-      std::cout << std::setw(w1) << "maxThreadsDim.z: " << props.maxThreadsDim[2] << std::endl;
-      std::cout << std::setw(w1) << "maxGridSize.x: " << props.maxGridSize[0] << std::endl;
-      std::cout << std::setw(w1) << "maxGridSize.y: " << props.maxGridSize[1] << std::endl;
-      std::cout << std::setw(w1) << "maxGridSize.z: " << props.maxGridSize[2] << std::endl;
-      std::cout << std::setw(w1) << "major: " << props.major << std::endl;
-      std::cout << std::setw(w1) << "minor: " << props.minor << std::endl;
-      std::cout << std::setw(w1) << "concurrentKernels: " << props.concurrentKernels << std::endl;
+  std::cout << std::setw(w1) << "totalConstMem: " << props.totalConstMem << std::endl;
+  std::cout << std::setw(w1) << "sharedMemPerBlock: " << (float)props.sharedMemPerBlock / 1024.0 << " config"
+            << std::endl;
+  std::cout << std::setw(w1) << "canMapHostMemory: " << props.canMapHostMemory << std::endl;
+  std::cout << std::setw(w1) << "regsPerBlock: " << props.regsPerBlock << std::endl;
+  std::cout << std::setw(w1) << "warpSize: " << props.warpSize << std::endl;
+  std::cout << std::setw(w1) << "l2CacheSize: " << props.l2CacheSize << std::endl;
+
+  // Use the variable for computeMode
+  std::cout << std::setw(w1) << "computeMode: " << computeMode << std::endl;
+
+  std::cout << std::setw(w1) << "maxThreadsPerBlock: " << props.maxThreadsPerBlock << std::endl;
+  std::cout << std::setw(w1) << "maxThreadsDim.x: " << props.maxThreadsDim[0] << std::endl;
+  std::cout << std::setw(w1) << "maxThreadsDim.y: " << props.maxThreadsDim[1] << std::endl;
+  std::cout << std::setw(w1) << "maxThreadsDim.z: " << props.maxThreadsDim[2] << std::endl;
+  std::cout << std::setw(w1) << "maxGridSize.x: " << props.maxGridSize[0] << std::endl;
+  std::cout << std::setw(w1) << "maxGridSize.y: " << props.maxGridSize[1] << std::endl;
+  std::cout << std::setw(w1) << "maxGridSize.z: " << props.maxGridSize[2] << std::endl;
+  std::cout << std::setw(w1) << "major: " << props.major << std::endl;
+  std::cout << std::setw(w1) << "minor: " << props.minor << std::endl;
+  std::cout << std::setw(w1) << "concurrentKernels: " << props.concurrentKernels << std::endl;
 #if defined(__HIPCC__)
-      std::cout << std::setw(w1) << "arch.hasGlobalInt32Atomics: " << props.arch.hasGlobalInt32Atomics << std::endl;
-      std::cout << std::setw(w1) << "arch.hasGlobalFloatAtomicExch: " << props.arch.hasGlobalFloatAtomicExch
-                << std::endl;
-      std::cout << std::setw(w1) << "arch.hasSharedInt32Atomics: " << props.arch.hasSharedInt32Atomics << std::endl;
-      std::cout << std::setw(w1) << "arch.hasSharedFloatAtomicExch: " << props.arch.hasSharedFloatAtomicExch
-                << std::endl;
-      std::cout << std::setw(w1) << "arch.hasFloatAtomicAdd: " << props.arch.hasFloatAtomicAdd << std::endl;
-      std::cout << std::setw(w1) << "arch.hasGlobalInt64Atomics: " << props.arch.hasGlobalInt64Atomics << std::endl;
-      std::cout << std::setw(w1) << "arch.hasSharedInt64Atomics: " << props.arch.hasSharedInt64Atomics << std::endl;
-      std::cout << std::setw(w1) << "arch.hasDoubles: " << props.arch.hasDoubles << std::endl;
-      std::cout << std::setw(w1) << "arch.hasWarpVote: " << props.arch.hasWarpVote << std::endl;
-      std::cout << std::setw(w1) << "arch.hasWarpBallot: " << props.arch.hasWarpBallot << std::endl;
-      std::cout << std::setw(w1) << "arch.hasWarpShuffle: " << props.arch.hasWarpShuffle << std::endl;
-      std::cout << std::setw(w1) << "arch.hasFunnelShift: " << props.arch.hasFunnelShift << std::endl;
-      std::cout << std::setw(w1) << "arch.hasThreadFenceSystem: " << props.arch.hasThreadFenceSystem << std::endl;
-      std::cout << std::setw(w1) << "arch.hasSyncThreadsExt: " << props.arch.hasSyncThreadsExt << std::endl;
-      std::cout << std::setw(w1) << "arch.hasSurfaceFuncs: " << props.arch.hasSurfaceFuncs << std::endl;
-      std::cout << std::setw(w1) << "arch.has3dGrid: " << props.arch.has3dGrid << std::endl;
-      std::cout << std::setw(w1) << "arch.hasDynamicParallelism: " << props.arch.hasDynamicParallelism << std::endl;
-      std::cout << std::setw(w1) << "gcnArchName: " << props.gcnArchName << std::endl;
+  std::cout << std::setw(w1) << "arch.hasGlobalInt32Atomics: " << props.arch.hasGlobalInt32Atomics << std::endl;
+  std::cout << std::setw(w1) << "arch.hasGlobalFloatAtomicExch: " << props.arch.hasGlobalFloatAtomicExch
+            << std::endl;
+  std::cout << std::setw(w1) << "arch.hasSharedInt32Atomics: " << props.arch.hasSharedInt32Atomics << std::endl;
+  std::cout << std::setw(w1) << "arch.hasSharedFloatAtomicExch: " << props.arch.hasSharedFloatAtomicExch
+            << std::endl;
+  std::cout << std::setw(w1) << "arch.hasFloatAtomicAdd: " << props.arch.hasFloatAtomicAdd << std::endl;
+  std::cout << std::setw(w1) << "arch.hasGlobalInt64Atomics: " << props.arch.hasGlobalInt64Atomics << std::endl;
+  std::cout << std::setw(w1) << "arch.hasSharedInt64Atomics: " << props.arch.hasSharedInt64Atomics << std::endl;
+  std::cout << std::setw(w1) << "arch.hasDoubles: " << props.arch.hasDoubles << std::endl;
+  std::cout << std::setw(w1) << "arch.hasWarpVote: " << props.arch.hasWarpVote << std::endl;
+  std::cout << std::setw(w1) << "arch.hasWarpBallot: " << props.arch.hasWarpBallot << std::endl;
+  std::cout << std::setw(w1) << "arch.hasWarpShuffle: " << props.arch.hasWarpShuffle << std::endl;
+  std::cout << std::setw(w1) << "arch.hasFunnelShift: " << props.arch.hasFunnelShift << std::endl;
+  std::cout << std::setw(w1) << "arch.hasThreadFenceSystem: " << props.arch.hasThreadFenceSystem << std::endl;
+  std::cout << std::setw(w1) << "arch.hasSyncThreadsExt: " << props.arch.hasSyncThreadsExt << std::endl;
+  std::cout << std::setw(w1) << "arch.hasSurfaceFuncs: " << props.arch.hasSurfaceFuncs << std::endl;
+  std::cout << std::setw(w1) << "arch.has3dGrid: " << props.arch.has3dGrid << std::endl;
+  std::cout << std::setw(w1) << "arch.hasDynamicParallelism: " << props.arch.hasDynamicParallelism << std::endl;
+  std::cout << std::setw(w1) << "gcnArchName: " << props.gcnArchName << std::endl;
 #endif
-      std::cout << std::setw(w1) << "isIntegrated: " << props.integrated << std::endl;
-      std::cout << std::setw(w1) << "maxTexture1D: " << props.maxTexture1D << std::endl;
-      std::cout << std::setw(w1) << "maxTexture2D.width: " << props.maxTexture2D[0] << std::endl;
-      std::cout << std::setw(w1) << "maxTexture2D.height: " << props.maxTexture2D[1] << std::endl;
-      std::cout << std::setw(w1) << "maxTexture3D.width: " << props.maxTexture3D[0] << std::endl;
-      std::cout << std::setw(w1) << "maxTexture3D.height: " << props.maxTexture3D[1] << std::endl;
-      std::cout << std::setw(w1) << "maxTexture3D.depth: " << props.maxTexture3D[2] << std::endl;
+  std::cout << std::setw(w1) << "isIntegrated: " << props.integrated << std::endl;
+  std::cout << std::setw(w1) << "maxTexture1D: " << props.maxTexture1D << std::endl;
+  std::cout << std::setw(w1) << "maxTexture2D.width: " << props.maxTexture2D[0] << std::endl;
+  std::cout << std::setw(w1) << "maxTexture2D.height: " << props.maxTexture2D[1] << std::endl;
+  std::cout << std::setw(w1) << "maxTexture3D.width: " << props.maxTexture3D[0] << std::endl;
+  std::cout << std::setw(w1) << "maxTexture3D.height: " << props.maxTexture3D[1] << std::endl;
+  std::cout << std::setw(w1) << "maxTexture3D.depth: " << props.maxTexture3D[2] << std::endl;
 #if defined(__HIPCC__)
-      std::cout << std::setw(w1) << "isLargeBar: " << props.isLargeBar << std::endl;
-      std::cout << std::setw(w1) << "asicRevision: " << props.asicRevision << std::endl;
+  std::cout << std::setw(w1) << "isLargeBar: " << props.isLargeBar << std::endl;
+  std::cout << std::setw(w1) << "asicRevision: " << props.asicRevision << std::endl;
 #endif
 
   int32_t deviceCnt;

From cf6ce9ab3817ae31cc25e8c82915f7761f2bac4f Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Tue, 30 Sep 2025 12:11:29 +0200
Subject: [PATCH 3/3] Add cooperativeMultiDevice variable in Kernels.cu

---
 GPU/GPUbenchmark/cuda/Kernels.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index be622b30474e5..16dc138ae466f 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -304,6 +304,7 @@ void printDeviceProp(int32_t deviceId)
   int32_t clockRateKHz = 0;
   int32_t memoryClockRateKHz = 0;
   int32_t computeMode = 0;
+  int32_t cooperativeMultiDevice = 0;
 
 #if (CUDART_VERSION >= 13000)
   GPUCHECK(cudaDeviceGetAttribute(&clockRateKHz, cudaDevAttrClockRate, deviceId));