From 499857616ee904b353187a99b03b15e3f2dabbe3 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 5 Sep 2025 20:44:57 +0200
Subject: [PATCH 01/11] Adding verbosity and fixing off-by-one error

---
 .../Global/GPUChainTrackingClusterizer.cxx    | 49 +++++++++++++++++--
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 49 +++++++++++++++++++
 .../GPUTPCNNClusterizerKernels.cxx            |  1 +
 3 files changed, 94 insertions(+), 5 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index c92049b040c46..8b15a8a06c276 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -665,7 +665,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
       nnTimers[11] = &getTimer<GPUTPCNNClusterizer, 11>("GPUTPCNNClusterizer_ONNXRegression2_2_", 11);
     }
 
-    mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) {
+    for (int32_t lane = 0; lane < numLanes; lane++) {
       nnApplications[lane].init(nn_settings, GetProcessingSettings().deterministicGPUReconstruction);
       if (nnApplications[lane].mModelsUsed[0]) {
         SetONNXGPUStream(*(nnApplications[lane].mModelClass).getSessionOptions(), lane, &deviceId);
@@ -706,10 +706,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
         (nnApplications[lane].mModelReg2).initSession();
       }
-      if (nn_settings.nnClusterizerVerbosity < 3) {
+      if (nn_settings.nnClusterizerVerbosity > 0) {
         LOG(info) << "(ORT) Allocated ONNX stream for lane " << lane << " and device " << deviceId;
       }
-    });
+    };
     for (int32_t sector = 0; sector < NSECTORS; sector++) {
       GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[sector];
       GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[sector] : clustererNN;
@@ -724,12 +724,24 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         clustererNNShadow.mNnClusterizerTotalClusters = processors()->tpcClusterer[lane].mNMaxClusters;
         nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow);
       }
+      if (nn_settings.nnClusterizerVerbosity > 2) {
+        LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Processor initialized. Sector " << sector << ", lane " << lane << ", max clusters " << clustererNN.mNnClusterizerTotalClusters << " (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
+      }
       AllocateRegisteredMemory(clustererNN.mMemoryId);
+      if (nn_settings.nnClusterizerVerbosity > 2) {
+        LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Memory registered for memoryId " << clustererNN.mMemoryId << " (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
+      }
       // nnApplications[lane].createBoundary(clustererNNShadow);
       // nnApplications[lane].createIndexLookup(clustererNNShadow);
     }
     if (doGPU) {
+      if (nn_settings.nnClusterizerVerbosity > 2) {
+        LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Writing to constant memory...";
+      }
       WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
+      if (nn_settings.nnClusterizerVerbosity > 2) {
+        LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Writing to constant memory done";
+      }
     }
   }
 #endif
@@ -1010,9 +1022,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           }
 
           // float time_clusterizer = 0, time_fill = 0, time_networks = 0;
+          if (nn_settings.nnClusterizerVerbosity > 2) {
+            LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Starting loop over batched data. clustererNNShadow.mNnClusterizerBatchedMode=" << clustererNNShadow.mNnClusterizerBatchedMode << ", numLoops=" << std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNNShadow.mNnClusterizerBatchedMode) << ", numClusters=" << clusterer.mPmemory->counters.nClusters << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
+          }
           for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNNShadow.mNnClusterizerBatchedMode); batch++) {
+            if (nn_settings.nnClusterizerVerbosity > 3) {
+              LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Start. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
+            }
             uint batchStart = batch * clustererNNShadow.mNnClusterizerBatchedMode;
-            size_t iSize = CAMath::Min((uint)clustererNNShadow.mNnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
+            size_t iSize = CAMath::Min((uint)clustererNNShadow.mNnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart - 1));
 
             // Filling the data
             if (mRec->IsGPU() || GetProcessingSettings().nn.nnClusterizerForceGpuInputFill) {
@@ -1022,9 +1040,18 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
               // Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability
               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNCPU>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
             }
+            if (doGPU) { // This is to make sure that the network does not start the evaluation before all data is filled
+              SynchronizeStream(lane);
+            }
+            if (nn_settings.nnClusterizerVerbosity > 3) {
+              LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done filling data. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
+            }
 
             if (clustererNNShadow.mNnClusterizerSetDeconvolutionFlags) {
               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishDeconvolutionFlags>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); // Publishing the deconvolution flags
+              if (nn_settings.nnClusterizerVerbosity > 3) {
+                LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done setting deconvolution flags. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
+              }
             }
 
             // NN evaluations
@@ -1044,6 +1071,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                 }
               }
               if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane]->Stop(); }
+              if (nn_settings.nnClusterizerVerbosity > 3) {
+                LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with NN classification inference. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
+              }
             }
             if (!clustererNNShadow.mNnClusterizerUseCfRegression) {
               if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 1]->Start(); }
@@ -1078,6 +1108,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                 }
                 if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 2]->Stop(); }
               }
+              if (nn_settings.nnClusterizerVerbosity > 3) {
+                LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with NN regression inference. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
+              }
             }
 
             // Publishing kernels for class labels and regression results
@@ -1092,6 +1125,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Publishing class 2 regression results
               }
             }
+            if (nn_settings.nnClusterizerVerbosity > 3) {
+              LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done publishing. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
+            }
           }
 
           if (clustererNNShadow.mNnClusterizerUseCfRegression) {
@@ -1100,6 +1136,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             }
             DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
             runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
+            if (nn_settings.nnClusterizerVerbosity > 3) {
+              LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with CF regression. (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
+            }
           }
 #else
           GPUFatal("Project not compiled with neural network clusterization. Aborting.");
@@ -1202,7 +1241,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   }
   for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) {
 #ifdef GPUCA_HAS_ONNX
-    if (GetProcessingSettings().nn.applyNNclusterizer) {
+    if (GetProcessingSettings().nn.applyNNclusterizer && GetProcessingSettings().nn.nnClusterizerVerbosity > 0) {
       LOG(info) << "(ORT) Environment releasing...";
       GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
       nnApplication.mModelClass.release(true);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 3dd8b0d621a56..2d09ff1bb80c0 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -16,6 +16,11 @@
 #include "ML/3rdparty/GPUORTFloat16.h"
 #include "GPUTPCNNClusterizer.h"
 #include "GPUSettings.h"
+#include "GPULogging.h"
+#include <cstdint>   // uintptr_t
+#include <iomanip>   // setprecision
+#include <ostream>
+#include <sstream>
 
 using namespace o2::gpu;
 
@@ -25,6 +30,8 @@ void GPUTPCNNClusterizer::SetMaxData(const GPUTrackingInOutPointers& io) {}
 
 void* GPUTPCNNClusterizer::setIOPointers(void* mem)
 {
+  // Keep track of the start address to compute how much memory we assign
+  void* startMem = mem;
   if (mNnClusterizerBatchedMode > 0) {
     if (mNnInferenceInputDType == 0 && mNnClusterizerElementSize > 0) {
       computePointerWithAlignment(mem, mInputData_16, mNnClusterizerBatchedMode * mNnClusterizerElementSize);
@@ -62,6 +69,48 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
   if (mNnClusterizerTotalClusters > 0) {
     computePointerWithAlignment(mem, mOutputDataClass, mNnClusterizerTotalClusters);
   }
+
+  if (mNnClusterizerVerbosity > 2) {
+    if (mNnClusterizerVerbosity > 3) {
+      auto fmt = [](size_t bytes) {
+        std::ostringstream os;
+        double mb = bytes / (1024.0 * 1024.0);
+        os << bytes << " bytes (" << std::fixed << std::setprecision(3) << mb << " MB)";
+        return os.str();
+      };
+
+      // Safely compute sizes only if corresponding pointer was allocated (and dimensions positive)
+      size_t szClusterFlags = (mClusterFlags && mNnClusterizerBatchedMode > 0) ? (size_t)2 * mNnClusterizerBatchedMode * sizeof(int8_t) : 0;
+      size_t szInput16 = (mInputData_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize * sizeof(OrtDataType::Float16_t) : 0;
+      size_t szInput32 = (mInputData_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize * sizeof(float) : 0;
+      size_t szProb16 = (mModelProbabilities_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes * sizeof(OrtDataType::Float16_t) : 0;
+      size_t szProb32 = (mModelProbabilities_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes * sizeof(float) : 0;
+      size_t szReg1_16 = (mOutputDataReg1_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes * sizeof(OrtDataType::Float16_t) : 0;
+      size_t szReg2_16 = (mOutputDataReg2_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes * sizeof(OrtDataType::Float16_t) : 0;
+      size_t szReg1_32 = (mOutputDataReg1_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes * sizeof(float) : 0;
+      size_t szReg2_32 = (mOutputDataReg2_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes * sizeof(float) : 0;
+      size_t szOutputDataClass = (mOutputDataClass && mNnClusterizerTotalClusters > 0) ? (size_t)mNnClusterizerTotalClusters * sizeof(int32_t) : 0;
+
+      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") Pointers set for clusterizer with memoryID " << mMemoryId << " deviceID " << mDeviceId << " and sector " << mISector;
+      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataClass pointer: " << mOutputDataClass << " | " << fmt(szOutputDataClass) << " MB";
+      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mClusterFlags pointer: " << static_cast<const void*>(mClusterFlags) << " | " << fmt(szClusterFlags) << " MB";
+      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mInputData_16 pointer: " << mInputData_16 << " | " << fmt(szInput16) << " MB";
+      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mModelProbabilities_16 pointer: " << mModelProbabilities_16 << " | " << fmt(szProb16) << " MB";
+      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg1_16 pointer: " << mOutputDataReg1_16 << " | " << fmt(szReg1_16) << " MB";
+      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg2_16 pointer: " << mOutputDataReg2_16 << " | " << fmt(szReg2_16) << " MB";
+      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mInputData_32 pointer: " << mInputData_32 << " | " << fmt(szInput32) << " MB";
+      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mModelProbabilities_32 pointer: " << mModelProbabilities_32 << " | " << fmt(szProb32) << " MB";
+      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg1_32 pointer: " << mOutputDataReg1_32 << " | " << fmt(szReg1_32) << " MB";
+      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg2_32 pointer: " << mOutputDataReg2_32 << " | " << fmt(szReg2_32) << " MB";
+    }
+    // Compute allocated bytes (difference between advanced pointer and start pointer)
+    size_t allocatedBytes = static_cast<size_t>(reinterpret_cast<uintptr_t>(mem) - reinterpret_cast<uintptr_t>(startMem));
+    double allocatedMB = static_cast<double>(allocatedBytes) / (1024.0 * 1024.0);
+    LOG(info) << std::fixed << std::setprecision(3)
+              << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") Total scratch allocation in setIOPointers: " << allocatedBytes
+              << " bytes (" << allocatedMB << " MB)";
+  }
+
   return mem;
 }
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 1882acd2a45c6..0e9ffaa8caeb0 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -360,6 +360,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
       return;
     }
 
+    // For flag influence on cluster error setting: O2/GPU/GPUTracking/Base/GPUParam.inc -> UpdateClusterError2ByState
     bool notSinglePad = false, notSingleTime = false;
     for (uint16_t i = 0; i < 8; i++) {
       Delta2 d = cfconsts::InnerNeighbors[i];

From 5368d52b3e6e24cfe168912d6b40d924af091b40 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 5 Sep 2025 22:46:54 +0200
Subject: [PATCH 02/11] removing unnecessary include, using GPUCommonLogger to
 fix CI build

---
 GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx   | 1 -
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx | 6 +-----
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 8b15a8a06c276..f73069b9290a8 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -43,7 +43,6 @@
 #include "DataFormatsTPC/Digit.h"
 #include "DataFormatsTPC/Constants.h"
 #include "TPCBase/RDHUtils.h"
-#include "GPULogging.h"
 
 #ifdef GPUCA_HAS_ONNX
 #include "GPUTPCNNClusterizerKernels.h"
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 2d09ff1bb80c0..e7e0a3c6d52cf 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -16,11 +16,7 @@
 #include "ML/3rdparty/GPUORTFloat16.h"
 #include "GPUTPCNNClusterizer.h"
 #include "GPUSettings.h"
-#include "GPULogging.h"
-#include <cstdint>   // uintptr_t
-#include <iomanip>   // setprecision
-#include <ostream>
-#include <sstream>
+#include "GPUCommonLogger.h"
 
 using namespace o2::gpu;
 

From d87f8fae212be5266694045c92001ebd93407615 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sat, 6 Sep 2025 14:09:23 +0200
Subject: [PATCH 03/11] GetGrid spawns more threads than actual number -> Most
 probably explains out-of-bounds accesses and memory faults

---
 Common/ML/src/OrtInterface.cxx                |  2 +-
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  4 +-
 .../Global/GPUChainTrackingClusterizer.cxx    | 13 ++-
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 82 +++++++++++++------
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  2 +-
 .../GPUTPCNNClusterizerKernels.cxx            | 69 ++++++++++++----
 6 files changed, 119 insertions(+), 53 deletions(-)

diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index 58d80eb9c0bf0..d30d05d1d1a00 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -54,7 +54,7 @@ void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsM
 
   // Load from options map
   if (!optionsMap.contains("model-path")) {
-    LOG(fatal) << "(ORT) Model path cannot be empty!";
+    LOG(fatal) << "(ORT) Model path must be contained in options map!";
   }
 
   if (!optionsMap["model-path"].empty()) {
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 65697b7f7c08b..6419d63bb7ada 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -259,7 +259,7 @@ AddOption(nnInferenceEnableOrtOptimization, unsigned int, 99, "", 0, "Enables gr
 AddOption(nnInferenceUseDeterministicCompute, int, 0, "", 0, "Enables deterministic compute in ONNX Runtime were possible. Can be [0, 1] -> see https://github.com/microsoft/onnxruntime/blob/3b97d79b3c12dbf93aa0d563f345714596dc8ab6/onnxruntime/core/framework/session_options.h#L208")
 AddOption(nnInferenceOrtProfiling, int, 0, "", 0, "Enables profiling of model execution in ONNX Runtime")
 AddOption(nnInferenceOrtProfilingPath, std::string, ".", "", 0, "If nnInferenceOrtProfiling is set, the path to store the profiling data")
-AddOption(nnInferenceVerbosity, int, 1, "", 0, "0: No messages; 1: Warnings; 2: Warnings + major debugs; >3: All debugs")
+AddOption(nnInferenceVerbosity, int, 2, "", 0, "0: All debugs; 1: Warnings + major debugs; 2: Warnings; >=3: No messages")
 AddOption(nnClusterizerAddIndexData, int, 1, "", 0, "If normalized index data (sector, row, pad), should be appended to the input")
 AddOption(nnClusterizerSizeInputRow, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
 AddOption(nnClusterizerSizeInputPad, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
@@ -267,7 +267,7 @@ AddOption(nnClusterizerSizeInputTime, int, 3, "", 0, "Size of the input to the N
 AddOption(nnClusterizerUseCfRegression, int, 0, "", 0, "(bool, default = false) If true, use the regression from the native clusterizer and not the NN")
 AddOption(nnClusterizerApplyCfDeconvolution, int, 0, "", 0, "Applies the CFDeconvolution kernel before the digits to the network are filled")
 AddOption(nnClusterizerBatchedMode, unsigned int, 1, "", 0, "(int, default = 1) If >1, the NN is evaluated on batched input of size specified in this variable")
-AddOption(nnClusterizerVerbosity, int, -1, "", 0, "(int, default = -1) If >0, logging messages of the clusterizer will be displayed")
+AddOption(nnClusterizerVerbosity, int, -1, "", 0, "(int, default = -1) If >0, logging messages of the clusterizer will be displayed. Higher number = higher verbosity")
 AddOption(nnClusterizerBoundaryFillValue, int, -1, "", 0, "Fill value for the boundary of the input to the NN")
 AddOption(nnClusterizerApplyNoiseSuppression, int, 1, "", 0, "Applies the NoiseSuppression kernel before the digits to the network are filled")
 AddOption(nnClusterizerSetDeconvolutionFlags, int, 1, "", 0, "Runs the deconvolution kernel without overwriting the charge in order to make cluster-to-track attachment identical to heuristic CF")
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index f73069b9290a8..04f91b58e677a 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -708,7 +708,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
       if (nn_settings.nnClusterizerVerbosity > 0) {
         LOG(info) << "(ORT) Allocated ONNX stream for lane " << lane << " and device " << deviceId;
       }
-    };
+    }
     for (int32_t sector = 0; sector < NSECTORS; sector++) {
       GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[sector];
       GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[sector] : clustererNN;
@@ -1029,7 +1029,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
               LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Start. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
             }
             uint batchStart = batch * clustererNNShadow.mNnClusterizerBatchedMode;
-            size_t iSize = CAMath::Min((uint)clustererNNShadow.mNnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart - 1));
+            size_t iSize = CAMath::Min((uint)clustererNNShadow.mNnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
 
             // Filling the data
             if (mRec->IsGPU() || GetProcessingSettings().nn.nnClusterizerForceGpuInputFill) {
@@ -1039,9 +1039,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
               // Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability
               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNCPU>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
             }
-            if (doGPU) { // This is to make sure that the network does not start the evaluation before all data is filled
-              SynchronizeStream(lane);
-            }
             if (nn_settings.nnClusterizerVerbosity > 3) {
               LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done filling data. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
             }
@@ -1240,8 +1237,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   }
   for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) {
 #ifdef GPUCA_HAS_ONNX
-    if (GetProcessingSettings().nn.applyNNclusterizer && GetProcessingSettings().nn.nnClusterizerVerbosity > 0) {
-      LOG(info) << "(ORT) Environment releasing...";
+    if (GetProcessingSettings().nn.applyNNclusterizer) {
+      if (GetProcessingSettings().nn.nnClusterizerVerbosity > 0) {
+        LOG(info) << "(ORT) Environment releasing...";
+      }
       GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
       nnApplication.mModelClass.release(true);
       nnApplication.mModelReg1.release(true);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index e7e0a3c6d52cf..48596bed44c46 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -69,35 +69,67 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
   if (mNnClusterizerVerbosity > 2) {
     if (mNnClusterizerVerbosity > 3) {
       auto fmt = [](size_t bytes) {
-        std::ostringstream os;
-        double mb = bytes / (1024.0 * 1024.0);
-        os << bytes << " bytes (" << std::fixed << std::setprecision(3) << mb << " MB)";
-        return os.str();
+      std::ostringstream os;
+      double mb = bytes / (1024.0 * 1024.0);
+      os << bytes << " bytes (" << std::fixed << std::setprecision(3) << mb << " MB)";
+      return os.str();
       };
 
-      // Safely compute sizes only if corresponding pointer was allocated (and dimensions positive)
-      size_t szClusterFlags = (mClusterFlags && mNnClusterizerBatchedMode > 0) ? (size_t)2 * mNnClusterizerBatchedMode * sizeof(int8_t) : 0;
-      size_t szInput16 = (mInputData_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize * sizeof(OrtDataType::Float16_t) : 0;
-      size_t szInput32 = (mInputData_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize * sizeof(float) : 0;
-      size_t szProb16 = (mModelProbabilities_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes * sizeof(OrtDataType::Float16_t) : 0;
-      size_t szProb32 = (mModelProbabilities_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes * sizeof(float) : 0;
-      size_t szReg1_16 = (mOutputDataReg1_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes * sizeof(OrtDataType::Float16_t) : 0;
-      size_t szReg2_16 = (mOutputDataReg2_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes * sizeof(OrtDataType::Float16_t) : 0;
-      size_t szReg1_32 = (mOutputDataReg1_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes * sizeof(float) : 0;
-      size_t szReg2_32 = (mOutputDataReg2_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes * sizeof(float) : 0;
-      size_t szOutputDataClass = (mOutputDataClass && mNnClusterizerTotalClusters > 0) ? (size_t)mNnClusterizerTotalClusters * sizeof(int32_t) : 0;
+      // Element counts (number of array entries, not bytes)
+      size_t elemsClusterFlags = (mClusterFlags && mNnClusterizerBatchedMode > 0) ? (size_t)2 * mNnClusterizerBatchedMode : 0;
+      size_t elemsInput16 = (mInputData_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize : 0;
+      size_t elemsInput32 = (mInputData_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize : 0;
+      size_t elemsProb16 = (mModelProbabilities_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes : 0;
+      size_t elemsProb32 = (mModelProbabilities_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes : 0;
+      size_t elemsReg1_16 = (mOutputDataReg1_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes : 0;
+      size_t elemsReg2_16 = (mOutputDataReg2_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes : 0;
+      size_t elemsReg1_32 = (mOutputDataReg1_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes : 0;
+      size_t elemsReg2_32 = (mOutputDataReg2_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes : 0;
+      size_t elemsOutputDataClass = (mOutputDataClass && mNnClusterizerTotalClusters > 0) ? (size_t)mNnClusterizerTotalClusters : 0;
+
+      // Byte sizes
+      size_t szClusterFlags = elemsClusterFlags * sizeof(int8_t);
+      size_t szInput16 = elemsInput16 * sizeof(OrtDataType::Float16_t);
+      size_t szInput32 = elemsInput32 * sizeof(float);
+      size_t szProb16 = elemsProb16 * sizeof(OrtDataType::Float16_t);
+      size_t szProb32 = elemsProb32 * sizeof(float);
+      size_t szReg1_16 = elemsReg1_16 * sizeof(OrtDataType::Float16_t);
+      size_t szReg2_16 = elemsReg2_16 * sizeof(OrtDataType::Float16_t);
+      size_t szReg1_32 = elemsReg1_32 * sizeof(float);
+      size_t szReg2_32 = elemsReg2_32 * sizeof(float);
+      size_t szOutputDataClass = elemsOutputDataClass * sizeof(int32_t);
 
       LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") Pointers set for clusterizer with memoryID " << mMemoryId << " deviceID " << mDeviceId << " and sector " << mISector;
-      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataClass pointer: " << mOutputDataClass << " | " << fmt(szOutputDataClass) << " MB";
-      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mClusterFlags pointer: " << static_cast<const void*>(mClusterFlags) << " | " << fmt(szClusterFlags) << " MB";
-      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mInputData_16 pointer: " << mInputData_16 << " | " << fmt(szInput16) << " MB";
-      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mModelProbabilities_16 pointer: " << mModelProbabilities_16 << " | " << fmt(szProb16) << " MB";
-      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg1_16 pointer: " << mOutputDataReg1_16 << " | " << fmt(szReg1_16) << " MB";
-      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg2_16 pointer: " << mOutputDataReg2_16 << " | " << fmt(szReg2_16) << " MB";
-      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mInputData_32 pointer: " << mInputData_32 << " | " << fmt(szInput32) << " MB";
-      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mModelProbabilities_32 pointer: " << mModelProbabilities_32 << " | " << fmt(szProb32) << " MB";
-      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg1_32 pointer: " << mOutputDataReg1_32 << " | " << fmt(szReg1_32) << " MB";
-      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg2_32 pointer: " << mOutputDataReg2_32 << " | " << fmt(szReg2_32) << " MB";
+      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataClass pointer: " << mOutputDataClass
+          << " | elements=" << elemsOutputDataClass << " (= mNnClusterizerTotalClusters)"
+          << " | " << fmt(szOutputDataClass);
+      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mClusterFlags pointer: " << static_cast<const void*>(mClusterFlags)
+          << " | elements=" << elemsClusterFlags << " (= 2 * mNnClusterizerBatchedMode)"
+          << " | " << fmt(szClusterFlags);
+      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mInputData_16 pointer: " << mInputData_16
+          << " | elements=" << elemsInput16 << " (= mNnClusterizerBatchedMode * mNnClusterizerElementSize)"
+          << " | " << fmt(szInput16);
+      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mModelProbabilities_16 pointer: " << mModelProbabilities_16
+          << " | elements=" << elemsProb16 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes)"
+          << " | " << fmt(szProb16);
+      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg1_16 pointer: " << mOutputDataReg1_16
+          << " | elements=" << elemsReg1_16 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes)"
+          << " | " << fmt(szReg1_16);
+      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg2_16 pointer: " << mOutputDataReg2_16
+          << " | elements=" << elemsReg2_16 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes)"
+          << " | " << fmt(szReg2_16);
+      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mInputData_32 pointer: " << mInputData_32
+          << " | elements=" << elemsInput32 << " (= mNnClusterizerBatchedMode * mNnClusterizerElementSize)"
+          << " | " << fmt(szInput32);
+      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mModelProbabilities_32 pointer: " << mModelProbabilities_32
+          << " | elements=" << elemsProb32 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes)"
+          << " | " << fmt(szProb32);
+      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg1_32 pointer: " << mOutputDataReg1_32
+          << " | elements=" << elemsReg1_32 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes)"
+          << " | " << fmt(szReg1_32);
+      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg2_32 pointer: " << mOutputDataReg2_32
+          << " | elements=" << elemsReg2_32 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes)"
+          << " | " << fmt(szReg2_32);
     }
     // Compute allocated bytes (difference between advanced pointer and start pointer)
     size_t allocatedBytes = static_cast<size_t>(reinterpret_cast<uintptr_t>(mem) - reinterpret_cast<uintptr_t>(startMem));
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 69972c8a0651c..a6b0b081fc3dd 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -50,7 +50,7 @@ class GPUTPCNNClusterizer : public GPUProcessor
   int32_t mNnClusterizerUseCfRegression = 0;
   int32_t mNnClusterizerBatchedMode = 1;
   int32_t mNnClusterizerTotalClusters = 1;
-  int32_t mNnClusterizerVerbosity = 0;
+  int32_t mNnClusterizerVerbosity = 1;
   int32_t mNnClusterizerBoundaryFillValue = -1;
   int32_t mNnClusterizerModelClassNumOutputNodes = -1;
   int32_t mNnClusterizerModelReg1NumOutputNodes = -1;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 0e9ffaa8caeb0..89f7cc6143fbc 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -57,7 +57,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   auto& clustererNN = processors.tpcNNClusterer[sector];
 
   uint32_t glo_idx = get_global_id(0);
-  if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
+  if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= clustererNN.mNnClusterizerBatchedMode) {
     return;
   }
 
@@ -145,11 +145,13 @@ template <>
 GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNGPU>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
 {
   uint32_t glo_idx = get_global_id(0);
-
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
 
-  // Optimized division using bit operations
+  if (glo_idx >= clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerRowTimeSizeFull) {
+    return;
+  }
+
   uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeFull;
   uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeFull);
 
@@ -249,6 +251,21 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
       } else {
         clustererNN.mInputData_32[write_idx] = output_value;
       }
+      // if (write_idx >= clustererNN.mNnClusterizerElementSize * clustererNN.mNnClusterizerBatchedMode) {
+      //   printf("Error: Write index out of bounds (central array)! %d >= %d (write_idx: %d, base_idx: %d, transient_index: %d, row_idx: %d, time_idx: %d, r_local: %d, t_local: %d)\n",
+      //          write_idx, (int)(clustererNN.mNnClusterizerElementSize * clustererNN.mNnClusterizerBatchedMode), write_idx, base_idx, transient_index, row_idx, time_idx, r_local, t_local);
+      // }
+      // if ((clusterer.mPmemory->counters.nClusters - batchStart) < clustererNN.mNnClusterizerBatchedMode) {
+      //   if (write_idx >= ((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize)) {
+      //     printf("Error: Write index out of bounds (end of array)! %d >= %d (write_idx: %d, base_idx: %d, transient_index: %d, row_idx: %d, time_idx: %d, r_local: %d, t_local: %d)\n",
+      //           write_idx, (int)((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize), write_idx, base_idx, transient_index, row_idx, time_idx, r_local, t_local);
+      //   }
+      //   if (write_idx > ((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize - 5)) {
+      //     printf("Sanity check (should appear only once) %d == %d (write_idx: %d, base_idx: %d, transient_index: %d, row_idx: %d, time_idx: %d, r_local: %d, t_local: %d)\n",
+      //           write_idx, (int)((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize - 4), write_idx, base_idx, transient_index, row_idx, time_idx, r_local, t_local);
+      //   }
+      // }
+
       write_idx += clustererNN.mNnClusterizerFullTimeSize; // Move to next pad position
     }
   }
@@ -258,18 +275,27 @@ template <>
 GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
 {
   uint32_t glo_idx = get_global_id(0);
+  auto& clusterer = processors.tpcClusterer[sector];
+  auto& clustererNN = processors.tpcNNClusterer[sector];
+  if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= clustererNN.mNnClusterizerBatchedMode) {
+    return;
+  }
   if (dtype == 0) {
-    processors.tpcNNClusterer[sector].mOutputDataClass[glo_idx + batchStart] = (int)((processors.tpcNNClusterer[sector].mModelProbabilities_16[glo_idx]).ToFloat() > processors.tpcNNClusterer[sector].mNnClassThreshold);
+    processors.tpcNNClusterer[sector].mOutputDataClass[glo_idx + batchStart] = (int32_t)((processors.tpcNNClusterer[sector].mModelProbabilities_16[glo_idx]).ToFloat() > processors.tpcNNClusterer[sector].mNnClassThreshold);
   } else if (dtype == 1) {
-    processors.tpcNNClusterer[sector].mOutputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].mModelProbabilities_32[glo_idx] > processors.tpcNNClusterer[sector].mNnClassThreshold);
+    processors.tpcNNClusterer[sector].mOutputDataClass[glo_idx + batchStart] = (int32_t)(processors.tpcNNClusterer[sector].mModelProbabilities_32[glo_idx] > processors.tpcNNClusterer[sector].mNnClassThreshold);
   }
 }
 
 template <>
 GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
 {
-  auto& clustererNN = processors.tpcNNClusterer[sector];
   uint32_t glo_idx = get_global_id(0);
+  auto& clusterer = processors.tpcClusterer[sector];
+  auto& clustererNN = processors.tpcNNClusterer[sector];
+  if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= clustererNN.mNnClusterizerBatchedMode) {
+    return;
+  }
   uint32_t elem_iterator = glo_idx * clustererNN.mNnClusterizerModelClassNumOutputNodes;
   float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
   uint32_t class_label = 0;
@@ -302,6 +328,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
   uint32_t glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
+  if (glo_idx >= clustererNN.mNnClusterizerBatchedMode) {
+    return;
+  }
 
   uint32_t maxClusterNum = clusterer.mPmemory->counters.nClusters;
   uint32_t full_glo_idx = glo_idx + batchStart;
@@ -426,6 +455,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
   uint32_t glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
+  if (glo_idx >= clustererNN.mNnClusterizerBatchedMode) {
+    return;
+  }
 
   uint32_t maxClusterNum = clusterer.mPmemory->counters.nClusters;
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
@@ -580,27 +612,30 @@ template <>
 GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishDeconvolutionFlags>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart)
 {
   // Implements identical publishing logic as the heuristic clusterizer and deconvolution kernel
-  uint32_t idx = get_global_id(0);
+  uint32_t glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
+  if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= clustererNN.mNnClusterizerBatchedMode) {
+    return;
+  }
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
-  CfChargePos peak = clusterer.mPfilteredPeakPositions[idx + batchStart];
+  CfChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
 
-  clustererNN.mClusterFlags[2 * idx] = 0;
-  clustererNN.mClusterFlags[2 * idx + 1] = 0;
+  clustererNN.mClusterFlags[2 * glo_idx] = 0;
+  clustererNN.mClusterFlags[2 * glo_idx + 1] = 0;
   for (int i = 0; i < 8; i++) {
     Delta2 d = cfconsts::InnerNeighbors[i];
     CfChargePos tmp_pos = peak.delta(d);
     PackedCharge charge = chargeMap[tmp_pos];
-    clustererNN.mClusterFlags[2 * idx] += (d.y != 0 && charge.isSplit());
-    clustererNN.mClusterFlags[2 * idx + 1] += (d.x != 0 && charge.isSplit());
+    clustererNN.mClusterFlags[2 * glo_idx] += (d.y != 0 && charge.isSplit());
+    clustererNN.mClusterFlags[2 * glo_idx + 1] += (d.x != 0 && charge.isSplit());
   }
   for (int i = 0; i < 16; i++) {
     Delta2 d = cfconsts::OuterNeighbors[i];
     CfChargePos tmp_pos = peak.delta(d);
     PackedCharge charge = chargeMap[tmp_pos];
-    clustererNN.mClusterFlags[2 * idx] += (d.y != 0 && charge.isSplit() && !charge.has3x3Peak());
-    clustererNN.mClusterFlags[2 * idx + 1] += (d.x != 0 && charge.isSplit() && !charge.has3x3Peak());
+    clustererNN.mClusterFlags[2 * glo_idx] += (d.y != 0 && charge.isSplit() && !charge.has3x3Peak());
+    clustererNN.mClusterFlags[2 * glo_idx + 1] += (d.x != 0 && charge.isSplit() && !charge.has3x3Peak());
   }
 }
 
@@ -608,7 +643,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 GPUd() int32_t GPUTPCNNClusterizerKernels::padOffset(int32_t row_ref, int32_t row_current)
 {
   if (row_current < 0 || row_current >= o2::tpc::constants::MAXGLOBALPADROW) {
-    return 0; // Short-circuit for negative rows
+    return 0; // Short-circuit for out-of-bound rows
   } else {
     return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2);
   }
@@ -624,11 +659,11 @@ GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int
   if (pad < 0 || row < 0) { // Faster short-circuit
     return true;
   } else if (row < 63) {
-    return ((pad < 0) || (pad >= static_cast<int>(GPUTPCGeometry::NPads(row))));
+    return (pad >= static_cast<int>(GPUTPCGeometry::NPads(row)));
   } else if (row < (63 + offset)) { // to account for the gap between IROC and OROC. Charge will be set to the boundary fill value in order to signal boundaries to the neural network
     return true;
   } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + offset)) {
-    return ((pad < 0) || (pad >= static_cast<int>(GPUTPCGeometry::NPads(row - offset))));
+    return (pad >= static_cast<int>(GPUTPCGeometry::NPads(row - offset)));
   } else {
     return true;
   }

From d3f439e3fcbabf5175026f1d332ea52fbbd979ff Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sat, 6 Sep 2025 23:17:28 +0200
Subject: [PATCH 04/11] Fixing smem usage from CFClusterizer and adding
 rejection flag -> No out-of-bounds in QC anymore

---
 .../Global/GPUChainTrackingClusterizer.cxx    |  10 +-
 .../TPCClusterFinder/GPUTPCCFClusterizer.cxx  |   2 +-
 .../TPCClusterFinder/GPUTPCCFClusterizer.h    |   2 +-
 .../TPCClusterFinder/GPUTPCCFClusterizer.inc  |   6 +-
 .../GPUTPCNNClusterizerKernels.cxx            | 362 +++++++++---------
 .../GPUTPCNNClusterizerKernels.h              |  13 +-
 6 files changed, 188 insertions(+), 207 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 04f91b58e677a..5e1a67af10994 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -1110,10 +1110,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             }
 
             // Publishing kernels for class labels and regression results
-            if (nnApplication.mModelClass.getNumOutputNodes()[0][1] == 1) {
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Assigning class labels
-            } else {
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Assigning class labels
+            if(clustererNNShadow.mNnClusterizerUseClassification) {
+              if (nnApplication.mModelClass.getNumOutputNodes()[0][1] == 1) {
+                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Assigning class labels
+              } else {
+                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Assigning class labels
+              }
             }
             if (!clustererNNShadow.mNnClusterizerUseCfRegression) {
               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Publishing class 1 regression results
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx
index c9c6b157499f2..49ee5957b8b36 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx
@@ -35,5 +35,5 @@ GPUdii() void GPUTPCCFClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads,
 
   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 
-  GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
+  GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, true);
 }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h
index 466d13d3254de..70e21db81756c 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h
@@ -57,7 +57,7 @@ class GPUTPCCFClusterizer : public GPUKernelTemplate
   template <int32_t iKernel = defaultKernel>
   GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t);
 
-  static GPUd() void computeClustersImpl(int32_t, int32_t, int32_t, int32_t, processorType&, const CfFragment&, GPUSharedMemory&, const CfArray2D<PackedCharge>&, const CfChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*, uint32_t*);
+  static GPUd() void computeClustersImpl(int32_t, int32_t, int32_t, int32_t, processorType&, const CfFragment&, GPUSharedMemory&, const CfArray2D<PackedCharge>&, const CfChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*, uint32_t*, int8_t);
 
   static GPUd() void buildCluster(const GPUSettingsRec&, const CfArray2D<PackedCharge>&, CfChargePos, CfChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*);
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc
index e32abbf37584f..c2c104809990e 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc
@@ -27,7 +27,8 @@ GPUdii() void GPUTPCCFClusterizer::computeClustersImpl(int32_t nBlocks, int32_t
                                                        uint32_t maxClusterPerRow,
                                                        uint32_t* clusterInRow,
                                                        tpc::ClusterNative* clusterByRow,
-                                                       uint32_t* clusterPosInRow)
+                                                       uint32_t* clusterPosInRow,
+                                                       int8_t isAccepted)
 {
   uint32_t idx = get_global_id(0);
 
@@ -62,6 +63,9 @@ GPUdii() void GPUTPCCFClusterizer::computeClustersImpl(int32_t nBlocks, int32_t
   tpc::ClusterNative myCluster;
   pc.finalize(pos, charge, fragment.start);
   bool rejectCluster = !pc.toNative(pos, charge, myCluster, clusterer.Param(), chargeMap);
+  if (!isAccepted) {
+    rejectCluster = true;
+  }
 
   if (rejectCluster) {
     if (clusterPosInRow) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 89f7cc6143fbc..1d13df440b3a4 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -12,7 +12,11 @@
 /// \file GPUTPCNNClusterizerKernels.cxx
 /// \author Christian Sonnabend
 
+#include "clusterFinderDefs.h"
+#include "PackedCharge.h"
 #include "GPUTPCNNClusterizerKernels.h"
+#include "GPUConstantMem.h"
+#include "GPUTPCClusterFinder.h"
 #include "GPUTPCCFClusterizer.h"
 #include "GPUTPCGeometry.h"
 
@@ -40,14 +44,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::run
   uint32_t glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
-  if (clustererNN.mOutputDataClass[glo_idx] == 0) { // default clusterizer should not be called in batched mode due to mess-up with thread indices
-    return;
-  }
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CPU_ONLY(MCLabelAccumulator labelAcc(clusterer));
   tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
-  o2::gpu::GPUTPCCFClusterizer::GPUSharedMemory smem_new;
-  GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
+  int8_t isAccepted = (clustererNN.mNnClusterizerUseClassification ? clustererNN.mOutputDataClass[CAMath::Min(glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] : 1);
+  GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, reinterpret_cast<GPUTPCCFClusterizer::GPUSharedMemory&>(smem), chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, isAccepted);
 }
 
 template <>
@@ -95,12 +96,6 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
         } else {
           CfChargePos tmp_pos(target_row, target_pad, target_time);
           float normalized_charge = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
-
-          if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && r == 0 && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) {
-            clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
-            clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx];
-          }
-
           if (dtype == 0) {
             clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)normalized_charge;
           } else {
@@ -187,7 +182,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
     }
 
     // Handle deconvolution flags only once per cluster (last thread in element)
-    if (data_idx == 2 && !clustererNN.mNnClusterizerSetDeconvolutionFlags) {
+    if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && data_idx == 2) {
       uint8_t cluster_flags = 0;
       for (uint16_t i = 0; i < 8; i++) {
         Delta2 d = cfconsts::InnerNeighbors[i];
@@ -364,89 +359,82 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 
   // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.mNnClusterizerModelReg1NumOutputNodes << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size();
 
-  if (clustererNN.mOutputDataClass[full_glo_idx] == 1 || (clustererNN.mNnClusterizerUseClassification <= 0)) {
-
-    ClusterAccumulator pc;
-
-    // Publishing logic is taken from default clusterizer
-    if (withMC) {
-      ClusterAccumulator dummy_pc;
-      CPU_ONLY(labelAcc->collect(peak, central_charge));
-      GPUTPCCFClusterizer::buildCluster(
-        clusterer.Param().rec,
-        chargeMap,
-        peak,
-        smem.posBcast,
-        smem.buf,
-        smem.innerAboveThreshold,
-        &dummy_pc,
-        labelAcc);
-    }
-    if ((clusterer.mPmemory->fragment).isOverlap(peak.time())) {
-      if (clusterer.mPclusterPosInRow) {
-        clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
-      }
-      return;
-    }
-
-    // For flag influence on cluster error setting: O2/GPU/GPUTracking/Base/GPUParam.inc -> UpdateClusterError2ByState
-    bool notSinglePad = false, notSingleTime = false;
-    for (uint16_t i = 0; i < 8; i++) {
-      Delta2 d = cfconsts::InnerNeighbors[i];
-      CfChargePos tmp_pos = peak.delta(d);
-      notSinglePad |= (d.x != 0) && (static_cast<float>(chargeMap[tmp_pos].unpack()) > 0);
-      notSingleTime |= (d.y != 0) && (static_cast<float>(chargeMap[tmp_pos].unpack()) > 0);
+  ClusterAccumulator pc;
+
+  // Publishing logic is taken from default clusterizer
+  if (withMC) {
+    ClusterAccumulator dummy_pc;
+    CPU_ONLY(labelAcc->collect(peak, central_charge));
+    GPUTPCCFClusterizer::buildCluster(
+      clusterer.Param().rec,
+      chargeMap,
+      peak,
+      smem.posBcast,
+      smem.buf,
+      smem.innerAboveThreshold,
+      &dummy_pc,
+      labelAcc);
+  }
+  if ((clusterer.mPmemory->fragment).isOverlap(peak.time())) {
+    if (clusterer.mPclusterPosInRow) {
+      clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
     }
+    return;
+  }
 
-    if (dtype == 0) {
-      pc.setFull(central_charge * clustererNN.mOutputDataReg1_16[model_output_index + 4].ToFloat(),
-                 static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_16[model_output_index].ToFloat(),
-                 notSinglePad ? clustererNN.mOutputDataReg1_16[model_output_index + 2].ToFloat() : 0.f,
-                 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_16[model_output_index + 1].ToFloat(),
-                 notSingleTime ? clustererNN.mOutputDataReg1_16[model_output_index + 3].ToFloat() : 0.f,
-                 clustererNN.mClusterFlags[2 * glo_idx],
-                 clustererNN.mClusterFlags[2 * glo_idx + 1]);
-    } else if (dtype == 1) {
-      pc.setFull(central_charge * clustererNN.mOutputDataReg1_32[model_output_index + 4],
-                 static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_32[model_output_index],
-                 notSinglePad ? clustererNN.mOutputDataReg1_32[model_output_index + 2] : 0.f,
-                 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_32[model_output_index + 1],
-                 notSingleTime ? clustererNN.mOutputDataReg1_32[model_output_index + 3] : 0.f,
-                 clustererNN.mClusterFlags[2 * glo_idx],
-                 clustererNN.mClusterFlags[2 * glo_idx + 1]);
-    }
+  // For flag influence on cluster error setting: O2/GPU/GPUTracking/Base/GPUParam.inc -> UpdateClusterError2ByState
+  bool notSinglePad = false, notSingleTime = false;
+  for (uint16_t i = 0; i < 8; i++) {
+    Delta2 d = cfconsts::InnerNeighbors[i];
+    CfChargePos tmp_pos = peak.delta(d);
+    notSinglePad |= (d.x != 0) && (static_cast<float>(chargeMap[tmp_pos].unpack()) > 0);
+    notSingleTime |= (d.y != 0) && (static_cast<float>(chargeMap[tmp_pos].unpack()) > 0);
+  }
 
-    tpc::ClusterNative myCluster;
-    bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
-    if (rejectCluster) {
-      if (clusterer.mPclusterPosInRow) {
-        clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
-      }
-      return;
-    }
+  if (dtype == 0) {
+    pc.setFull(central_charge * clustererNN.mOutputDataReg1_16[model_output_index + 4].ToFloat(),
+                static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_16[model_output_index].ToFloat(),
+                notSinglePad ? clustererNN.mOutputDataReg1_16[model_output_index + 2].ToFloat() : 0.f,
+                (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_16[model_output_index + 1].ToFloat(),
+                notSingleTime ? clustererNN.mOutputDataReg1_16[model_output_index + 3].ToFloat() : 0.f,
+                clustererNN.mClusterFlags[2 * glo_idx],
+                clustererNN.mClusterFlags[2 * glo_idx + 1]);
+  } else if (dtype == 1) {
+    pc.setFull(central_charge * clustererNN.mOutputDataReg1_32[model_output_index + 4],
+                static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_32[model_output_index],
+                notSinglePad ? clustererNN.mOutputDataReg1_32[model_output_index + 2] : 0.f,
+                (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_32[model_output_index + 1],
+                notSingleTime ? clustererNN.mOutputDataReg1_32[model_output_index + 3] : 0.f,
+                clustererNN.mClusterFlags[2 * glo_idx],
+                clustererNN.mClusterFlags[2 * glo_idx + 1]);
+  }
 
-    uint32_t rowIndex = 0;
-    if (clusterOut != nullptr) {
-      rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
-        clusterer,
-        myCluster,
-        peak.row(),
-        clusterer.mNMaxClusterPerRow,
-        clusterer.mPclusterInRow,
-        clusterOut);
-      if (clusterer.mPclusterPosInRow != nullptr) {
-        clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
-      }
-    } else if (clusterer.mPclusterPosInRow) {
-      rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
-    }
-    CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow));
-  } else {
+  tpc::ClusterNative myCluster;
+  bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
+  rejectCluster &= (clustererNN.mNnClusterizerUseClassification ? clustererNN.mOutputDataClass[CAMath::Min(full_glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] : 1);
+  if (rejectCluster) {
     if (clusterer.mPclusterPosInRow) {
       clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
     }
     return;
   }
+
+  uint32_t rowIndex = 0;
+  if (clusterOut != nullptr) {
+    rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
+      clusterer,
+      myCluster,
+      peak.row(),
+      clusterer.mNMaxClusterPerRow,
+      clusterer.mPclusterInRow,
+      clusterOut);
+    if (clusterer.mPclusterPosInRow != nullptr) {
+      clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
+    }
+  } else if (clusterer.mPclusterPosInRow) {
+    rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
+  }
+  CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow));
 }
 
 template <>
@@ -488,123 +476,117 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 
   uint32_t model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg2NumOutputNodes;
 
-  if ((clustererNN.mOutputDataClass[full_glo_idx] > 0) || (clustererNN.mNnClusterizerUseClassification <= 0)) {
-
-    ClusterAccumulator pc;
-
-    if (withMC) {
-      ClusterAccumulator dummy_pc;
-      CPU_ONLY(labelAcc->collect(peak, central_charge));
-      GPUTPCCFClusterizer::buildCluster(
-        clusterer.Param().rec,
-        chargeMap,
-        peak,
-        smem.posBcast,
-        smem.buf,
-        smem.innerAboveThreshold,
-        &dummy_pc,
-        labelAcc);
-    }
-    if ((clusterer.mPmemory->fragment).isOverlap(peak.time())) {
-      if (clusterer.mPclusterPosInRow) {
-        clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
-      }
-      return;
-    }
-
-    // Cluster 1
-    if (dtype == 0) {
-      pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 8].ToFloat(),
-                 static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index].ToFloat(),
-                 clustererNN.mOutputDataReg2_16[model_output_index + 4].ToFloat(),
-                 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 2].ToFloat(),
-                 clustererNN.mOutputDataReg2_16[model_output_index + 6].ToFloat(),
-                 clustererNN.mClusterFlags[2 * glo_idx],
-                 clustererNN.mClusterFlags[2 * glo_idx + 1]);
-    } else if (dtype == 1) {
-      pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 8],
-                 static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index],
-                 clustererNN.mOutputDataReg2_32[model_output_index + 4],
-                 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 2],
-                 clustererNN.mOutputDataReg2_32[model_output_index + 6],
-                 clustererNN.mClusterFlags[2 * glo_idx],
-                 clustererNN.mClusterFlags[2 * glo_idx + 1]);
+  ClusterAccumulator pc;
+
+  if (withMC) {
+    ClusterAccumulator dummy_pc;
+    CPU_ONLY(labelAcc->collect(peak, central_charge));
+    GPUTPCCFClusterizer::buildCluster(
+      clusterer.Param().rec,
+      chargeMap,
+      peak,
+      smem.posBcast,
+      smem.buf,
+      smem.innerAboveThreshold,
+      &dummy_pc,
+      labelAcc);
+  }
+  if ((clusterer.mPmemory->fragment).isOverlap(peak.time())) {
+    if (clusterer.mPclusterPosInRow) {
+      clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
     }
+    return;
+  }
 
-    tpc::ClusterNative myCluster;
-    bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
-    if (rejectCluster) {
-      if (clusterer.mPclusterPosInRow) {
-        clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
-      }
-      return;
-    }
+  // Cluster 1
+  if (dtype == 0) {
+    pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 8].ToFloat(),
+                static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index].ToFloat(),
+                clustererNN.mOutputDataReg2_16[model_output_index + 4].ToFloat(),
+                (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 2].ToFloat(),
+                clustererNN.mOutputDataReg2_16[model_output_index + 6].ToFloat(),
+                clustererNN.mClusterFlags[2 * glo_idx],
+                clustererNN.mClusterFlags[2 * glo_idx + 1]);
+  } else if (dtype == 1) {
+    pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 8],
+                static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index],
+                clustererNN.mOutputDataReg2_32[model_output_index + 4],
+                (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 2],
+                clustererNN.mOutputDataReg2_32[model_output_index + 6],
+                clustererNN.mClusterFlags[2 * glo_idx],
+                clustererNN.mClusterFlags[2 * glo_idx + 1]);
+  }
 
-    uint32_t rowIndex = 0;
-    if (clusterOut != nullptr) {
-      rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
-        clusterer,
-        myCluster,
-        peak.row(),
-        clusterer.mNMaxClusterPerRow,
-        clusterer.mPclusterInRow,
-        clusterOut);
-      if (clusterer.mPclusterPosInRow != nullptr) {
-        clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
-      }
-    } else if (clusterer.mPclusterPosInRow) {
-      rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
+  tpc::ClusterNative myCluster;
+  bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
+  rejectCluster &= (clustererNN.mNnClusterizerUseClassification ? clustererNN.mOutputDataClass[CAMath::Min(full_glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] : 1);
+  if (rejectCluster) {
+    if (clusterer.mPclusterPosInRow) {
+      clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
     }
-    CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow));
+    return;
+  }
 
-    // Cluster 2
-    if (dtype == 0) {
-      pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 9].ToFloat(),
-                 static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat(),
-                 clustererNN.mOutputDataReg2_16[model_output_index + 5].ToFloat(),
-                 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 3].ToFloat(),
-                 clustererNN.mOutputDataReg2_16[model_output_index + 7].ToFloat(),
-                 clustererNN.mClusterFlags[2 * glo_idx],
-                 clustererNN.mClusterFlags[2 * glo_idx + 1]);
-    } else if (dtype == 1) {
-      pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 9],
-                 static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index + 1],
-                 clustererNN.mOutputDataReg2_32[model_output_index + 5],
-                 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 3],
-                 clustererNN.mOutputDataReg2_32[model_output_index + 7],
-                 clustererNN.mClusterFlags[2 * glo_idx],
-                 clustererNN.mClusterFlags[2 * glo_idx + 1]);
+  uint32_t rowIndex = 0;
+  if (clusterOut != nullptr) {
+    rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
+      clusterer,
+      myCluster,
+      peak.row(),
+      clusterer.mNMaxClusterPerRow,
+      clusterer.mPclusterInRow,
+      clusterOut);
+    if (clusterer.mPclusterPosInRow != nullptr) {
+      clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
     }
+  } else if (clusterer.mPclusterPosInRow) {
+    rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
+  }
+  CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow));
 
-    rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
-    if (rejectCluster) {
-      if (clusterer.mPclusterPosInRow) {
-        clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
-      }
-      return;
-    }
+  // Cluster 2
+  if (dtype == 0) {
+    pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 9].ToFloat(),
+                static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat(),
+                clustererNN.mOutputDataReg2_16[model_output_index + 5].ToFloat(),
+                (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 3].ToFloat(),
+                clustererNN.mOutputDataReg2_16[model_output_index + 7].ToFloat(),
+                clustererNN.mClusterFlags[2 * glo_idx],
+                clustererNN.mClusterFlags[2 * glo_idx + 1]);
+  } else if (dtype == 1) {
+    pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 9],
+                static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index + 1],
+                clustererNN.mOutputDataReg2_32[model_output_index + 5],
+                (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 3],
+                clustererNN.mOutputDataReg2_32[model_output_index + 7],
+                clustererNN.mClusterFlags[2 * glo_idx],
+                clustererNN.mClusterFlags[2 * glo_idx + 1]);
+  }
 
-    if (clusterOut != nullptr) {
-      rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
-        clusterer,
-        myCluster,
-        peak.row(),
-        clusterer.mNMaxClusterPerRow,
-        clusterer.mPclusterInRow,
-        clusterOut);
-      if (clusterer.mPclusterPosInRow != nullptr) {
-        clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
-      }
-    } else if (clusterer.mPclusterPosInRow) {
-      rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
-    }
-    // CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters?
-  } else {
+  rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
+  rejectCluster &= (clustererNN.mNnClusterizerUseClassification ? clustererNN.mOutputDataClass[CAMath::Min(full_glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] : 1);
+  if (rejectCluster) {
     if (clusterer.mPclusterPosInRow) {
       clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
     }
     return;
   }
+
+  if (clusterOut != nullptr) {
+    rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
+      clusterer,
+      myCluster,
+      peak.row(),
+      clusterer.mNMaxClusterPerRow,
+      clusterer.mPclusterInRow,
+      clusterOut);
+    if (clusterer.mPclusterPosInRow != nullptr) {
+      clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
+    }
+  } else if (clusterer.mPclusterPosInRow) {
+    rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
+  }
+  // CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters?
 }
 
 // ---------------------------------
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
index 7469754594124..c48abc6f84889 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
@@ -15,12 +15,8 @@
 #ifndef O2_GPU_NN_CLUSTERIZER_H
 #define O2_GPU_NN_CLUSTERIZER_H
 
-#include "clusterFinderDefs.h"
-#include "GPUGeneralKernels.h"
-#include "GPUConstantMem.h"
-#include "GPUTPCClusterFinder.h"
 #include "CfArray2D.h"
-#include "PackedCharge.h"
+#include "GPUGeneralKernels.h"
 #include "GPUTPCNNClusterizer.h"
 
 namespace o2::tpc
@@ -33,6 +29,8 @@ namespace o2::gpu
 
 class ClusterAccumulator;
 class MCLabelAccumulator;
+class CfChargePos;
+class PackedCharge;
 
 class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
 {
@@ -66,11 +64,6 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
   GPUd() static void Thread(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, processorType&, uint8_t = 0, int8_t = 0, int8_t = 0, uint = 0, Args...);
 
  private:
-  static GPUd() void fillInputData(int32_t, int32_t, int32_t, int32_t, processorType&, uint8_t, int8_t, uint);
-  static GPUd() void publishClustersReg1(uint, GPUSharedMemory&, processorType&, uint8_t, int8_t, int8_t, uint);
-  static GPUd() uint32_t sortIntoBuckets(GPUTPCClusterFinder&, const tpc::ClusterNative&, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*, uint32_t);
-  static GPUd() void publishClustersReg2(uint, GPUSharedMemory&, processorType&, uint8_t, int8_t, int8_t, uint);
-
   static GPUd() int32_t padOffset(int32_t, int32_t);
   static GPUd() int32_t rowOffset(int32_t, int32_t);
   static GPUd() bool isBoundary(int32_t, int32_t, int32_t);

From 1bee99aa95d64587224e799fc1a9e23f4be3d9ec Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sun, 7 Sep 2025 00:02:35 +0200
Subject: [PATCH 05/11] Adjusting kernels for GPU safe rejection

---
 .../Global/GPUChainTrackingClusterizer.cxx    | 11 ++-
 .../GPUTPCNNClusterizerKernels.cxx            | 72 +++++++++++--------
 2 files changed, 48 insertions(+), 35 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 5e1a67af10994..30a4d869a1fb5 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -1110,12 +1110,11 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             }
 
             // Publishing kernels for class labels and regression results
-            if(clustererNNShadow.mNnClusterizerUseClassification) {
-              if (nnApplication.mModelClass.getNumOutputNodes()[0][1] == 1) {
-                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Assigning class labels
-              } else {
-                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Assigning class labels
-              }
+            // In case classification should not be used, this kernel should still be executed to fill the mOutputDataClass array with default values
+            if (nnApplication.mModelClass.getNumOutputNodes()[0][1] == 1) {
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Assigning class labels
+            } else {
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Assigning class labels
             }
             if (!clustererNNShadow.mNnClusterizerUseCfRegression) {
               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Publishing class 1 regression results
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 1d13df440b3a4..170493202748d 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -47,7 +47,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::run
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CPU_ONLY(MCLabelAccumulator labelAcc(clusterer));
   tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
-  int8_t isAccepted = (clustererNN.mNnClusterizerUseClassification ? clustererNN.mOutputDataClass[CAMath::Min(glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] : 1);
+  int8_t isAccepted = (clustererNN.mNnClusterizerUseClassification ? (clustererNN.mOutputDataClass[CAMath::Min(glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] > 0) : 1);
   GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, reinterpret_cast<GPUTPCCFClusterizer::GPUSharedMemory&>(smem), chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, isAccepted);
 }
 
@@ -275,10 +275,14 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
   if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= clustererNN.mNnClusterizerBatchedMode) {
     return;
   }
-  if (dtype == 0) {
-    processors.tpcNNClusterer[sector].mOutputDataClass[glo_idx + batchStart] = (int32_t)((processors.tpcNNClusterer[sector].mModelProbabilities_16[glo_idx]).ToFloat() > processors.tpcNNClusterer[sector].mNnClassThreshold);
-  } else if (dtype == 1) {
-    processors.tpcNNClusterer[sector].mOutputDataClass[glo_idx + batchStart] = (int32_t)(processors.tpcNNClusterer[sector].mModelProbabilities_32[glo_idx] > processors.tpcNNClusterer[sector].mNnClassThreshold);
+  if(clustererNN.mNnClusterizerUseClassification) {
+    if (dtype == 0) {
+      clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)((clustererNN.mModelProbabilities_16[glo_idx]).ToFloat() > clustererNN.mNnClassThreshold);
+    } else if (dtype == 1) {
+      clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)(clustererNN.mModelProbabilities_32[glo_idx] > clustererNN.mNnClassThreshold);
+    }
+  } else {
+    clustererNN.mOutputDataClass[glo_idx + batchStart] = 1;
   }
 }
 
@@ -291,29 +295,33 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
   if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= clustererNN.mNnClusterizerBatchedMode) {
     return;
   }
-  uint32_t elem_iterator = glo_idx * clustererNN.mNnClusterizerModelClassNumOutputNodes;
-  float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
-  uint32_t class_label = 0;
-  for (uint32_t pIdx = elem_iterator; pIdx < elem_iterator + clustererNN.mNnClusterizerModelClassNumOutputNodes; pIdx++) {
-    if (pIdx == elem_iterator) {
-      if (dtype == 0) {
-        current_max_prob = static_cast<float>(clustererNN.mModelProbabilities_16[pIdx]);
-      } else if (dtype == 1) {
-        current_max_prob = clustererNN.mModelProbabilities_32[pIdx];
-      }
-    } else {
-      if (dtype == 0) {
-        current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_16[pIdx].ToFloat());
-      } else if (dtype == 1) {
-        current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_32[pIdx]);
+  if(clustererNN.mNnClusterizerUseClassification) {
+    uint32_t elem_iterator = glo_idx * clustererNN.mNnClusterizerModelClassNumOutputNodes;
+    float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
+    uint32_t class_label = 0;
+    for (uint32_t pIdx = elem_iterator; pIdx < elem_iterator + clustererNN.mNnClusterizerModelClassNumOutputNodes; pIdx++) {
+      if (pIdx == elem_iterator) {
+        if (dtype == 0) {
+          current_max_prob = static_cast<float>(clustererNN.mModelProbabilities_16[pIdx]);
+        } else if (dtype == 1) {
+          current_max_prob = clustererNN.mModelProbabilities_32[pIdx];
+        }
+      } else {
+        if (dtype == 0) {
+          current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_16[pIdx].ToFloat());
+        } else if (dtype == 1) {
+          current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_32[pIdx]);
+        }
       }
     }
-  }
-  // uint32_t class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clustererNN.mNnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
-  clustererNN.mOutputDataClass[glo_idx + batchStart] = class_label;
-  if (class_label > 1) {
-    clustererNN.mClusterFlags[2 * glo_idx] = 1;
-    clustererNN.mClusterFlags[2 * glo_idx + 1] = 1;
+    // uint32_t class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clustererNN.mNnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
+    clustererNN.mOutputDataClass[glo_idx + batchStart] = class_label;
+    if (class_label > 1) {
+      clustererNN.mClusterFlags[2 * glo_idx] = 1;
+      clustererNN.mClusterFlags[2 * glo_idx + 1] = 1;
+    }
+  } else {
+    clustererNN.mOutputDataClass[glo_idx + batchStart] = 1;
   }
 }
 
@@ -411,7 +419,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 
   tpc::ClusterNative myCluster;
   bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
-  rejectCluster &= (clustererNN.mNnClusterizerUseClassification ? clustererNN.mOutputDataClass[CAMath::Min(full_glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] : 1);
+  if (clustererNN.mNnClusterizerUseClassification) {
+    rejectCluster |= (clustererNN.mOutputDataClass[CAMath::Min(full_glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] <= 0);
+  }
   if (rejectCluster) {
     if (clusterer.mPclusterPosInRow) {
       clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
@@ -519,7 +529,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 
   tpc::ClusterNative myCluster;
   bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
-  rejectCluster &= (clustererNN.mNnClusterizerUseClassification ? clustererNN.mOutputDataClass[CAMath::Min(full_glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] : 1);
+  if (clustererNN.mNnClusterizerUseClassification) {
+    rejectCluster |= (clustererNN.mOutputDataClass[CAMath::Min(full_glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] <= 0);
+  }
   if (rejectCluster) {
     if (clusterer.mPclusterPosInRow) {
       clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
@@ -564,7 +576,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
   }
 
   rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
-  rejectCluster &= (clustererNN.mNnClusterizerUseClassification ? clustererNN.mOutputDataClass[CAMath::Min(full_glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] : 1);
+  if (clustererNN.mNnClusterizerUseClassification) {
+    rejectCluster |= (clustererNN.mOutputDataClass[CAMath::Min(full_glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] <= 0);
+  }
   if (rejectCluster) {
     if (clusterer.mPclusterPosInRow) {
       clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;

From fe9b699b6d53657513218b3e4d2083d4c50f96d7 Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Sat, 6 Sep 2025 22:03:10 +0000
Subject: [PATCH 06/11] Please consider the following formatting changes

---
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 48 ++++++------
 .../GPUTPCNNClusterizerKernels.cxx            | 76 +++++++++----------
 2 files changed, 62 insertions(+), 62 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 48596bed44c46..d189b79df74f3 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -69,10 +69,10 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
   if (mNnClusterizerVerbosity > 2) {
     if (mNnClusterizerVerbosity > 3) {
       auto fmt = [](size_t bytes) {
-      std::ostringstream os;
-      double mb = bytes / (1024.0 * 1024.0);
-      os << bytes << " bytes (" << std::fixed << std::setprecision(3) << mb << " MB)";
-      return os.str();
+        std::ostringstream os;
+        double mb = bytes / (1024.0 * 1024.0);
+        os << bytes << " bytes (" << std::fixed << std::setprecision(3) << mb << " MB)";
+        return os.str();
       };
 
       // Element counts (number of array entries, not bytes)
@@ -101,35 +101,35 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
 
       LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") Pointers set for clusterizer with memoryID " << mMemoryId << " deviceID " << mDeviceId << " and sector " << mISector;
       LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataClass pointer: " << mOutputDataClass
-          << " | elements=" << elemsOutputDataClass << " (= mNnClusterizerTotalClusters)"
-          << " | " << fmt(szOutputDataClass);
+                << " | elements=" << elemsOutputDataClass << " (= mNnClusterizerTotalClusters)"
+                << " | " << fmt(szOutputDataClass);
       LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mClusterFlags pointer: " << static_cast<const void*>(mClusterFlags)
-          << " | elements=" << elemsClusterFlags << " (= 2 * mNnClusterizerBatchedMode)"
-          << " | " << fmt(szClusterFlags);
+                << " | elements=" << elemsClusterFlags << " (= 2 * mNnClusterizerBatchedMode)"
+                << " | " << fmt(szClusterFlags);
       LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mInputData_16 pointer: " << mInputData_16
-          << " | elements=" << elemsInput16 << " (= mNnClusterizerBatchedMode * mNnClusterizerElementSize)"
-          << " | " << fmt(szInput16);
+                << " | elements=" << elemsInput16 << " (= mNnClusterizerBatchedMode * mNnClusterizerElementSize)"
+                << " | " << fmt(szInput16);
       LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mModelProbabilities_16 pointer: " << mModelProbabilities_16
-          << " | elements=" << elemsProb16 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes)"
-          << " | " << fmt(szProb16);
+                << " | elements=" << elemsProb16 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes)"
+                << " | " << fmt(szProb16);
       LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg1_16 pointer: " << mOutputDataReg1_16
-          << " | elements=" << elemsReg1_16 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes)"
-          << " | " << fmt(szReg1_16);
+                << " | elements=" << elemsReg1_16 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes)"
+                << " | " << fmt(szReg1_16);
       LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg2_16 pointer: " << mOutputDataReg2_16
-          << " | elements=" << elemsReg2_16 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes)"
-          << " | " << fmt(szReg2_16);
+                << " | elements=" << elemsReg2_16 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes)"
+                << " | " << fmt(szReg2_16);
       LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mInputData_32 pointer: " << mInputData_32
-          << " | elements=" << elemsInput32 << " (= mNnClusterizerBatchedMode * mNnClusterizerElementSize)"
-          << " | " << fmt(szInput32);
+                << " | elements=" << elemsInput32 << " (= mNnClusterizerBatchedMode * mNnClusterizerElementSize)"
+                << " | " << fmt(szInput32);
       LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mModelProbabilities_32 pointer: " << mModelProbabilities_32
-          << " | elements=" << elemsProb32 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes)"
-          << " | " << fmt(szProb32);
+                << " | elements=" << elemsProb32 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes)"
+                << " | " << fmt(szProb32);
       LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg1_32 pointer: " << mOutputDataReg1_32
-          << " | elements=" << elemsReg1_32 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes)"
-          << " | " << fmt(szReg1_32);
+                << " | elements=" << elemsReg1_32 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes)"
+                << " | " << fmt(szReg1_32);
       LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg2_32 pointer: " << mOutputDataReg2_32
-          << " | elements=" << elemsReg2_32 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes)"
-          << " | " << fmt(szReg2_32);
+                << " | elements=" << elemsReg2_32 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes)"
+                << " | " << fmt(szReg2_32);
     }
     // Compute allocated bytes (difference between advanced pointer and start pointer)
     size_t allocatedBytes = static_cast<size_t>(reinterpret_cast<uintptr_t>(mem) - reinterpret_cast<uintptr_t>(startMem));
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 170493202748d..779a95df88031 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -275,7 +275,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
   if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= clustererNN.mNnClusterizerBatchedMode) {
     return;
   }
-  if(clustererNN.mNnClusterizerUseClassification) {
+  if (clustererNN.mNnClusterizerUseClassification) {
     if (dtype == 0) {
       clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)((clustererNN.mModelProbabilities_16[glo_idx]).ToFloat() > clustererNN.mNnClassThreshold);
     } else if (dtype == 1) {
@@ -295,7 +295,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
   if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= clustererNN.mNnClusterizerBatchedMode) {
     return;
   }
-  if(clustererNN.mNnClusterizerUseClassification) {
+  if (clustererNN.mNnClusterizerUseClassification) {
     uint32_t elem_iterator = glo_idx * clustererNN.mNnClusterizerModelClassNumOutputNodes;
     float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
     uint32_t class_label = 0;
@@ -401,20 +401,20 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 
   if (dtype == 0) {
     pc.setFull(central_charge * clustererNN.mOutputDataReg1_16[model_output_index + 4].ToFloat(),
-                static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_16[model_output_index].ToFloat(),
-                notSinglePad ? clustererNN.mOutputDataReg1_16[model_output_index + 2].ToFloat() : 0.f,
-                (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_16[model_output_index + 1].ToFloat(),
-                notSingleTime ? clustererNN.mOutputDataReg1_16[model_output_index + 3].ToFloat() : 0.f,
-                clustererNN.mClusterFlags[2 * glo_idx],
-                clustererNN.mClusterFlags[2 * glo_idx + 1]);
+               static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_16[model_output_index].ToFloat(),
+               notSinglePad ? clustererNN.mOutputDataReg1_16[model_output_index + 2].ToFloat() : 0.f,
+               (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_16[model_output_index + 1].ToFloat(),
+               notSingleTime ? clustererNN.mOutputDataReg1_16[model_output_index + 3].ToFloat() : 0.f,
+               clustererNN.mClusterFlags[2 * glo_idx],
+               clustererNN.mClusterFlags[2 * glo_idx + 1]);
   } else if (dtype == 1) {
     pc.setFull(central_charge * clustererNN.mOutputDataReg1_32[model_output_index + 4],
-                static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_32[model_output_index],
-                notSinglePad ? clustererNN.mOutputDataReg1_32[model_output_index + 2] : 0.f,
-                (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_32[model_output_index + 1],
-                notSingleTime ? clustererNN.mOutputDataReg1_32[model_output_index + 3] : 0.f,
-                clustererNN.mClusterFlags[2 * glo_idx],
-                clustererNN.mClusterFlags[2 * glo_idx + 1]);
+               static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_32[model_output_index],
+               notSinglePad ? clustererNN.mOutputDataReg1_32[model_output_index + 2] : 0.f,
+               (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_32[model_output_index + 1],
+               notSingleTime ? clustererNN.mOutputDataReg1_32[model_output_index + 3] : 0.f,
+               clustererNN.mClusterFlags[2 * glo_idx],
+               clustererNN.mClusterFlags[2 * glo_idx + 1]);
   }
 
   tpc::ClusterNative myCluster;
@@ -511,20 +511,20 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
   // Cluster 1
   if (dtype == 0) {
     pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 8].ToFloat(),
-                static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index].ToFloat(),
-                clustererNN.mOutputDataReg2_16[model_output_index + 4].ToFloat(),
-                (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 2].ToFloat(),
-                clustererNN.mOutputDataReg2_16[model_output_index + 6].ToFloat(),
-                clustererNN.mClusterFlags[2 * glo_idx],
-                clustererNN.mClusterFlags[2 * glo_idx + 1]);
+               static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index].ToFloat(),
+               clustererNN.mOutputDataReg2_16[model_output_index + 4].ToFloat(),
+               (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 2].ToFloat(),
+               clustererNN.mOutputDataReg2_16[model_output_index + 6].ToFloat(),
+               clustererNN.mClusterFlags[2 * glo_idx],
+               clustererNN.mClusterFlags[2 * glo_idx + 1]);
   } else if (dtype == 1) {
     pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 8],
-                static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index],
-                clustererNN.mOutputDataReg2_32[model_output_index + 4],
-                (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 2],
-                clustererNN.mOutputDataReg2_32[model_output_index + 6],
-                clustererNN.mClusterFlags[2 * glo_idx],
-                clustererNN.mClusterFlags[2 * glo_idx + 1]);
+               static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index],
+               clustererNN.mOutputDataReg2_32[model_output_index + 4],
+               (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 2],
+               clustererNN.mOutputDataReg2_32[model_output_index + 6],
+               clustererNN.mClusterFlags[2 * glo_idx],
+               clustererNN.mClusterFlags[2 * glo_idx + 1]);
   }
 
   tpc::ClusterNative myCluster;
@@ -559,20 +559,20 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
   // Cluster 2
   if (dtype == 0) {
     pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 9].ToFloat(),
-                static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat(),
-                clustererNN.mOutputDataReg2_16[model_output_index + 5].ToFloat(),
-                (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 3].ToFloat(),
-                clustererNN.mOutputDataReg2_16[model_output_index + 7].ToFloat(),
-                clustererNN.mClusterFlags[2 * glo_idx],
-                clustererNN.mClusterFlags[2 * glo_idx + 1]);
+               static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat(),
+               clustererNN.mOutputDataReg2_16[model_output_index + 5].ToFloat(),
+               (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 3].ToFloat(),
+               clustererNN.mOutputDataReg2_16[model_output_index + 7].ToFloat(),
+               clustererNN.mClusterFlags[2 * glo_idx],
+               clustererNN.mClusterFlags[2 * glo_idx + 1]);
   } else if (dtype == 1) {
     pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 9],
-                static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index + 1],
-                clustererNN.mOutputDataReg2_32[model_output_index + 5],
-                (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 3],
-                clustererNN.mOutputDataReg2_32[model_output_index + 7],
-                clustererNN.mClusterFlags[2 * glo_idx],
-                clustererNN.mClusterFlags[2 * glo_idx + 1]);
+               static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index + 1],
+               clustererNN.mOutputDataReg2_32[model_output_index + 5],
+               (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 3],
+               clustererNN.mOutputDataReg2_32[model_output_index + 7],
+               clustererNN.mClusterFlags[2 * glo_idx],
+               clustererNN.mClusterFlags[2 * glo_idx + 1]);
   }
 
   rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);

From 902ddc6437bb0975165d1aaf3b59d30163168993 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sun, 7 Sep 2025 10:35:47 +0200
Subject: [PATCH 07/11] Casting to avoid CI build failures

---
 .../GPUTPCNNClusterizerKernels.cxx                 | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 779a95df88031..15ee6b6119022 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -58,7 +58,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   auto& clustererNN = processors.tpcNNClusterer[sector];
 
   uint32_t glo_idx = get_global_id(0);
-  if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= clustererNN.mNnClusterizerBatchedMode) {
+  if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
     return;
   }
 
@@ -143,7 +143,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
 
-  if (glo_idx >= clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerRowTimeSizeFull) {
+  if (glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerRowTimeSizeFull) {
     return;
   }
 
@@ -272,7 +272,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
   uint32_t glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
-  if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= clustererNN.mNnClusterizerBatchedMode) {
+  if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
     return;
   }
   if (clustererNN.mNnClusterizerUseClassification) {
@@ -292,7 +292,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
   uint32_t glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
-  if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= clustererNN.mNnClusterizerBatchedMode) {
+  if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
     return;
   }
   if (clustererNN.mNnClusterizerUseClassification) {
@@ -331,7 +331,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
   uint32_t glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
-  if (glo_idx >= clustererNN.mNnClusterizerBatchedMode) {
+  if (glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
     return;
   }
 
@@ -453,7 +453,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
   uint32_t glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
-  if (glo_idx >= clustererNN.mNnClusterizerBatchedMode) {
+  if (glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
     return;
   }
 
@@ -611,7 +611,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
   uint32_t glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
-  if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= clustererNN.mNnClusterizerBatchedMode) {
+  if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
     return;
   }
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));

From 79cc38e4d63d012467015f061354220e3ecc75ec Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sun, 7 Sep 2025 11:40:38 +0200
Subject: [PATCH 08/11] Changing formatter to not use std::

---
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx          | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index d189b79df74f3..7794db66c908b 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -68,11 +68,12 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
 
   if (mNnClusterizerVerbosity > 2) {
     if (mNnClusterizerVerbosity > 3) {
-      auto fmt = [](size_t bytes) {
-        std::ostringstream os;
-        double mb = bytes / (1024.0 * 1024.0);
-        os << bytes << " bytes (" << std::fixed << std::setprecision(3) << mb << " MB)";
-        return os.str();
+      auto fmt = [](size_t bytes) -> const char* {
+        static char buf[64];
+        double mb = (double)bytes / (1024.0 * 1024.0);
+        int n = snprintf(buf, sizeof(buf), "%zu bytes (%.3f MB)", bytes, mb);
+        (void)n;
+        return buf;
       };
 
       // Element counts (number of array entries, not bytes)

From b50afbb74db14002aff74e0bf2efc0a622bf0d6b Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sun, 7 Sep 2025 17:00:14 +0200
Subject: [PATCH 09/11] Remove usage of std::

---
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx          | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 7794db66c908b..6fac0e417ac26 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -135,9 +135,14 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
     // Compute allocated bytes (difference between advanced pointer and start pointer)
     size_t allocatedBytes = static_cast<size_t>(reinterpret_cast<uintptr_t>(mem) - reinterpret_cast<uintptr_t>(startMem));
     double allocatedMB = static_cast<double>(allocatedBytes) / (1024.0 * 1024.0);
-    LOG(info) << std::fixed << std::setprecision(3)
-              << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") Total scratch allocation in setIOPointers: " << allocatedBytes
-              << " bytes (" << allocatedMB << " MB)";
+    {
+      char allocMsg[256];
+      int nn = snprintf(allocMsg, sizeof(allocMsg),
+                        "(NNCLUS, GPUTPCNNClusterizer, this=%p) Total scratch allocation in setIOPointers: %zu bytes (%.3f MB)",
+                        (void*)this, (size_t)allocatedBytes, allocatedMB);
+      (void)nn;
+      LOG(info) << allocMsg;
+    }
   }
 
   return mem;

From a545f086fa5357ecf0d32887927f9d3fde4f9e6b Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 8 Sep 2025 11:00:43 +0200
Subject: [PATCH 10/11] Adding back the runParallelOuterLoop

---
 GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 30a4d869a1fb5..fd3699ae4d125 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -664,7 +664,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
       nnTimers[11] = &getTimer<GPUTPCNNClusterizer, 11>("GPUTPCNNClusterizer_ONNXRegression2_2_", 11);
     }
 
-    for (int32_t lane = 0; lane < numLanes; lane++) {
+    mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) {
       nnApplications[lane].init(nn_settings, GetProcessingSettings().deterministicGPUReconstruction);
       if (nnApplications[lane].mModelsUsed[0]) {
         SetONNXGPUStream(*(nnApplications[lane].mModelClass).getSessionOptions(), lane, &deviceId);
@@ -708,7 +708,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
       if (nn_settings.nnClusterizerVerbosity > 0) {
         LOG(info) << "(ORT) Allocated ONNX stream for lane " << lane << " and device " << deviceId;
       }
-    }
+    });
     for (int32_t sector = 0; sector < NSECTORS; sector++) {
       GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[sector];
       GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[sector] : clustererNN;

From 7c47304ff03ce3c0470e12b261df22cb6c493c1d Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Tue, 9 Sep 2025 11:08:12 +0200
Subject: [PATCH 11/11] Declaring CfChargePos as struct, not class

---
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
index c48abc6f84889..9c93726a097b7 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
@@ -29,7 +29,7 @@ namespace o2::gpu
 
 class ClusterAccumulator;
 class MCLabelAccumulator;
-class CfChargePos;
+struct CfChargePos;
 class PackedCharge;
 
 class GPUTPCNNClusterizerKernels : public GPUKernelTemplate