From 768fd50181c595ac7fde1833264d334dfa5fd32c Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Wed, 15 Oct 2025 15:03:24 +0200
Subject: [PATCH 1/3] GPU Standalone: Make setO2Settings compatible with debug
 mode

---
 GPU/GPUTracking/Standalone/Benchmark/standalone.cxx | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
index 1fa41d55ebbec..f9c53e3ffd59c 100644
--- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
+++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
@@ -214,11 +214,11 @@ int32_t ReadConfiguration(int argc, char** argv)
     }
   }
   if (configStandalone.setO2Settings) {
-    if (!(configStandalone.inputcontrolmem && configStandalone.outputcontrolmem)) {
-      printf("setO2Settings requires the usage of --inputMemory and --outputMemory as in O2\n");
-      return 1;
-    }
-    if (configStandalone.runGPU) {
+    if (configStandalone.runGPU && configStandalone.proc.debugLevel <= 1) {
+      if (!(configStandalone.inputcontrolmem && configStandalone.outputcontrolmem)) {
+        printf("setO2Settings requires the usage of --inputMemory and --outputMemory as in O2\n");
+        return 1;
+      }
       configStandalone.proc.forceHostMemoryPoolSize = 1024 * 1024 * 1024;
     }
     configStandalone.rec.tpc.trackReferenceX = 83;

From a8aa0133377abec0966c1223cbd88aa1181765db Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Wed, 15 Oct 2025 17:23:02 +0200
Subject: [PATCH 2/3] GPU RTC: Add overrideWarpSize option

---
 GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu | 4 ++--
 GPU/GPUTracking/Definitions/GPUSettingsList.h      | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index 9e7cfa5495040..62b490a59d0dc 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -113,7 +113,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
   constexpr int32_t reqVerMin = 0;
 #endif
   if (GetProcessingSettings().rtc.enable && GetProcessingSettings().rtctech.runTest == 2) {
-    mWarpSize = GPUCA_WARP_SIZE;
+    mWarpSize = GetProcessingSettings().rtc.overrideWarpSize != -1 ? GetProcessingSettings().rtc.overrideWarpSize : GPUCA_WARP_SIZE;
     genAndLoadRTC();
     exit(0);
   }
@@ -245,7 +245,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
       GPUInfo("\ttextureAlignment = %ld", (uint64_t)deviceProp.textureAlignment);
       GPUInfo(" ");
     }
-    if (deviceProp.warpSize != GPUCA_WARP_SIZE && !GetProcessingSettings().rtc.enable) {
+    if (GetProcessingSettings().rtc.enable ? (GetProcessingSettings().rtc.overrideWarpSize != -1 && deviceProp.warpSize != GetProcessingSettings().rtc.overrideWarpSize) : (deviceProp.warpSize != GPUCA_WARP_SIZE)) {
       throw std::runtime_error("Invalid warp size on GPU");
     }
     mWarpSize = deviceProp.warpSize;
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 30477d67fdc4f..bde082b8a10c4 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -216,6 +216,7 @@ AddOption(optSpecialCode, int8_t, -1, "", 0, "Insert GPUCA_RTC_SPECIAL_CODE spec
 AddOption(deterministic, bool, false, "", 0, "Compile RTC in deterministic mode, with NO_FAST_MATH flags and GPUCA_DETERMINISTIC_MODE define")
 AddOption(compilePerKernel, bool, true, "", 0, "Run one RTC compilation per kernel")
 AddOption(enable, bool, false, "", 0, "Use RTC to optimize GPU code")
+AddOption(overrideWarpSize, int32_t, -1, "", 0, "Override the warp size to be used for RTC")
 AddHelp("help", 'h')
 EndConfig()
 

From 7b8f4063624dff58c20180fc60fd97c274134f09 Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Wed, 15 Oct 2025 19:14:42 +0200
Subject: [PATCH 3/3] GPU Workflow: Add dumpFirst and dumpLast options

---
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  3 ++
 GPU/GPUTracking/Interface/GPUO2Interface.cxx  | 23 +++++++-------
 GPU/GPUTracking/Interface/GPUO2Interface.h    |  4 +--
 .../include/GPUWorkflow/GPUWorkflowSpec.h     |  1 +
 GPU/Workflow/src/GPUWorkflowInternal.h        |  1 +
 GPU/Workflow/src/GPUWorkflowPipeline.cxx      |  4 +--
 GPU/Workflow/src/GPUWorkflowSpec.cxx          | 31 ++++++++++++++++---
 prodtests/full-system-test/dpl-workflow.sh    |  3 ++
 8 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index bde082b8a10c4..fc08b063ff16a 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -625,6 +625,9 @@ AddOption(deviceType, std::string, "CPU", "", 0, "Device type, CPU | CUDA | HIP
 AddOption(forceDeviceType, bool, true, "", 0, "force device type, otherwise allows fall-back to CPU")
 AddOption(synchronousProcessing, bool, false, "", 0, "Apply performance shortcuts for synchronous processing, disable unneeded steps")
 AddOption(dump, int32_t, 0, "", 0, "Dump events for standalone benchmark: 1 = dump events, 2 = dump events and skip processing in workflow")
+AddOption(dumpFirst, int32_t, 0, "", 0, "First event to dump (referring to tfCounter)")
+AddOption(dumpLast, int32_t, -1, "", 0, "Last event to dump (-1 = all)")
+AddOption(dumpFolder, std::string, "", "", 0, "Folder to which to write dump files, [P] is replaced by process id")
 AddOption(display, bool, false, "", 0, "Enable standalone gpu tracking visualizaion")
 AddOption(rundEdx, int32_t, -1, "", 0, "Enable/disable dEdx processing (-1 for autoselect)")
 AddOption(dEdxSplineTopologyCorrFile, std::string, "", "", 0, "File name of the dE/dx spline track topology correction file")
diff --git a/GPU/GPUTracking/Interface/GPUO2Interface.cxx b/GPU/GPUTracking/Interface/GPUO2Interface.cxx
index 65907528a3dba..d04db5e9bf271 100644
--- a/GPU/GPUTracking/Interface/GPUO2Interface.cxx
+++ b/GPU/GPUTracking/Interface/GPUO2Interface.cxx
@@ -137,29 +137,30 @@ void GPUO2Interface::Deinitialize()
   mNContexts = 0;
 }
 
-void GPUO2Interface::DumpEvent(int32_t nEvent, GPUTrackingInOutPointers* data)
+void GPUO2Interface::DumpEvent(int32_t nEvent, GPUTrackingInOutPointers* data, uint32_t iThread, const char* dir)
 {
-  mCtx[0].mChain->ClearIOPointers();
-  mCtx[0].mChain->mIOPtrs = *data;
+  const auto oldPtrs = mCtx[iThread].mChain->mIOPtrs;
+  mCtx[iThread].mChain->mIOPtrs = *data;
   char fname[1024];
-  snprintf(fname, 1024, "event.%d.dump", nEvent);
-  mCtx[0].mChain->DumpData(fname);
+  snprintf(fname, 1024, "%sevent.%d.dump", dir, nEvent);
+  mCtx[iThread].mChain->DumpData(fname);
   if (nEvent == 0) {
 #ifdef GPUCA_BUILD_QA
     if (mConfig->configProcessing.runMC) {
-      mCtx[0].mChain->ForceInitQA();
+      mCtx[iThread].mChain->ForceInitQA();
       snprintf(fname, 1024, "mc.%d.dump", nEvent);
-      mCtx[0].mChain->GetQA()->UpdateChain(mCtx[0].mChain);
-      mCtx[0].mChain->GetQA()->DumpO2MCData(fname);
+      mCtx[iThread].mChain->GetQA()->UpdateChain(mCtx[iThread].mChain);
+      mCtx[iThread].mChain->GetQA()->DumpO2MCData(fname);
     }
 #endif
   }
+  mCtx[iThread].mChain->mIOPtrs = oldPtrs;
 }
 
-void GPUO2Interface::DumpSettings()
+void GPUO2Interface::DumpSettings(uint32_t iThread, const char* dir)
 {
-  mCtx[0].mChain->DoQueuedUpdates(-1);
-  mCtx[0].mRec->DumpSettings();
+  mCtx[iThread].mChain->DoQueuedUpdates(-1);
+  mCtx[iThread].mRec->DumpSettings(dir);
 }
 
 int32_t GPUO2Interface::RunTracking(GPUTrackingInOutPointers* data, GPUInterfaceOutputs* outputs, uint32_t iThread, GPUInterfaceInputUpdate* inputUpdateCallback)
diff --git a/GPU/GPUTracking/Interface/GPUO2Interface.h b/GPU/GPUTracking/Interface/GPUO2Interface.h
index 9b7390f2ed663..0e2020b306984 100644
--- a/GPU/GPUTracking/Interface/GPUO2Interface.h
+++ b/GPU/GPUTracking/Interface/GPUO2Interface.h
@@ -77,8 +77,8 @@ class GPUO2Interface
 
   int32_t RunTracking(GPUTrackingInOutPointers* data, GPUInterfaceOutputs* outputs = nullptr, uint32_t iThread = 0, GPUInterfaceInputUpdate* inputUpdateCallback = nullptr);
   void Clear(bool clearOutputs, uint32_t iThread = 0);
-  void DumpEvent(int32_t nEvent, GPUTrackingInOutPointers* data);
-  void DumpSettings();
+  void DumpEvent(int32_t nEvent, GPUTrackingInOutPointers* data, uint32_t iThread, const char* dir = "");
+  void DumpSettings(uint32_t iThread, const char* dir = "");
 
   void GetITSTraits(o2::its::TrackerTraits<7>*& trackerTraits, o2::its::VertexerTraits<7>*& vertexerTraits, o2::its::TimeFrame<7>*& timeFrame);
   const o2::base::Propagator* GetDeviceO2Propagator(int32_t iThread = 0) const;
diff --git a/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h b/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h
index 4f62f07593bff..160efd4048af0 100644
--- a/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h
+++ b/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h
@@ -225,6 +225,7 @@ class GPURecoWorkflowSpec : public o2::framework::Task
   int64_t mCreationForCalib = -1; ///< creation time for calib manipulation
   int32_t mVerbosity = 0;
   uint32_t mNTFs = 0;
+  uint32_t mNTFDumps = 0;
   uint32_t mNDebugDumps = 0;
   uint32_t mNextThreadIndex = 0;
   bool mUpdateGainMapCCDB = true;
diff --git a/GPU/Workflow/src/GPUWorkflowInternal.h b/GPU/Workflow/src/GPUWorkflowInternal.h
index 7ac9c60048e20..73d3676f3d84a 100644
--- a/GPU/Workflow/src/GPUWorkflowInternal.h
+++ b/GPU/Workflow/src/GPUWorkflowInternal.h
@@ -47,6 +47,7 @@ struct GPURecoWorkflow_QueueObject {
   bool jobSubmitted = false;
   bool jobFinished = false;
   int32_t jobReturnValue = 0;
+  volatile int32_t jobThreadIndex = -1;
   std::mutex jobFinishedMutex;
   std::condition_variable jobFinishedNotify;
   bool jobInputFinal = false;
diff --git a/GPU/Workflow/src/GPUWorkflowPipeline.cxx b/GPU/Workflow/src/GPUWorkflowPipeline.cxx
index 8867b6c336f97..ba395cd98d64d 100644
--- a/GPU/Workflow/src/GPUWorkflowPipeline.cxx
+++ b/GPU/Workflow/src/GPUWorkflowPipeline.cxx
@@ -90,6 +90,7 @@ void GPURecoWorkflowSpec::RunWorkerThread(int32_t id)
       context = workerContext.inputQueue.front();
       workerContext.inputQueue.pop();
     }
+    context->jobThreadIndex = id;
     context->jobReturnValue = runMain(nullptr, context->jobPtrs, context->jobOutputRegions, id, context->jobInputUpdateCallback.get());
     {
       std::lock_guard lk(context->jobFinishedMutex);
@@ -179,8 +180,7 @@ int32_t GPURecoWorkflowSpec::handlePipeline(ProcessingContext& pc, GPUTrackingIn
       }
       mPipeline->completionPolicyQueue.pop();
     }
-  }
-  if (mSpecConfig.enableDoublePipeline == 2) {
+  } else if (mSpecConfig.enableDoublePipeline == 2) {
     auto prepareDummyMessage = pc.outputs().make<DataAllocator::UninitializedVector<char>>(Output{gDataOriginGPU, "PIPELINEPREPARE", 0}, 0u);
 
     size_t ptrsTotal = 0;
diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx
index 6c76f13c9bbd0..d3d3eb14869e0 100644
--- a/GPU/Workflow/src/GPUWorkflowSpec.cxx
+++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx
@@ -825,11 +825,31 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc)
 
   lockDecodeInput.reset();
 
+  uint32_t threadIndex;
   if (mConfParam->dump) {
-    if (mNTFs == 1) {
-      mGPUReco->DumpSettings();
+    if (mSpecConfig.enableDoublePipeline && pipelineContext->jobSubmitted) {
+      while (pipelineContext->jobThreadIndex == -1) {
+      }
+      threadIndex = pipelineContext->jobThreadIndex;
+    } else {
+      threadIndex = 0; // TODO: Not sure if this is safe, but it is not yet known which threadIndex will pick up the enqueued job
+    }
+
+    std::string dir = "";
+    if (mConfParam->dumpFolder != "") {
+      dir = std::regex_replace(mConfParam->dumpFolder, std::regex("\\[P\\]"), std::to_string(getpid()));
+      if (mNTFs == 1) {
+        mkdir(dir.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
+      }
+      dir += "/";
+    }
+    if (mNTFs == 1) { // Must dump with first TF, since will enforce enqueued calib updates
+      mGPUReco->DumpSettings(threadIndex, dir.c_str());
+    }
+    if (tinfo.tfCounter >= mConfParam->dumpFirst && (mConfParam->dumpLast == -1 || tinfo.tfCounter <= mConfParam->dumpLast)) {
+      mGPUReco->DumpEvent(mNTFDumps, &ptrs, threadIndex, dir.c_str());
+      mNTFDumps++;
     }
-    mGPUReco->DumpEvent(mNTFs - 1, &ptrs);
   }
   std::unique_ptr<GPUTrackingInOutPointers> ptrsDump;
   if (mConfParam->dumpBadTFMode == 2) {
@@ -847,9 +867,10 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc)
     std::unique_lock lk(pipelineContext->jobFinishedMutex);
     pipelineContext->jobFinishedNotify.wait(lk, [context = pipelineContext.get()]() { return context->jobFinished; });
     retVal = pipelineContext->jobReturnValue;
+    threadIndex = pipelineContext->jobThreadIndex;
   } else {
     // uint32_t threadIndex = pc.services().get<ThreadPool>().threadIndex;
-    uint32_t threadIndex = mNextThreadIndex;
+    threadIndex = mNextThreadIndex;
     if (mConfig->configProcessing.doublePipeline) {
       mNextThreadIndex = (mNextThreadIndex + 1) % 2;
     }
@@ -879,7 +900,7 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc)
       }
       fclose(fp);
     } else if (mConfParam->dumpBadTFMode == 2) {
-      mGPUReco->DumpEvent(mNDebugDumps - 1, ptrsDump.get());
+      mGPUReco->DumpEvent(mNDebugDumps - 1, ptrsDump.get(), threadIndex);
     }
   }
 
diff --git a/prodtests/full-system-test/dpl-workflow.sh b/prodtests/full-system-test/dpl-workflow.sh
index 2f0e761366e18..5d47ae84b130b 100755
--- a/prodtests/full-system-test/dpl-workflow.sh
+++ b/prodtests/full-system-test/dpl-workflow.sh
@@ -235,6 +235,9 @@ if [[ $EPNSYNCMODE == 1 ]]; then
     fi
   fi
 fi
+if [[ $GPUTYPE != "CPU" && $NGPUS > 1 ]]; then
+  GPU_CONFIG_KEY+="GPU_global.dumpFolder=gpu_dump_[P];"
+fi
 if [[ $SYNCRAWMODE == 1 ]]; then
   GPU_CONFIG_KEY+="GPU_proc.tpcIncreasedMinClustersPerRow=500000;GPU_proc.ignoreNonFatalGPUErrors=1;GPU_proc.throttleAlarms=1;"
   if [[ $RUNTYPE == "PHYSICS" || $RUNTYPE == "COSMICS" || $RUNTYPE == "TECHNICAL" ]]; then