From 768fd50181c595ac7fde1833264d334dfa5fd32c Mon Sep 17 00:00:00 2001 From: David Rohr Date: Wed, 15 Oct 2025 15:03:24 +0200 Subject: [PATCH 1/3] GPU Standalone: Make setO2Settings compatible with debug mode --- GPU/GPUTracking/Standalone/Benchmark/standalone.cxx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx index 1fa41d55ebbec..f9c53e3ffd59c 100644 --- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx +++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx @@ -214,11 +214,11 @@ int32_t ReadConfiguration(int argc, char** argv) } } if (configStandalone.setO2Settings) { - if (!(configStandalone.inputcontrolmem && configStandalone.outputcontrolmem)) { - printf("setO2Settings requires the usage of --inputMemory and --outputMemory as in O2\n"); - return 1; - } - if (configStandalone.runGPU) { + if (configStandalone.runGPU && configStandalone.proc.debugLevel <= 1) { + if (!(configStandalone.inputcontrolmem && configStandalone.outputcontrolmem)) { + printf("setO2Settings requires the usage of --inputMemory and --outputMemory as in O2\n"); + return 1; + } configStandalone.proc.forceHostMemoryPoolSize = 1024 * 1024 * 1024; } configStandalone.rec.tpc.trackReferenceX = 83; From a8aa0133377abec0966c1223cbd88aa1181765db Mon Sep 17 00:00:00 2001 From: David Rohr Date: Wed, 15 Oct 2025 17:23:02 +0200 Subject: [PATCH 2/3] GPU RTC: Add overrideWarpSize option --- GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu | 4 ++-- GPU/GPUTracking/Definitions/GPUSettingsList.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index 9e7cfa5495040..62b490a59d0dc 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -113,7 +113,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime() constexpr int32_t reqVerMin = 0; #endif if (GetProcessingSettings().rtc.enable && GetProcessingSettings().rtctech.runTest == 2) { - mWarpSize = GPUCA_WARP_SIZE; + mWarpSize = GetProcessingSettings().rtc.overrideWarpSize != -1 ? GetProcessingSettings().rtc.overrideWarpSize : GPUCA_WARP_SIZE; genAndLoadRTC(); exit(0); } @@ -245,7 +245,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime() GPUInfo("\ttextureAlignment = %ld", (uint64_t)deviceProp.textureAlignment); GPUInfo(" "); } - if (deviceProp.warpSize != GPUCA_WARP_SIZE && !GetProcessingSettings().rtc.enable) { + if (GetProcessingSettings().rtc.enable ? (GetProcessingSettings().rtc.overrideWarpSize != -1 && deviceProp.warpSize != GetProcessingSettings().rtc.overrideWarpSize) : (deviceProp.warpSize != GPUCA_WARP_SIZE)) { throw std::runtime_error("Invalid warp size on GPU"); } mWarpSize = deviceProp.warpSize; diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 30477d67fdc4f..bde082b8a10c4 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -216,6 +216,7 @@ AddOption(optSpecialCode, int8_t, -1, "", 0, "Insert GPUCA_RTC_SPECIAL_CODE spec AddOption(deterministic, bool, false, "", 0, "Compile RTC in deterministic mode, with NO_FAST_MATH flags and GPUCA_DETERMINISTIC_MODE define") AddOption(compilePerKernel, bool, true, "", 0, "Run one RTC compilation per kernel") AddOption(enable, bool, false, "", 0, "Use RTC to optimize GPU code") +AddOption(overrideWarpSize, int32_t, -1, "", 0, "Override the warp size to be used for RTC") AddHelp("help", 'h') EndConfig() From 7b8f4063624dff58c20180fc60fd97c274134f09 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Wed, 15 Oct 2025 19:14:42 +0200 Subject: [PATCH 3/3] GPU Workflow: Add dumpFirst and dumpLast options --- GPU/GPUTracking/Definitions/GPUSettingsList.h | 3 ++ GPU/GPUTracking/Interface/GPUO2Interface.cxx | 23 +++++++------- GPU/GPUTracking/Interface/GPUO2Interface.h | 4 +-- .../include/GPUWorkflow/GPUWorkflowSpec.h | 1 + GPU/Workflow/src/GPUWorkflowInternal.h | 1 + GPU/Workflow/src/GPUWorkflowPipeline.cxx | 4 +-- GPU/Workflow/src/GPUWorkflowSpec.cxx | 31 ++++++++++++++++--- prodtests/full-system-test/dpl-workflow.sh | 3 ++ 8 files changed, 50 insertions(+), 20 deletions(-) diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index bde082b8a10c4..fc08b063ff16a 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -625,6 +625,9 @@ AddOption(deviceType, std::string, "CPU", "", 0, "Device type, CPU | CUDA | HIP AddOption(forceDeviceType, bool, true, "", 0, "force device type, otherwise allows fall-back to CPU") AddOption(synchronousProcessing, bool, false, "", 0, "Apply performance shortcuts for synchronous processing, disable unneeded steps") AddOption(dump, int32_t, 0, "", 0, "Dump events for standalone benchmark: 1 = dump events, 2 = dump events and skip processing in workflow") +AddOption(dumpFirst, int32_t, 0, "", 0, "First event to dump (referring to tfCounter)") +AddOption(dumpLast, int32_t, -1, "", 0, "Last event to dump (-1 = all)") +AddOption(dumpFolder, std::string, "", "", 0, "Folder to which to write dump files, [P] is replaced by process id") AddOption(display, bool, false, "", 0, "Enable standalone gpu tracking visualizaion") AddOption(rundEdx, int32_t, -1, "", 0, "Enable/disable dEdx processing (-1 for autoselect)") AddOption(dEdxSplineTopologyCorrFile, std::string, "", "", 0, "File name of the dE/dx spline track topology correction file") diff --git a/GPU/GPUTracking/Interface/GPUO2Interface.cxx b/GPU/GPUTracking/Interface/GPUO2Interface.cxx index 65907528a3dba..d04db5e9bf271 100644 --- a/GPU/GPUTracking/Interface/GPUO2Interface.cxx +++ b/GPU/GPUTracking/Interface/GPUO2Interface.cxx @@ -137,29 +137,30 @@ void GPUO2Interface::Deinitialize() mNContexts = 0; } -void GPUO2Interface::DumpEvent(int32_t nEvent, GPUTrackingInOutPointers* data) +void GPUO2Interface::DumpEvent(int32_t nEvent, GPUTrackingInOutPointers* data, uint32_t iThread, const char* dir) { - mCtx[0].mChain->ClearIOPointers(); - mCtx[0].mChain->mIOPtrs = *data; + const auto oldPtrs = mCtx[iThread].mChain->mIOPtrs; + mCtx[iThread].mChain->mIOPtrs = *data; char fname[1024]; - snprintf(fname, 1024, "event.%d.dump", nEvent); - mCtx[0].mChain->DumpData(fname); + snprintf(fname, 1024, "%sevent.%d.dump", dir, nEvent); + mCtx[iThread].mChain->DumpData(fname); if (nEvent == 0) { #ifdef GPUCA_BUILD_QA if (mConfig->configProcessing.runMC) { - mCtx[0].mChain->ForceInitQA(); + mCtx[iThread].mChain->ForceInitQA(); snprintf(fname, 1024, "mc.%d.dump", nEvent); - mCtx[0].mChain->GetQA()->UpdateChain(mCtx[0].mChain); - mCtx[0].mChain->GetQA()->DumpO2MCData(fname); + mCtx[iThread].mChain->GetQA()->UpdateChain(mCtx[iThread].mChain); + mCtx[iThread].mChain->GetQA()->DumpO2MCData(fname); } #endif } + mCtx[iThread].mChain->mIOPtrs = oldPtrs; } -void GPUO2Interface::DumpSettings() +void GPUO2Interface::DumpSettings(uint32_t iThread, const char* dir) { - mCtx[0].mChain->DoQueuedUpdates(-1); - mCtx[0].mRec->DumpSettings(); + mCtx[iThread].mChain->DoQueuedUpdates(-1); + mCtx[iThread].mRec->DumpSettings(dir); } int32_t GPUO2Interface::RunTracking(GPUTrackingInOutPointers* data, GPUInterfaceOutputs* outputs, uint32_t iThread, GPUInterfaceInputUpdate* inputUpdateCallback) diff --git a/GPU/GPUTracking/Interface/GPUO2Interface.h b/GPU/GPUTracking/Interface/GPUO2Interface.h index 9b7390f2ed663..0e2020b306984 100644 --- a/GPU/GPUTracking/Interface/GPUO2Interface.h +++ b/GPU/GPUTracking/Interface/GPUO2Interface.h @@ -77,8 +77,8 @@ class GPUO2Interface int32_t RunTracking(GPUTrackingInOutPointers* data, GPUInterfaceOutputs* outputs = nullptr, uint32_t iThread = 0, GPUInterfaceInputUpdate* inputUpdateCallback = nullptr); void Clear(bool clearOutputs, uint32_t iThread = 0); - void DumpEvent(int32_t nEvent, GPUTrackingInOutPointers* data); - void DumpSettings(); + void DumpEvent(int32_t nEvent, GPUTrackingInOutPointers* data, uint32_t iThread, const char* dir = ""); + void DumpSettings(uint32_t iThread, const char* dir = ""); void GetITSTraits(o2::its::TrackerTraits<7>*& trackerTraits, o2::its::VertexerTraits<7>*& vertexerTraits, o2::its::TimeFrame<7>*& timeFrame); const o2::base::Propagator* GetDeviceO2Propagator(int32_t iThread = 0) const; diff --git a/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h b/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h index 4f62f07593bff..160efd4048af0 100644 --- a/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h +++ b/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h @@ -225,6 +225,7 @@ class GPURecoWorkflowSpec : public o2::framework::Task int64_t mCreationForCalib = -1; ///< creation time for calib manipulation int32_t mVerbosity = 0; uint32_t mNTFs = 0; + uint32_t mNTFDumps = 0; uint32_t mNDebugDumps = 0; uint32_t mNextThreadIndex = 0; bool mUpdateGainMapCCDB = true; diff --git a/GPU/Workflow/src/GPUWorkflowInternal.h b/GPU/Workflow/src/GPUWorkflowInternal.h index 7ac9c60048e20..73d3676f3d84a 100644 --- a/GPU/Workflow/src/GPUWorkflowInternal.h +++ b/GPU/Workflow/src/GPUWorkflowInternal.h @@ -47,6 +47,7 @@ struct GPURecoWorkflow_QueueObject { bool jobSubmitted = false; bool jobFinished = false; int32_t jobReturnValue = 0; + volatile int32_t jobThreadIndex = -1; std::mutex jobFinishedMutex; std::condition_variable jobFinishedNotify; bool jobInputFinal = false; diff --git a/GPU/Workflow/src/GPUWorkflowPipeline.cxx b/GPU/Workflow/src/GPUWorkflowPipeline.cxx index 8867b6c336f97..ba395cd98d64d 100644 --- a/GPU/Workflow/src/GPUWorkflowPipeline.cxx +++ b/GPU/Workflow/src/GPUWorkflowPipeline.cxx @@ -90,6 +90,7 @@ void GPURecoWorkflowSpec::RunWorkerThread(int32_t id) context = workerContext.inputQueue.front(); workerContext.inputQueue.pop(); } + context->jobThreadIndex = id; context->jobReturnValue = runMain(nullptr, context->jobPtrs, context->jobOutputRegions, id, context->jobInputUpdateCallback.get()); { std::lock_guard lk(context->jobFinishedMutex); @@ -179,8 +180,7 @@ int32_t GPURecoWorkflowSpec::handlePipeline(ProcessingContext& pc, GPUTrackingIn } mPipeline->completionPolicyQueue.pop(); } - } - if (mSpecConfig.enableDoublePipeline == 2) { + } else if (mSpecConfig.enableDoublePipeline == 2) { auto prepareDummyMessage = pc.outputs().make>(Output{gDataOriginGPU, "PIPELINEPREPARE", 0}, 0u); size_t ptrsTotal = 0; diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx index 6c76f13c9bbd0..d3d3eb14869e0 100644 --- a/GPU/Workflow/src/GPUWorkflowSpec.cxx +++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx @@ -825,11 +825,31 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc) lockDecodeInput.reset(); + uint32_t threadIndex; if (mConfParam->dump) { - if (mNTFs == 1) { - mGPUReco->DumpSettings(); + if (mSpecConfig.enableDoublePipeline && pipelineContext->jobSubmitted) { + while (pipelineContext->jobThreadIndex == -1) { + } + threadIndex = pipelineContext->jobThreadIndex; + } else { + threadIndex = 0; // TODO: Not sure if this is safe, but it is not yet known which threadIndex will pick up the enqueued job + } + + std::string dir = ""; + if (mConfParam->dumpFolder != "") { + dir = std::regex_replace(mConfParam->dumpFolder, std::regex("\\[P\\]"), std::to_string(getpid())); + if (mNTFs == 1) { + mkdir(dir.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); + } + dir += "/"; + } + if (mNTFs == 1) { // Must dump with first TF, since will enforce enqueued calib updates + mGPUReco->DumpSettings(threadIndex, dir.c_str()); + } + if (tinfo.tfCounter >= mConfParam->dumpFirst && (mConfParam->dumpLast == -1 || tinfo.tfCounter <= mConfParam->dumpLast)) { + mGPUReco->DumpEvent(mNTFDumps, &ptrs, threadIndex, dir.c_str()); + mNTFDumps++; } - mGPUReco->DumpEvent(mNTFs - 1, &ptrs); } std::unique_ptr ptrsDump; if (mConfParam->dumpBadTFMode == 2) { @@ -847,9 +867,10 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc) std::unique_lock lk(pipelineContext->jobFinishedMutex); pipelineContext->jobFinishedNotify.wait(lk, [context = pipelineContext.get()]() { return context->jobFinished; }); retVal = pipelineContext->jobReturnValue; + threadIndex = pipelineContext->jobThreadIndex; } else { // uint32_t threadIndex = pc.services().get().threadIndex; - uint32_t threadIndex = mNextThreadIndex; + threadIndex = mNextThreadIndex; if (mConfig->configProcessing.doublePipeline) { mNextThreadIndex = (mNextThreadIndex + 1) % 2; } @@ -879,7 +900,7 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc) } fclose(fp); } else if (mConfParam->dumpBadTFMode == 2) { - mGPUReco->DumpEvent(mNDebugDumps - 1, ptrsDump.get()); + mGPUReco->DumpEvent(mNDebugDumps - 1, ptrsDump.get(), threadIndex); } } diff --git a/prodtests/full-system-test/dpl-workflow.sh b/prodtests/full-system-test/dpl-workflow.sh index 2f0e761366e18..5d47ae84b130b 100755 --- a/prodtests/full-system-test/dpl-workflow.sh +++ b/prodtests/full-system-test/dpl-workflow.sh @@ -235,6 +235,9 @@ if [[ $EPNSYNCMODE == 1 ]]; then fi fi fi +if [[ $GPUTYPE != "CPU" && $NGPUS > 1 ]]; then + GPU_CONFIG_KEY+="GPU_global.dumpFolder=gpu_dump_[P];" +fi if [[ $SYNCRAWMODE == 1 ]]; then GPU_CONFIG_KEY+="GPU_proc.tpcIncreasedMinClustersPerRow=500000;GPU_proc.ignoreNonFatalGPUErrors=1;GPU_proc.throttleAlarms=1;" if [[ $RUNTYPE == "PHYSICS" || $RUNTYPE == "COSMICS" || $RUNTYPE == "TECHNICAL" ]]; then