From 4cd7589243f01f02cbd875b422a058d080825da8 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Thu, 16 Oct 2025 14:01:40 +0200 Subject: [PATCH 1/6] GPU: Make memoryStat work from GPUWorkflow --- GPU/GPUTracking/Base/GPUReconstruction.cxx | 2 +- GPU/GPUTracking/Base/GPUReconstructionCPU.cxx | 28 +++++++++++-------- GPU/GPUTracking/Definitions/GPUSettingsList.h | 2 +- GPU/GPUTracking/Global/GPUChainTracking.cxx | 4 +++ .../GPUChainTrackingDebugAndProfiling.cxx | 26 +++++++++++------ .../Standalone/Benchmark/standalone.cxx | 8 ------ 6 files changed, 39 insertions(+), 31 deletions(-) diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx index 6d64fb3daca6a..e24b76678e710 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.cxx +++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx @@ -999,7 +999,7 @@ void GPUReconstruction::PrintMemoryStatistics() } printf("%59s CPU / %9s GPU\n", "", ""); for (auto it = sizes.begin(); it != sizes.end(); it++) { - printf("Allocation %30s %s: Size %'14zu / %'14zu\n", it->first.c_str(), it->second[2] ? "P" : " ", it->second[0], it->second[1]); + printf("Allocation %50s %s: Size %'14zu / %'14zu\n", it->first.c_str(), it->second[2] ? "P" : " ", it->second[0], it->second[1]); } PrintMemoryOverview(); for (uint32_t i = 0; i < mChains.size(); i++) { diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx index 641b0a2d095ca..bdf1ade37868c 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx @@ -231,26 +231,24 @@ int32_t GPUReconstructionCPU::RunChains() } mTimerTotal.Start(); const std::clock_t cpuTimerStart = std::clock(); + int32_t retVal = 0; if (GetProcessingSettings().doublePipeline) { - int32_t retVal = EnqueuePipeline(); - if (retVal) { - return retVal; - } + retVal = EnqueuePipeline(); } else { if (mSlaves.size() || mMaster) { WriteConstantParams(); // Reinitialize // TODO: Get this in sync with GPUChainTracking::DoQueuedUpdates, and consider the doublePipeline } for (uint32_t i = 0; i < mChains.size(); i++) { - int32_t retVal = mChains[i]->RunChain(); - if (retVal) { - return retVal; - } - } - if (GetProcessingSettings().tpcFreeAllocatedMemoryAfterProcessing) { - ClearAllocatedMemory(); + retVal = mChains[i]->RunChain(); } } + if (retVal != 0 && retVal != 2) { + return retVal; + } mTimerTotal.Stop(); + if (GetProcessingSettings().tpcFreeAllocatedMemoryAfterProcessing) { + ClearAllocatedMemory(); + } mStatCPUTime += (double)(std::clock() - cpuTimerStart) / CLOCKS_PER_SEC; if (GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) { GPUInfo("Allocated memory when ending processing %36s", ""); @@ -339,7 +337,13 @@ int32_t GPUReconstructionCPU::RunChains() mTimerTotal.Reset(); } - return 0; + if (GetProcessingSettings().memoryStat) { + PrintMemoryStatistics(); + } else if (GetProcessingSettings().debugLevel >= 2) { + PrintMemoryOverview(); + } + + return retVal; } void GPUReconstructionCPU::ResetDeviceProcessorTypes() diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index fc08b063ff16a..8cf6b29a43d96 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -377,6 +377,7 @@ AddOption(debugOnFailureMaxFiles, uint32_t, 0, "", 0, "Max number of files to ha AddOption(debugOnFailureMaxSize, uint32_t, 0, "", 0, "Max size of existing dumps in the target folder in GB") AddOption(debugOnFailureDirectory, std::string, ".", "", 0, "Target folder for debug / dump") AddOption(amdMI100SerializationWorkaround, bool, false, "", 0, "Enable workaround that mitigates MI100 serialization bug") +AddOption(memoryStat, bool, false, "", 0, "Print memory statistics") AddVariable(eventDisplay, o2::gpu::GPUDisplayFrontendInterface*, nullptr) AddSubConfig(GPUSettingsProcessingRTC, rtc) AddSubConfig(GPUSettingsProcessingRTCtechnical, rtctech) @@ -587,7 +588,6 @@ AddOption(zsVersion, int32_t, 2, "", 0, "ZS Version: 1 = 10-bit ADC row based, 2 AddOption(dumpEvents, bool, false, "", 0, "Dump events (after transformation such as encodeZS") AddOption(stripDumpedEvents, bool, false, "", 0, "Remove redundant inputs (e.g. digits and ZS) before dumping") AddOption(printSettings, int32_t, 0, "", 0, "Print all settings", def(1)) -AddOption(memoryStat, bool, false, "", 0, "Print memory statistics") AddOption(testSyncAsync, bool, false, "syncAsync", 0, "Test first synchronous and then asynchronous processing") AddOption(testSync, bool, false, "sync", 0, "Test settings for synchronous phase") AddOption(timeFrameTime, bool, false, "tfTime", 0, "Print some debug information about time frame processing time") diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx index 91870f981d542..14d0e04eb4dd3 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.cxx +++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx @@ -278,6 +278,10 @@ bool GPUChainTracking::ValidateSettings() return false; } if (GetProcessingSettings().doublePipeline) { + if (GetProcessingSettings().tpcFreeAllocatedMemoryAfterProcessing) { + GPUError("Cannot use double pipeline with tpcFreeAllocatedMemoryAfterProcessing"); + return false; + } if (!GetRecoStepsOutputs().isOnlySet(GPUDataTypes::InOutType::TPCMergedTracks, GPUDataTypes::InOutType::TPCCompressedClusters, GPUDataTypes::InOutType::TPCClusters)) { GPUError("Invalid outputs for double pipeline mode 0x%x", (uint32_t)GetRecoStepsOutputs()); return false; diff --git a/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx b/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx index 15846246bca0a..fab7179876c04 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx @@ -142,8 +142,10 @@ void GPUChainTracking::PrintMemoryStatistics() std::map usageMap; for (int32_t i = 0; i < NSECTORS; i++) { #ifdef GPUCA_TPC_GEOMETRY_O2 - addToMap("TPC Clusterer Sector Peaks", usageMap, processors()->tpcClusterer[i].mPmemory->counters.nPeaks, processors()->tpcClusterer[i].mNMaxPeaks); - addToMap("TPC Clusterer Sector Clusters", usageMap, processors()->tpcClusterer[i].mPmemory->counters.nClusters, processors()->tpcClusterer[i].mNMaxClusters); + if (processors()->tpcClusterer[i].mPmemory) { + addToMap("TPC Clusterer Sector Peaks", usageMap, processors()->tpcClusterer[i].mPmemory->counters.nPeaks, processors()->tpcClusterer[i].mNMaxPeaks); + addToMap("TPC Clusterer Sector Clusters", usageMap, processors()->tpcClusterer[i].mPmemory->counters.nClusters, processors()->tpcClusterer[i].mNMaxClusters); + } #endif addToMap("TPC Sector Start Hits", usageMap, *processors()->tpcTrackers[i].NStartHits(), processors()->tpcTrackers[i].NMaxStartHits()); addToMap("TPC Sector Tracklets", usageMap, *processors()->tpcTrackers[i].NTracklets(), processors()->tpcTrackers[i].NMaxTracklets()); @@ -152,8 +154,10 @@ void GPUChainTracking::PrintMemoryStatistics() addToMap("TPC Sector TrackHits", usageMap, *processors()->tpcTrackers[i].NTrackHits(), processors()->tpcTrackers[i].NMaxTrackHits()); } addToMap("TPC Clusterer Clusters", usageMap, mRec->MemoryScalers()->nTPCHits, mRec->MemoryScalers()->NTPCClusters(mRec->MemoryScalers()->nTPCdigits)); - addToMap("TPC Tracks", usageMap, processors()->tpcMerger.NMergedTracks(), processors()->tpcMerger.NMaxTracks()); - addToMap("TPC TrackHits", usageMap, processors()->tpcMerger.NMergedTrackClusters(), processors()->tpcMerger.NMaxMergedTrackClusters()); + if (processors()->tpcMerger.Memory()) { + addToMap("TPC Tracks", usageMap, processors()->tpcMerger.NMergedTracks(), processors()->tpcMerger.NMaxTracks()); + addToMap("TPC TrackHits", usageMap, processors()->tpcMerger.NMergedTrackClusters(), processors()->tpcMerger.NMaxMergedTrackClusters()); + } if (mRec->GetProcessingSettings().createO2Output) { addToMap("TPC O2 Tracks", usageMap, processors()->tpcMerger.NOutputTracksTPCO2(), processors()->tpcMerger.NOutputTracksTPCO2()); @@ -161,9 +165,11 @@ void GPUChainTracking::PrintMemoryStatistics() } #ifdef GPUCA_TPC_GEOMETRY_O2 - addToMap("TPC ComprCache HitsAttached", usageMap, processors()->tpcCompressor.mOutput->nAttachedClusters, processors()->tpcCompressor.mMaxTrackClusters); - addToMap("TPC ComprCache HitsUnattached", usageMap, processors()->tpcCompressor.mOutput->nUnattachedClusters, processors()->tpcCompressor.mMaxClustersInCache); - addToMap("TPC ComprCache Tracks", usageMap, processors()->tpcCompressor.mOutput->nTracks, processors()->tpcCompressor.mMaxTracks); + if (processors()->tpcCompressor.mOutput) { + addToMap("TPC ComprCache HitsAttached", usageMap, processors()->tpcCompressor.mOutput->nAttachedClusters, processors()->tpcCompressor.mMaxTrackClusters); + addToMap("TPC ComprCache HitsUnattached", usageMap, processors()->tpcCompressor.mOutput->nUnattachedClusters, processors()->tpcCompressor.mMaxClustersInCache); + addToMap("TPC ComprCache Tracks", usageMap, processors()->tpcCompressor.mOutput->nTracks, processors()->tpcCompressor.mMaxTracks); + } #endif for (auto& elem : usageMap) { @@ -180,8 +186,10 @@ void GPUChainTracking::PrintMemoryRelations() GPUInfo("MEMREL SectorTracks NCl %d NTrk %d", processors()->tpcTrackers[i].NHitsTotal(), *processors()->tpcTrackers[i].NTracks()); GPUInfo("MEMREL SectorTrackHits NCl %d NTrkH %d", processors()->tpcTrackers[i].NHitsTotal(), *processors()->tpcTrackers[i].NTrackHits()); } - GPUInfo("MEMREL Tracks NCl %d NTrk %d", processors()->tpcMerger.NMaxClusters(), processors()->tpcMerger.NMergedTracks()); - GPUInfo("MEMREL TrackHitss NCl %d NTrkH %d", processors()->tpcMerger.NMaxClusters(), processors()->tpcMerger.NMergedTrackClusters()); + if (processors()->tpcMerger.Memory()) { + GPUInfo("MEMREL Tracks NCl %d NTrk %d", processors()->tpcMerger.NMaxClusters(), processors()->tpcMerger.NMergedTracks()); + GPUInfo("MEMREL TrackHitss NCl %d NTrkH %d", processors()->tpcMerger.NMaxClusters(), processors()->tpcMerger.NMergedTrackClusters()); + } } void GPUChainTracking::PrepareKernelDebugOutput() diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx index f9c53e3ffd59c..4fe1691afef50 100644 --- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx +++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx @@ -649,11 +649,6 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU if (tmpRetVal == 0 || tmpRetVal == 2) { OutputStat(chainTrackingUse, iRun == 0 ? nTracksTotal : nullptr, iRun == 0 ? nClustersTotal : nullptr); - if (configStandalone.memoryStat) { - recUse->PrintMemoryStatistics(); - } else if (configStandalone.proc.debugLevel >= 2) { - recUse->PrintMemoryOverview(); - } } if (tmpRetVal == 0 && configStandalone.testSyncAsync) { @@ -685,9 +680,6 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU tmpRetVal = recAsync->RunChains(); if (tmpRetVal == 0 || tmpRetVal == 2) { OutputStat(chainTrackingAsync, nullptr, nullptr); - if (configStandalone.memoryStat) { - recAsync->PrintMemoryStatistics(); - } } recAsync->ClearAllocatedMemory(); } From b02aeda485cca4fda78e69813c93edeef2d30dc7 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Thu, 16 Oct 2025 15:06:20 +0200 Subject: [PATCH 2/6] GPU TPC Decompression: all temporary memory should go to the stack and be freed --- GPU/GPUTracking/Base/GPUReconstruction.cxx | 6 +++++- .../DataCompression/GPUTPCDecompression.cxx | 12 ++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx index e24b76678e710..a05736d519bd0 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.cxx +++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx @@ -639,7 +639,7 @@ void GPUReconstruction::AllocateRegisteredMemoryInternal(GPUMemoryResource* res, res->mPtr = GPUProcessor::alignPointer(res->mPtrDevice); res->SetPointers(res->mPtr); if (GetProcessingSettings().allocDebugLevel >= 2) { - std::cout << (res->mReuse >= 0 ? "Reused " : "Allocated ") << res->mName << ": " << res->mSize << "\n"; + std::cout << (res->mReuse >= 0 ? "Reused " : "Allocated ") << res->mName << ": " << res->mSize << " (individual" << ((res->mType & GPUMemoryResource::MEMORY_STACK) ? " stack" : "") << ")\n"; } if (res->mType & GPUMemoryResource::MEMORY_STACK) { mNonPersistentIndividualAllocations.emplace_back(res); @@ -896,8 +896,12 @@ void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag) } mHostMemoryPoolEnd = std::get<0>(mNonPersistentMemoryStack.back()); mDeviceMemoryPoolEnd = std::get<1>(mNonPersistentMemoryStack.back()); + std::cout << "FOOOO POP " << std::get<2>(mNonPersistentMemoryStack.back()) << " - " << mNonPersistentIndividualAllocations.size(); for (uint32_t i = std::get<2>(mNonPersistentMemoryStack.back()); i < mNonPersistentIndividualAllocations.size(); i++) { GPUMemoryResource* res = mNonPersistentIndividualAllocations[i]; + if (GetProcessingSettings().allocDebugLevel >= 2 && (res->mPtr || res->mPtrDevice)) { + std::cout << "Freeing NonPersistent " << res->mName << ": size " << res->mSize << " (reused " << res->mReuse << ")\n"; + } if (res->mReuse < 0) { operator delete(res->mPtrDevice, std::align_val_t(GPUCA_BUFFER_ALIGNMENT)); } diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx index fd0c929dd2ba7..397695b051a86 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx @@ -106,12 +106,12 @@ void GPUTPCDecompression::RegisterMemoryAllocation() { AllocateAndInitializeLate(); mMemoryResInputGPU = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputGPU, GPUMemoryResource::MEMORY_INPUT_FLAG | GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_EXTERNAL | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionInput"); - mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersGPU, GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBuffersGPU"); - mResourceTmpIndexes = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersOutput, GPUMemoryResource::MEMORY_OUTPUT | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBuffersOutput"); - mResourceTmpClustersOffsets = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersInput, GPUMemoryResource::MEMORY_INPUT | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBuffersInput"); - mResourceTmpBufferBeforeFiltering = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpClusterNativeAccessForFiltering, GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBufferForFiltering"); - mResourceClusterNativeAccess = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputClusterNativeAccess, GPUMemoryResource::MEMORY_INPUT | GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpClusterAccessForFiltering"); - mResourceNClusterPerSectorRow = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersNClusterPerSectorRow, GPUMemoryResource::MEMORY_OUTPUT | GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpClusterCountForFiltering"); + mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersGPU, GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK, "TPCDecompressionTmpBuffersGPU"); + mResourceTmpIndexes = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersOutput, GPUMemoryResource::MEMORY_OUTPUT | GPUMemoryResource::MEMORY_STACK, "TPCDecompressionTmpBuffersOutput"); + mResourceTmpClustersOffsets = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersInput, GPUMemoryResource::MEMORY_INPUT | GPUMemoryResource::MEMORY_STACK, "TPCDecompressionTmpBuffersInput"); + mResourceTmpBufferBeforeFiltering = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpClusterNativeAccessForFiltering, GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK, "TPCDecompressionTmpBufferForFiltering"); + mResourceClusterNativeAccess = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputClusterNativeAccess, GPUMemoryResource::MEMORY_INPUT | GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_STACK, "TPCDecompressionTmpClusterAccessForFiltering"); + mResourceNClusterPerSectorRow = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersNClusterPerSectorRow, GPUMemoryResource::MEMORY_OUTPUT | GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_STACK, "TPCDecompressionTmpClusterCountForFiltering"); } void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io) From 772ddbecf955e7743b883c3f9048c4bc4adedc70 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Fri, 17 Oct 2025 11:25:19 +0200 Subject: [PATCH 3/6] GPU TPC: Tracklet memory during seeding when running on the host should be on the stack --- GPU/GPUTracking/SectorTracker/GPUTPCTracker.cxx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GPU/GPUTracking/SectorTracker/GPUTPCTracker.cxx b/GPU/GPUTracking/SectorTracker/GPUTPCTracker.cxx index 41530cb629ce8..7897de4f2002e 100644 --- a/GPU/GPUTracking/SectorTracker/GPUTPCTracker.cxx +++ b/GPU/GPUTracking/SectorTracker/GPUTPCTracker.cxx @@ -102,9 +102,9 @@ void GPUTPCTracker::RegisterMemoryAllocation() uint32_t type = GPUMemoryResource::MEMORY_SCRATCH; if (mRec->GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) { // For individual scheme, we allocate tracklets separately, and change the type for the following allocations to custom type |= GPUMemoryResource::MEMORY_CUSTOM; - mMemoryResTracklets = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersTracklets, type, "TPCTrackerTracklets"); + mMemoryResTracklets = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersTracklets, type | GPUMemoryResource::MEMORY_STACK, "TPCTrackerTracklets"); } - mMemoryResOutput = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersOutput, type, "TPCTrackerTracks"); + mMemoryResOutput = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersOutput, type, "TPCTrackerTracks"); // TODO: Ideally this should eventually go on the stack, so that we can free it after the first phase of track merging } GPUhd() void* GPUTPCTracker::SetPointersTracklets(void* mem) From 901e3e4cb5968b2fa415d6e7c518bc866dafdd93 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Fri, 17 Oct 2025 11:25:52 +0200 Subject: [PATCH 4/6] GPU: Add option to free individual stacked allocations per processor on the host --- GPU/GPUTracking/Base/GPUReconstruction.cxx | 23 ++++++++++++++-------- GPU/GPUTracking/Base/GPUReconstruction.h | 2 +- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx index a05736d519bd0..f00c856ad1ff2 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.cxx +++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx @@ -877,8 +877,11 @@ void GPUReconstruction::PushNonPersistentMemory(uint64_t tag) mNonPersistentMemoryStack.emplace_back(mHostMemoryPoolEnd, mDeviceMemoryPoolEnd, mNonPersistentIndividualAllocations.size(), mNonPersistentIndividualDirectAllocations.size(), tag); } -void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag) +void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag, const GPUProcessor* proc) { + if (proc && GetProcessingSettings().memoryAllocationStrategy != GPUMemoryResource::ALLOCATION_INDIVIDUAL) { + GPUFatal("Processor-depending memory-free works only with allocation strategy ALLOCATION_INDIVIDUAL"); + } if (GetProcessingSettings().keepDisplayMemory || GetProcessingSettings().disableMemoryReuse) { return; } @@ -888,17 +891,17 @@ void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag) if (tag != 0 && std::get<4>(mNonPersistentMemoryStack.back()) != tag) { GPUFatal("Tag mismatch when popping non persistent memory from stack : pop %s vs on stack %s", qTag2Str(tag).c_str(), qTag2Str(std::get<4>(mNonPersistentMemoryStack.back())).c_str()); } - if ((GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) && (IsGPU() || GetProcessingSettings().forceHostMemoryPoolSize)) { + if (!proc && (GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) && (IsGPU() || GetProcessingSettings().forceHostMemoryPoolSize)) { printf("Allocated memory after %30s (%8s) (Stack %zu): ", GPUDataTypes::RECO_STEP_NAMES[getRecoStepNum(step, true)], qTag2Str(std::get<4>(mNonPersistentMemoryStack.back())).c_str(), mNonPersistentMemoryStack.size()); PrintMemoryOverview(); printf("%76s", ""); PrintMemoryMax(); } - mHostMemoryPoolEnd = std::get<0>(mNonPersistentMemoryStack.back()); - mDeviceMemoryPoolEnd = std::get<1>(mNonPersistentMemoryStack.back()); - std::cout << "FOOOO POP " << std::get<2>(mNonPersistentMemoryStack.back()) << " - " << mNonPersistentIndividualAllocations.size(); for (uint32_t i = std::get<2>(mNonPersistentMemoryStack.back()); i < mNonPersistentIndividualAllocations.size(); i++) { GPUMemoryResource* res = mNonPersistentIndividualAllocations[i]; + if (proc && res->mProcessor != proc) { + continue; + } if (GetProcessingSettings().allocDebugLevel >= 2 && (res->mPtr || res->mPtrDevice)) { std::cout << "Freeing NonPersistent " << res->mName << ": size " << res->mSize << " (reused " << res->mReuse << ")\n"; } @@ -908,9 +911,13 @@ void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag) res->mPtr = nullptr; res->mPtrDevice = nullptr; } - mNonPersistentIndividualAllocations.resize(std::get<2>(mNonPersistentMemoryStack.back())); - mNonPersistentIndividualDirectAllocations.resize(std::get<3>(mNonPersistentMemoryStack.back())); - mNonPersistentMemoryStack.pop_back(); + if (!proc) { + mHostMemoryPoolEnd = std::get<0>(mNonPersistentMemoryStack.back()); + mDeviceMemoryPoolEnd = std::get<1>(mNonPersistentMemoryStack.back()); + mNonPersistentIndividualAllocations.resize(std::get<2>(mNonPersistentMemoryStack.back())); + mNonPersistentIndividualDirectAllocations.resize(std::get<3>(mNonPersistentMemoryStack.back())); + mNonPersistentMemoryStack.pop_back(); + } } void GPUReconstruction::BlockStackedMemory(GPUReconstruction* rec) diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h index 420e602e61352..b98f5660a933e 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.h +++ b/GPU/GPUTracking/Base/GPUReconstruction.h @@ -179,7 +179,7 @@ class GPUReconstruction void ReturnVolatileMemory(); ThrustVolatileAllocator getThrustVolatileDeviceAllocator(); void PushNonPersistentMemory(uint64_t tag); - void PopNonPersistentMemory(RecoStep step, uint64_t tag); + void PopNonPersistentMemory(RecoStep step, uint64_t tag, const GPUProcessor* proc = nullptr); void BlockStackedMemory(GPUReconstruction* rec); void UnblockStackedMemory(); void ResetRegisteredMemoryPointers(GPUProcessor* proc); From bb4a15b111eb27eff3f341b59b931b4a87dd14f4 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Fri, 17 Oct 2025 16:26:51 +0200 Subject: [PATCH 5/6] GPU: Make memory allocation and freeing of individual stacked memory thread-safe --- GPU/GPUTracking/Base/GPUReconstruction.cxx | 7 ++++ GPU/GPUTracking/Base/GPUReconstruction.h | 2 + GPU/GPUTracking/utils/stdspinlock.h | 44 ++++++++++++++++++++++ 3 files changed, 53 insertions(+) create mode 100644 GPU/GPUTracking/utils/stdspinlock.h diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx index f00c856ad1ff2..5129ccc4becf1 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.cxx +++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx @@ -40,6 +40,7 @@ #include "GPULogging.h" #include "utils/strtag.h" +#include "utils/stdspinlock.h" #ifdef GPUCA_O2_LIB #include "GPUO2InterfaceConfiguration.h" @@ -589,6 +590,7 @@ size_t GPUReconstruction::AllocateRegisteredMemoryHelper(GPUMemoryResource* res, throw std::bad_alloc(); } size_t retVal; + stdspinlock spinlock(mMemoryMutex); if ((res->mType & GPUMemoryResource::MEMORY_STACK) && memorypoolend) { retVal = ptrDiff((res->*setPtr)((char*)1), (char*)(1)); memorypoolend = (void*)((char*)memorypoolend - GPUProcessor::getAlignmentMod(memorypoolend)); @@ -642,6 +644,7 @@ void GPUReconstruction::AllocateRegisteredMemoryInternal(GPUMemoryResource* res, std::cout << (res->mReuse >= 0 ? "Reused " : "Allocated ") << res->mName << ": " << res->mSize << " (individual" << ((res->mType & GPUMemoryResource::MEMORY_STACK) ? " stack" : "") << ")\n"; } if (res->mType & GPUMemoryResource::MEMORY_STACK) { + stdspinlock spinlock(mMemoryMutex); mNonPersistentIndividualAllocations.emplace_back(res); } if ((size_t)res->mPtr % GPUCA_BUFFER_ALIGNMENT) { @@ -722,6 +725,7 @@ size_t GPUReconstruction::AllocateRegisteredMemory(int16_t ires, GPUOutputContro void* GPUReconstruction::AllocateDirectMemory(size_t size, int32_t type) { + stdspinlock spinlock(mMemoryMutex); if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) { char* retVal = new (std::align_val_t(GPUCA_BUFFER_ALIGNMENT)) char[size]; if ((type & GPUMemoryResource::MEMORY_STACK)) { @@ -763,6 +767,7 @@ void* GPUReconstruction::AllocateDirectMemory(size_t size, int32_t type) void* GPUReconstruction::AllocateVolatileDeviceMemory(size_t size) { + stdspinlock spinlock(mMemoryMutex); if (mVolatileMemoryStart == nullptr) { mVolatileMemoryStart = mDeviceMemoryPool; } @@ -788,6 +793,7 @@ void* GPUReconstruction::AllocateVolatileMemory(size_t size, bool device) return AllocateVolatileDeviceMemory(size); } char* retVal = new (std::align_val_t(GPUCA_BUFFER_ALIGNMENT)) char[size]; + stdspinlock spinlock(mMemoryMutex); mVolatileChunks.emplace_back(retVal, alignedDeleter()); return retVal; } @@ -912,6 +918,7 @@ void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag, cons res->mPtrDevice = nullptr; } if (!proc) { + stdspinlock spinlock(mMemoryMutex); mHostMemoryPoolEnd = std::get<0>(mNonPersistentMemoryStack.back()); mDeviceMemoryPoolEnd = std::get<1>(mNonPersistentMemoryStack.back()); mNonPersistentIndividualAllocations.resize(std::get<2>(mNonPersistentMemoryStack.back())); diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h index b98f5660a933e..b7eda77aeb9fe 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.h +++ b/GPU/GPUTracking/Base/GPUReconstruction.h @@ -25,6 +25,7 @@ #include #include #include +#include #include "GPUDataTypes.h" #include "GPUMemoryResource.h" @@ -390,6 +391,7 @@ class GPUReconstruction std::vector> mNonPersistentIndividualDirectAllocations; std::vector> mDirectMemoryChunks; std::vector> mVolatileChunks; + std::atomic_flag mMemoryMutex = ATOMIC_FLAG_INIT; std::unique_ptr mPipelineContext; diff --git a/GPU/GPUTracking/utils/stdspinlock.h b/GPU/GPUTracking/utils/stdspinlock.h new file mode 100644 index 0000000000000..14bf95c45968e --- /dev/null +++ b/GPU/GPUTracking/utils/stdspinlock.h @@ -0,0 +1,44 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file stdspinlock.h +/// \author David Rohr + +#ifndef Q_STDSPINLOCK_H +#define Q_STDSPINLOCK_H + +#include + +class stdspinlock +{ + public: + stdspinlock(std::atomic_flag& flag) : mFlag(&flag) + { + while (flag.test_and_set(std::memory_order_acquire)) { + } + } + void release() + { + if (mFlag) { + mFlag->clear(std::memory_order_release); + mFlag = nullptr; + } + } + ~stdspinlock() + { + release(); + } + + private: + std::atomic_flag* mFlag; +}; + +#endif // Q_STDSPINLOCK_H From bb67843210308617274df59654ee0252c495cffc Mon Sep 17 00:00:00 2001 From: David Rohr Date: Fri, 17 Oct 2025 11:26:17 +0200 Subject: [PATCH 6/6] GPU TPC: Free sector tracking memory earlier --- GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx | 3 +++ GPU/GPUTracking/SectorTracker/GPUTPCTracker.cxx | 7 ++++++- GPU/GPUTracking/SectorTracker/GPUTPCTracker.h | 1 + 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx b/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx index d13e8d5544631..7ab2cfeccee80 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx @@ -224,6 +224,9 @@ int32_t GPUChainTracking::RunTPCTrackingSectors_internal() GPUInfo("Sector %u, Number of tracks: %d", iSector, *trk.NTracks()); } DoDebugAndDump(RecoStep::TPCSectorTracking, GPUChainTrackingDebugFlags::TPCSectorTracks, trk, &GPUTPCTracker::DumpTrackHits, *mDebugFile); + if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL && !trk.MemoryReuseAllowed()) { + mRec->PopNonPersistentMemory(RecoStep::TPCSectorTracking, qStr2Tag("TPCSLTRK"), &trk); + } }); mRec->SetNActiveThreadsOuterLoop(1); if (error) { diff --git a/GPU/GPUTracking/SectorTracker/GPUTPCTracker.cxx b/GPU/GPUTracking/SectorTracker/GPUTPCTracker.cxx index 7897de4f2002e..c5e6a21460a36 100644 --- a/GPU/GPUTracking/SectorTracker/GPUTPCTracker.cxx +++ b/GPU/GPUTracking/SectorTracker/GPUTPCTracker.cxx @@ -84,10 +84,15 @@ void* GPUTPCTracker::SetPointersCommon(void* mem) return mem; } +bool GPUTPCTracker::MemoryReuseAllowed() +{ + return !mRec->GetProcessingSettings().keepDisplayMemory && ((mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSectorTracking) || mRec->GetProcessingSettings().inKernelParallel == 1 || mRec->GetProcessingSettings().nHostThreads == 1); +} + void GPUTPCTracker::RegisterMemoryAllocation() { AllocateAndInitializeLate(); - bool reuseCondition = !mRec->GetProcessingSettings().keepDisplayMemory && ((mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSectorTracking) || mRec->GetProcessingSettings().inKernelParallel == 1 || mRec->GetProcessingSettings().nHostThreads == 1); + bool reuseCondition = MemoryReuseAllowed(); GPUMemoryReuse reLinks{reuseCondition, GPUMemoryReuse::REUSE_1TO1, GPUMemoryReuse::TrackerDataLinks, (uint16_t)(mISector % mRec->GetProcessingSettings().nStreams)}; mMemoryResLinks = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersDataLinks, GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK, "TPCSectorLinks", reLinks); mMemoryResSectorScratch = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersDataScratch, GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK | GPUMemoryResource::MEMORY_CUSTOM, "TPCSectorScratch"); diff --git a/GPU/GPUTracking/SectorTracker/GPUTPCTracker.h b/GPU/GPUTracking/SectorTracker/GPUTPCTracker.h index 2667da4a53977..aee429c959e98 100644 --- a/GPU/GPUTracking/SectorTracker/GPUTPCTracker.h +++ b/GPU/GPUTracking/SectorTracker/GPUTPCTracker.h @@ -103,6 +103,7 @@ class GPUTPCTracker : public GPUProcessor void* SetPointersTracklets(void* mem); void* SetPointersOutput(void* mem); void RegisterMemoryAllocation(); + bool MemoryReuseAllowed(); int16_t MemoryResLinks() const { return mMemoryResLinks; } int16_t MemoryResScratchHost() const { return mMemoryResScratchHost; }