diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx index 6d64fb3daca6a..5129ccc4becf1 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.cxx +++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx @@ -40,6 +40,7 @@ #include "GPULogging.h" #include "utils/strtag.h" +#include "utils/stdspinlock.h" #ifdef GPUCA_O2_LIB #include "GPUO2InterfaceConfiguration.h" @@ -589,6 +590,7 @@ size_t GPUReconstruction::AllocateRegisteredMemoryHelper(GPUMemoryResource* res, throw std::bad_alloc(); } size_t retVal; + stdspinlock spinlock(mMemoryMutex); if ((res->mType & GPUMemoryResource::MEMORY_STACK) && memorypoolend) { retVal = ptrDiff((res->*setPtr)((char*)1), (char*)(1)); memorypoolend = (void*)((char*)memorypoolend - GPUProcessor::getAlignmentMod(memorypoolend)); @@ -639,9 +641,10 @@ void GPUReconstruction::AllocateRegisteredMemoryInternal(GPUMemoryResource* res, res->mPtr = GPUProcessor::alignPointer(res->mPtrDevice); res->SetPointers(res->mPtr); if (GetProcessingSettings().allocDebugLevel >= 2) { - std::cout << (res->mReuse >= 0 ? "Reused " : "Allocated ") << res->mName << ": " << res->mSize << "\n"; + std::cout << (res->mReuse >= 0 ? "Reused " : "Allocated ") << res->mName << ": " << res->mSize << " (individual" << ((res->mType & GPUMemoryResource::MEMORY_STACK) ? " stack" : "") << ")\n"; } if (res->mType & GPUMemoryResource::MEMORY_STACK) { + stdspinlock spinlock(mMemoryMutex); mNonPersistentIndividualAllocations.emplace_back(res); } if ((size_t)res->mPtr % GPUCA_BUFFER_ALIGNMENT) { @@ -722,6 +725,7 @@ size_t GPUReconstruction::AllocateRegisteredMemory(int16_t ires, GPUOutputContro void* GPUReconstruction::AllocateDirectMemory(size_t size, int32_t type) { + stdspinlock spinlock(mMemoryMutex); if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) { char* retVal = new (std::align_val_t(GPUCA_BUFFER_ALIGNMENT)) char[size]; if ((type & GPUMemoryResource::MEMORY_STACK)) { @@ -763,6 +767,7 @@ void* GPUReconstruction::AllocateDirectMemory(size_t size, int32_t type) void* GPUReconstruction::AllocateVolatileDeviceMemory(size_t size) { + stdspinlock spinlock(mMemoryMutex); if (mVolatileMemoryStart == nullptr) { mVolatileMemoryStart = mDeviceMemoryPool; } @@ -788,6 +793,7 @@ void* GPUReconstruction::AllocateVolatileMemory(size_t size, bool device) return AllocateVolatileDeviceMemory(size); } char* retVal = new (std::align_val_t(GPUCA_BUFFER_ALIGNMENT)) char[size]; + stdspinlock spinlock(mMemoryMutex); mVolatileChunks.emplace_back(retVal, alignedDeleter()); return retVal; } @@ -877,8 +883,11 @@ void GPUReconstruction::PushNonPersistentMemory(uint64_t tag) mNonPersistentMemoryStack.emplace_back(mHostMemoryPoolEnd, mDeviceMemoryPoolEnd, mNonPersistentIndividualAllocations.size(), mNonPersistentIndividualDirectAllocations.size(), tag); } -void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag) +void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag, const GPUProcessor* proc) { + if (proc && GetProcessingSettings().memoryAllocationStrategy != GPUMemoryResource::ALLOCATION_INDIVIDUAL) { + GPUFatal("Processor-depending memory-free works only with allocation strategy ALLOCATION_INDIVIDUAL"); + } if (GetProcessingSettings().keepDisplayMemory || GetProcessingSettings().disableMemoryReuse) { return; } @@ -888,25 +897,34 @@ void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag) if (tag != 0 && std::get<4>(mNonPersistentMemoryStack.back()) != tag) { GPUFatal("Tag mismatch when popping non persistent memory from stack : pop %s vs on stack %s", qTag2Str(tag).c_str(), qTag2Str(std::get<4>(mNonPersistentMemoryStack.back())).c_str()); } - if ((GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) && (IsGPU() || GetProcessingSettings().forceHostMemoryPoolSize)) { + if (!proc && (GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) && (IsGPU() || GetProcessingSettings().forceHostMemoryPoolSize)) { printf("Allocated memory after %30s (%8s) (Stack %zu): ", GPUDataTypes::RECO_STEP_NAMES[getRecoStepNum(step, true)], qTag2Str(std::get<4>(mNonPersistentMemoryStack.back())).c_str(), mNonPersistentMemoryStack.size()); PrintMemoryOverview(); printf("%76s", ""); PrintMemoryMax(); } - mHostMemoryPoolEnd = std::get<0>(mNonPersistentMemoryStack.back()); - mDeviceMemoryPoolEnd = std::get<1>(mNonPersistentMemoryStack.back()); for (uint32_t i = std::get<2>(mNonPersistentMemoryStack.back()); i < mNonPersistentIndividualAllocations.size(); i++) { GPUMemoryResource* res = mNonPersistentIndividualAllocations[i]; + if (proc && res->mProcessor != proc) { + continue; + } + if (GetProcessingSettings().allocDebugLevel >= 2 && (res->mPtr || res->mPtrDevice)) { + std::cout << "Freeing NonPersistent " << res->mName << ": size " << res->mSize << " (reused " << res->mReuse << ")\n"; + } if (res->mReuse < 0) { operator delete(res->mPtrDevice, std::align_val_t(GPUCA_BUFFER_ALIGNMENT)); } res->mPtr = nullptr; res->mPtrDevice = nullptr; } - mNonPersistentIndividualAllocations.resize(std::get<2>(mNonPersistentMemoryStack.back())); - mNonPersistentIndividualDirectAllocations.resize(std::get<3>(mNonPersistentMemoryStack.back())); - mNonPersistentMemoryStack.pop_back(); + if (!proc) { + stdspinlock spinlock(mMemoryMutex); + mHostMemoryPoolEnd = std::get<0>(mNonPersistentMemoryStack.back()); + mDeviceMemoryPoolEnd = std::get<1>(mNonPersistentMemoryStack.back()); + mNonPersistentIndividualAllocations.resize(std::get<2>(mNonPersistentMemoryStack.back())); + mNonPersistentIndividualDirectAllocations.resize(std::get<3>(mNonPersistentMemoryStack.back())); + mNonPersistentMemoryStack.pop_back(); + } } void GPUReconstruction::BlockStackedMemory(GPUReconstruction* rec) @@ -999,7 +1017,7 @@ void GPUReconstruction::PrintMemoryStatistics() } printf("%59s CPU / %9s GPU\n", "", ""); for (auto it = sizes.begin(); it != sizes.end(); it++) { - printf("Allocation %30s %s: Size %'14zu / %'14zu\n", it->first.c_str(), it->second[2] ? "P" : " ", it->second[0], it->second[1]); + printf("Allocation %50s %s: Size %'14zu / %'14zu\n", it->first.c_str(), it->second[2] ? "P" : " ", it->second[0], it->second[1]); } PrintMemoryOverview(); for (uint32_t i = 0; i < mChains.size(); i++) { diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h index 420e602e61352..b7eda77aeb9fe 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.h +++ b/GPU/GPUTracking/Base/GPUReconstruction.h @@ -25,6 +25,7 @@ #include #include #include +#include #include "GPUDataTypes.h" #include "GPUMemoryResource.h" @@ -179,7 +180,7 @@ class GPUReconstruction void ReturnVolatileMemory(); ThrustVolatileAllocator getThrustVolatileDeviceAllocator(); void PushNonPersistentMemory(uint64_t tag); - void PopNonPersistentMemory(RecoStep step, uint64_t tag); + void PopNonPersistentMemory(RecoStep step, uint64_t tag, const GPUProcessor* proc = nullptr); void BlockStackedMemory(GPUReconstruction* rec); void UnblockStackedMemory(); void ResetRegisteredMemoryPointers(GPUProcessor* proc); @@ -390,6 +391,7 @@ class GPUReconstruction std::vector> mNonPersistentIndividualDirectAllocations; std::vector> mDirectMemoryChunks; std::vector> mVolatileChunks; + std::atomic_flag mMemoryMutex = ATOMIC_FLAG_INIT; std::unique_ptr mPipelineContext; diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx index 641b0a2d095ca..bdf1ade37868c 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx @@ -231,26 +231,24 @@ int32_t GPUReconstructionCPU::RunChains() } mTimerTotal.Start(); const std::clock_t cpuTimerStart = std::clock(); + int32_t retVal = 0; if (GetProcessingSettings().doublePipeline) { - int32_t retVal = EnqueuePipeline(); - if (retVal) { - return retVal; - } + retVal = EnqueuePipeline(); } else { if (mSlaves.size() || mMaster) { WriteConstantParams(); // Reinitialize // TODO: Get this in sync with GPUChainTracking::DoQueuedUpdates, and consider the doublePipeline } for (uint32_t i = 0; i < mChains.size(); i++) { - int32_t retVal = mChains[i]->RunChain(); - if (retVal) { - return retVal; - } - } - if (GetProcessingSettings().tpcFreeAllocatedMemoryAfterProcessing) { - ClearAllocatedMemory(); + retVal = mChains[i]->RunChain(); } } + if (retVal != 0 && retVal != 2) { + return retVal; + } mTimerTotal.Stop(); + if (GetProcessingSettings().tpcFreeAllocatedMemoryAfterProcessing) { + ClearAllocatedMemory(); + } mStatCPUTime += (double)(std::clock() - cpuTimerStart) / CLOCKS_PER_SEC; if (GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) { GPUInfo("Allocated memory when ending processing %36s", ""); @@ -339,7 +337,13 @@ int32_t GPUReconstructionCPU::RunChains() mTimerTotal.Reset(); } - return 0; + if (GetProcessingSettings().memoryStat) { + PrintMemoryStatistics(); + } else if (GetProcessingSettings().debugLevel >= 2) { + PrintMemoryOverview(); + } + + return retVal; } void GPUReconstructionCPU::ResetDeviceProcessorTypes() diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx index fd0c929dd2ba7..397695b051a86 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx @@ -106,12 +106,12 @@ void GPUTPCDecompression::RegisterMemoryAllocation() { AllocateAndInitializeLate(); mMemoryResInputGPU = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputGPU, GPUMemoryResource::MEMORY_INPUT_FLAG | GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_EXTERNAL | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionInput"); - mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersGPU, GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBuffersGPU"); - mResourceTmpIndexes = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersOutput, GPUMemoryResource::MEMORY_OUTPUT | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBuffersOutput"); - mResourceTmpClustersOffsets = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersInput, GPUMemoryResource::MEMORY_INPUT | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBuffersInput"); - mResourceTmpBufferBeforeFiltering = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpClusterNativeAccessForFiltering, GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBufferForFiltering"); - mResourceClusterNativeAccess = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputClusterNativeAccess, GPUMemoryResource::MEMORY_INPUT | GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpClusterAccessForFiltering"); - mResourceNClusterPerSectorRow = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersNClusterPerSectorRow, GPUMemoryResource::MEMORY_OUTPUT | GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpClusterCountForFiltering"); + mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersGPU, GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK, "TPCDecompressionTmpBuffersGPU"); + mResourceTmpIndexes = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersOutput, GPUMemoryResource::MEMORY_OUTPUT | GPUMemoryResource::MEMORY_STACK, "TPCDecompressionTmpBuffersOutput"); + mResourceTmpClustersOffsets = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersInput, GPUMemoryResource::MEMORY_INPUT | GPUMemoryResource::MEMORY_STACK, "TPCDecompressionTmpBuffersInput"); + mResourceTmpBufferBeforeFiltering = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpClusterNativeAccessForFiltering, GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK, "TPCDecompressionTmpBufferForFiltering"); + mResourceClusterNativeAccess = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputClusterNativeAccess, GPUMemoryResource::MEMORY_INPUT | GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_STACK, "TPCDecompressionTmpClusterAccessForFiltering"); + mResourceNClusterPerSectorRow = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersNClusterPerSectorRow, GPUMemoryResource::MEMORY_OUTPUT | GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_STACK, "TPCDecompressionTmpClusterCountForFiltering"); } void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io) diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index fc08b063ff16a..8cf6b29a43d96 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -377,6 +377,7 @@ AddOption(debugOnFailureMaxFiles, uint32_t, 0, "", 0, "Max number of files to ha AddOption(debugOnFailureMaxSize, uint32_t, 0, "", 0, "Max size of existing dumps in the target folder in GB") AddOption(debugOnFailureDirectory, std::string, ".", "", 0, "Target folder for debug / dump") AddOption(amdMI100SerializationWorkaround, bool, false, "", 0, "Enable workaround that mitigates MI100 serialization bug") +AddOption(memoryStat, bool, false, "", 0, "Print memory statistics") AddVariable(eventDisplay, o2::gpu::GPUDisplayFrontendInterface*, nullptr) AddSubConfig(GPUSettingsProcessingRTC, rtc) AddSubConfig(GPUSettingsProcessingRTCtechnical, rtctech) @@ -587,7 +588,6 @@ AddOption(zsVersion, int32_t, 2, "", 0, "ZS Version: 1 = 10-bit ADC row based, 2 AddOption(dumpEvents, bool, false, "", 0, "Dump events (after transformation such as encodeZS") AddOption(stripDumpedEvents, bool, false, "", 0, "Remove redundant inputs (e.g. digits and ZS) before dumping") AddOption(printSettings, int32_t, 0, "", 0, "Print all settings", def(1)) -AddOption(memoryStat, bool, false, "", 0, "Print memory statistics") AddOption(testSyncAsync, bool, false, "syncAsync", 0, "Test first synchronous and then asynchronous processing") AddOption(testSync, bool, false, "sync", 0, "Test settings for synchronous phase") AddOption(timeFrameTime, bool, false, "tfTime", 0, "Print some debug information about time frame processing time") diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx index 91870f981d542..14d0e04eb4dd3 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.cxx +++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx @@ -278,6 +278,10 @@ bool GPUChainTracking::ValidateSettings() return false; } if (GetProcessingSettings().doublePipeline) { + if (GetProcessingSettings().tpcFreeAllocatedMemoryAfterProcessing) { + GPUError("Cannot use double pipeline with tpcFreeAllocatedMemoryAfterProcessing"); + return false; + } if (!GetRecoStepsOutputs().isOnlySet(GPUDataTypes::InOutType::TPCMergedTracks, GPUDataTypes::InOutType::TPCCompressedClusters, GPUDataTypes::InOutType::TPCClusters)) { GPUError("Invalid outputs for double pipeline mode 0x%x", (uint32_t)GetRecoStepsOutputs()); return false; diff --git a/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx b/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx index 15846246bca0a..fab7179876c04 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx @@ -142,8 +142,10 @@ void GPUChainTracking::PrintMemoryStatistics() std::map usageMap; for (int32_t i = 0; i < NSECTORS; i++) { #ifdef GPUCA_TPC_GEOMETRY_O2 - addToMap("TPC Clusterer Sector Peaks", usageMap, processors()->tpcClusterer[i].mPmemory->counters.nPeaks, processors()->tpcClusterer[i].mNMaxPeaks); - addToMap("TPC Clusterer Sector Clusters", usageMap, processors()->tpcClusterer[i].mPmemory->counters.nClusters, processors()->tpcClusterer[i].mNMaxClusters); + if (processors()->tpcClusterer[i].mPmemory) { + addToMap("TPC Clusterer Sector Peaks", usageMap, processors()->tpcClusterer[i].mPmemory->counters.nPeaks, processors()->tpcClusterer[i].mNMaxPeaks); + addToMap("TPC Clusterer Sector Clusters", usageMap, processors()->tpcClusterer[i].mPmemory->counters.nClusters, processors()->tpcClusterer[i].mNMaxClusters); + } #endif addToMap("TPC Sector Start Hits", usageMap, *processors()->tpcTrackers[i].NStartHits(), processors()->tpcTrackers[i].NMaxStartHits()); addToMap("TPC Sector Tracklets", usageMap, *processors()->tpcTrackers[i].NTracklets(), processors()->tpcTrackers[i].NMaxTracklets()); @@ -152,8 +154,10 @@ void GPUChainTracking::PrintMemoryStatistics() addToMap("TPC Sector TrackHits", usageMap, *processors()->tpcTrackers[i].NTrackHits(), processors()->tpcTrackers[i].NMaxTrackHits()); } addToMap("TPC Clusterer Clusters", usageMap, mRec->MemoryScalers()->nTPCHits, mRec->MemoryScalers()->NTPCClusters(mRec->MemoryScalers()->nTPCdigits)); - addToMap("TPC Tracks", usageMap, processors()->tpcMerger.NMergedTracks(), processors()->tpcMerger.NMaxTracks()); - addToMap("TPC TrackHits", usageMap, processors()->tpcMerger.NMergedTrackClusters(), processors()->tpcMerger.NMaxMergedTrackClusters()); + if (processors()->tpcMerger.Memory()) { + addToMap("TPC Tracks", usageMap, processors()->tpcMerger.NMergedTracks(), processors()->tpcMerger.NMaxTracks()); + addToMap("TPC TrackHits", usageMap, processors()->tpcMerger.NMergedTrackClusters(), processors()->tpcMerger.NMaxMergedTrackClusters()); + } if (mRec->GetProcessingSettings().createO2Output) { addToMap("TPC O2 Tracks", usageMap, processors()->tpcMerger.NOutputTracksTPCO2(), processors()->tpcMerger.NOutputTracksTPCO2()); @@ -161,9 +165,11 @@ void GPUChainTracking::PrintMemoryStatistics() } #ifdef GPUCA_TPC_GEOMETRY_O2 - addToMap("TPC ComprCache HitsAttached", usageMap, processors()->tpcCompressor.mOutput->nAttachedClusters, processors()->tpcCompressor.mMaxTrackClusters); - addToMap("TPC ComprCache HitsUnattached", usageMap, processors()->tpcCompressor.mOutput->nUnattachedClusters, processors()->tpcCompressor.mMaxClustersInCache); - addToMap("TPC ComprCache Tracks", usageMap, processors()->tpcCompressor.mOutput->nTracks, processors()->tpcCompressor.mMaxTracks); + if (processors()->tpcCompressor.mOutput) { + addToMap("TPC ComprCache HitsAttached", usageMap, processors()->tpcCompressor.mOutput->nAttachedClusters, processors()->tpcCompressor.mMaxTrackClusters); + addToMap("TPC ComprCache HitsUnattached", usageMap, processors()->tpcCompressor.mOutput->nUnattachedClusters, processors()->tpcCompressor.mMaxClustersInCache); + addToMap("TPC ComprCache Tracks", usageMap, processors()->tpcCompressor.mOutput->nTracks, processors()->tpcCompressor.mMaxTracks); + } #endif for (auto& elem : usageMap) { @@ -180,8 +186,10 @@ void GPUChainTracking::PrintMemoryRelations() GPUInfo("MEMREL SectorTracks NCl %d NTrk %d", processors()->tpcTrackers[i].NHitsTotal(), *processors()->tpcTrackers[i].NTracks()); GPUInfo("MEMREL SectorTrackHits NCl %d NTrkH %d", processors()->tpcTrackers[i].NHitsTotal(), *processors()->tpcTrackers[i].NTrackHits()); } - GPUInfo("MEMREL Tracks NCl %d NTrk %d", processors()->tpcMerger.NMaxClusters(), processors()->tpcMerger.NMergedTracks()); - GPUInfo("MEMREL TrackHitss NCl %d NTrkH %d", processors()->tpcMerger.NMaxClusters(), processors()->tpcMerger.NMergedTrackClusters()); + if (processors()->tpcMerger.Memory()) { + GPUInfo("MEMREL Tracks NCl %d NTrk %d", processors()->tpcMerger.NMaxClusters(), processors()->tpcMerger.NMergedTracks()); + GPUInfo("MEMREL TrackHitss NCl %d NTrkH %d", processors()->tpcMerger.NMaxClusters(), processors()->tpcMerger.NMergedTrackClusters()); + } } void GPUChainTracking::PrepareKernelDebugOutput() diff --git a/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx b/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx index d13e8d5544631..7ab2cfeccee80 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx @@ -224,6 +224,9 @@ int32_t GPUChainTracking::RunTPCTrackingSectors_internal() GPUInfo("Sector %u, Number of tracks: %d", iSector, *trk.NTracks()); } DoDebugAndDump(RecoStep::TPCSectorTracking, GPUChainTrackingDebugFlags::TPCSectorTracks, trk, &GPUTPCTracker::DumpTrackHits, *mDebugFile); + if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL && !trk.MemoryReuseAllowed()) { + mRec->PopNonPersistentMemory(RecoStep::TPCSectorTracking, qStr2Tag("TPCSLTRK"), &trk); + } }); mRec->SetNActiveThreadsOuterLoop(1); if (error) { diff --git a/GPU/GPUTracking/SectorTracker/GPUTPCTracker.cxx b/GPU/GPUTracking/SectorTracker/GPUTPCTracker.cxx index 41530cb629ce8..c5e6a21460a36 100644 --- a/GPU/GPUTracking/SectorTracker/GPUTPCTracker.cxx +++ b/GPU/GPUTracking/SectorTracker/GPUTPCTracker.cxx @@ -84,10 +84,15 @@ void* GPUTPCTracker::SetPointersCommon(void* mem) return mem; } +bool GPUTPCTracker::MemoryReuseAllowed() +{ + return !mRec->GetProcessingSettings().keepDisplayMemory && ((mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSectorTracking) || mRec->GetProcessingSettings().inKernelParallel == 1 || mRec->GetProcessingSettings().nHostThreads == 1); +} + void GPUTPCTracker::RegisterMemoryAllocation() { AllocateAndInitializeLate(); - bool reuseCondition = !mRec->GetProcessingSettings().keepDisplayMemory && ((mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSectorTracking) || mRec->GetProcessingSettings().inKernelParallel == 1 || mRec->GetProcessingSettings().nHostThreads == 1); + bool reuseCondition = MemoryReuseAllowed(); GPUMemoryReuse reLinks{reuseCondition, GPUMemoryReuse::REUSE_1TO1, GPUMemoryReuse::TrackerDataLinks, (uint16_t)(mISector % mRec->GetProcessingSettings().nStreams)}; mMemoryResLinks = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersDataLinks, GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK, "TPCSectorLinks", reLinks); mMemoryResSectorScratch = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersDataScratch, GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK | GPUMemoryResource::MEMORY_CUSTOM, "TPCSectorScratch"); @@ -102,9 +107,9 @@ void GPUTPCTracker::RegisterMemoryAllocation() uint32_t type = GPUMemoryResource::MEMORY_SCRATCH; if (mRec->GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) { // For individual scheme, we allocate tracklets separately, and change the type for the following allocations to custom type |= GPUMemoryResource::MEMORY_CUSTOM; - mMemoryResTracklets = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersTracklets, type, "TPCTrackerTracklets"); + mMemoryResTracklets = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersTracklets, type | GPUMemoryResource::MEMORY_STACK, "TPCTrackerTracklets"); } - mMemoryResOutput = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersOutput, type, "TPCTrackerTracks"); + mMemoryResOutput = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersOutput, type, "TPCTrackerTracks"); // TODO: Ideally this should eventually go on the stack, so that we can free it after the first phase of track merging } GPUhd() void* GPUTPCTracker::SetPointersTracklets(void* mem) diff --git a/GPU/GPUTracking/SectorTracker/GPUTPCTracker.h b/GPU/GPUTracking/SectorTracker/GPUTPCTracker.h index 2667da4a53977..aee429c959e98 100644 --- a/GPU/GPUTracking/SectorTracker/GPUTPCTracker.h +++ b/GPU/GPUTracking/SectorTracker/GPUTPCTracker.h @@ -103,6 +103,7 @@ class GPUTPCTracker : public GPUProcessor void* SetPointersTracklets(void* mem); void* SetPointersOutput(void* mem); void RegisterMemoryAllocation(); + bool MemoryReuseAllowed(); int16_t MemoryResLinks() const { return mMemoryResLinks; } int16_t MemoryResScratchHost() const { return mMemoryResScratchHost; } diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx index f9c53e3ffd59c..4fe1691afef50 100644 --- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx +++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx @@ -649,11 +649,6 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU if (tmpRetVal == 0 || tmpRetVal == 2) { OutputStat(chainTrackingUse, iRun == 0 ? nTracksTotal : nullptr, iRun == 0 ? nClustersTotal : nullptr); - if (configStandalone.memoryStat) { - recUse->PrintMemoryStatistics(); - } else if (configStandalone.proc.debugLevel >= 2) { - recUse->PrintMemoryOverview(); - } } if (tmpRetVal == 0 && configStandalone.testSyncAsync) { @@ -685,9 +680,6 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU tmpRetVal = recAsync->RunChains(); if (tmpRetVal == 0 || tmpRetVal == 2) { OutputStat(chainTrackingAsync, nullptr, nullptr); - if (configStandalone.memoryStat) { - recAsync->PrintMemoryStatistics(); - } } recAsync->ClearAllocatedMemory(); } diff --git a/GPU/GPUTracking/utils/stdspinlock.h b/GPU/GPUTracking/utils/stdspinlock.h new file mode 100644 index 0000000000000..14bf95c45968e --- /dev/null +++ b/GPU/GPUTracking/utils/stdspinlock.h @@ -0,0 +1,44 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file stdspinlock.h +/// \author David Rohr + +#ifndef Q_STDSPINLOCK_H +#define Q_STDSPINLOCK_H + +#include + +class stdspinlock +{ + public: + stdspinlock(std::atomic_flag& flag) : mFlag(&flag) + { + while (flag.test_and_set(std::memory_order_acquire)) { + } + } + void release() + { + if (mFlag) { + mFlag->clear(std::memory_order_release); + mFlag = nullptr; + } + } + ~stdspinlock() + { + release(); + } + + private: + std::atomic_flag* mFlag; +}; + +#endif // Q_STDSPINLOCK_H