From 4cd7589243f01f02cbd875b422a058d080825da8 Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Thu, 16 Oct 2025 14:01:40 +0200
Subject: [PATCH 1/6] GPU: Make memoryStat work from GPUWorkflow

---
 GPU/GPUTracking/Base/GPUReconstruction.cxx    |  2 +-
 GPU/GPUTracking/Base/GPUReconstructionCPU.cxx | 28 +++++++++++--------
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  2 +-
 GPU/GPUTracking/Global/GPUChainTracking.cxx   |  4 +++
 .../GPUChainTrackingDebugAndProfiling.cxx     | 26 +++++++++++------
 .../Standalone/Benchmark/standalone.cxx       |  8 ------
 6 files changed, 39 insertions(+), 31 deletions(-)

diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx
index 6d64fb3daca6a..e24b76678e710 100644
--- a/GPU/GPUTracking/Base/GPUReconstruction.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx
@@ -999,7 +999,7 @@ void GPUReconstruction::PrintMemoryStatistics()
   }
   printf("%59s CPU / %9s GPU\n", "", "");
   for (auto it = sizes.begin(); it != sizes.end(); it++) {
-    printf("Allocation %30s %s: Size %'14zu / %'14zu\n", it->first.c_str(), it->second[2] ? "P" : " ", it->second[0], it->second[1]);
+    printf("Allocation %50s %s: Size %'14zu / %'14zu\n", it->first.c_str(), it->second[2] ? "P" : " ", it->second[0], it->second[1]);
   }
   PrintMemoryOverview();
   for (uint32_t i = 0; i < mChains.size(); i++) {
diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
index 641b0a2d095ca..bdf1ade37868c 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
@@ -231,26 +231,24 @@ int32_t GPUReconstructionCPU::RunChains()
   }
   mTimerTotal.Start();
   const std::clock_t cpuTimerStart = std::clock();
+  int32_t retVal = 0;
   if (GetProcessingSettings().doublePipeline) {
-    int32_t retVal = EnqueuePipeline();
-    if (retVal) {
-      return retVal;
-    }
+    retVal = EnqueuePipeline();
   } else {
     if (mSlaves.size() || mMaster) {
       WriteConstantParams(); // Reinitialize // TODO: Get this in sync with GPUChainTracking::DoQueuedUpdates, and consider the doublePipeline
     }
     for (uint32_t i = 0; i < mChains.size(); i++) {
-      int32_t retVal = mChains[i]->RunChain();
-      if (retVal) {
-        return retVal;
-      }
-    }
-    if (GetProcessingSettings().tpcFreeAllocatedMemoryAfterProcessing) {
-      ClearAllocatedMemory();
+      retVal = mChains[i]->RunChain();
     }
   }
+  if (retVal != 0 && retVal != 2) {
+    return retVal;
+  }
   mTimerTotal.Stop();
+  if (GetProcessingSettings().tpcFreeAllocatedMemoryAfterProcessing) {
+    ClearAllocatedMemory();
+  }
   mStatCPUTime += (double)(std::clock() - cpuTimerStart) / CLOCKS_PER_SEC;
   if (GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) {
     GPUInfo("Allocated memory when ending processing %36s", "");
@@ -339,7 +337,13 @@ int32_t GPUReconstructionCPU::RunChains()
     mTimerTotal.Reset();
   }
 
-  return 0;
+  if (GetProcessingSettings().memoryStat) {
+    PrintMemoryStatistics();
+  } else if (GetProcessingSettings().debugLevel >= 2) {
+    PrintMemoryOverview();
+  }
+
+  return retVal;
 }
 
 void GPUReconstructionCPU::ResetDeviceProcessorTypes()
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index fc08b063ff16a..8cf6b29a43d96 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -377,6 +377,7 @@ AddOption(debugOnFailureMaxFiles, uint32_t, 0, "", 0, "Max number of files to ha
 AddOption(debugOnFailureMaxSize, uint32_t, 0, "", 0, "Max size of existing dumps in the target folder in GB")
 AddOption(debugOnFailureDirectory, std::string, ".", "", 0, "Target folder for debug / dump")
 AddOption(amdMI100SerializationWorkaround, bool, false, "", 0, "Enable workaround that mitigates MI100 serialization bug")
+AddOption(memoryStat, bool, false, "", 0, "Print memory statistics")
 AddVariable(eventDisplay, o2::gpu::GPUDisplayFrontendInterface*, nullptr)
 AddSubConfig(GPUSettingsProcessingRTC, rtc)
 AddSubConfig(GPUSettingsProcessingRTCtechnical, rtctech)
@@ -587,7 +588,6 @@ AddOption(zsVersion, int32_t, 2, "", 0, "ZS Version: 1 = 10-bit ADC row based, 2
 AddOption(dumpEvents, bool, false, "", 0, "Dump events (after transformation such as encodeZS")
 AddOption(stripDumpedEvents, bool, false, "", 0, "Remove redundant inputs (e.g. digits and ZS) before dumping")
 AddOption(printSettings, int32_t, 0, "", 0, "Print all settings", def(1))
-AddOption(memoryStat, bool, false, "", 0, "Print memory statistics")
 AddOption(testSyncAsync, bool, false, "syncAsync", 0, "Test first synchronous and then asynchronous processing")
 AddOption(testSync, bool, false, "sync", 0, "Test settings for synchronous phase")
 AddOption(timeFrameTime, bool, false, "tfTime", 0, "Print some debug information about time frame processing time")
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx
index 91870f981d542..14d0e04eb4dd3 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx
@@ -278,6 +278,10 @@ bool GPUChainTracking::ValidateSettings()
     return false;
   }
   if (GetProcessingSettings().doublePipeline) {
+    if (GetProcessingSettings().tpcFreeAllocatedMemoryAfterProcessing) {
+      GPUError("Cannot use double pipeline with tpcFreeAllocatedMemoryAfterProcessing");
+      return false;
+    }
     if (!GetRecoStepsOutputs().isOnlySet(GPUDataTypes::InOutType::TPCMergedTracks, GPUDataTypes::InOutType::TPCCompressedClusters, GPUDataTypes::InOutType::TPCClusters)) {
       GPUError("Invalid outputs for double pipeline mode 0x%x", (uint32_t)GetRecoStepsOutputs());
       return false;
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx b/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx
index 15846246bca0a..fab7179876c04 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx
@@ -142,8 +142,10 @@ void GPUChainTracking::PrintMemoryStatistics()
   std::map<std::string, GPUChainTrackingMemUsage> usageMap;
   for (int32_t i = 0; i < NSECTORS; i++) {
 #ifdef GPUCA_TPC_GEOMETRY_O2
-    addToMap("TPC Clusterer Sector Peaks", usageMap, processors()->tpcClusterer[i].mPmemory->counters.nPeaks, processors()->tpcClusterer[i].mNMaxPeaks);
-    addToMap("TPC Clusterer Sector Clusters", usageMap, processors()->tpcClusterer[i].mPmemory->counters.nClusters, processors()->tpcClusterer[i].mNMaxClusters);
+    if (processors()->tpcClusterer[i].mPmemory) {
+      addToMap("TPC Clusterer Sector Peaks", usageMap, processors()->tpcClusterer[i].mPmemory->counters.nPeaks, processors()->tpcClusterer[i].mNMaxPeaks);
+      addToMap("TPC Clusterer Sector Clusters", usageMap, processors()->tpcClusterer[i].mPmemory->counters.nClusters, processors()->tpcClusterer[i].mNMaxClusters);
+    }
 #endif
     addToMap("TPC Sector Start Hits", usageMap, *processors()->tpcTrackers[i].NStartHits(), processors()->tpcTrackers[i].NMaxStartHits());
     addToMap("TPC Sector Tracklets", usageMap, *processors()->tpcTrackers[i].NTracklets(), processors()->tpcTrackers[i].NMaxTracklets());
@@ -152,8 +154,10 @@ void GPUChainTracking::PrintMemoryStatistics()
     addToMap("TPC Sector TrackHits", usageMap, *processors()->tpcTrackers[i].NTrackHits(), processors()->tpcTrackers[i].NMaxTrackHits());
   }
   addToMap("TPC Clusterer Clusters", usageMap, mRec->MemoryScalers()->nTPCHits, mRec->MemoryScalers()->NTPCClusters(mRec->MemoryScalers()->nTPCdigits));
-  addToMap("TPC Tracks", usageMap, processors()->tpcMerger.NMergedTracks(), processors()->tpcMerger.NMaxTracks());
-  addToMap("TPC TrackHits", usageMap, processors()->tpcMerger.NMergedTrackClusters(), processors()->tpcMerger.NMaxMergedTrackClusters());
+  if (processors()->tpcMerger.Memory()) {
+    addToMap("TPC Tracks", usageMap, processors()->tpcMerger.NMergedTracks(), processors()->tpcMerger.NMaxTracks());
+    addToMap("TPC TrackHits", usageMap, processors()->tpcMerger.NMergedTrackClusters(), processors()->tpcMerger.NMaxMergedTrackClusters());
+  }
 
   if (mRec->GetProcessingSettings().createO2Output) {
     addToMap("TPC O2 Tracks", usageMap, processors()->tpcMerger.NOutputTracksTPCO2(), processors()->tpcMerger.NOutputTracksTPCO2());
@@ -161,9 +165,11 @@ void GPUChainTracking::PrintMemoryStatistics()
   }
 
 #ifdef GPUCA_TPC_GEOMETRY_O2
-  addToMap("TPC ComprCache HitsAttached", usageMap, processors()->tpcCompressor.mOutput->nAttachedClusters, processors()->tpcCompressor.mMaxTrackClusters);
-  addToMap("TPC ComprCache HitsUnattached", usageMap, processors()->tpcCompressor.mOutput->nUnattachedClusters, processors()->tpcCompressor.mMaxClustersInCache);
-  addToMap("TPC ComprCache Tracks", usageMap, processors()->tpcCompressor.mOutput->nTracks, processors()->tpcCompressor.mMaxTracks);
+  if (processors()->tpcCompressor.mOutput) {
+    addToMap("TPC ComprCache HitsAttached", usageMap, processors()->tpcCompressor.mOutput->nAttachedClusters, processors()->tpcCompressor.mMaxTrackClusters);
+    addToMap("TPC ComprCache HitsUnattached", usageMap, processors()->tpcCompressor.mOutput->nUnattachedClusters, processors()->tpcCompressor.mMaxClustersInCache);
+    addToMap("TPC ComprCache Tracks", usageMap, processors()->tpcCompressor.mOutput->nTracks, processors()->tpcCompressor.mMaxTracks);
+  }
 #endif
 
   for (auto& elem : usageMap) {
@@ -180,8 +186,10 @@ void GPUChainTracking::PrintMemoryRelations()
     GPUInfo("MEMREL SectorTracks NCl %d NTrk %d", processors()->tpcTrackers[i].NHitsTotal(), *processors()->tpcTrackers[i].NTracks());
     GPUInfo("MEMREL SectorTrackHits NCl %d NTrkH %d", processors()->tpcTrackers[i].NHitsTotal(), *processors()->tpcTrackers[i].NTrackHits());
   }
-  GPUInfo("MEMREL Tracks NCl %d NTrk %d", processors()->tpcMerger.NMaxClusters(), processors()->tpcMerger.NMergedTracks());
-  GPUInfo("MEMREL TrackHitss NCl %d NTrkH %d", processors()->tpcMerger.NMaxClusters(), processors()->tpcMerger.NMergedTrackClusters());
+  if (processors()->tpcMerger.Memory()) {
+    GPUInfo("MEMREL Tracks NCl %d NTrk %d", processors()->tpcMerger.NMaxClusters(), processors()->tpcMerger.NMergedTracks());
+    GPUInfo("MEMREL TrackHitss NCl %d NTrkH %d", processors()->tpcMerger.NMaxClusters(), processors()->tpcMerger.NMergedTrackClusters());
+  }
 }
 
 void GPUChainTracking::PrepareKernelDebugOutput()
diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
index f9c53e3ffd59c..4fe1691afef50 100644
--- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
+++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
@@ -649,11 +649,6 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU
 
     if (tmpRetVal == 0 || tmpRetVal == 2) {
       OutputStat(chainTrackingUse, iRun == 0 ? nTracksTotal : nullptr, iRun == 0 ? nClustersTotal : nullptr);
-      if (configStandalone.memoryStat) {
-        recUse->PrintMemoryStatistics();
-      } else if (configStandalone.proc.debugLevel >= 2) {
-        recUse->PrintMemoryOverview();
-      }
     }
 
     if (tmpRetVal == 0 && configStandalone.testSyncAsync) {
@@ -685,9 +680,6 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU
       tmpRetVal = recAsync->RunChains();
       if (tmpRetVal == 0 || tmpRetVal == 2) {
         OutputStat(chainTrackingAsync, nullptr, nullptr);
-        if (configStandalone.memoryStat) {
-          recAsync->PrintMemoryStatistics();
-        }
       }
       recAsync->ClearAllocatedMemory();
     }

From b02aeda485cca4fda78e69813c93edeef2d30dc7 Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Thu, 16 Oct 2025 15:06:20 +0200
Subject: [PATCH 2/6] GPU TPC Decompression: all temporary memory should go to
 the stack and be freed

---
 GPU/GPUTracking/Base/GPUReconstruction.cxx           |  6 +++++-
 .../DataCompression/GPUTPCDecompression.cxx          | 12 ++++++------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx
index e24b76678e710..a05736d519bd0 100644
--- a/GPU/GPUTracking/Base/GPUReconstruction.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx
@@ -639,7 +639,7 @@ void GPUReconstruction::AllocateRegisteredMemoryInternal(GPUMemoryResource* res,
       res->mPtr = GPUProcessor::alignPointer<GPUCA_BUFFER_ALIGNMENT>(res->mPtrDevice);
       res->SetPointers(res->mPtr);
       if (GetProcessingSettings().allocDebugLevel >= 2) {
-        std::cout << (res->mReuse >= 0 ? "Reused " : "Allocated ") << res->mName << ": " << res->mSize << "\n";
+        std::cout << (res->mReuse >= 0 ? "Reused " : "Allocated ") << res->mName << ": " << res->mSize << " (individual" << ((res->mType & GPUMemoryResource::MEMORY_STACK) ? " stack" : "") << ")\n";
       }
       if (res->mType & GPUMemoryResource::MEMORY_STACK) {
         mNonPersistentIndividualAllocations.emplace_back(res);
@@ -896,8 +896,12 @@ void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag)
   }
   mHostMemoryPoolEnd = std::get<0>(mNonPersistentMemoryStack.back());
   mDeviceMemoryPoolEnd = std::get<1>(mNonPersistentMemoryStack.back());
+  std::cout << "FOOOO POP " << std::get<2>(mNonPersistentMemoryStack.back()) << " - " << mNonPersistentIndividualAllocations.size();
   for (uint32_t i = std::get<2>(mNonPersistentMemoryStack.back()); i < mNonPersistentIndividualAllocations.size(); i++) {
     GPUMemoryResource* res = mNonPersistentIndividualAllocations[i];
+    if (GetProcessingSettings().allocDebugLevel >= 2 && (res->mPtr || res->mPtrDevice)) {
+      std::cout << "Freeing NonPersistent " << res->mName << ": size " << res->mSize << " (reused " << res->mReuse << ")\n";
+    }
     if (res->mReuse < 0) {
       operator delete(res->mPtrDevice, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
     }
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
index fd0c929dd2ba7..397695b051a86 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
@@ -106,12 +106,12 @@ void GPUTPCDecompression::RegisterMemoryAllocation()
 {
   AllocateAndInitializeLate();
   mMemoryResInputGPU = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputGPU, GPUMemoryResource::MEMORY_INPUT_FLAG | GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_EXTERNAL | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionInput");
-  mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersGPU, GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBuffersGPU");
-  mResourceTmpIndexes = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersOutput, GPUMemoryResource::MEMORY_OUTPUT | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBuffersOutput");
-  mResourceTmpClustersOffsets = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersInput, GPUMemoryResource::MEMORY_INPUT | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBuffersInput");
-  mResourceTmpBufferBeforeFiltering = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpClusterNativeAccessForFiltering, GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBufferForFiltering");
-  mResourceClusterNativeAccess = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputClusterNativeAccess, GPUMemoryResource::MEMORY_INPUT | GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpClusterAccessForFiltering");
-  mResourceNClusterPerSectorRow = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersNClusterPerSectorRow, GPUMemoryResource::MEMORY_OUTPUT | GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpClusterCountForFiltering");
+  mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersGPU, GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK, "TPCDecompressionTmpBuffersGPU");
+  mResourceTmpIndexes = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersOutput, GPUMemoryResource::MEMORY_OUTPUT | GPUMemoryResource::MEMORY_STACK, "TPCDecompressionTmpBuffersOutput");
+  mResourceTmpClustersOffsets = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersInput, GPUMemoryResource::MEMORY_INPUT | GPUMemoryResource::MEMORY_STACK, "TPCDecompressionTmpBuffersInput");
+  mResourceTmpBufferBeforeFiltering = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpClusterNativeAccessForFiltering, GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK, "TPCDecompressionTmpBufferForFiltering");
+  mResourceClusterNativeAccess = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputClusterNativeAccess, GPUMemoryResource::MEMORY_INPUT | GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_STACK, "TPCDecompressionTmpClusterAccessForFiltering");
+  mResourceNClusterPerSectorRow = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersNClusterPerSectorRow, GPUMemoryResource::MEMORY_OUTPUT | GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_STACK, "TPCDecompressionTmpClusterCountForFiltering");
 }
 
 void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io)

From 772ddbecf955e7743b883c3f9048c4bc4adedc70 Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Fri, 17 Oct 2025 11:25:19 +0200
Subject: [PATCH 3/6] GPU TPC: Tracklet memory during seeding when running on
 the host should be on the stack

---
 GPU/GPUTracking/SectorTracker/GPUTPCTracker.cxx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPU/GPUTracking/SectorTracker/GPUTPCTracker.cxx b/GPU/GPUTracking/SectorTracker/GPUTPCTracker.cxx
index 41530cb629ce8..7897de4f2002e 100644
--- a/GPU/GPUTracking/SectorTracker/GPUTPCTracker.cxx
+++ b/GPU/GPUTracking/SectorTracker/GPUTPCTracker.cxx
@@ -102,9 +102,9 @@ void GPUTPCTracker::RegisterMemoryAllocation()
   uint32_t type = GPUMemoryResource::MEMORY_SCRATCH;
   if (mRec->GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) { // For individual scheme, we allocate tracklets separately, and change the type for the following allocations to custom
     type |= GPUMemoryResource::MEMORY_CUSTOM;
-    mMemoryResTracklets = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersTracklets, type, "TPCTrackerTracklets");
+    mMemoryResTracklets = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersTracklets, type | GPUMemoryResource::MEMORY_STACK, "TPCTrackerTracklets");
   }
-  mMemoryResOutput = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersOutput, type, "TPCTrackerTracks");
+  mMemoryResOutput = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersOutput, type, "TPCTrackerTracks"); // TODO: Ideally this should eventually go on the stack, so that we can free it after the first phase of track merging
 }
 
 GPUhd() void* GPUTPCTracker::SetPointersTracklets(void* mem)

From 901e3e4cb5968b2fa415d6e7c518bc866dafdd93 Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Fri, 17 Oct 2025 11:25:52 +0200
Subject: [PATCH 4/6] GPU: Add option to free individual stacked allocations
 per processor on the host

---
 GPU/GPUTracking/Base/GPUReconstruction.cxx | 23 ++++++++++++++--------
 GPU/GPUTracking/Base/GPUReconstruction.h   |  2 +-
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx
index a05736d519bd0..f00c856ad1ff2 100644
--- a/GPU/GPUTracking/Base/GPUReconstruction.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx
@@ -877,8 +877,11 @@ void GPUReconstruction::PushNonPersistentMemory(uint64_t tag)
   mNonPersistentMemoryStack.emplace_back(mHostMemoryPoolEnd, mDeviceMemoryPoolEnd, mNonPersistentIndividualAllocations.size(), mNonPersistentIndividualDirectAllocations.size(), tag);
 }
 
-void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag)
+void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag, const GPUProcessor* proc)
 {
+  if (proc && GetProcessingSettings().memoryAllocationStrategy != GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
+    GPUFatal("Processor-depending memory-free works only with allocation strategy ALLOCATION_INDIVIDUAL");
+  }
   if (GetProcessingSettings().keepDisplayMemory || GetProcessingSettings().disableMemoryReuse) {
     return;
   }
@@ -888,17 +891,17 @@ void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag)
   if (tag != 0 && std::get<4>(mNonPersistentMemoryStack.back()) != tag) {
     GPUFatal("Tag mismatch when popping non persistent memory from stack : pop %s vs on stack %s", qTag2Str(tag).c_str(), qTag2Str(std::get<4>(mNonPersistentMemoryStack.back())).c_str());
   }
-  if ((GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) && (IsGPU() || GetProcessingSettings().forceHostMemoryPoolSize)) {
+  if (!proc && (GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) && (IsGPU() || GetProcessingSettings().forceHostMemoryPoolSize)) {
     printf("Allocated memory after %30s (%8s) (Stack %zu): ", GPUDataTypes::RECO_STEP_NAMES[getRecoStepNum(step, true)], qTag2Str(std::get<4>(mNonPersistentMemoryStack.back())).c_str(), mNonPersistentMemoryStack.size());
     PrintMemoryOverview();
     printf("%76s", "");
     PrintMemoryMax();
   }
-  mHostMemoryPoolEnd = std::get<0>(mNonPersistentMemoryStack.back());
-  mDeviceMemoryPoolEnd = std::get<1>(mNonPersistentMemoryStack.back());
-  std::cout << "FOOOO POP " << std::get<2>(mNonPersistentMemoryStack.back()) << " - " << mNonPersistentIndividualAllocations.size();
   for (uint32_t i = std::get<2>(mNonPersistentMemoryStack.back()); i < mNonPersistentIndividualAllocations.size(); i++) {
     GPUMemoryResource* res = mNonPersistentIndividualAllocations[i];
+    if (proc && res->mProcessor != proc) {
+      continue;
+    }
     if (GetProcessingSettings().allocDebugLevel >= 2 && (res->mPtr || res->mPtrDevice)) {
       std::cout << "Freeing NonPersistent " << res->mName << ": size " << res->mSize << " (reused " << res->mReuse << ")\n";
     }
@@ -908,9 +911,13 @@ void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag)
     res->mPtr = nullptr;
     res->mPtrDevice = nullptr;
   }
-  mNonPersistentIndividualAllocations.resize(std::get<2>(mNonPersistentMemoryStack.back()));
-  mNonPersistentIndividualDirectAllocations.resize(std::get<3>(mNonPersistentMemoryStack.back()));
-  mNonPersistentMemoryStack.pop_back();
+  if (!proc) {
+    mHostMemoryPoolEnd = std::get<0>(mNonPersistentMemoryStack.back());
+    mDeviceMemoryPoolEnd = std::get<1>(mNonPersistentMemoryStack.back());
+    mNonPersistentIndividualAllocations.resize(std::get<2>(mNonPersistentMemoryStack.back()));
+    mNonPersistentIndividualDirectAllocations.resize(std::get<3>(mNonPersistentMemoryStack.back()));
+    mNonPersistentMemoryStack.pop_back();
+  }
 }
 
 void GPUReconstruction::BlockStackedMemory(GPUReconstruction* rec)
diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h
index 420e602e61352..b98f5660a933e 100644
--- a/GPU/GPUTracking/Base/GPUReconstruction.h
+++ b/GPU/GPUTracking/Base/GPUReconstruction.h
@@ -179,7 +179,7 @@ class GPUReconstruction
   void ReturnVolatileMemory();
   ThrustVolatileAllocator getThrustVolatileDeviceAllocator();
   void PushNonPersistentMemory(uint64_t tag);
-  void PopNonPersistentMemory(RecoStep step, uint64_t tag);
+  void PopNonPersistentMemory(RecoStep step, uint64_t tag, const GPUProcessor* proc = nullptr);
   void BlockStackedMemory(GPUReconstruction* rec);
   void UnblockStackedMemory();
   void ResetRegisteredMemoryPointers(GPUProcessor* proc);

From bb4a15b111eb27eff3f341b59b931b4a87dd14f4 Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Fri, 17 Oct 2025 16:26:51 +0200
Subject: [PATCH 5/6] GPU: Make memory allocation and freeing of individual
 stacked memory thread-safe

---
 GPU/GPUTracking/Base/GPUReconstruction.cxx |  7 ++++
 GPU/GPUTracking/Base/GPUReconstruction.h   |  2 +
 GPU/GPUTracking/utils/stdspinlock.h        | 44 ++++++++++++++++++++++
 3 files changed, 53 insertions(+)
 create mode 100644 GPU/GPUTracking/utils/stdspinlock.h

diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx
index f00c856ad1ff2..5129ccc4becf1 100644
--- a/GPU/GPUTracking/Base/GPUReconstruction.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx
@@ -40,6 +40,7 @@
 
 #include "GPULogging.h"
 #include "utils/strtag.h"
+#include "utils/stdspinlock.h"
 
 #ifdef GPUCA_O2_LIB
 #include "GPUO2InterfaceConfiguration.h"
@@ -589,6 +590,7 @@ size_t GPUReconstruction::AllocateRegisteredMemoryHelper(GPUMemoryResource* res,
     throw std::bad_alloc();
   }
   size_t retVal;
+  stdspinlock spinlock(mMemoryMutex);
   if ((res->mType & GPUMemoryResource::MEMORY_STACK) && memorypoolend) {
     retVal = ptrDiff((res->*setPtr)((char*)1), (char*)(1));
     memorypoolend = (void*)((char*)memorypoolend - GPUProcessor::getAlignmentMod<GPUCA_MEMALIGN>(memorypoolend));
@@ -642,6 +644,7 @@ void GPUReconstruction::AllocateRegisteredMemoryInternal(GPUMemoryResource* res,
         std::cout << (res->mReuse >= 0 ? "Reused " : "Allocated ") << res->mName << ": " << res->mSize << " (individual" << ((res->mType & GPUMemoryResource::MEMORY_STACK) ? " stack" : "") << ")\n";
       }
       if (res->mType & GPUMemoryResource::MEMORY_STACK) {
+        stdspinlock spinlock(mMemoryMutex);
         mNonPersistentIndividualAllocations.emplace_back(res);
       }
       if ((size_t)res->mPtr % GPUCA_BUFFER_ALIGNMENT) {
@@ -722,6 +725,7 @@ size_t GPUReconstruction::AllocateRegisteredMemory(int16_t ires, GPUOutputContro
 
 void* GPUReconstruction::AllocateDirectMemory(size_t size, int32_t type)
 {
+  stdspinlock spinlock(mMemoryMutex);
   if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
     char* retVal = new (std::align_val_t(GPUCA_BUFFER_ALIGNMENT)) char[size];
     if ((type & GPUMemoryResource::MEMORY_STACK)) {
@@ -763,6 +767,7 @@ void* GPUReconstruction::AllocateDirectMemory(size_t size, int32_t type)
 
 void* GPUReconstruction::AllocateVolatileDeviceMemory(size_t size)
 {
+  stdspinlock spinlock(mMemoryMutex);
   if (mVolatileMemoryStart == nullptr) {
     mVolatileMemoryStart = mDeviceMemoryPool;
   }
@@ -788,6 +793,7 @@ void* GPUReconstruction::AllocateVolatileMemory(size_t size, bool device)
     return AllocateVolatileDeviceMemory(size);
   }
   char* retVal = new (std::align_val_t(GPUCA_BUFFER_ALIGNMENT)) char[size];
+  stdspinlock spinlock(mMemoryMutex);
   mVolatileChunks.emplace_back(retVal, alignedDeleter());
   return retVal;
 }
@@ -912,6 +918,7 @@ void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag, cons
     res->mPtrDevice = nullptr;
   }
   if (!proc) {
+    stdspinlock spinlock(mMemoryMutex);
     mHostMemoryPoolEnd = std::get<0>(mNonPersistentMemoryStack.back());
     mDeviceMemoryPoolEnd = std::get<1>(mNonPersistentMemoryStack.back());
     mNonPersistentIndividualAllocations.resize(std::get<2>(mNonPersistentMemoryStack.back()));
diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h
index b98f5660a933e..b7eda77aeb9fe 100644
--- a/GPU/GPUTracking/Base/GPUReconstruction.h
+++ b/GPU/GPUTracking/Base/GPUReconstruction.h
@@ -25,6 +25,7 @@
 #include <functional>
 #include <unordered_map>
 #include <unordered_set>
+#include <atomic>
 
 #include "GPUDataTypes.h"
 #include "GPUMemoryResource.h"
@@ -390,6 +391,7 @@ class GPUReconstruction
   std::vector<std::unique_ptr<char[], alignedDeleter>> mNonPersistentIndividualDirectAllocations;
   std::vector<std::unique_ptr<char[], alignedDeleter>> mDirectMemoryChunks;
   std::vector<std::unique_ptr<char[], alignedDeleter>> mVolatileChunks;
+  std::atomic_flag mMemoryMutex = ATOMIC_FLAG_INIT;
 
   std::unique_ptr<GPUReconstructionPipelineContext> mPipelineContext;
 
diff --git a/GPU/GPUTracking/utils/stdspinlock.h b/GPU/GPUTracking/utils/stdspinlock.h
new file mode 100644
index 0000000000000..14bf95c45968e
--- /dev/null
+++ b/GPU/GPUTracking/utils/stdspinlock.h
@@ -0,0 +1,44 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file stdspinlock.h
+/// \author David Rohr
+
+#ifndef Q_STDSPINLOCK_H
+#define Q_STDSPINLOCK_H
+
+#include <atomic>
+
+class stdspinlock
+{
+ public:
+  stdspinlock(std::atomic_flag& flag) : mFlag(&flag)
+  {
+    while (flag.test_and_set(std::memory_order_acquire)) {
+    }
+  }
+  void release()
+  {
+    if (mFlag) {
+      mFlag->clear(std::memory_order_release);
+      mFlag = nullptr;
+    }
+  }
+  ~stdspinlock()
+  {
+    release();
+  }
+
+ private:
+  std::atomic_flag* mFlag;
+};
+
+#endif // Q_STDSPINLOCK_H

From bb67843210308617274df59654ee0252c495cffc Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Fri, 17 Oct 2025 11:26:17 +0200
Subject: [PATCH 6/6] GPU TPC: Free sector tracking memory earlier

---
 GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx | 3 +++
 GPU/GPUTracking/SectorTracker/GPUTPCTracker.cxx          | 7 ++++++-
 GPU/GPUTracking/SectorTracker/GPUTPCTracker.h            | 1 +
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx b/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx
index d13e8d5544631..7ab2cfeccee80 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx
@@ -224,6 +224,9 @@ int32_t GPUChainTracking::RunTPCTrackingSectors_internal()
       GPUInfo("Sector %u, Number of tracks: %d", iSector, *trk.NTracks());
     }
     DoDebugAndDump(RecoStep::TPCSectorTracking, GPUChainTrackingDebugFlags::TPCSectorTracks, trk, &GPUTPCTracker::DumpTrackHits, *mDebugFile);
+    if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL && !trk.MemoryReuseAllowed()) {
+      mRec->PopNonPersistentMemory(RecoStep::TPCSectorTracking, qStr2Tag("TPCSLTRK"), &trk);
+    }
   });
   mRec->SetNActiveThreadsOuterLoop(1);
   if (error) {
diff --git a/GPU/GPUTracking/SectorTracker/GPUTPCTracker.cxx b/GPU/GPUTracking/SectorTracker/GPUTPCTracker.cxx
index 7897de4f2002e..c5e6a21460a36 100644
--- a/GPU/GPUTracking/SectorTracker/GPUTPCTracker.cxx
+++ b/GPU/GPUTracking/SectorTracker/GPUTPCTracker.cxx
@@ -84,10 +84,15 @@ void* GPUTPCTracker::SetPointersCommon(void* mem)
   return mem;
 }
 
+bool GPUTPCTracker::MemoryReuseAllowed()
+{
+  return !mRec->GetProcessingSettings().keepDisplayMemory && ((mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSectorTracking) || mRec->GetProcessingSettings().inKernelParallel == 1 || mRec->GetProcessingSettings().nHostThreads == 1);
+}
+
 void GPUTPCTracker::RegisterMemoryAllocation()
 {
   AllocateAndInitializeLate();
-  bool reuseCondition = !mRec->GetProcessingSettings().keepDisplayMemory && ((mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSectorTracking) || mRec->GetProcessingSettings().inKernelParallel == 1 || mRec->GetProcessingSettings().nHostThreads == 1);
+  bool reuseCondition = MemoryReuseAllowed();
   GPUMemoryReuse reLinks{reuseCondition, GPUMemoryReuse::REUSE_1TO1, GPUMemoryReuse::TrackerDataLinks, (uint16_t)(mISector % mRec->GetProcessingSettings().nStreams)};
   mMemoryResLinks = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersDataLinks, GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK, "TPCSectorLinks", reLinks);
   mMemoryResSectorScratch = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersDataScratch, GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK | GPUMemoryResource::MEMORY_CUSTOM, "TPCSectorScratch");
diff --git a/GPU/GPUTracking/SectorTracker/GPUTPCTracker.h b/GPU/GPUTracking/SectorTracker/GPUTPCTracker.h
index 2667da4a53977..aee429c959e98 100644
--- a/GPU/GPUTracking/SectorTracker/GPUTPCTracker.h
+++ b/GPU/GPUTracking/SectorTracker/GPUTPCTracker.h
@@ -103,6 +103,7 @@ class GPUTPCTracker : public GPUProcessor
   void* SetPointersTracklets(void* mem);
   void* SetPointersOutput(void* mem);
   void RegisterMemoryAllocation();
+  bool MemoryReuseAllowed();
 
   int16_t MemoryResLinks() const { return mMemoryResLinks; }
   int16_t MemoryResScratchHost() const { return mMemoryResScratchHost; }