From cf642627b91cd993f573b47787e6af5d832c495f Mon Sep 17 00:00:00 2001 From: David Rohr Date: Wed, 12 Nov 2025 15:45:11 +0100 Subject: [PATCH 1/3] GPU TPC: Dynamically increase the protection / decrease the removal tube at inner pad rows, edge pads, high local occupancy --- GPU/GPUTracking/Base/GPUReconstruction.cxx | 2 +- GPU/GPUTracking/Definitions/GPUSettingsList.h | 14 +++++--- GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx | 34 +++++++++++-------- .../TPCConvert/GPUTPCConvertImpl.h | 2 +- GPU/TPCFastTransformation/TPCFastTransform.h | 8 +++++ 5 files changed, 39 insertions(+), 21 deletions(-) diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx index 5129ccc4becf1..5e1bccc9c95db 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.cxx +++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx @@ -748,7 +748,7 @@ void* GPUReconstruction::AllocateDirectMemory(size_t size, int32_t type) void*& poolend = (type & GPUMemoryResource::MEMORY_GPU) ? mDeviceMemoryPoolEnd : mHostMemoryPoolEnd; char* retVal; if ((type & GPUMemoryResource::MEMORY_STACK)) { - poolend = (char*)poolend - size; + poolend = (char*)poolend - size; // TODO: Implement overflow check poolend = (char*)poolend - GPUProcessor::getAlignmentMod(poolend); retVal = (char*)poolend; } else { diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 2f1eb72cb3d00..f4fd513370949 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -68,11 +68,12 @@ AddOptionRTC(minNClustersFinalTrack, int32_t, -1, "", 0, "required min number of AddOptionRTC(searchWindowDZDR, float, 2.5f, "", 0, "Use DZDR window for seeding instead of neighboursSearchArea") AddOptionRTC(trackReferenceX, float, 1000.f, "", 0, "Transport all tracks to this X after tracking (disabled if > 500, auto = 1000)") AddOptionRTC(zsThreshold, float, 2.0f, "", 0, "Zero-Suppression threshold") -AddOptionRTC(tubeProtectSigma2, float, 5.f * 5.f, "", 0, "Max sigma2 to mark adjacent cluster for protection") -AddOptionRTC(tubeProtectMaxSize2, float, 3.5f * 3.5f, "", 0, "Square of max tube size (if smaller than tubeProtectChi2)") -AddOptionRTC(tubeProtectMinSize2, float, 1.0f * 1.0f, "", 0, "Square of min tube size (if larger than tubeProtectChi2)") -AddOptionRTC(tubeRemoveSigma2, float, 1.f * 1.f, "", 0, "Max sigma2 to mark adjacent cluster for removal") -AddOptionRTC(tubeRemoveMaxSize2, float, 1.5f * 1.5f, "", 0, "Square of max tube size (if smaller than tubeRejectChi2)") +AddOptionRTC(tubeProtectSigma2, float, 4.f * 4.f, "", 0, "Max sigma2 to mark adjacent cluster for protection") +AddOptionRTC(tubeProtectMaxSize2, float, 2.f * 2.f, "", 0, "Square of max tube size (if smaller than tubeProtectChi2)") +AddOptionRTC(tubeProtectMinSize2, float, 0.5f * 0.5f, "", 0, "Square of min tube size (if larger than tubeProtectChi2)") +AddOptionRTC(tubeRemoveSigma2, float, 1.25f * 1.25f, "", 0, "Max sigma2 to mark adjacent cluster for removal") +AddOptionRTC(tubeRemoveMaxSize2, float, 2.5f * 2.5f, "", 0, "Square of max tube size (if smaller than tubeRejectChi2)") +AddOptionRTC(tubeExtraProtectMinOccupancy, uint32_t, 1500, "", 0, "Increase Protection, decrease removal by factor 2, when above this lokal occupancy / rowx") AddOptionRTC(clustersShiftTimebins, float, 0, "", 0, "Shift of TPC clusters (applied during CTF cluster decoding)") AddOptionRTC(clustersShiftTimebinsClusterizer, float, 0, "", 0, "Shift of TPC clusters (applied during CTF clusterization)") AddOptionRTC(clustersEdgeFixDistance, float, 0.f, "", 0, "If >0, revert cluster.flag edge bit distance to edge exceeds this parameter (fixed during CTF decoding)") @@ -162,6 +163,9 @@ AddOptionRTC(dEdxClusterRejectionFlagMask, int8_t, o2::gpu::GPUTPCGMMergedTrackH AddOptionRTC(dEdxClusterRejectionFlagMaskAlt, int8_t, o2::gpu::GPUTPCGMMergedTrackHit::flagEdge, "", 0, "OR mask of TPC flags that will reject the cluster in alternative dEdx") AddOptionRTC(rejectEdgeClustersInSeeding, int8_t, 0, "", 0, "Reject edge clusters based on uncorrected track Y during seeding") AddOptionRTC(rejectEdgeClustersInTrackFit, int8_t, 0, "", 0, "Reject edge clusters based on uncorrected track Y during track fit") +AddOptionRTC(tubeExtraProtectMinRow, uint8_t, 20, "", 0, "Increase Protection, decrease removal by factor 2, when below this row") +AddOptionRTC(tubeExtraProtectEdgePads, uint8_t, 2, "", 0, "Increase Protection, decrease removal by factor 2, when on this number of pads from the edge") + AddOptionArray(PID_remap, int8_t, 9, (0, 1, 2, 3, 4, 5, 6, 7, 8), "", 0, "Remap Ipid to PID_reamp[Ipid] (no remap if<0)") // BUG: CUDA cannot yet hand AddOptionArrayRTC AddHelp("help", 'h') EndConfig() diff --git a/GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx b/GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx index 7baa3b7e11b4f..70fb9cd1a769e 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx @@ -248,7 +248,6 @@ GPUd() bool GPUTPCGMTrackParam::Fit(GPUTPCGMMerger* GPUrestrict() merger, int32_ const float invCharge = merger->GetConstantMem()->ioPtrs.clustersNative ? (1.f / merger->GetConstantMem()->ioPtrs.clustersNative->clustersLinear[cluster.num].qMax) : 0.f; float invAvgCharge = (sumInvSqrtCharge += invSqrtCharge) / ++nAvgCharge; invAvgCharge *= invAvgCharge; - prop.GetErr2(err2Y, err2Z, param, zz, cluster.row, clusterState, cluster.sector, time, invAvgCharge, invCharge); if (rejectChi2 >= GPUTPCGMPropagator::rejectInterFill) { @@ -491,7 +490,8 @@ GPUd() float GPUTPCGMTrackParam::AttachClusters(const GPUTPCGMMerger* GPUrestric GPUd() float GPUTPCGMTrackParam::AttachClusters(const GPUTPCGMMerger* GPUrestrict() Merger, int32_t sector, int32_t iRow, int32_t iTrack, bool goodLeg, float Y, float Z) { - if (Merger->Param().rec.tpc.disableRefitAttachment & 1) { + const auto& param = Merger->Param(); + if (param.rec.tpc.disableRefitAttachment & 1) { return -1e6f; } const GPUTPCTracker& GPUrestrict() tracker = *(Merger->GetConstantMem()->tpcTrackers + sector); @@ -502,34 +502,40 @@ GPUd() float GPUTPCGMTrackParam::AttachClusters(const GPUTPCGMMerger* GPUrestric return -1e6f; } - const float zOffset = Merger->Param().par.continuousTracking ? Merger->GetConstantMem()->calibObjects.fastTransformHelper->getCorrMap()->convVertexTimeToZOffset(sector, mTOffset, Merger->Param().continuousMaxTimeBin) : 0; + const float zOffset = param.par.continuousTracking ? Merger->GetConstantMem()->calibObjects.fastTransformHelper->getCorrMap()->convVertexTimeToZOffset(sector, mTOffset, param.continuousMaxTimeBin) : 0; // TODO: do some validatiomns for the transform conv functions... const float y0 = row.Grid().YMin(); const float stepY = row.HstepY(); const float z0 = row.Grid().ZMin() - zOffset; // We can use our own ZOffset, since this is only used temporarily anyway const float stepZ = row.HstepZ(); int32_t bin, ny, nz; - bool protect = CAMath::Abs(GetQPt() * Merger->Param().qptB5Scaler) <= Merger->Param().rec.tpc.rejectQPtB5 && goodLeg; + float uncorrectedY, uncorrectedZ; + Merger->GetConstantMem()->calibObjects.fastTransformHelper->InverseTransformYZtoNominalYZ(sector, iRow, Y, Z, uncorrectedY, uncorrectedZ); + if (CAMath::Abs(uncorrectedY) > row.getTPCMaxY()) { + return uncorrectedY; + } + bool protect = CAMath::Abs(GetQPt() * param.qptB5Scaler) <= param.rec.tpc.rejectQPtB5 && goodLeg; float err2Y, err2Z; - Merger->Param().GetClusterErrors2(sector, iRow, Z, mP[2], mP[3], -1.f, 0.f, 0.f, err2Y, err2Z); // TODO: Use correct time/avgCharge - const float tubeMaxSize2 = protect ? Merger->Param().rec.tpc.tubeProtectMaxSize2 : Merger->Param().rec.tpc.tubeRemoveMaxSize2; - const float tubeMinSize2 = protect ? Merger->Param().rec.tpc.tubeProtectMinSize2 : 0.f; - const float tubeSigma2 = protect ? Merger->Param().rec.tpc.tubeProtectSigma2 : Merger->Param().rec.tpc.tubeRemoveSigma2; + param.GetClusterErrors2(sector, iRow, Z, mP[2], mP[3], -1.f, 0.f, 0.f, err2Y, err2Z); // TODO: Use correct time/avgCharge + const float tubeMaxSize2 = protect ? param.rec.tpc.tubeProtectMaxSize2 : param.rec.tpc.tubeRemoveMaxSize2; + const float tubeMinSize2 = protect ? param.rec.tpc.tubeProtectMinSize2 : 0.f; + float tubeSigma2 = protect ? param.rec.tpc.tubeProtectSigma2 : param.rec.tpc.tubeRemoveSigma2; + uint32_t pad = CAMath::Float2UIntRn(GPUTPCGeometry::LinearY2Pad(sector, iRow, uncorrectedY)); + float time = Merger->GetConstantMem()->calibObjects.fastTransformHelper->getCorrMap()->InverseTransformInTimeFrame(sector, uncorrectedZ + (param.par.continuousTracking ? Merger->GetConstantMem()->calibObjects.fastTransformHelper->getCorrMap()->convVertexTimeToZOffset(sector, mTOffset, param.continuousMaxTimeBin) : 0), param.continuousMaxTimeBin); // TODO: Simplify this call in TPCFastTransform + if (iRow < param.rec.tpc.tubeExtraProtectMinRow || + pad < param.rec.tpc.tubeExtraProtectEdgePads || pad >= (uint32_t)(GPUTPCGeometry::NPads(iRow) - param.rec.tpc.tubeExtraProtectEdgePads) || + param.GetUnscaledMult(time) / GPUTPCGeometry::Row2X(iRow) > param.rec.tpc.tubeExtraProtectMinOccupancy) { + tubeSigma2 *= protect ? 2 : 0.5; + } const float sy2 = CAMath::Max(tubeMinSize2, CAMath::Min(tubeMaxSize2, tubeSigma2 * (err2Y + CAMath::Abs(mC[0])))); // Cov can be bogus when following circle const float sz2 = CAMath::Max(tubeMinSize2, CAMath::Min(tubeMaxSize2, tubeSigma2 * (err2Z + CAMath::Abs(mC[2])))); // In that case we should provide the track error externally const float tubeY = CAMath::Sqrt(sy2); const float tubeZ = CAMath::Sqrt(sz2); const float sy21 = 1.f / sy2; const float sz21 = 1.f / sz2; - float uncorrectedY, uncorrectedZ; - Merger->GetConstantMem()->calibObjects.fastTransformHelper->InverseTransformYZtoNominalYZ(sector, iRow, Y, Z, uncorrectedY, uncorrectedZ); - if (CAMath::Abs(uncorrectedY) > row.getTPCMaxY()) { - return uncorrectedY; - } row.Grid().GetBinArea(uncorrectedY, uncorrectedZ + zOffset, tubeY, tubeZ, bin, ny, nz); - const int32_t nBinsY = row.Grid().Ny(); const int32_t idOffset = tracker.Data().ClusterIdOffset(); const int32_t* ids = &(tracker.Data().ClusterDataIndex()[row.HitNumberOffset()]); diff --git a/GPU/GPUTracking/TPCConvert/GPUTPCConvertImpl.h b/GPU/GPUTracking/TPCConvert/GPUTPCConvertImpl.h index dd9a74f9b9131..b185a01a3f392 100644 --- a/GPU/GPUTracking/TPCConvert/GPUTPCConvertImpl.h +++ b/GPU/GPUTracking/TPCConvert/GPUTPCConvertImpl.h @@ -28,7 +28,7 @@ class GPUTPCConvertImpl public: GPUd() static void convert(const GPUConstantMem& GPUrestrict() cm, int32_t sector, int32_t row, float pad, float time, float& GPUrestrict() x, float& GPUrestrict() y, float& GPUrestrict() z) { - if (cm.param.par.continuousTracking) { + if (cm.param.par.continuousTracking) { // TODO: This might be wrong, don't we just need to do TransformInTimeframe always cm.calibObjects.fastTransformHelper->getCorrMap()->TransformInTimeFrame(sector, row, pad, time, x, y, z, cm.param.continuousMaxTimeBin); } else { cm.calibObjects.fastTransformHelper->Transform(sector, row, pad, time, x, y, z); diff --git a/GPU/TPCFastTransformation/TPCFastTransform.h b/GPU/TPCFastTransformation/TPCFastTransform.h index 14cd892b2554a..d9e35ba8bf405 100644 --- a/GPU/TPCFastTransformation/TPCFastTransform.h +++ b/GPU/TPCFastTransformation/TPCFastTransform.h @@ -194,6 +194,7 @@ class TPCFastTransform : public FlatObject /// Inverse transformation GPUd() void InverseTransformInTimeFrame(int32_t slice, int32_t row, float /*x*/, float y, float z, float& pad, float& time, float maxTimeBin) const; + GPUd() float InverseTransformInTimeFrame(int32_t slice, float z, float maxTimeBin) const; /// Inverse transformation: Transformed Y and Z -> transformed X GPUd() void InverseTransformYZtoX(int32_t slice, int32_t row, float y, float z, float& x, const TPCFastTransform* ref = nullptr, const TPCFastTransform* ref2 = nullptr, float scale = 0.f, float scale2 = 0.f, int32_t scaleMode = 0) const; @@ -667,6 +668,13 @@ GPUdi() void TPCFastTransform::InverseTransformInTimeFrame(int32_t slice, int32_ convUVtoPadTimeInTimeFrame(slice, row, u, v, pad, time, maxTimeBin); } +GPUdi() float TPCFastTransform::InverseTransformInTimeFrame(int32_t slice, float z, float maxTimeBin) const +{ + float pad, time; + InverseTransformInTimeFrame(slice, 0, 0, 0, z, pad, time, maxTimeBin); + return time; +} + GPUdi() void TPCFastTransform::TransformIdealZ(int32_t slice, float time, float& z, float vertexTime) const { /// _______________ The main method: cluster transformation _______________________ From f174492b3f60e109a71433fa0d557548edbd12c2 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Fri, 14 Nov 2025 15:27:10 +0100 Subject: [PATCH 2/3] GPU TPC: occuapncy Map needs bounds check --- GPU/GPUTracking/Base/GPUParam.cxx | 1 + GPU/GPUTracking/Base/GPUParam.h | 1 + GPU/GPUTracking/Base/GPUParam.inc | 2 +- GPU/GPUTracking/Base/GPUReconstructionCPU.cxx | 19 ++++++++++++++----- GPU/GPUTracking/Base/GPUReconstructionCPU.h | 2 +- .../Global/GPUChainTrackingSectorTracker.cxx | 6 +++--- .../Global/GPUTrackingInputProvider.cxx | 3 ++- .../Global/GPUTrackingInputProvider.h | 1 + .../Interface/GPUO2InterfaceUtils.cxx | 2 ++ .../Standalone/Benchmark/standalone.cxx | 2 +- 10 files changed, 27 insertions(+), 12 deletions(-) diff --git a/GPU/GPUTracking/Base/GPUParam.cxx b/GPU/GPUTracking/Base/GPUParam.cxx index cc3c6a8bb9140..7095766e8512e 100644 --- a/GPU/GPUTracking/Base/GPUParam.cxx +++ b/GPU/GPUTracking/Base/GPUParam.cxx @@ -35,6 +35,7 @@ void GPUParam::SetDefaults(float solenoidBz, bool assumeConstantBz) memset((void*)this, 0, sizeof(*this)); new (&rec) GPUSettingsRec; occupancyMap = nullptr; + occupancyMapSize = 0; occupancyTotal = 0; #ifdef GPUCA_TPC_GEOMETRY_O2 diff --git a/GPU/GPUTracking/Base/GPUParam.h b/GPU/GPUTracking/Base/GPUParam.h index 2564fc9bf0462..847f3e05ea32e 100644 --- a/GPU/GPUTracking/Base/GPUParam.h +++ b/GPU/GPUTracking/Base/GPUParam.h @@ -63,6 +63,7 @@ struct GPUParam_t { GPUTPCGMPolynomialField polynomialField; // Polynomial approx. of magnetic field for TPC GM const uint32_t* occupancyMap; // Ptr to TPC occupancy map uint32_t occupancyTotal; // Total occupancy in the TPC (nCl / nHbf) + uint32_t occupancyMapSize; // Size of occupancy map GPUParamSector SectorParam[GPUCA_NSECTORS]; diff --git a/GPU/GPUTracking/Base/GPUParam.inc b/GPU/GPUTracking/Base/GPUParam.inc index a118a8f639fe9..dbccca4d7c46b 100644 --- a/GPU/GPUTracking/Base/GPUParam.inc +++ b/GPU/GPUTracking/Base/GPUParam.inc @@ -213,7 +213,7 @@ GPUdi() float GPUParam::GetUnscaledMult(float time) const if (!occupancyMap) { return 0.f; } - const uint32_t bin = CAMath::Max(0.f, time / rec.tpc.occupancyMapTimeBins); + const uint32_t bin = CAMath::Min(occupancyMapSize - 1, CAMath::Max(0.f, time / rec.tpc.occupancyMapTimeBins)); return occupancyMap[bin]; } diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx index bdf1ade37868c..6ba77e25221ae 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx @@ -31,6 +31,7 @@ #include "GPULogging.h" #include "GPUMemorySizeScalers.h" #include "GPUReconstructionProcessingKernels.inc" +#include "GPUTPCClusterOccupancyMap.h" #include #include @@ -355,17 +356,25 @@ void GPUReconstructionCPU::ResetDeviceProcessorTypes() } } -void GPUReconstructionCPU::UpdateParamOccupancyMap(const uint32_t* mapHost, const uint32_t* mapGPU, uint32_t occupancyTotal, int32_t stream) +void GPUReconstructionCPU::UpdateParamOccupancyMap(const uint32_t* mapHost, const uint32_t* mapGPU, uint32_t occupancyTotal, uint32_t mapSize, int32_t stream) { + if (mapHost && mapSize != GPUTPCClusterOccupancyMapBin::getNBins(param())) { + throw std::runtime_error("Updating occupancy map with object of invalid size"); + } param().occupancyMap = mapHost; + param().occupancyMapSize = mapSize; param().occupancyTotal = occupancyTotal; if (IsGPU()) { - if (!((size_t)¶m().occupancyTotal - (size_t)¶m().occupancyMap == sizeof(param().occupancyMap) && sizeof(param().occupancyMap) == sizeof(size_t) && sizeof(param().occupancyTotal) < sizeof(size_t))) { + if (!((size_t)¶m().occupancyMapSize - (size_t)¶m().occupancyMap == sizeof(param().occupancyMap) + sizeof(param().occupancyTotal) && sizeof(param().occupancyMap) == sizeof(void*) && sizeof(param().occupancyTotal) == sizeof(uint32_t))) { // TODO: Make static assert, and check alignment throw std::runtime_error("occupancy data not consecutive in GPUParam"); } + struct tmpOccuapncyParam { + const void* ptr; + uint32_t total; + uint32_t size; + }; + tmpOccuapncyParam tmp = {mapGPU, occupancyTotal, mapSize}; const auto holdContext = GetThreadContext(); - size_t tmp[2] = {(size_t)mapGPU, 0}; - memcpy(&tmp[1], &occupancyTotal, sizeof(occupancyTotal)); - WriteToConstantMemory((char*)&processors()->param.occupancyMap - (char*)processors(), &tmp, sizeof(param().occupancyMap) + sizeof(param().occupancyTotal), stream); + WriteToConstantMemory((char*)&processors()->param.occupancyMap - (char*)processors(), &tmp, sizeof(tmp), stream); } } diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h index 768c301f24327..e72a1c6686124 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h @@ -48,7 +48,7 @@ class GPUReconstructionCPU : public GPUReconstructionProcessing::KernelInterface int32_t RunChains() override; - void UpdateParamOccupancyMap(const uint32_t* mapHost, const uint32_t* mapGPU, uint32_t occupancyTotal, int32_t stream = -1); + void UpdateParamOccupancyMap(const uint32_t* mapHost, const uint32_t* mapGPU, uint32_t occupancyTotal, uint32_t mapSize, int32_t stream = -1); protected: struct GPUProcessorProcessors : public GPUProcessor { diff --git a/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx b/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx index 7ab2cfeccee80..31ceb24ecb836 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx @@ -126,11 +126,11 @@ int32_t GPUChainTracking::RunTPCTrackingSectors_internal() auto* ptrTmp = (GPUTPCClusterOccupancyMapBin*)mRec->AllocateVolatileMemory(GPUTPCClusterOccupancyMapBin::getTotalSize(param()), doGPU); runKernel(GetGridAutoStep(streamInitAndOccMap, RecoStep::TPCSectorTracking), ptrTmp, GPUTPCClusterOccupancyMapBin::getTotalSize(param())); runKernel(GetGridBlk(GPUCA_NSECTORS * GPUCA_ROW_COUNT, streamInitAndOccMap), ptrTmp); - runKernel(GetGridBlk(GPUTPCClusterOccupancyMapBin::getNBins(param()), streamInitAndOccMap), ptrTmp, ptr + 2); + runKernel(GetGridBlk(mInputsHost->mTPCClusterOccupancyMapSize, streamInitAndOccMap), ptrTmp, ptr + 2); mRec->ReturnVolatileMemory(); mInputsHost->mTPCClusterOccupancyMap[1] = param().rec.tpc.occupancyMapTimeBins * 0x10000 + param().rec.tpc.occupancyMapTimeBinsAverage; if (doGPU) { - GPUMemCpy(RecoStep::TPCSectorTracking, mInputsHost->mTPCClusterOccupancyMap + 2, mInputsShadow->mTPCClusterOccupancyMap + 2, sizeof(*ptr) * GPUTPCClusterOccupancyMapBin::getNBins(mRec->GetParam()), streamInitAndOccMap, false, &mEvents->init); + GPUMemCpy(RecoStep::TPCSectorTracking, mInputsHost->mTPCClusterOccupancyMap + 2, mInputsShadow->mTPCClusterOccupancyMap + 2, sizeof(*ptr) * mInputsHost->mTPCClusterOccupancyMapSize, streamInitAndOccMap, false, &mEvents->init); } else { TransferMemoryResourceLinkToGPU(RecoStep::TPCSectorTracking, mInputsHost->mResourceOccupancyMap, streamInitAndOccMap, &mEvents->init); } @@ -138,7 +138,7 @@ int32_t GPUChainTracking::RunTPCTrackingSectors_internal() if (param().rec.tpc.occupancyMapTimeBins || param().rec.tpc.sysClusErrorC12Norm) { uint32_t& occupancyTotal = *mInputsHost->mTPCClusterOccupancyMap; occupancyTotal = CAMath::Float2UIntRn(mRec->MemoryScalers()->nTPCHits / (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasNHBFPerTF ? mIOPtrs.settingsTF->nHBFPerTF : 128)); - mRec->UpdateParamOccupancyMap(param().rec.tpc.occupancyMapTimeBins ? mInputsHost->mTPCClusterOccupancyMap + 2 : nullptr, doGPU && param().rec.tpc.occupancyMapTimeBins ? mInputsShadow->mTPCClusterOccupancyMap + 2 : nullptr, occupancyTotal, streamInitAndOccMap); + mRec->UpdateParamOccupancyMap(param().rec.tpc.occupancyMapTimeBins ? mInputsHost->mTPCClusterOccupancyMap + 2 : nullptr, doGPU && param().rec.tpc.occupancyMapTimeBins ? mInputsShadow->mTPCClusterOccupancyMap + 2 : nullptr, occupancyTotal, mInputsHost->mTPCClusterOccupancyMapSize, streamInitAndOccMap); } int32_t streamMap[NSECTORS]; diff --git a/GPU/GPUTracking/Global/GPUTrackingInputProvider.cxx b/GPU/GPUTracking/Global/GPUTrackingInputProvider.cxx index 7ef9baa903fbe..9bb8b230e9e0b 100644 --- a/GPU/GPUTracking/Global/GPUTrackingInputProvider.cxx +++ b/GPU/GPUTracking/Global/GPUTrackingInputProvider.cxx @@ -82,7 +82,8 @@ void* GPUTrackingInputProvider::SetPointersInputTRD(void* mem) void* GPUTrackingInputProvider::SetPointersTPCOccupancyMap(void* mem) { if (mHoldTPCOccupancyMap) { - computePointerWithAlignment(mem, mTPCClusterOccupancyMap, (mRec->GetParam().rec.tpc.occupancyMapTimeBins ? GPUTPCClusterOccupancyMapBin::getNBins(mRec->GetParam()) + 1 : 0) + 1); // +1 for total occupancy estimator, +1 for sanity check information + mTPCClusterOccupancyMapSize = mRec->GetParam().rec.tpc.occupancyMapTimeBins ? GPUTPCClusterOccupancyMapBin::getNBins(mRec->GetParam()) : 0; + computePointerWithAlignment(mem, mTPCClusterOccupancyMap, (mRec->GetParam().rec.tpc.occupancyMapTimeBins ? mTPCClusterOccupancyMapSize + 1 : 0) + 1); // +1 for total occupancy estimator, +1 for sanity check information } return mem; } diff --git a/GPU/GPUTracking/Global/GPUTrackingInputProvider.h b/GPU/GPUTracking/Global/GPUTrackingInputProvider.h index 910e87fd02126..7aee803a03ace 100644 --- a/GPU/GPUTracking/Global/GPUTrackingInputProvider.h +++ b/GPU/GPUTracking/Global/GPUTrackingInputProvider.h @@ -81,6 +81,7 @@ class GPUTrackingInputProvider : public GPUProcessor o2::tpc::ClusterNative* mPclusterNativeOutput = nullptr; uint32_t* mTPCClusterOccupancyMap = nullptr; + uint32_t mTPCClusterOccupancyMapSize = 0; uint32_t* mErrorCodes = nullptr; }; diff --git a/GPU/GPUTracking/Interface/GPUO2InterfaceUtils.cxx b/GPU/GPUTracking/Interface/GPUO2InterfaceUtils.cxx index 9454c9a2389ae..43b8dc21eaf15 100644 --- a/GPU/GPUTracking/Interface/GPUO2InterfaceUtils.cxx +++ b/GPU/GPUTracking/Interface/GPUO2InterfaceUtils.cxx @@ -26,6 +26,7 @@ #include "TPCBase/CRU.h" #include "TPCBase/RDHUtils.h" #include "DataFormatsTPC/ZeroSuppression.h" +#include "GPUTPCClusterOccupancyMap.h" #include using namespace o2::gpu; @@ -132,6 +133,7 @@ void GPUO2InterfaceUtils::paramUseExternalOccupancyMap(GPUParam* param, uint32_t param->occupancyTotal = *occupancymap; if (param->rec.tpc.occupancyMapTimeBins) { param->occupancyMap = occupancymap + 2; + param->occupancyMapSize = GPUTPCClusterOccupancyMapBin::getNBins(*param); } } } diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx index b0ea1f5b7dbcb..ca26f26d32612 100644 --- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx +++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx @@ -453,7 +453,7 @@ int32_t SetupReconstruction() if (configStandalone.proc.doublePipeline) { recPipeline->SetSettings(&grp, &recSet, &procSet, &steps); } - if (configStandalone.testSyncAsync) { + if (configStandalone.testSyncAsync) { // TODO: Add --async mode / flag // Set settings for asynchronous steps.steps.setBits(GPUDataTypes::RecoStep::TPCDecompression, true); steps.steps.setBits(GPUDataTypes::RecoStep::TPCdEdx, true); From 32fcc370e8a9fd7cfe68ceff4091fccff8ff4c93 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Mon, 17 Nov 2025 14:55:34 +0100 Subject: [PATCH 3/3] GPU TPC: Fix synchronization between update of occupancy map and running sector tracker on GPU --- GPU/GPUTracking/Base/GPUReconstructionCPU.cxx | 4 ++-- GPU/GPUTracking/Base/GPUReconstructionCPU.h | 2 +- .../Global/GPUChainTrackingSectorTracker.cxx | 11 ++++++----- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx index 6ba77e25221ae..3da96654b895d 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx @@ -356,7 +356,7 @@ void GPUReconstructionCPU::ResetDeviceProcessorTypes() } } -void GPUReconstructionCPU::UpdateParamOccupancyMap(const uint32_t* mapHost, const uint32_t* mapGPU, uint32_t occupancyTotal, uint32_t mapSize, int32_t stream) +void GPUReconstructionCPU::UpdateParamOccupancyMap(const uint32_t* mapHost, const uint32_t* mapGPU, uint32_t occupancyTotal, uint32_t mapSize, int32_t stream, deviceEvent* ev) { if (mapHost && mapSize != GPUTPCClusterOccupancyMapBin::getNBins(param())) { throw std::runtime_error("Updating occupancy map with object of invalid size"); @@ -375,6 +375,6 @@ void GPUReconstructionCPU::UpdateParamOccupancyMap(const uint32_t* mapHost, cons }; tmpOccuapncyParam tmp = {mapGPU, occupancyTotal, mapSize}; const auto holdContext = GetThreadContext(); - WriteToConstantMemory((char*)&processors()->param.occupancyMap - (char*)processors(), &tmp, sizeof(tmp), stream); + WriteToConstantMemory((char*)&processors()->param.occupancyMap - (char*)processors(), &tmp, sizeof(tmp), stream, ev); } } diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h index e72a1c6686124..a78a482db4e7a 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h @@ -48,7 +48,7 @@ class GPUReconstructionCPU : public GPUReconstructionProcessing::KernelInterface int32_t RunChains() override; - void UpdateParamOccupancyMap(const uint32_t* mapHost, const uint32_t* mapGPU, uint32_t occupancyTotal, uint32_t mapSize, int32_t stream = -1); + void UpdateParamOccupancyMap(const uint32_t* mapHost, const uint32_t* mapGPU, uint32_t occupancyTotal, uint32_t mapSize, int32_t stream = -1, deviceEvent* ev = nullptr); protected: struct GPUProcessorProcessors : public GPUProcessor { diff --git a/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx b/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx index 31ceb24ecb836..122eb709b4356 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx @@ -100,6 +100,7 @@ int32_t GPUChainTracking::RunTPCTrackingSectors_internal() bool streamInit[GPUCA_MAX_STREAMS] = {false}; int32_t streamInitAndOccMap = mRec->NStreams() - 1; + bool initializeOccMap = param().rec.tpc.occupancyMapTimeBins || param().rec.tpc.sysClusErrorC12Norm; if (doGPU) { // Copy Tracker Object to GPU Memory if (GetProcessingSettings().debugLevel >= 3) { @@ -109,7 +110,7 @@ int32_t GPUChainTracking::RunTPCTrackingSectors_internal() return 2; } - WriteToConstantMemory(RecoStep::TPCSectorTracking, (char*)processors()->tpcTrackers - (char*)processors(), processorsShadow()->tpcTrackers, sizeof(GPUTPCTracker) * NSECTORS, streamInitAndOccMap, &mEvents->init); + WriteToConstantMemory(RecoStep::TPCSectorTracking, (char*)processors()->tpcTrackers - (char*)processors(), processorsShadow()->tpcTrackers, sizeof(GPUTPCTracker) * NSECTORS, streamInitAndOccMap, !initializeOccMap ? &mEvents->init : nullptr); std::fill(streamInit, streamInit + mRec->NStreams(), false); streamInit[streamInitAndOccMap] = true; @@ -130,15 +131,15 @@ int32_t GPUChainTracking::RunTPCTrackingSectors_internal() mRec->ReturnVolatileMemory(); mInputsHost->mTPCClusterOccupancyMap[1] = param().rec.tpc.occupancyMapTimeBins * 0x10000 + param().rec.tpc.occupancyMapTimeBinsAverage; if (doGPU) { - GPUMemCpy(RecoStep::TPCSectorTracking, mInputsHost->mTPCClusterOccupancyMap + 2, mInputsShadow->mTPCClusterOccupancyMap + 2, sizeof(*ptr) * mInputsHost->mTPCClusterOccupancyMapSize, streamInitAndOccMap, false, &mEvents->init); + GPUMemCpy(RecoStep::TPCSectorTracking, mInputsHost->mTPCClusterOccupancyMap + 2, mInputsShadow->mTPCClusterOccupancyMap + 2, sizeof(*ptr) * mInputsHost->mTPCClusterOccupancyMapSize, streamInitAndOccMap, false); } else { - TransferMemoryResourceLinkToGPU(RecoStep::TPCSectorTracking, mInputsHost->mResourceOccupancyMap, streamInitAndOccMap, &mEvents->init); + TransferMemoryResourceLinkToGPU(RecoStep::TPCSectorTracking, mInputsHost->mResourceOccupancyMap, streamInitAndOccMap); } } - if (param().rec.tpc.occupancyMapTimeBins || param().rec.tpc.sysClusErrorC12Norm) { + if (initializeOccMap) { uint32_t& occupancyTotal = *mInputsHost->mTPCClusterOccupancyMap; occupancyTotal = CAMath::Float2UIntRn(mRec->MemoryScalers()->nTPCHits / (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasNHBFPerTF ? mIOPtrs.settingsTF->nHBFPerTF : 128)); - mRec->UpdateParamOccupancyMap(param().rec.tpc.occupancyMapTimeBins ? mInputsHost->mTPCClusterOccupancyMap + 2 : nullptr, doGPU && param().rec.tpc.occupancyMapTimeBins ? mInputsShadow->mTPCClusterOccupancyMap + 2 : nullptr, occupancyTotal, mInputsHost->mTPCClusterOccupancyMapSize, streamInitAndOccMap); + mRec->UpdateParamOccupancyMap(param().rec.tpc.occupancyMapTimeBins ? mInputsHost->mTPCClusterOccupancyMap + 2 : nullptr, doGPU && param().rec.tpc.occupancyMapTimeBins ? mInputsShadow->mTPCClusterOccupancyMap + 2 : nullptr, occupancyTotal, mInputsHost->mTPCClusterOccupancyMapSize, streamInitAndOccMap, &mEvents->init); } int32_t streamMap[NSECTORS];