diff --git a/GPU/Common/GPUCommonAlgorithm.h b/GPU/Common/GPUCommonAlgorithm.h index 8cd53ec5e0609..db57e7ec06d4b 100644 --- a/GPU/Common/GPUCommonAlgorithm.h +++ b/GPU/Common/GPUCommonAlgorithm.h @@ -331,28 +331,28 @@ GPUdi() void GPUCommonAlgorithm::swap(T& a, T& b) #pragma OPENCL EXTENSION cl_khr_subgroups : enable template -GPUdi() T work_group_scan_inclusive_add_FUNC(T v) +GPUdi() T warp_scan_inclusive_add_FUNC(T v) { return sub_group_scan_inclusive_add(v); } template <> // FIXME: It seems OpenCL does not support 8 and 16 bit subgroup operations -GPUdi() uint8_t work_group_scan_inclusive_add_FUNC(uint8_t v) +GPUdi() uint8_t warp_scan_inclusive_add_FUNC(uint8_t v) { return sub_group_scan_inclusive_add((uint32_t)v); } template -GPUdi() T work_group_broadcast_FUNC(T v, int32_t i) +GPUdi() T warp_broadcast_FUNC(T v, int32_t i) { return sub_group_broadcast(v, i); } template <> -GPUdi() uint8_t work_group_broadcast_FUNC(uint8_t v, int32_t i) +GPUdi() uint8_t warp_broadcast_FUNC(uint8_t v, int32_t i) { return sub_group_broadcast((uint32_t)v, i); } -#define warp_scan_inclusive_add(v) work_group_scan_inclusive_add_FUNC(v) -#define warp_broadcast(v, i) work_group_broadcast_FUNC(v, i) +#define warp_scan_inclusive_add(v) warp_scan_inclusive_add_FUNC(v) +#define warp_broadcast(v, i) warp_broadcast_FUNC(v, i) #elif (defined(__CUDACC__) || defined(__HIPCC__)) // CUDA and HIP work the same way using cub, need just different header diff --git a/GPU/Common/GPUCommonDefAPI.h b/GPU/Common/GPUCommonDefAPI.h index b029038a3b521..2494cd8dd6fe5 100644 --- a/GPU/Common/GPUCommonDefAPI.h +++ b/GPU/Common/GPUCommonDefAPI.h @@ -96,13 +96,13 @@ #define GPUgeneric() __generic #define GPUconstexprref() GPUconstexpr() #if defined(__OPENCL__) && !defined(__clang__) - #define GPUbarrier() work_group_barrier(mem_fence::global | mem_fence::local); - #define GPUbarrierWarp() + #define GPUbarrier() work_group_barrier(mem_fence::global | mem_fence::local) + #define GPUbarrierWarp() sub_group_barrier(mem_fence::global | mem_fence::local) #define GPUAtomic(type) atomic static_assert(sizeof(atomic) == sizeof(uint32_t), "Invalid size of atomic type"); #else #define GPUbarrier() barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE) - #define GPUbarrierWarp() + #define GPUbarrierWarp() sub_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE) #if defined(__OPENCL__) && defined(GPUCA_OPENCL_CLANG_C11_ATOMICS) namespace o2 { namespace gpu { template struct oclAtomic; diff --git a/GPU/GPUTracking/TPCClusterFinder/CfUtils.h b/GPU/GPUTracking/TPCClusterFinder/CfUtils.h index 96f4893c74af3..f9e3f7a304d84 100644 --- a/GPU/GPUTracking/TPCClusterFinder/CfUtils.h +++ b/GPU/GPUTracking/TPCClusterFinder/CfUtils.h @@ -58,10 +58,9 @@ class CfUtils *sum = __popc(waveMask); return myOffset; #else // CPU / OpenCL fallback - int32_t myOffset = warp_scan_inclusive_add(pred ? 1 : 0); + int32_t myOffset = warp_scan_inclusive_add(!!pred); *sum = warp_broadcast(myOffset, GPUCA_WARP_SIZE - 1); - myOffset--; - return myOffset; + return myOffset - !!pred; #endif } @@ -111,8 +110,7 @@ class CfUtils if (sum != nullptr) { *sum = work_group_broadcast(lpos, BlockSize - 1); } - lpos--; - return lpos; + return lpos - !!pred; #endif } @@ -149,7 +147,7 @@ class CfUtils return sum; #else // CPU / OpenCL fallback - return work_group_reduce_add(pred ? 1 : 0); + return work_group_reduce_add(!!pred); #endif } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx index 312085d2947ab..f7bb64106fe4f 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx @@ -224,7 +224,7 @@ GPUd() size_t GPUTPCCFDecodeZSLink::DecodePage(GPUSharedMemory& smem, processorT return pageDigitOffset; } - int32_t nDecoded = 0; + [[maybe_unused]] int32_t nDecoded = 0; const auto* decHdr = ConsumeHeader(page); ConsumeBytes(page, decHdr->firstZSDataOffset * 16); @@ -275,7 +275,7 @@ GPUd() size_t GPUTPCCFDecodeZSLink::DecodePage(GPUSharedMemory& smem, processorT #endif pageDigitOffset += nAdc; } // for (uint32_t t = 0; t < decHdr->nTimebinHeaders; t++) - (void)nDecoded; + #ifdef GPUCA_CHECK_TPCZS_CORRUPTION if (iThread == 0 && nDecoded != decHdr->nADCsamples) { clusterer.raiseError(GPUErrors::ERROR_TPCZS_INVALID_NADC, clusterer.mISector * 1000 + decHdr->cruID, decHdr->nADCsamples, nDecoded); @@ -566,6 +566,7 @@ GPUd() void GPUTPCCFDecodeZSLinkBase::WriteCharge(processorType& clusterer, floa positions[positionOffset] = pos; charge *= clusterer.GetConstantMem()->calibObjects.tpcPadGain->getGainCorrection(sector, padAndRow.getRow(), padAndRow.getPad()); + chargeMap[pos] = PackedCharge(charge); } @@ -615,6 +616,7 @@ GPUd() uint32_t GPUTPCCFDecodeZSDenseLink::DecodePage(GPUSharedMemory& smem, pro ConsumeBytes(page, decHeader->firstZSDataOffset - sizeof(o2::header::RAWDataHeader)); for (uint16_t i = 0; i < decHeader->nTimebinHeaders; i++) { + [[maybe_unused]] ptrdiff_t sizeLeftInPage = payloadEnd - page; assert(sizeLeftInPage > 0); @@ -728,8 +730,6 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread( uint16_t nSamplesInTB = 0; - GPUbarrier(); - // Read timebin link headers for (uint8_t iLink = 0; iLink < nLinksInTimebin; iLink++) { uint8_t timebinLinkHeaderStart = ConsumeByte(page); @@ -777,6 +777,8 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread( } // for (uint8_t iLink = 0; iLink < nLinksInTimebin; iLink++) + GPUbarrierWarp(); // Ensure all writes to shared memory are finished, before reading it + const uint8_t* adcData = ConsumeBytes(page, (nSamplesInTB * DECODE_BITS + 7) / 8); MAYBE_PAGE_OVERFLOW(page); // TODO: We don't need this check? @@ -784,8 +786,6 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread( return FillWithInvalid(clusterer, iThread, NTHREADS, pageDigitOffset, nSamplesInTB); } - GPUbarrier(); - // Unpack ADC int32_t iLink = 0; for (uint16_t sample = iThread; sample < nSamplesInTB; sample += NTHREADS) { @@ -819,6 +819,8 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread( } // for (uint16_t sample = iThread; sample < nSamplesInTB; sample += NTHREADS) + GPUbarrierWarp(); // Ensure all reads to shared memory are finished, before decoding next header into shmem + assert(PayloadExtendsToNextPage || adcData <= page); assert(PayloadExtendsToNextPage || page <= payloadEnd);