Skip to content

Commit 161825e

Browse files
committed
GPU: Replace assertions with error counters in ZS decoding.
1 parent 21d965e commit 161825e

File tree

3 files changed

+112
-56
lines changed

3 files changed

+112
-56
lines changed

GPU/GPUTracking/Global/GPUErrorCodes.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ GPUCA_ERROR_CODE(26, ERROR_TPCZS_INVALID_ROW, SectorRow)
4747
GPUCA_ERROR_CODE(27, ERROR_TPCZS_INVALID_NADC, SectorCRU, SamplesInPage, SamplesWritten) // Invalid number of ADC samples in header, existing samples were decoded
4848
GPUCA_ERROR_CODE(28, ERROR_TPCZS_INCOMPLETE_HBF, SectorCRU, PacketCount, NextPacketCount) // Part of HBF is missing, decoding incomplete
4949
GPUCA_ERROR_CODE(29, ERROR_TPCZS_INVALID_OFFSET, SectorEndpoint, Value, Expected) // Raw page is skipped since it contains invalid payload offset
50-
GPUCA_ERROR_CODE(29, MAX_GPUCA_ERROR_NUMBER)
50+
GPUCA_ERROR_CODE(30, ERROR_TPCZS_INVALID_MAGIC_WORD, Value) // ZS header contains wrong magic word
51+
GPUCA_ERROR_CODE(31, ERROR_TPCZS_PAGE_OVERFLOW, Position, PageEnd) // Ran out of page to decode
52+
GPUCA_ERROR_CODE(32, ERROR_TPCZS_VERSION_MISMATCH, Value, Expected) // ZS decoder received page with wrong version
53+
GPUCA_ERROR_CODE(32, MAX_GPUCA_ERROR_NUMBER)
5154

5255
// #define GPUCA_CHECK_TPCZS_CORRUPTION

GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx

Lines changed: 102 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -604,61 +604,104 @@ GPUd() uint32_t GPUTPCCFDecodeZSDenseLink::DecodePage(GPUSharedMemory& smem, pro
604604
const auto* decHeader = Peek<TPCZSHDRV2>(page, raw::RDHUtils::getMemorySize(*rawDataHeader) - sizeof(TPCZSHDRV2));
605605
ConsumeHeader<header::RAWDataHeader>(page);
606606

607-
assert(decHeader->version >= ZSVersionDenseLinkBased);
608-
assert(decHeader->magicWord == tpc::zerosupp_link_based::CommonHeader::MagicWordLinkZSMetaHeader);
609-
610607
uint16_t nSamplesWritten = 0;
611608
const uint16_t nSamplesInPage = decHeader->nADCsamples;
612609

613610
const auto* payloadEnd = Peek(pageStart, raw::RDHUtils::getMemorySize(*rawDataHeader) - sizeof(TPCZSHDRV2) - ((decHeader->flags & TPCZSHDRV2::ZSFlags::TriggerWordPresent) ? TPCZSHDRV2::TRIGGER_WORD_SIZE : 0));
614611
const auto* nextPage = Peek(pageStart, TPCZSHDR::TPC_ZS_PAGE_SIZE);
615612

613+
const bool extendsToNextPage = decHeader->flags & TPCZSHDRV2::ZSFlags::payloadExtendsToNextPage;
614+
616615
ConsumeBytes(page, decHeader->firstZSDataOffset - sizeof(o2::header::RAWDataHeader));
617616

618-
for (uint16_t i = 0; i < decHeader->nTimebinHeaders; i++) {
617+
int err = GPUErrors::ERROR_NONE;
619618

620-
[[maybe_unused]] ptrdiff_t sizeLeftInPage = payloadEnd - page;
621-
assert(sizeLeftInPage > 0);
619+
if (decHeader->version < ZSVersionDenseLinkBased) {
620+
err = GPUErrors::ERROR_TPCZS_VERSION_MISMATCH;
621+
}
622622

623-
uint16_t nSamplesWrittenTB = 0;
623+
if (decHeader->magicWord != zerosupp_link_based::CommonHeader::MagicWordLinkZSMetaHeader) {
624+
err = GPUErrors::ERROR_TPCZS_INVALID_MAGIC_WORD;
625+
}
626+
627+
for (uint16_t i = 0; i < decHeader->nTimebinHeaders && !err; i++) {
628+
629+
ptrdiff_t sizeLeftInPage = payloadEnd - page;
630+
if (sizeLeftInPage <= 0) {
631+
err = GPUErrors::ERROR_TPCZS_PAGE_OVERFLOW;
632+
break;
633+
}
634+
635+
int16_t nSamplesWrittenTB = 0;
636+
uint16_t nSamplesLeftInPage = nSamplesInPage - nSamplesWritten;
637+
638+
if (i == decHeader->nTimebinHeaders - 1 && extendsToNextPage) {
639+
if (raw::RDHUtils::getMemorySize(*rawDataHeader) != TPCZSHDR::TPC_ZS_PAGE_SIZE) {
640+
err = GPUErrors::ERROR_TPCZS_PAGE_OVERFLOW;
641+
break;
642+
}
624643

625-
if (i == decHeader->nTimebinHeaders - 1 && decHeader->flags & o2::tpc::TPCZSHDRV2::ZSFlags::payloadExtendsToNextPage) {
626-
assert(o2::raw::RDHUtils::getMemorySize(*rawDataHeader) == TPCZSHDR::TPC_ZS_PAGE_SIZE);
627644
if ((uint16_t)(raw::RDHUtils::getPageCounter(rawDataHeader) + 1) == raw::RDHUtils::getPageCounter(nextPage)) {
628-
nSamplesWrittenTB = DecodeTB<DecodeInParallel, true>(clusterer, smem, iThread, page, pageDigitOffset, rawDataHeader, firstHBF, decHeader->cruID, payloadEnd, nextPage);
645+
nSamplesWrittenTB = DecodeTB<DecodeInParallel, true>(clusterer, smem, iThread, page, pageDigitOffset, rawDataHeader, firstHBF, decHeader->cruID, nSamplesLeftInPage, payloadEnd, nextPage);
629646
} else {
630-
nSamplesWrittenTB = FillWithInvalid(clusterer, iThread, nThreads, pageDigitOffset, nSamplesInPage - nSamplesWritten);
631-
#ifdef GPUCA_CHECK_TPCZS_CORRUPTION
632-
if (iThread == 0) {
633-
clusterer.raiseError(GPUErrors::ERROR_TPCZS_INCOMPLETE_HBF, clusterer.mISector * 1000 + decHeader->cruID, raw::RDHUtils::getPageCounter(rawDataHeader), raw::RDHUtils::getPageCounter(nextPage));
634-
}
635-
#endif
647+
err = GPUErrors::ERROR_TPCZS_INCOMPLETE_HBF;
648+
break;
636649
}
637650
} else {
638-
nSamplesWrittenTB = DecodeTB<DecodeInParallel, false>(clusterer, smem, iThread, page, pageDigitOffset, rawDataHeader, firstHBF, decHeader->cruID, payloadEnd, nextPage);
651+
nSamplesWrittenTB = DecodeTB<DecodeInParallel, false>(clusterer, smem, iThread, page, pageDigitOffset, rawDataHeader, firstHBF, decHeader->cruID, nSamplesLeftInPage, payloadEnd, nextPage);
652+
}
653+
654+
// Abort decoding the page if an error was detected.
655+
if (nSamplesWrittenTB < 0) {
656+
err = -nSamplesWrittenTB;
657+
break;
639658
}
640659

641-
assert(nSamplesWritten <= nSamplesInPage);
642660
nSamplesWritten += nSamplesWrittenTB;
643661
pageDigitOffset += nSamplesWrittenTB;
644662
} // for (uint16_t i = 0; i < decHeader->nTimebinHeaders; i++)
645663

646-
#ifdef GPUCA_CHECK_TPCZS_CORRUPTION
647-
if (iThread == 0 && nSamplesWritten != nSamplesInPage) {
648-
clusterer.raiseError(GPUErrors::ERROR_TPCZS_INVALID_NADC, clusterer.mISector * 1000 + decHeader->cruID, nSamplesInPage, nSamplesWritten);
649-
/*#ifndef GPUCA_GPUCODE
650-
FILE* foo = fopen("dump.bin", "w+b");
651-
fwrite(pageSrc, 1, o2::raw::RDHUtils::getMemorySize(*rdHdr), foo);
652-
fclose(foo);
653-
#endif*/
664+
if (nSamplesWritten != nSamplesInPage) {
665+
if (nSamplesWritten < nSamplesInPage) {
666+
pageDigitOffset += FillWithInvalid(clusterer, iThread, nThreads, pageDigitOffset, nSamplesInPage - nSamplesWritten);
667+
}
668+
err = !err ? GPUErrors::ERROR_TPCZS_INVALID_NADC : err; // Ensure we don't overwrite any previous error
669+
}
670+
671+
if (iThread == 0 && err) {
672+
[[maybe_unused]] bool dumpPage = false;
673+
674+
if (err == GPUErrors::ERROR_TPCZS_VERSION_MISMATCH) {
675+
clusterer.raiseError(err, decHeader->version, ZSVersionDenseLinkBased);
676+
} else if (err == GPUErrors::ERROR_TPCZS_INVALID_MAGIC_WORD) {
677+
clusterer.raiseError(err, decHeader->magicWord);
678+
} else if (err == GPUErrors::ERROR_TPCZS_INCOMPLETE_HBF) {
679+
clusterer.raiseError(err, clusterer.mISector * 1000 + decHeader->cruID, raw::RDHUtils::getPageCounter(rawDataHeader), raw::RDHUtils::getPageCounter(nextPage));
680+
} else if (err == GPUErrors::ERROR_TPCZS_PAGE_OVERFLOW) {
681+
clusterer.raiseError(err, extendsToNextPage);
682+
dumpPage = true;
683+
} else if (err == GPUErrors::ERROR_TPCZS_INVALID_NADC) {
684+
clusterer.raiseError(err, nSamplesInPage, nSamplesWritten, extendsToNextPage);
685+
dumpPage = true;
686+
} else {
687+
// Unknown error -> forward it
688+
clusterer.raiseError(err);
689+
}
690+
691+
// #ifndef GPUCA_GPUCODE
692+
// if (dumpPage) {
693+
// FILE* foo = fopen("dump.bin", "w+b");
694+
// fwrite(pageSrc, 1, o2::raw::RDHUtils::getMemorySize(*rdHdr), foo);
695+
// fclose(foo);
696+
// }
697+
// #endif
654698
}
655-
#endif
656699

657700
return pageDigitOffset;
658701
}
659702

660703
template <bool DecodeInParallel, bool PayloadExtendsToNextPage>
661-
GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTB(
704+
GPUd() int16_t GPUTPCCFDecodeZSDenseLink::DecodeTB(
662705
processorType& clusterer,
663706
[[maybe_unused]] GPUSharedMemory& smem,
664707
int32_t iThread,
@@ -667,23 +710,24 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTB(
667710
const header::RAWDataHeader* rawDataHeader,
668711
int32_t firstHBF,
669712
int32_t cru,
670-
[[maybe_unused]] const uint8_t* payloadEnd,
671-
[[maybe_unused]] const uint8_t* nextPage)
713+
uint16_t nSamplesLeftInPage,
714+
const uint8_t* payloadEnd,
715+
const uint8_t* nextPage)
672716
{
673717

674718
if constexpr (DecodeInParallel) {
675-
return DecodeTBMultiThread<PayloadExtendsToNextPage>(clusterer, smem, iThread, page, pageDigitOffset, rawDataHeader, firstHBF, cru, payloadEnd, nextPage);
719+
return DecodeTBMultiThread<PayloadExtendsToNextPage>(clusterer, smem, iThread, page, pageDigitOffset, rawDataHeader, firstHBF, cru, nSamplesLeftInPage, payloadEnd, nextPage);
676720
} else {
677-
uint16_t nSamplesWritten = 0;
721+
int16_t nSamplesWritten = 0;
678722
if (iThread == 0) {
679-
nSamplesWritten = DecodeTBSingleThread<PayloadExtendsToNextPage>(clusterer, page, pageDigitOffset, rawDataHeader, firstHBF, cru, payloadEnd, nextPage);
723+
nSamplesWritten = DecodeTBSingleThread<PayloadExtendsToNextPage>(clusterer, page, pageDigitOffset, rawDataHeader, firstHBF, cru, nSamplesLeftInPage, payloadEnd, nextPage);
680724
}
681725
return warp_broadcast(nSamplesWritten, 0);
682726
}
683727
}
684728

685729
template <bool PayloadExtendsToNextPage>
686-
GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread(
730+
GPUd() int16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread(
687731
processorType& clusterer,
688732
GPUSharedMemory& smem,
689733
const int32_t iThread,
@@ -692,8 +736,9 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread(
692736
const header::RAWDataHeader* rawDataHeader,
693737
int32_t firstHBF,
694738
int32_t cru,
695-
[[maybe_unused]] const uint8_t* payloadEnd,
696-
[[maybe_unused]] const uint8_t* nextPage)
739+
uint16_t nSamplesLeftInPage,
740+
const uint8_t* payloadEnd,
741+
const uint8_t* nextPage)
697742
{
698743
#define MAYBE_PAGE_OVERFLOW(pagePtr) \
699744
if constexpr (PayloadExtendsToNextPage) { \
@@ -703,7 +748,9 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread(
703748
ConsumeBytes(pagePtr, sizeof(header::RAWDataHeader) + diff); \
704749
} \
705750
} else { \
706-
assert(pagePtr <= payloadEnd); \
751+
if (pagePtr > payloadEnd) { \
752+
return -GPUErrors::ERROR_TPCZS_PAGE_OVERFLOW; \
753+
} \
707754
}
708755

709756
#define PEEK_OVERFLOW(pagePtr, offset) \
@@ -728,7 +775,7 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread(
728775
uint16_t linkBC = (tbbHdr & 0xFFF0) >> 4;
729776
int32_t timeBin = (linkBC + (uint64_t)(raw::RDHUtils::getHeartBeatOrbit(*rawDataHeader) - firstHBF) * constants::lhc::LHCMaxBunches) / LHCBCPERTIMEBIN;
730777

731-
uint16_t nSamplesInTB = 0;
778+
int16_t nSamplesInTB = 0;
732779

733780
// Read timebin link headers
734781
for (uint8_t iLink = 0; iLink < nLinksInTimebin; iLink++) {
@@ -756,7 +803,6 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread(
756803
int32_t chanByteOffset = nBytesBitmask - 1 - CAMath::Popcount(bitmaskL2 >> (chanL2Idx + 1));
757804

758805
uint8_t myChannelHasData = (chan < 80 && l2 ? TEST_BIT(PEEK_OVERFLOW(page, chanByteOffset), chan % 8) : 0);
759-
assert(myChannelHasData == 0 || myChannelHasData == 1);
760806

761807
int32_t nSamplesStep;
762808
int32_t threadSampleOffset = CfUtils::warpPredicateScan(myChannelHasData, &nSamplesStep);
@@ -779,13 +825,17 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread(
779825

780826
GPUbarrierWarp(); // Ensure all writes to shared memory are finished, before reading it
781827

782-
const uint8_t* adcData = ConsumeBytes(page, (nSamplesInTB * DECODE_BITS + 7) / 8);
783-
MAYBE_PAGE_OVERFLOW(page); // TODO: We don't need this check?
828+
if (nSamplesInTB > nSamplesLeftInPage) {
829+
return -GPUErrors::ERROR_TPCZS_INVALID_NADC;
830+
}
784831

785832
if (not fragment.contains(timeBin)) {
786833
return FillWithInvalid(clusterer, iThread, NTHREADS, pageDigitOffset, nSamplesInTB);
787834
}
788835

836+
const uint8_t* adcData = ConsumeBytes(page, (nSamplesInTB * DECODE_BITS + 7) / 8);
837+
MAYBE_PAGE_OVERFLOW(page);
838+
789839
// Unpack ADC
790840
int32_t iLink = 0;
791841
for (uint16_t sample = iThread; sample < nSamplesInTB; sample += NTHREADS) {
@@ -821,9 +871,6 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread(
821871

822872
GPUbarrierWarp(); // Ensure all reads to shared memory are finished, before decoding next header into shmem
823873

824-
assert(PayloadExtendsToNextPage || adcData <= page);
825-
assert(PayloadExtendsToNextPage || page <= payloadEnd);
826-
827874
return nSamplesInTB;
828875

829876
#undef TEST_BIT
@@ -832,13 +879,14 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread(
832879
}
833880

834881
template <bool PayloadExtendsToNextPage>
835-
GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBSingleThread(
882+
GPUd() int16_t GPUTPCCFDecodeZSDenseLink::DecodeTBSingleThread(
836883
processorType& clusterer,
837884
const uint8_t*& page,
838885
uint32_t pageDigitOffset,
839886
const header::RAWDataHeader* rawDataHeader,
840887
int32_t firstHBF,
841888
int32_t cru,
889+
uint16_t nSamplesLeftInPage,
842890
[[maybe_unused]] const uint8_t* payloadEnd,
843891
[[maybe_unused]] const uint8_t* nextPage)
844892
{
@@ -850,7 +898,9 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBSingleThread(
850898
ConsumeBytes(pagePtr, sizeof(header::RAWDataHeader) + diff); \
851899
} \
852900
} else { \
853-
assert(pagePtr <= payloadEnd); \
901+
if (pagePtr > payloadEnd) { \
902+
return -GPUErrors::ERROR_TPCZS_PAGE_OVERFLOW; \
903+
} \
854904
}
855905

856906
using zerosupp_link_based::ChannelPerTBHeader;
@@ -898,14 +948,18 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBSingleThread(
898948

899949
} // for (uint8_t iLink = 0; iLink < nLinksInTimebin; iLink++)
900950

901-
const uint8_t* adcData = ConsumeBytes(page, (nSamplesInTB * DECODE_BITS + 7) / 8);
902-
MAYBE_PAGE_OVERFLOW(page);
951+
if (nSamplesInTB > nSamplesLeftInPage) {
952+
return -GPUErrors::ERROR_TPCZS_INVALID_NADC;
953+
}
903954

904955
if (not fragment.contains(timeBin)) {
905956
FillWithInvalid(clusterer, 0, 1, pageDigitOffset, nSamplesInTB);
906957
return nSamplesInTB;
907958
}
908959

960+
const uint8_t* adcData = ConsumeBytes(page, (nSamplesInTB * DECODE_BITS + 7) / 8);
961+
MAYBE_PAGE_OVERFLOW(page);
962+
909963
// Unpack ADC
910964
uint32_t byte = 0, bits = 0;
911965
uint16_t rawFECChannel = 0;
@@ -937,10 +991,6 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBSingleThread(
937991
} // while (bits >= DECODE_BITS)
938992
} // while (nSamplesWritten < nAdc)
939993

940-
assert(PayloadExtendsToNextPage || adcData <= page);
941-
assert(PayloadExtendsToNextPage || page <= payloadEnd);
942-
assert(nSamplesWritten == nSamplesInTB);
943-
944994
return nSamplesWritten;
945995

946996
#undef MAYBE_PAGE_OVERFLOW

GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -167,14 +167,17 @@ class GPUTPCCFDecodeZSDenseLink : public GPUTPCCFDecodeZSLinkBase
167167

168168
GPUd() static bool ChannelIsActive(const uint8_t* chan, uint16_t chanIndex);
169169

170+
// Decode a single timebin within an 8kb page.
171+
// Returns the number of samples decoded from the page
172+
// or negative value to indicate an error (no samples are written in this case)
170173
template <bool DecodeInParallel, bool PayloadExtendsToNextPage>
171-
GPUd() static uint16_t DecodeTB(processorType& clusterer, GPUSharedMemory& smem, int32_t iThread, const uint8_t*& page, uint32_t pageDigitOffset, const header::RAWDataHeader* rawDataHeader, int32_t firstHBF, int32_t cru, const uint8_t* payloadEnd, const uint8_t* nextPage);
174+
GPUd() static int16_t DecodeTB(processorType& clusterer, GPUSharedMemory& smem, int32_t iThread, const uint8_t*& page, uint32_t pageDigitOffset, const header::RAWDataHeader* rawDataHeader, int32_t firstHBF, int32_t cru, uint16_t nSamplesLeftInPage, const uint8_t* payloadEnd, const uint8_t* nextPage);
172175

173176
template <bool PayloadExtendsToNextPage>
174-
GPUd() static uint16_t DecodeTBSingleThread(processorType& clusterer, const uint8_t*& page, uint32_t pageDigitOffset, const header::RAWDataHeader* rawDataHeader, int32_t firstHBF, int32_t cru, const uint8_t* payloadEnd, const uint8_t* nextPage);
177+
GPUd() static int16_t DecodeTBSingleThread(processorType& clusterer, const uint8_t*& page, uint32_t pageDigitOffset, const header::RAWDataHeader* rawDataHeader, int32_t firstHBF, int32_t cru, uint16_t nSamplesLeftInPage, const uint8_t* payloadEnd, const uint8_t* nextPage);
175178

176179
template <bool PayloadExtendsToNextPage>
177-
GPUd() static uint16_t DecodeTBMultiThread(processorType& clusterer, GPUSharedMemory& smem, const int32_t iThread, const uint8_t*& page, uint32_t pageDigitOffset, const header::RAWDataHeader* rawDataHeader, int32_t firstHBF, int32_t cru, const uint8_t* payloadEnd, const uint8_t* nextPage);
180+
GPUd() static int16_t DecodeTBMultiThread(processorType& clusterer, GPUSharedMemory& smem, const int32_t iThread, const uint8_t*& page, uint32_t pageDigitOffset, const header::RAWDataHeader* rawDataHeader, int32_t firstHBF, int32_t cru, uint16_t nSamplesLeftInPage, const uint8_t* payloadEnd, const uint8_t* nextPage);
178181
};
179182

180183
} // namespace o2::gpu

0 commit comments

Comments
 (0)