Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 6 additions & 10 deletions GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,10 @@
#include "CommonDataFormat/InteractionRecord.h"
#endif

#include "utils/VcShim.h"
#include "utils/strtag.h"
#include <fstream>

#ifndef GPUCA_NO_VC
#include <Vc/Vc>
#endif

using namespace o2::gpu;
using namespace o2::tpc;
using namespace o2::tpc::constants;
Expand Down Expand Up @@ -173,7 +170,7 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : ((mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader*)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]) : 0);

for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
#ifndef GPUCA_NO_VC

if (GetProcessingSettings().prefetchTPCpageScan >= 3 && j < GPUTrackingInOutZS::NENDPOINTS - 1) {
for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j + 1]; k++) {
for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j + 1][k]; l++) {
Expand All @@ -182,7 +179,6 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
}
}
}
#endif

std::vector<std::pair<CfFragment, TPCCFDecodeScanTmp>> fragments;
fragments.reserve(mCFContext->nFragments);
Expand All @@ -201,12 +197,12 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
}
nPages += mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k];
for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; l++) {
#ifndef GPUCA_NO_VC

if (GetProcessingSettings().prefetchTPCpageScan >= 2 && l + 1 < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]) {
Vc::Common::prefetchForOneRead(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE);
Vc::Common::prefetchForOneRead(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
}
#endif

const uint8_t* const page = ((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE;
const o2::header::RAWDataHeader* rdh = (const o2::header::RAWDataHeader*)page;
if (o2::raw::RDHUtils::getMemorySize(*rdh) == sizeof(o2::header::RAWDataHeader)) {
Expand Down Expand Up @@ -510,7 +506,7 @@ int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
return 1;
}
}
#ifndef GPUCA_NO_VC

if (GetProcessingSettings().prefetchTPCpageScan >= 1 && iSector < NSECTORS - 1) {
for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j]; k++) {
Expand All @@ -521,7 +517,7 @@ int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
}
}
}
#endif

const auto& x = TPCClusterizerDecodeZSCount(iSector, fragmentMax);
nDigitsFragmentMax[iSector] = x.first;
processors()->tpcClusterer[iSector].mPmemory->counters.nDigits = x.first;
Expand Down
30 changes: 1 addition & 29 deletions GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,7 @@
#include "clusterFinderDefs.h"

#ifndef GPUCA_GPUCODE
#ifndef GPUCA_NO_VC
#include <Vc/Vc>
#else
#include <array>
#endif
#include "utils/VcShim.h"
#endif

using namespace o2::gpu;
Expand Down Expand Up @@ -80,20 +76,13 @@ GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThread

constexpr size_t ElemsInTileRow = (size_t)TilingLayout<GridSize<2>>::WidthInTiles * TimebinsPerCacheline * PadsPerCacheline;

#ifndef GPUCA_NO_VC
using UShort8 = Vc::fixed_size_simd<uint16_t, PadsPerCacheline>;
using Charge8 = Vc::fixed_size_simd<float, PadsPerCacheline>;

UShort8 totalCharges{Vc::Zero};
UShort8 consecCharges{Vc::Zero};
UShort8 maxConsecCharges{Vc::Zero};
Charge8 maxCharge{Vc::Zero};
#else
std::array<uint16_t, PadsPerCacheline> totalCharges{0};
std::array<uint16_t, PadsPerCacheline> consecCharges{0};
std::array<uint16_t, PadsPerCacheline> maxConsecCharges{0};
std::array<Charge, PadsPerCacheline> maxCharge{0};
#endif

tpccf::TPCFragmentTime t = fragment.firstNonOverlapTimeBin();

Expand All @@ -102,7 +91,6 @@ GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThread

for (; t < fragment.lastNonOverlapTimeBin(); t += TimebinsPerCacheline) {
for (tpccf::TPCFragmentTime localtime = 0; localtime < TimebinsPerCacheline; localtime++) {
#ifndef GPUCA_NO_VC
const UShort8 packedCharges{packedChargeStart + PadsPerCacheline * localtime, Vc::Aligned};
const UShort8::mask_type isCharge = packedCharges != 0;

Expand All @@ -123,22 +111,6 @@ GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThread
} else {
consecCharges = 0;
}
#else // Vc not available
for (tpccf::Pad localpad = 0; localpad < PadsPerCacheline; localpad++) {
const uint16_t packedCharge = packedChargeStart[PadsPerCacheline * localtime + localpad];
const bool isCharge = packedCharge != 0;
if (isCharge) {
totalCharges[localpad]++;
consecCharges[localpad]++;
maxConsecCharges[localpad] = CAMath::Max(maxConsecCharges[localpad], consecCharges[localpad]);

const Charge unpackedCharge = Charge(packedCharge) / Charge(1 << PackedCharge::DecimalBits);
maxCharge[localpad] = CAMath::Max<Charge>(maxCharge[localpad], unpackedCharge);
} else {
consecCharges[localpad] = 0;
}
}
#endif
}

packedChargeStart += ElemsInTileRow;
Expand Down
192 changes: 192 additions & 0 deletions GPU/GPUTracking/utils/VcShim.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
// Copyright 2020-2025 CERN and copyright holders of ALICE O2.
// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
// All rights not expressly granted are reserved.
//
// This software is distributed under the terms of the GNU General Public
// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
//
// In applying this license CERN does not waive the privileges and immunities
// granted to it by virtue of its status as an Intergovernmental Organization
// or submit itself to any jurisdiction.

/// \file VcShim.h
/// \brief Provides a basic fallback implementation for Vc
///
/// \author Felix Weiglhofer

#ifndef GPU_UTILS_VCSHIM_H
#define GPU_UTILS_VCSHIM_H

#ifndef GPUCA_NO_VC

#include <Vc/Vc>

#else

#include <algorithm>
#include <array>
#include <bitset>
#include <cstddef>

namespace Vc
{

constexpr struct VectorSpecialInitializerZero {
} Zero;
constexpr struct AlignedTag {
} Aligned;

template <typename T>
typename T::vector_type& internal_data(T& v)
{
return v.mData;
}

template <typename T>
const typename T::vector_type& internal_data(const T& v)
{
return v.mData;
}

namespace Common
{

template <typename V, typename M>
class WriteMaskVector
{
private:
const M& mMask;
V& mVec;

public:
using value_type = typename V::value_type;

WriteMaskVector(V& v, const M& m) : mMask(m), mVec(v) {}

WriteMaskVector& operator++(int)
{
for (size_t i = 0; i < mVec.size(); i++)
mVec[i] += value_type(mMask[i]);
return *this;
}

WriteMaskVector& operator=(const value_type& v)
{
for (size_t i = 0; i < mVec.size(); i++) {
if (mMask[i])
mVec[i] = v;
}
return *this;
}
};

inline void prefetchMid(const void*) {}
inline void prefetchFar(const void*) {}
inline void prefetchForOneRead(const void*) {}

} // namespace Common

template <typename T, size_t N>
class fixed_size_simd_mask
{
private:
std::bitset<N> mData;

public:
bool isNotEmpty() const { return mData.any(); }

std::bitset<N>::reference operator[](size_t i) { return mData[i]; }
bool operator[](size_t i) const { return mData[i]; }

fixed_size_simd_mask operator!() const
{
auto o = *this;
o.mData.flip();
return o;
}
};

template <typename T, size_t N>
class fixed_size_simd
{
private:
std::array<T, N> mData;

public:
using vector_type = std::array<T, N>;
using value_type = T;
using mask_type = fixed_size_simd_mask<T, N>;

static constexpr size_t size() { return N; }

fixed_size_simd() = default;
explicit fixed_size_simd(VectorSpecialInitializerZero) { mData = {}; }

template <typename U>
fixed_size_simd(const fixed_size_simd<U, N>& w)
{
std::copy_n(internal_data(w).begin(), N, mData.begin());
}

fixed_size_simd(const T* d, AlignedTag) { std::copy_n(d, N, mData.begin()); }

T& operator[](size_t i) { return mData[i]; }
const T& operator[](size_t i) const { return mData[i]; }

Common::WriteMaskVector<fixed_size_simd, mask_type> operator()(const mask_type& m) { return {*this, m}; }

fixed_size_simd& operator=(const T& v)
{
for (auto& x : mData)
x = v;
return *this;
}

fixed_size_simd& operator+=(const T& v)
{
for (auto& x : mData)
x += v;
return *this;
}

fixed_size_simd& operator/=(const T& v)
{
for (auto& x : mData)
x /= v;
return *this;
}

fixed_size_simd operator/(const T& v) const
{
auto x = *this;
return x /= v;
}

mask_type operator==(const T& v) const
{
mask_type m;
for (size_t i = 0; i < N; i++)
m[i] = mData[i] == v;
return m;
}

mask_type operator!=(const T& v) const { return !(*this == v); }

friend vector_type& internal_data<>(fixed_size_simd& x);
friend const vector_type& internal_data<>(const fixed_size_simd& x);
};

template <typename V>
V max(const V& a, const V& b)
{
V o;
for (size_t i = 0; i < a.size(); i++)
o[i] = std::max(a[i], b[i]);
return o;
}

} // namespace Vc

#endif // ifndef GPUCA_NO_VC

#endif