Project
Loading...
Searching...
No Matches
GPUTPCCFCheckPadBaseline.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
16#include "CfArray2D.h"
17#include "PackedCharge.h"
18#include "clusterFinderDefs.h"
19
20#ifndef GPUCA_GPUCODE
21#include "utils/VcShim.h"
22#endif
23
24using namespace o2::gpu;
25using namespace o2::gpu::tpccf;
26
27template <>
28GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
29{
30#ifdef GPUCA_GPUCODE
31 CheckBaselineGPU(nBlocks, nThreads, iBlock, iThread, smem, clusterer);
32#else
33 CheckBaselineCPU(nBlocks, nThreads, iBlock, iThread, smem, clusterer);
34#endif
35}
36
37// Charges are stored in a 2D array (pad and time) using a tiling layout.
38// Tiles are 8 pads x 4 timebins large stored in time-major layout and make up a single cacheline.
39//
40// This kernel processes one row per block. Threads cooperatively load chunks
41// of 4 consecutive time bins for all pads into shared memory. Thread `i` then processes charges for pad `i` in shared memory.
42// Blocks require `nextMultipleOf<64>(138 * 4) = 576` threads to process the largest TPC rows with 138 pads correctly.
43GPUd() void GPUTPCCFCheckPadBaseline::CheckBaselineGPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
44{
45#ifdef GPUCA_GPUCODE
46 static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFCheckPadBaseline) == 576);
47 if (iBlock >= (int32_t)GPUTPCGeometry::NROWS) {
48 return;
49 }
50
51 const CfFragment& fragment = clusterer.mPmemory->fragment;
52 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
53
54 constexpr GPUTPCGeometry geo;
55
56 const auto iRow = iBlock;
57 const auto nPads = geo.NPads(iRow);
58 const CfChargePos basePos{(Row)iRow, 0, 0};
59
60 int32_t totalCharges = 0;
61 int32_t consecCharges = 0;
62 int32_t maxConsecCharges = 0;
63 Charge maxCharge = 0;
64
65 const int16_t iPadOffset = iThread % MaxNPadsPerRow;
66 const int16_t iTimeOffset = iThread / MaxNPadsPerRow;
67 const int16_t iPadHandle = iThread;
68 const bool handlePad = iPadHandle < nPads;
69
70 const auto firstTB = fragment.firstNonOverlapTimeBin();
71 const auto lastTB = fragment.lastNonOverlapTimeBin();
72
73 for (auto t = firstTB; t < lastTB; t += NumOfCachedTBs) {
74
75 const TPCFragmentTime iTime = t + iTimeOffset;
76
77 const CfChargePos pos = basePos.delta({iPadOffset, iTime});
78
79 smem.charges[iTimeOffset][iPadOffset] = iTime < lastTB && iPadOffset < nPads ? chargeMap[pos].unpack() : 0;
80
81 GPUbarrier();
82
83 if (handlePad) {
84 for (int32_t i = 0; i < NumOfCachedTBs; i++) {
85 const Charge q = smem.charges[i][iPadHandle];
86 totalCharges += (q > 0);
87 consecCharges = (q > 0) ? consecCharges + 1 : 0;
88 maxConsecCharges = CAMath::Max(consecCharges, maxConsecCharges);
89 maxCharge = CAMath::Max<Charge>(q, maxCharge);
90 }
91 }
92
93 GPUbarrier();
94 }
95
96 if (handlePad) {
97 updatePadBaseline(basePos.gpad + iPadHandle, clusterer, totalCharges, maxConsecCharges, maxCharge);
98 }
99#endif
100}
101
102GPUd() void GPUTPCCFCheckPadBaseline::CheckBaselineCPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
103{
104#ifndef GPUCA_GPUCODE
105 const CfFragment& fragment = clusterer.mPmemory->fragment;
106 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
107
108 CfChargePos basePos(iBlock * PadsPerCacheline, 0);
109
110 constexpr GPUTPCGeometry geo;
111 if (basePos.pad() >= geo.NPads(basePos.row())) {
112 return;
113 }
114
115 constexpr size_t ElemsInTileRow = (size_t)TilingLayout<GridSize<2>>::WidthInTiles * TimebinsPerCacheline * PadsPerCacheline;
116
117 using UShort8 = Vc::fixed_size_simd<uint16_t, PadsPerCacheline>;
118 using Charge8 = Vc::fixed_size_simd<float, PadsPerCacheline>;
119
120 UShort8 totalCharges{Vc::Zero};
121 UShort8 consecCharges{Vc::Zero};
122 UShort8 maxConsecCharges{Vc::Zero};
123 Charge8 maxCharge{Vc::Zero};
124
125 tpccf::TPCFragmentTime t = fragment.firstNonOverlapTimeBin();
126
127 // Access packed charges as raw integers. We throw away the PackedCharge type here to simplify vectorization.
128 const uint16_t* packedChargeStart = reinterpret_cast<uint16_t*>(&chargeMap[basePos.delta({0, t})]);
129
130 for (; t < fragment.lastNonOverlapTimeBin(); t += TimebinsPerCacheline) {
131 for (tpccf::TPCFragmentTime localtime = 0; localtime < TimebinsPerCacheline; localtime++) {
132 const UShort8 packedCharges{packedChargeStart + PadsPerCacheline * localtime, Vc::Aligned};
133 const UShort8::mask_type isCharge = packedCharges != 0;
134
135 if (isCharge.isNotEmpty()) {
136 totalCharges(isCharge)++;
137 consecCharges += 1;
138 consecCharges(not isCharge) = 0;
139 maxConsecCharges = Vc::max(consecCharges, maxConsecCharges);
140
141 // Manually unpack charges to float.
142 // Duplicated from PackedCharge::unpack to generate vectorized code:
143 // Charge unpack() const { return Charge(mVal & ChargeMask) / Charge(1 << DecimalBits); }
144 // Note that PackedCharge has to cut off the highest 2 bits via ChargeMask as they are used for flags by the cluster finder
145 // and are not part of the charge value. We can skip this step because the cluster finder hasn't run yet
146 // and thus these bits are guarenteed to be zero.
147 const Charge8 unpackedCharges = Charge8(packedCharges) / Charge(1 << PackedCharge::DecimalBits);
148 maxCharge = Vc::max(maxCharge, unpackedCharges);
149 } else {
150 consecCharges = 0;
151 }
152 }
153
154 packedChargeStart += ElemsInTileRow;
155 }
156
157 for (tpccf::Pad localpad = 0; localpad < PadsPerCacheline; localpad++) {
158 updatePadBaseline(basePos.gpad + localpad, clusterer, totalCharges[localpad], maxConsecCharges[localpad], maxCharge[localpad]);
159 }
160#endif
161}
162
163GPUd() void GPUTPCCFCheckPadBaseline::updatePadBaseline(int32_t pad, const GPUTPCClusterFinder& clusterer, int32_t totalCharges, int32_t consecCharges, Charge maxCharge)
164{
165 const CfFragment& fragment = clusterer.mPmemory->fragment;
166 const int32_t totalChargesBaseline = clusterer.Param().rec.tpc.maxTimeBinAboveThresholdIn1000Bin * fragment.lengthWithoutOverlap() / 1000;
167 const int32_t consecChargesBaseline = clusterer.Param().rec.tpc.maxConsecTimeBinAboveThreshold;
168 const uint16_t saturationThreshold = clusterer.Param().rec.tpc.noisyPadSaturationThreshold;
169 const bool isNoisy = (!saturationThreshold || maxCharge < saturationThreshold) && ((totalChargesBaseline > 0 && totalCharges >= totalChargesBaseline) || (consecChargesBaseline > 0 && consecCharges >= consecChargesBaseline));
170
171 if (isNoisy) {
172 clusterer.mPpadIsNoisy[pad] = true;
173 }
174}
int32_t i
#define GPUbarrier()
#define GPUCA_GET_THREAD_COUNT(...)
GPUd() void GPUTPCCFCheckPadBaseline
uint16_t pos
Definition RawData.h:3
Provides a basic fallback implementation for Vc.
static constexpr uint32_t NROWS
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)