Project
Loading...
Searching...
No Matches
GPUTPCCFCheckPadBaseline.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
16#include "CfArray2D.h"
17#include "PackedCharge.h"
18#include "clusterFinderDefs.h"
19
20#ifndef GPUCA_GPUCODE
21#include "utils/VcShim.h"
22#endif
23
24using namespace o2::gpu;
25using namespace o2::gpu::tpccf;
26
27template <>
28GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
29{
30#ifdef GPUCA_GPUCODE
31 CheckBaselineGPU(nBlocks, nThreads, iBlock, iThread, smem, clusterer);
32#else
33 CheckBaselineCPU(nBlocks, nThreads, iBlock, iThread, smem, clusterer);
34#endif
35}
36
37// Charges are stored in a 2D array (pad and time) using a tiling layout.
38// Tiles are 8 pads x 4 timebins large stored in time-major layout and make up a single cacheline.
39//
40// This kernel processes one row per block. Threads cooperatively load chunks
41// of 4 consecutive time bins for all pads into shared memory. Thread `i` then processes charges for pad `i` in shared memory.
42// Blocks require `nextMultipleOf<64>(138 * 4) = 576` threads to process the largest TPC rows with 138 pads correctly.
43GPUd() void GPUTPCCFCheckPadBaseline::CheckBaselineGPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
44{
45#ifdef GPUCA_GPUCODE
46 if (iBlock >= GPUCA_ROW_COUNT) {
47 return;
48 }
49
50 const CfFragment& fragment = clusterer.mPmemory->fragment;
51 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
52
53 const auto iRow = iBlock;
54 const auto rowinfo = GetRowInfo(iRow);
55 const CfChargePos basePos{(Row)iRow, 0, 0};
56
57 int32_t totalCharges = 0;
58 int32_t consecCharges = 0;
59 int32_t maxConsecCharges = 0;
60 Charge maxCharge = 0;
61
62 const int16_t iPadOffset = iThread % MaxNPadsPerRow;
63 const int16_t iTimeOffset = iThread / MaxNPadsPerRow;
64 const int16_t iPadHandle = iThread;
65 const bool handlePad = iPadHandle < rowinfo.nPads;
66
67 const auto firstTB = fragment.firstNonOverlapTimeBin();
68 const auto lastTB = fragment.lastNonOverlapTimeBin();
69
70 for (auto t = firstTB; t < lastTB; t += NumOfCachedTBs) {
71
72 const TPCFragmentTime iTime = t + iTimeOffset;
73
74 const CfChargePos pos = basePos.delta({iPadOffset, iTime});
75
76 smem.charges[iTimeOffset][iPadOffset] = iTime < lastTB && iPadOffset < rowinfo.nPads ? chargeMap[pos].unpack() : 0;
77
78 GPUbarrier();
79
80 if (handlePad) {
81 for (int32_t i = 0; i < NumOfCachedTBs; i++) {
82 const Charge q = smem.charges[i][iPadHandle];
83 totalCharges += (q > 0);
84 consecCharges = (q > 0) ? consecCharges + 1 : 0;
85 maxConsecCharges = CAMath::Max(consecCharges, maxConsecCharges);
86 maxCharge = CAMath::Max<Charge>(q, maxCharge);
87 }
88 }
89
90 GPUbarrier();
91 }
92
93 if (handlePad) {
94 updatePadBaseline(rowinfo.globalPadOffset + iPadOffset, clusterer, totalCharges, maxConsecCharges, maxCharge);
95 }
96#endif
97}
98
99GPUd() void GPUTPCCFCheckPadBaseline::CheckBaselineCPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
100{
101#ifndef GPUCA_GPUCODE
102 const CfFragment& fragment = clusterer.mPmemory->fragment;
103 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
104
105 int32_t basePad = iBlock * PadsPerCacheline;
106 int32_t padsPerRow;
107 CfChargePos basePos = padToCfChargePos<PadsPerCacheline>(basePad, clusterer, padsPerRow);
108
109 if (not basePos.valid()) {
110 return;
111 }
112
113 constexpr size_t ElemsInTileRow = (size_t)TilingLayout<GridSize<2>>::WidthInTiles * TimebinsPerCacheline * PadsPerCacheline;
114
115 using UShort8 = Vc::fixed_size_simd<uint16_t, PadsPerCacheline>;
116 using Charge8 = Vc::fixed_size_simd<float, PadsPerCacheline>;
117
118 UShort8 totalCharges{Vc::Zero};
119 UShort8 consecCharges{Vc::Zero};
120 UShort8 maxConsecCharges{Vc::Zero};
121 Charge8 maxCharge{Vc::Zero};
122
123 tpccf::TPCFragmentTime t = fragment.firstNonOverlapTimeBin();
124
125 // Access packed charges as raw integers. We throw away the PackedCharge type here to simplify vectorization.
126 const uint16_t* packedChargeStart = reinterpret_cast<uint16_t*>(&chargeMap[basePos.delta({0, t})]);
127
128 for (; t < fragment.lastNonOverlapTimeBin(); t += TimebinsPerCacheline) {
129 for (tpccf::TPCFragmentTime localtime = 0; localtime < TimebinsPerCacheline; localtime++) {
130 const UShort8 packedCharges{packedChargeStart + PadsPerCacheline * localtime, Vc::Aligned};
131 const UShort8::mask_type isCharge = packedCharges != 0;
132
133 if (isCharge.isNotEmpty()) {
134 totalCharges(isCharge)++;
135 consecCharges += 1;
136 consecCharges(not isCharge) = 0;
137 maxConsecCharges = Vc::max(consecCharges, maxConsecCharges);
138
139 // Manually unpack charges to float.
140 // Duplicated from PackedCharge::unpack to generate vectorized code:
141 // Charge unpack() const { return Charge(mVal & ChargeMask) / Charge(1 << DecimalBits); }
142 // Note that PackedCharge has to cut off the highest 2 bits via ChargeMask as they are used for flags by the cluster finder
143 // and are not part of the charge value. We can skip this step because the cluster finder hasn't run yet
144 // and thus these bits are guarenteed to be zero.
145 const Charge8 unpackedCharges = Charge8(packedCharges) / Charge(1 << PackedCharge::DecimalBits);
146 maxCharge = Vc::max(maxCharge, unpackedCharges);
147 } else {
148 consecCharges = 0;
149 }
150 }
151
152 packedChargeStart += ElemsInTileRow;
153 }
154
155 for (tpccf::Pad localpad = 0; localpad < PadsPerCacheline; localpad++) {
156 updatePadBaseline(basePad + localpad, clusterer, totalCharges[localpad], maxConsecCharges[localpad], maxCharge[localpad]);
157 }
158#endif
159}
160
161template <int32_t PadsPerBlock>
162GPUd() CfChargePos GPUTPCCFCheckPadBaseline::padToCfChargePos(int32_t& pad, const GPUTPCClusterFinder& clusterer, int32_t& padsPerRow)
163{
164 constexpr GPUTPCGeometry geo;
165
166 int32_t padOffset = 0;
167 for (Row r = 0; r < GPUCA_ROW_COUNT; r++) {
168 int32_t npads = geo.NPads(r);
169 int32_t padInRow = pad - padOffset;
170 if (0 <= padInRow && padInRow < npads) {
171 int32_t cachelineOffset = padInRow % PadsPerBlock;
172 pad -= cachelineOffset;
173 padsPerRow = npads;
174 return CfChargePos{r, Pad(padInRow - cachelineOffset), 0};
175 }
176 padOffset += npads;
177 }
178
179 padsPerRow = 0;
180 return CfChargePos{0, 0, INVALID_TIME_BIN};
181}
182
183GPUd() GPUTPCCFCheckPadBaseline::RowInfo GPUTPCCFCheckPadBaseline::GetRowInfo(int16_t row)
184{
185 constexpr GPUTPCGeometry geo;
186
187 int16_t padOffset = 0;
188 for (int16_t r = 0; r < row; r++) {
189 padOffset += geo.NPads(r);
190 }
191
192 return RowInfo{padOffset, geo.NPads(row)};
193}
194
195GPUd() void GPUTPCCFCheckPadBaseline::updatePadBaseline(int32_t pad, const GPUTPCClusterFinder& clusterer, int32_t totalCharges, int32_t consecCharges, Charge maxCharge)
196{
197 const CfFragment& fragment = clusterer.mPmemory->fragment;
198 const int32_t totalChargesBaseline = clusterer.Param().rec.tpc.maxTimeBinAboveThresholdIn1000Bin * fragment.lengthWithoutOverlap() / 1000;
199 const int32_t consecChargesBaseline = clusterer.Param().rec.tpc.maxConsecTimeBinAboveThreshold;
200 const uint16_t saturationThreshold = clusterer.Param().rec.tpc.noisyPadSaturationThreshold;
201 const bool isNoisy = (!saturationThreshold || maxCharge < saturationThreshold) && ((totalChargesBaseline > 0 && totalCharges >= totalChargesBaseline) || (consecChargesBaseline > 0 && consecCharges >= consecChargesBaseline));
202
203 if (isNoisy) {
204 clusterer.mPpadIsNoisy[pad] = true;
205 }
206}
#define INVALID_TIME_BIN
Definition CfChargePos.h:23
int32_t i
#define GPUbarrier()
GPUd() void GPUTPCCFCheckPadBaseline
#define GPUCA_ROW_COUNT
uint16_t pos
Definition RawData.h:3
Provides a basic fallback implementation for Vc.
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLboolean r
Definition glcorearb.h:1233
std::vector< int > row