Project
Loading...
Searching...
No Matches
GPUTPCCFCheckPadBaseline.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
16#include "CfArray2D.h"
17#include "PackedCharge.h"
18#include "clusterFinderDefs.h"
20#include "GPUCommonAlgorithm.h"
21
22#ifndef GPUCA_GPUCODE
23#include "MCLabelAccumulator.h"
24#include "utils/VcShim.h"
25#endif
26
27#if 0
28#define DPRINT(...) printf(__VA_ARGS__)
29#define DPRINTB(...) \
30 if (iThread == 0) \
31 printf(__VA_ARGS__)
32#define DPRINTB_IF(test, ...) \
33 if (iThread == 0 && (test)) \
34 printf(__VA_ARGS__)
35#else
36#define DPRINT(...) ((void)0)
37#define DPRINTB(...) ((void)0)
38#define DPRINTB_IF(test, ...) ((void)0)
39#endif
40
41using namespace o2::gpu;
42using namespace o2::gpu::tpccf;
43
45
46static GPUdi() HIPTailDescriptor* GetHIPTails(GPUTPCClusterFinder& clusterer, int32_t row)
47{
48 // HIP TAILS: indexing starts at 1, so 0 index indicates no connection
49 return clusterer.mPhipTailsByRow + row * GPUTPCCFHIPClusterizer::MaxHIPTailsPerRow;
50}
51
52static GPUdi() Charge UpdateHIPTailFilter(Charge filteredCharge, Charge charge, Charge alpha)
53{
54 return filteredCharge + alpha * (charge - filteredCharge);
55}
56
57static GPUdi() float HIPTailTimeMean(const HIPTailDescriptor& tail)
58{
59 const float length = tail.tailEnd > tail.tailStart ? float(tail.tailEnd - tail.tailStart) : 1.f;
60 return tail.tailStart + 0.5f * (length - 1.f);
61}
62
63static GPUdi() float HIPTailTimeVariance(const HIPTailDescriptor& tail)
64{
65 const float length = tail.tailEnd > tail.tailStart ? float(tail.tailEnd - tail.tailStart) : 1.f;
66 return (length * length - 1.f) * (1.f / 12.f);
67}
68
69// Collect tails marked for closing across the workgroup using a prefix scan,
70// then cooperatively zero the charge map entries for each closed tail.
71// Caller must set acc.activeHIPTail.end before calling if the tail is open.
72static GPUdi() uint16_t CloseHIPTails(
73 Kernel::GPUSharedMemory& smem,
74 GPUTPCClusterFinder& clusterer,
75 int32_t iThread, int32_t nThreads,
76 int16_t iPadHandle,
77 CfChargePos basePos,
78 CfArray2D<PackedCharge>& chargeMap,
79 Kernel::PadChargeAccu& acc,
80 bool shouldCloseTail)
81{
82 const uint32_t row = basePos.row();
83 const uint16_t nClosedTails = work_group_count(shouldCloseTail);
84
85 auto* nHIPTails = clusterer.mPnHIPTails;
86 auto* hipTails = GetHIPTails(clusterer, row);
87
88 if (nClosedTails > 0) {
89 int16_t iClosedTail = work_group_scan_inclusive_add((int16_t)shouldCloseTail) - 1;
90 const bool shouldStoreTail = shouldCloseTail && acc.activeHIPTail.Length() > 0;
91 uint16_t nStoredTails = work_group_count(shouldStoreTail);
92 int16_t iStoredTail = work_group_scan_inclusive_add((int16_t)shouldStoreTail) - 1;
93
94 // Use exactly one atomic add per closing call to reduce differences in
95 // tail ordering between runs.
96 if (nStoredTails > 0) {
97 if (iThread == 0) {
98 smem.tailStoreBase = CAMath::AtomicAdd(&nHIPTails[row], (uint32_t)nStoredTails);
99 }
100 GPUbarrier();
101 }
102 if (shouldCloseTail) {
103 smem.tailsClosedPad[iClosedTail] = iPadHandle;
104 smem.tailsClosed[iClosedTail] = acc.activeHIPTail;
105 smem.tailsClosedStoreIdx[iClosedTail] = GPUTPCCFHIPTailConnector::MaxHIPTailsPerRow;
106
107 if (shouldStoreTail) {
108 const uint32_t idx = smem.tailStoreBase + iStoredTail + 1;
109 smem.tailsClosedStoreIdx[iClosedTail] = idx;
111 hipTails[idx] = {0, 0, (uint16_t)iPadHandle,
112 (uint16_t)acc.activeHIPTail.start, (uint16_t)acc.activeHIPTail.end,
113 0.f, 0.f};
114 }
115 }
116
117 acc.tailFilterCharge = 0;
118 acc.activeHIPTail.Reset();
119 }
120
121 GPUbarrier();
122 }
123
124 // TODO: performance improvement -> parallelize this loop across tails
125 for (uint16_t iTail = 0; iTail < nClosedTails; iTail++) {
126 const auto tailPad = smem.tailsClosedPad[iTail];
127 const auto tail = smem.tailsClosed[iTail];
128 const uint32_t tailStoreIdx = smem.tailsClosedStoreIdx[iTail];
129
130 Charge qTot = 0.f;
131 Charge qMax = 0.f;
132 for (uint16_t iTime = iThread; iTime < tail.Length(); iTime += nThreads) {
133 const int16_t time = tail.start + iTime;
134 auto pos = basePos.delta({tailPad, time});
135 const Charge q = chargeMap[pos].unpack();
136 qTot += q;
137 qMax = CAMath::Max(qMax, q);
138 chargeMap[pos] = PackedCharge{0};
139 }
140
141 smem.tailQTotScratch[iThread] = qTot;
142 smem.tailQMaxScratch[iThread] = qMax;
143 GPUbarrier();
144 for (uint16_t active = nThreads; active > 1;) {
145 const uint16_t stride = (active + 1) / 2;
146 if (iThread < active - stride) {
147 smem.tailQTotScratch[iThread] += smem.tailQTotScratch[iThread + stride];
148 smem.tailQMaxScratch[iThread] = CAMath::Max(smem.tailQMaxScratch[iThread], smem.tailQMaxScratch[iThread + stride]);
149 }
150 active = stride;
151 GPUbarrier();
152 }
153
154 if (iThread == 0 && tailStoreIdx < GPUTPCCFHIPTailConnector::MaxHIPTailsPerRow) {
155 HIPTailDescriptor& tailDescriptor = hipTails[tailStoreIdx];
156 tailDescriptor.qTot = smem.tailQTotScratch[0];
157 tailDescriptor.qMax = smem.tailQMaxScratch[0];
158 }
159 }
160
161 return nClosedTails;
162}
163
164template <bool CheckHIPTrigger, bool CheckHIPTailEnd>
165static GPUdi() void ScanCachedCharges(Kernel::GPUSharedMemory& smem, uint16_t timeOffset, uint16_t pad, Charge hipTailThreshold, Charge hipTailFilterAlpha, Kernel::PadChargeAccu& acc)
166{
167 for (int32_t i = 0; i < Kernel::NumOfCachedTBs; i++) {
168 const Charge qs = smem.charges[i][pad];
169 const int16_t curTB = timeOffset + i;
170
171 acc.totalCharges += qs > 0;
172 acc.consecCharges = qs > 0 ? acc.consecCharges + 1 : 0;
173 acc.maxConsecCharges = CAMath::Max(acc.consecCharges, acc.maxConsecCharges);
174 acc.maxCharge = CAMath::Max<Charge>(qs, acc.maxCharge);
175
176 if (qs >= hipTailThreshold) {
177 if (acc.aboveThresholdStart < 0) {
178 acc.aboveThresholdStart = curTB;
179 }
180 } else {
181 acc.aboveThresholdStart = -1;
182 }
183
184 if constexpr (CheckHIPTrigger) {
185 if (acc.HIPtb < 0 && qs >= Charge(Kernel::MaxADC)) {
186 acc.HIPtb = acc.aboveThresholdStart; // start of rising edge, not first sat TB
187 smem.tails[pad] = {acc.HIPtb, 0}; // Broadcast HIP start TB to neighboring pads / threads
188 }
189 }
190
191 if constexpr (CheckHIPTailEnd) {
192 if (acc.activeHIPTail.IsOpen()) {
193 acc.tailFilterCharge = UpdateHIPTailFilter(acc.tailFilterCharge, qs, hipTailFilterAlpha);
194 if (acc.tailFilterCharge < hipTailThreshold) {
195 acc.activeHIPTail.end = curTB;
196 }
197 }
198 }
199 }
200}
201
202template <>
203GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
204{
205#ifdef GPUCA_GPUCODE
206 CheckBaselineGPU(nBlocks, nThreads, iBlock, iThread, smem, clusterer);
207#else
208 CheckBaselineCPU(nBlocks, nThreads, iBlock, iThread, smem, clusterer);
209#endif
210}
211
212// Charges are stored in a 2D array (pad and time) using a tiling layout.
213// Tiles are 8 pads x 4 timebins large stored in time-major layout and make up a single cacheline.
214//
215// This kernel processes one row per block. Threads cooperatively load chunks
216// of 4 consecutive time bins for all pads into shared memory. Thread `i` then processes charges for pad `i` in shared memory.
217// Blocks require `nextMultipleOf<64>(138 * 4) = 576` threads to process the largest TPC rows with 138 pads correctly.
218GPUd() void GPUTPCCFCheckPadBaseline::CheckBaselineGPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
219{
220#ifdef GPUCA_GPUCODE
221 static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFCheckPadBaseline) == 576);
222 if (iBlock >= (int32_t)GPUTPCGeometry::NROWS) {
223 return;
224 }
225
226 const CfFragment& fragment = clusterer.mPmemory->fragment;
227 const bool hipFilterOn = clusterer.Param().rec.tpc.hipTailFilter;
228 const Charge hipTailThreshold = clusterer.Param().rec.tpc.hipTailFilterThreshold;
229 const Charge hipTailFilterAlpha = clusterer.Param().rec.tpc.hipTailFilterAlpha;
230 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
231
232 constexpr GPUTPCGeometry geo;
233
234 const auto iRow = iBlock;
235 const auto nPads = geo.NPads(iRow);
236 const CfChargePos basePos{(Row)iRow, 0, 0};
237
238 PadChargeAccu acc;
239
240 const int16_t iPadOffset = iThread % MaxNPadsPerRow;
241 const int16_t iTimeOffset = iThread / MaxNPadsPerRow;
242 const int16_t iPadHandle = iThread;
243 const bool handlePad = iPadHandle < nPads;
244
245 if (iPadHandle < MaxNPadsPerRow) {
246 smem.tails[iPadHandle] = {-1, -1};
247 }
248 GPUbarrier();
249
250 // Pad filter scans the entire fragments including overlap.
251 // Minimal runtime overhead and prevents headaches later on as
252 // saturated signal in overlap region can create tails in the next fragment
253 // even when cleared in current fragment as they're decoded twice
254 const TPCFragmentTime firstTB = 0;
255 const TPCFragmentTime lastTB = fragment.length;
256
257 for (uint16_t t = firstTB; t < lastTB; t += NumOfCachedTBs) {
258
259 bool thisThreadHasTrigger = false;
260 for (uint16_t tt = 0; tt < NumOfCachedTBs; tt += TimebinsPerCacheline) {
261 const TPCFragmentTime iTimeLoad = t + tt + iTimeOffset;
262
263 const CfChargePos pos = basePos.delta({iPadOffset, iTimeLoad});
264
265 const Charge ql = iTimeLoad < lastTB && iPadOffset < nPads ? chargeMap[pos].unpack() : 0;
266 smem.charges[tt + iTimeOffset][iPadOffset] = ql;
267
268 thisThreadHasTrigger |= ql >= Charge(MaxADC);
269 }
270
271 bool hasHIPTrigger = false;
272 if (hipFilterOn) {
273 hasHIPTrigger = work_group_any(thisThreadHasTrigger);
274 } else {
275 // Need a barrier here even if HIP filter is disabled
276 GPUbarrier();
277 }
278
279 acc.HIPtb = -1;
280
281 if (handlePad) {
282
283 // TODO: is this really necessary?
284 // Why is the old version so much slower, when we just add short branches to the loop???
285 if (!hasHIPTrigger) [[likely]] {
286 if (!acc.activeHIPTail.IsOpen()) {
287 ScanCachedCharges<false, false>(smem, t, iPadHandle, hipTailThreshold, hipTailFilterAlpha, acc);
288 } else {
289 ScanCachedCharges<false, true>(smem, t, iPadHandle, hipTailThreshold, hipTailFilterAlpha, acc);
290 }
291 } else {
292 if (!acc.activeHIPTail.IsOpen()) {
293 ScanCachedCharges<true, false>(smem, t, iPadHandle, hipTailThreshold, hipTailFilterAlpha, acc);
294 } else {
295 ScanCachedCharges<true, true>(smem, t, iPadHandle, hipTailThreshold, hipTailFilterAlpha, acc);
296 }
297 }
298 }
299
300 GPUbarrier();
301
302 if (hasHIPTrigger) [[unlikely]] {
303
304 DPRINTB("%d: Trigger!\n", iBlock);
305
306 if (handlePad && acc.HIPtb < 0) {
307
308 // Search neighboring pads for trigger
309 for (int16_t i = -SSClusterPadWidth; i < 0; i++) {
310 const auto p = iPadHandle + i;
311 if (p > -1) {
312 acc.HIPtb = CAMath::Max(smem.tails[p].start, acc.HIPtb);
313 }
314 }
315
316 for (int16_t i = 1; i <= SSClusterPadWidth; i++) {
317 const auto p = iPadHandle + i;
318 if (p < MaxNPadsPerRow) {
319 acc.HIPtb = CAMath::Max(smem.tails[p].start, acc.HIPtb);
320 }
321 }
322 }
323
324 bool shouldCloseTail = acc.HIPtb > -1 && acc.activeHIPTail.HasValue();
325 if (shouldCloseTail && acc.activeHIPTail.IsOpen()) {
326 DPRINT("%d: end = %d\n", iThread, acc.HIPtb);
327 acc.activeHIPTail.end = acc.HIPtb;
328 }
329
330 CloseHIPTails(smem, clusterer, iThread, nThreads, iPadHandle, basePos, chargeMap, acc, shouldCloseTail);
331
332 GPUbarrier();
333
334 if (acc.HIPtb > -1) {
335 DPRINT("%d: start = %d\n", iThread, acc.HIPtb);
336 acc.activeHIPTail.SetOpen(acc.HIPtb);
337 acc.tailFilterCharge = Charge(MaxADC);
338 }
339
340 // Clear smem between iterations to prevent stale entries
341 if (handlePad) {
342 smem.tails[iPadHandle].Reset();
343 }
344
345 GPUbarrier();
346
347 } // if (hipTriggerFound)
348
349 } // for (uint16_t t = firstTB; t < lastTB; t += NumOfCachedTBs)
350
351 if (handlePad) {
352 updatePadBaseline(basePos.gpad + iPadHandle, clusterer, acc.totalCharges, acc.maxConsecCharges, acc.maxCharge);
353 }
354
355 // --- Close remaining tails
356 const bool shouldCloseTail = acc.activeHIPTail.HasValue();
357
358 // Call `work_group_any` here, instead of always counting.
359 // This is important as `work_group_count` is a lot slower
360 // and has a lot of overhead if no HIPs were found.
361 if (work_group_any(shouldCloseTail)) {
362 if (shouldCloseTail && acc.activeHIPTail.IsOpen()) {
363 acc.activeHIPTail.end = lastTB;
364 }
365
366 [[maybe_unused]] const uint16_t nClosedTails = CloseHIPTails(smem, clusterer, iThread, nThreads, iPadHandle, basePos, chargeMap, acc, shouldCloseTail);
367
368 DPRINTB_IF(nClosedTails > 0, "%d: Close remaining tails (%d)\n", iBlock, nClosedTails);
369 }
370
371#endif
372}
373
374GPUd() void GPUTPCCFCheckPadBaseline::CheckBaselineCPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
375{
376#ifndef GPUCA_GPUCODE
377 const CfFragment& fragment = clusterer.mPmemory->fragment;
378 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
379
380 CfChargePos basePos(iBlock * PadsPerCacheline, 0);
381
382 constexpr GPUTPCGeometry geo;
383 if (basePos.pad() >= geo.NPads(basePos.row())) {
384 return;
385 }
386
387 constexpr size_t ElemsInTileRow = (size_t)TilingLayout<GridSize<2>>::WidthInTiles * TimebinsPerCacheline * PadsPerCacheline;
388
389 using UShort8 = Vc::fixed_size_simd<uint16_t, PadsPerCacheline>;
390 using Charge8 = Vc::fixed_size_simd<float, PadsPerCacheline>;
391
392 UShort8 totalCharges{Vc::Zero};
393 UShort8 consecCharges{Vc::Zero};
394 UShort8 maxConsecCharges{Vc::Zero};
395 Charge8 maxCharge{Vc::Zero};
396
397 tpccf::TPCFragmentTime t = fragment.firstNonOverlapTimeBin();
398
399 // Access packed charges as raw integers. We throw away the PackedCharge type here to simplify vectorization.
400 const uint16_t* packedChargeStart = reinterpret_cast<uint16_t*>(&chargeMap[basePos.delta({0, t})]);
401
402 for (; t < fragment.lastNonOverlapTimeBin(); t += TimebinsPerCacheline) {
403 for (tpccf::TPCFragmentTime localtime = 0; localtime < TimebinsPerCacheline; localtime++) {
404 const UShort8 packedCharges{packedChargeStart + PadsPerCacheline * localtime, Vc::Aligned};
405 const UShort8::mask_type isCharge = packedCharges != 0;
406
407 if (isCharge.isNotEmpty()) {
408 totalCharges(isCharge)++;
409 consecCharges += 1;
410 consecCharges(not isCharge) = 0;
411 maxConsecCharges = Vc::max(consecCharges, maxConsecCharges);
412
413 // Manually unpack charges to float.
414 // Duplicated from PackedCharge::unpack to generate vectorized code:
415 // Charge unpack() const { return Charge(mVal & ChargeMask) / Charge(1 << DecimalBits); }
416 // Note that PackedCharge has to cut off the highest 2 bits via ChargeMask as they are used for flags by the cluster finder
417 // and are not part of the charge value. We can skip this step because the cluster finder hasn't run yet
418 // and thus these bits are guarenteed to be zero.
419 const Charge8 unpackedCharges = Charge8(packedCharges) / Charge(1 << PackedCharge::DecimalBits);
420 maxCharge = Vc::max(maxCharge, unpackedCharges);
421 } else {
422 consecCharges = 0;
423 }
424 }
425
426 packedChargeStart += ElemsInTileRow;
427 }
428
429 for (tpccf::Pad localpad = 0; localpad < PadsPerCacheline; localpad++) {
430 updatePadBaseline(basePos.gpad + localpad, clusterer, totalCharges[localpad], maxConsecCharges[localpad], maxCharge[localpad]);
431 }
432#endif
433}
434
435GPUd() void GPUTPCCFCheckPadBaseline::updatePadBaseline(int32_t pad, const GPUTPCClusterFinder& clusterer, int32_t totalCharges, int32_t consecCharges, Charge maxCharge)
436{
437 const CfFragment& fragment = clusterer.mPmemory->fragment;
438 const int32_t totalChargesBaseline = clusterer.Param().rec.tpc.maxTimeBinAboveThresholdIn1000Bin * fragment.lengthWithoutOverlap() / 1000;
439 const int32_t consecChargesBaseline = clusterer.Param().rec.tpc.maxConsecTimeBinAboveThreshold;
440 const uint16_t saturationThreshold = clusterer.Param().rec.tpc.noisyPadSaturationThreshold;
441 const bool isNoisy = (!saturationThreshold || maxCharge < saturationThreshold) && ((totalChargesBaseline > 0 && totalCharges >= totalChargesBaseline) || (consecChargesBaseline > 0 && consecCharges >= consecChargesBaseline));
442
443 if (isNoisy) {
444 clusterer.mPpadIsNoisy[pad] = true;
445 }
446}
447
448// ======== HIP Tail Connector Kernel ========
449
450template <>
451GPUd() void GPUTPCCFHIPTailConnector::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
452{
453 if (iBlock >= (int32_t)GPUTPCGeometry::NROWS) {
454 return;
455 }
456 const uint32_t row = iBlock;
457
458 const uint32_t nTails = CAMath::Min(clusterer.mPnHIPTails[row], (uint32_t)MaxHIPTailsPerRow - 1);
459
460 // HIP TAILS: indexing starts at 1, so 0 index indicates no connection
461 HIPTailDescriptor* tails = GetHIPTails(clusterer, row);
462
463#ifdef GPUCA_DETERMINISTIC_MODE
464 // Races in tail comparisons and atomic swap can lead to slightly different clusters.
465 // So need a sequential fallback for deterministic mode
466 if (iThread > 0) {
467 return;
468 }
469 nThreads = 1;
470 GPUCommonAlgorithm::sortInBlock(tails + 1, tails + nTails + 1, [](auto&& t1, auto&& t2) {
471 if (t1.pad != t2.pad) {
472 return t1.pad < t2.pad;
473 }
474 return t1.tailStart < t2.tailStart;
475 });
476#endif
477
478 for (uint32_t iTail = iThread + 1; iTail <= nTails; iTail += nThreads) {
479 auto* tail = &tails[iTail];
480
481 // TODO: this is needed because tailStarts may vary due to rising edge
482 // Better approach would be to also track the triggered timebin and match that instead
483 uint16_t overlapWindowStart = tail->tailStart >= 5 ? tail->tailStart - 5 : 0;
484 uint16_t overlapWindowEnd = tail->tailStart + 5;
485
486 for (uint32_t jTail = iTail + 1; jTail <= nTails; jTail++) {
487 auto* tailNext = &tails[jTail];
488 if (tailNext->iPrev > 0) {
489 continue;
490 }
491
492 const bool overlapPad = tailNext->pad >= tail->pad - GPUTPCCFCheckPadBaseline::SSClusterPadWidth && tailNext->pad <= tail->pad + GPUTPCCFCheckPadBaseline::SSClusterPadWidth;
493 const bool overlapTime = tailNext->tailStart >= overlapWindowStart && tailNext->tailStart < overlapWindowEnd;
494
495 if (overlapPad && overlapTime) {
496 if (CAMath::AtomicCAS(&tailNext->iPrev, 0u, iTail)) {
497 tail->iNext = jTail;
498 break;
499 }
500 }
501 }
502 }
503}
504
505// ======== HIP Clusterizer Kernel ========
506
507template <>
508GPUd() void GPUTPCCFHIPClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, uint8_t onlyMC)
509{
510 if (iBlock >= (int32_t)GPUTPCGeometry::NROWS) {
511 return;
512 }
513
514 const uint32_t row = iBlock;
515 uint32_t nTails = clusterer.mPnHIPTails[row];
516 nTails = CAMath::Min(nTails, (uint32_t)MaxHIPTailsPerRow - 1);
517
518 const auto* tails = GetHIPTails(clusterer, row);
519 const auto& fragment = clusterer.mPmemory->fragment;
520
521 auto* clusterPosInRow = clusterer.mPhipClusterPosInRow
522 ? clusterer.mPhipClusterPosInRow + row * MaxHIPTailsPerRow
523 : nullptr;
524
525 for (uint32_t iTail = iThread + 1; iTail <= nTails; iTail += nThreads) {
526
527 const auto* tail = &tails[iTail];
528 if (tail->iPrev != 0) {
529 continue;
530 }
531
532 CPU_ONLY(auto labelAcc = MCLabelAccumulator{clusterer});
533
534 float qTot = 0;
535 float qMax = 0;
536 float padSum = 0;
537 float padSqSum = 0;
538 float timeSum = 0;
539 uint32_t tailStart = (uint32_t)-1;
540 uint32_t tailEnd = 0;
541
542 // Zero-th element is empty tail
543 for (; tail != tails; tail = &tails[tail->iNext]) {
544 const float tailWeight = tail->qTot;
545 const float tailPad = tail->pad;
546 const float tailTime = HIPTailTimeMean(*tail);
547 qMax = CAMath::Max(qMax, tail->qMax);
548 qTot += tail->qTot;
549 padSum += tailWeight * tailPad;
550 padSqSum += tailWeight * tailPad * tailPad;
551 timeSum += tailWeight * tailTime;
552 tailStart = CAMath::Min<uint32_t>(tailStart, tail->tailStart);
553 tailEnd = CAMath::Max<uint32_t>(tailEnd, tail->tailEnd);
554
555 CPU_ONLY(labelAcc.collectTail(row, tail->pad, tail->tailStart, tail->tailEnd));
556 }
557
558 const float weightSum = CAMath::Max(qTot, 1.f);
559 const float padMean = padSum / weightSum;
560 const float timeMean = timeSum / weightSum; // TODO: Use timebin of saturated signal instead! Time mean is biased for long tails.
561 const float padSigma = CAMath::Sqrt(CAMath::Max(0.f, padSqSum / weightSum - padMean * padMean));
562
564 cn.qMax = qMax;
565 cn.setSaturatedQtot(qTot);
566 cn.setSaturatedTailLength(tailEnd - tailStart);
567 float clusterTime = fragment.start + timeMean - clusterer.Param().rec.tpc.clustersShiftTimebinsClusterizer;
568 cn.setTimeFlags(clusterTime, 0);
569 cn.setPad(padMean);
570 cn.setSigmaPad(padSigma);
571
572 if (cn.qMax >= 1023) {
573
574 uint32_t index;
575
576 if (!onlyMC) {
577 // Cut off clusters where the tail connection failed for some reason
578 // TODO: Deduplicate with GPUTPCCFClusterizer::sortIntoBuckets (can't call cross-kernel).
579 // TODO: Add error reporting for row cluster overflow.
580 index = CAMath::AtomicAdd(&clusterer.mPclusterInRow[row], 1u);
581 if (index < clusterer.mNMaxClusterPerRow) {
582 clusterer.mPclusterByRow[clusterer.mNMaxClusterPerRow * row + index] = cn;
583 }
584 if (clusterPosInRow) {
585 clusterPosInRow[iTail] = index;
586 }
587 } else {
588 index = clusterPosInRow[iTail];
589 }
590
591 CPU_ONLY(labelAcc.commit(row, index, clusterer.mNMaxClusterPerRow));
592 }
593
594 } // for (uint32_t iTail = iThread + 1; iTail <= nTails; iTail += nThreads)
595}
Class of a TPC cluster in TPC-native coordinates (row, time)
int16_t time
Definition RawEventData.h:4
int32_t i
#define GPUbarrier()
#define GPUCA_GET_THREAD_COUNT(...)
#define DPRINT(...)
#define DPRINTB(...)
#define DPRINTB_IF(test,...)
GPUd() void GPUTPCCFCheckPadBaseline
static Charge charge
static int32_t row
uint16_t pos
Definition RawData.h:3
Provides a basic fallback implementation for Vc.
static constexpr uint32_t NROWS
#define CPU_ONLY(x)
GLfloat GLfloat GLfloat alpha
Definition glcorearb.h:279
GLuint index
Definition glcorearb.h:781
GLuint GLsizei GLsizei * length
Definition glcorearb.h:790
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLint GLenum GLboolean GLsizei stride
Definition glcorearb.h:867
GLuint GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat t1
Definition glcorearb.h:5034
GPUdi() o2
Definition TrackTRD.h:39
tpccf::TPCFragmentTime length
Definition CfFragment.h:33
tpccf::TPCTime start
Definition CfFragment.h:31