Project
Loading...
Searching...
No Matches
GPUTPCCFCheckPadBaseline.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
16#include "CfArray2D.h"
17#include "PackedCharge.h"
18#include "clusterFinderDefs.h"
20
21#ifndef GPUCA_GPUCODE
22#include "utils/VcShim.h"
23#endif
24
25#if 0
26#define DPRINT(...) printf(__VA_ARGS__)
27#define DPRINTB(...) \
28 if (iThread == 0) \
29 printf(__VA_ARGS__)
30#define DPRINTB_IF(test, ...) \
31 if (iThread == 0 && (test)) \
32 printf(__VA_ARGS__)
33#else
34#define DPRINT(...) ((void)0)
35#define DPRINTB(...) ((void)0)
36#define DPRINTB_IF(test, ...) ((void)0)
37#endif
38
39using namespace o2::gpu;
40using namespace o2::gpu::tpccf;
41
43
44static GPUdi() HIPTailDescriptor* GetHIPTails(GPUTPCClusterFinder& clusterer, int32_t row)
45{
46 // HIP TAILS: indexing starts at 1, so 0 index indicates no connection
47 return clusterer.mPhipTailsByRow + row * GPUTPCCFHIPClusterizer::MaxHIPTailsPerRow;
48}
49
50static GPUdi() Charge UpdateHIPTailFilter(Charge filteredCharge, Charge charge, Charge alpha)
51{
52 return filteredCharge + alpha * (charge - filteredCharge);
53}
54
55static GPUdi() float HIPTailTimeMean(const HIPTailDescriptor& tail)
56{
57 const float length = tail.tailEnd > tail.tailStart ? float(tail.tailEnd - tail.tailStart) : 1.f;
58 return tail.tailStart + 0.5f * (length - 1.f);
59}
60
61static GPUdi() float HIPTailTimeVariance(const HIPTailDescriptor& tail)
62{
63 const float length = tail.tailEnd > tail.tailStart ? float(tail.tailEnd - tail.tailStart) : 1.f;
64 return (length * length - 1.f) * (1.f / 12.f);
65}
66
67// Collect tails marked for closing across the workgroup using a prefix scan,
68// then cooperatively zero the charge map entries for each closed tail.
69// Caller must set acc.activeHIPTail.end before calling if the tail is open.
70static GPUdi() uint16_t CloseHIPTails(
71 Kernel::GPUSharedMemory& smem,
72 GPUTPCClusterFinder& clusterer,
73 int32_t iThread, int32_t nThreads,
74 int16_t iPadHandle,
75 CfChargePos basePos,
76 CfArray2D<PackedCharge>& chargeMap,
77 Kernel::PadChargeAccu& acc,
78 bool shouldCloseTail)
79{
80 const uint32_t row = basePos.row();
81 const uint16_t nClosedTails = work_group_count(shouldCloseTail);
82
83 auto* nHIPTails = clusterer.mPnHIPTails;
84 auto* hipTails = GetHIPTails(clusterer, row);
85
86 if (nClosedTails > 0) {
87 int16_t iClosedTail = work_group_scan_inclusive_add((int16_t)shouldCloseTail) - 1;
88 const bool shouldStoreTail = shouldCloseTail && acc.activeHIPTail.Length() > 0;
89 uint16_t nStoredTails = work_group_count(shouldStoreTail);
90 int16_t iStoredTail = work_group_scan_inclusive_add((int16_t)shouldStoreTail) - 1;
91
92 // Use exactly one atomic add per closing call to reduce differences in
93 // tail ordering between runs.
94 if (nStoredTails > 0) {
95 if (iThread == 0) {
96 smem.tailStoreBase = CAMath::AtomicAdd(&nHIPTails[row], (uint32_t)nStoredTails);
97 }
98 GPUbarrier();
99 }
100 if (shouldCloseTail) {
101 smem.tailsClosedPad[iClosedTail] = iPadHandle;
102 smem.tailsClosed[iClosedTail] = acc.activeHIPTail;
103 smem.tailsClosedStoreIdx[iClosedTail] = GPUTPCCFHIPTailConnector::MaxHIPTailsPerRow;
104
105 if (shouldStoreTail) {
106 const uint32_t idx = smem.tailStoreBase + iStoredTail + 1;
107 smem.tailsClosedStoreIdx[iClosedTail] = idx;
109 hipTails[idx] = {0, 0, (uint16_t)iPadHandle,
110 (uint16_t)acc.activeHIPTail.start, (uint16_t)acc.activeHIPTail.end,
111 0.f, 0.f};
112 }
113 }
114
115 acc.tailFilterCharge = 0;
116 acc.activeHIPTail.Reset();
117 }
118
119 GPUbarrier();
120 }
121
122 // TODO: performance improvement -> parallelize this loop across tails
123 for (uint16_t iTail = 0; iTail < nClosedTails; iTail++) {
124 const auto tailPad = smem.tailsClosedPad[iTail];
125 const auto tail = smem.tailsClosed[iTail];
126 const uint32_t tailStoreIdx = smem.tailsClosedStoreIdx[iTail];
127
128 Charge qTot = 0.f;
129 Charge qMax = 0.f;
130 for (uint16_t iTime = iThread; iTime < tail.Length(); iTime += nThreads) {
131 const int16_t time = tail.start + iTime;
132 auto pos = basePos.delta({tailPad, time});
133 const Charge q = chargeMap[pos].unpack();
134 qTot += q;
135 qMax = CAMath::Max(qMax, q);
136 chargeMap[pos] = PackedCharge{0};
137 }
138
139 smem.tailQTotScratch[iThread] = qTot;
140 smem.tailQMaxScratch[iThread] = qMax;
141 GPUbarrier();
142 for (uint16_t active = nThreads; active > 1;) {
143 const uint16_t stride = (active + 1) / 2;
144 if (iThread < active - stride) {
145 smem.tailQTotScratch[iThread] += smem.tailQTotScratch[iThread + stride];
146 smem.tailQMaxScratch[iThread] = CAMath::Max(smem.tailQMaxScratch[iThread], smem.tailQMaxScratch[iThread + stride]);
147 }
148 active = stride;
149 GPUbarrier();
150 }
151
152 if (iThread == 0 && tailStoreIdx < GPUTPCCFHIPTailConnector::MaxHIPTailsPerRow) {
153 HIPTailDescriptor& tailDescriptor = hipTails[tailStoreIdx];
154 tailDescriptor.qTot = smem.tailQTotScratch[0];
155 tailDescriptor.qMax = smem.tailQMaxScratch[0];
156 }
157 }
158
159 return nClosedTails;
160}
161
162template <bool CheckHIPTrigger, bool CheckHIPTailEnd>
163static GPUdi() void ScanCachedCharges(Kernel::GPUSharedMemory& smem, uint16_t timeOffset, uint16_t pad, Charge hipTailThreshold, Charge hipTailFilterAlpha, Kernel::PadChargeAccu& acc)
164{
165 for (int32_t i = 0; i < Kernel::NumOfCachedTBs; i++) {
166 const Charge qs = smem.charges[i][pad];
167 const int16_t curTB = timeOffset + i;
168
169 acc.totalCharges += qs > 0;
170 acc.consecCharges = qs > 0 ? acc.consecCharges + 1 : 0;
171 acc.maxConsecCharges = CAMath::Max(acc.consecCharges, acc.maxConsecCharges);
172 acc.maxCharge = CAMath::Max<Charge>(qs, acc.maxCharge);
173
174 if (qs >= hipTailThreshold) {
175 if (acc.aboveThresholdStart < 0) {
176 acc.aboveThresholdStart = curTB;
177 }
178 } else {
179 acc.aboveThresholdStart = -1;
180 }
181
182 if constexpr (CheckHIPTrigger) {
183 if (acc.HIPtb < 0 && qs >= Charge(Kernel::MaxADC)) {
184 acc.HIPtb = acc.aboveThresholdStart; // start of rising edge, not first sat TB
185 smem.tails[pad] = {acc.HIPtb, 0}; // Broadcast HIP start TB to neighboring pads / threads
186 }
187 }
188
189 if constexpr (CheckHIPTailEnd) {
190 if (acc.activeHIPTail.IsOpen()) {
191 acc.tailFilterCharge = UpdateHIPTailFilter(acc.tailFilterCharge, qs, hipTailFilterAlpha);
192 if (acc.tailFilterCharge < hipTailThreshold) {
193 acc.activeHIPTail.end = curTB;
194 }
195 }
196 }
197 }
198}
199
200template <>
201GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
202{
203#ifdef GPUCA_GPUCODE
204 CheckBaselineGPU(nBlocks, nThreads, iBlock, iThread, smem, clusterer);
205#else
206 CheckBaselineCPU(nBlocks, nThreads, iBlock, iThread, smem, clusterer);
207#endif
208}
209
210// Charges are stored in a 2D array (pad and time) using a tiling layout.
211// Tiles are 8 pads x 4 timebins large stored in time-major layout and make up a single cacheline.
212//
213// This kernel processes one row per block. Threads cooperatively load chunks
214// of 4 consecutive time bins for all pads into shared memory. Thread `i` then processes charges for pad `i` in shared memory.
215// Blocks require `nextMultipleOf<64>(138 * 4) = 576` threads to process the largest TPC rows with 138 pads correctly.
216GPUd() void GPUTPCCFCheckPadBaseline::CheckBaselineGPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
217{
218#ifdef GPUCA_GPUCODE
219 static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFCheckPadBaseline) == 576);
220 if (iBlock >= (int32_t)GPUTPCGeometry::NROWS) {
221 return;
222 }
223
224 const CfFragment& fragment = clusterer.mPmemory->fragment;
225 const bool hipFilterOn = clusterer.Param().rec.tpc.hipTailFilter;
226 const Charge hipTailThreshold = clusterer.Param().rec.tpc.hipTailFilterThreshold;
227 const Charge hipTailFilterAlpha = clusterer.Param().rec.tpc.hipTailFilterAlpha;
228 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
229
230 constexpr GPUTPCGeometry geo;
231
232 const auto iRow = iBlock;
233 const auto nPads = geo.NPads(iRow);
234 const CfChargePos basePos{(Row)iRow, 0, 0};
235
236 PadChargeAccu acc;
237
238 const int16_t iPadOffset = iThread % MaxNPadsPerRow;
239 const int16_t iTimeOffset = iThread / MaxNPadsPerRow;
240 const int16_t iPadHandle = iThread;
241 const bool handlePad = iPadHandle < nPads;
242
243 if (iPadHandle < MaxNPadsPerRow) {
244 smem.tails[iPadHandle] = {-1, -1};
245 }
246 GPUbarrier();
247
248 // Pad filter scans the entire fragments including overlap.
249 // Minimal runtime overhead and prevents headaches later on as
250 // saturated signal in overlap region can create tails in the next fragment
251 // even when cleared in current fragment as they're decoded twice
252 const TPCFragmentTime firstTB = 0;
253 const TPCFragmentTime lastTB = fragment.length;
254
255 for (uint16_t t = firstTB; t < lastTB; t += NumOfCachedTBs) {
256
257 bool thisThreadHasTrigger = false;
258 for (uint16_t tt = 0; tt < NumOfCachedTBs; tt += TimebinsPerCacheline) {
259 const TPCFragmentTime iTimeLoad = t + tt + iTimeOffset;
260
261 const CfChargePos pos = basePos.delta({iPadOffset, iTimeLoad});
262
263 const Charge ql = iTimeLoad < lastTB && iPadOffset < nPads ? chargeMap[pos].unpack() : 0;
264 smem.charges[tt + iTimeOffset][iPadOffset] = ql;
265
266 thisThreadHasTrigger |= ql >= Charge(MaxADC);
267 }
268
269 bool hasHIPTrigger = false;
270 if (hipFilterOn) {
271 hasHIPTrigger = work_group_any(thisThreadHasTrigger);
272 } else {
273 // Need a barrier here even if HIP filter is disabled
274 GPUbarrier();
275 }
276
277 acc.HIPtb = -1;
278
279 if (handlePad) {
280
281 // TODO: is this really necessary?
282 // Why is the old version so much slower, when we just add short branches to the loop???
283 if (!hasHIPTrigger) [[likely]] {
284 if (!acc.activeHIPTail.IsOpen()) {
285 ScanCachedCharges<false, false>(smem, t, iPadHandle, hipTailThreshold, hipTailFilterAlpha, acc);
286 } else {
287 ScanCachedCharges<false, true>(smem, t, iPadHandle, hipTailThreshold, hipTailFilterAlpha, acc);
288 }
289 } else {
290 if (!acc.activeHIPTail.IsOpen()) {
291 ScanCachedCharges<true, false>(smem, t, iPadHandle, hipTailThreshold, hipTailFilterAlpha, acc);
292 } else {
293 ScanCachedCharges<true, true>(smem, t, iPadHandle, hipTailThreshold, hipTailFilterAlpha, acc);
294 }
295 }
296 }
297
298 GPUbarrier();
299
300 if (hasHIPTrigger) [[unlikely]] {
301
302 DPRINTB("%d: Trigger!\n", iBlock);
303
304 if (handlePad && acc.HIPtb < 0) {
305
306 // Search neighboring pads for trigger
307 for (int16_t i = -SSClusterPadWidth; i < 0; i++) {
308 const auto p = iPadHandle + i;
309 if (p > -1) {
310 acc.HIPtb = CAMath::Max(smem.tails[p].start, acc.HIPtb);
311 }
312 }
313
314 for (int16_t i = 1; i <= SSClusterPadWidth; i++) {
315 const auto p = iPadHandle + i;
316 if (p < MaxNPadsPerRow) {
317 acc.HIPtb = CAMath::Max(smem.tails[p].start, acc.HIPtb);
318 }
319 }
320 }
321
322 bool shouldCloseTail = acc.HIPtb > -1 && acc.activeHIPTail.HasValue();
323 if (shouldCloseTail && acc.activeHIPTail.IsOpen()) {
324 DPRINT("%d: end = %d\n", iThread, acc.HIPtb);
325 acc.activeHIPTail.end = acc.HIPtb;
326 }
327
328 CloseHIPTails(smem, clusterer, iThread, nThreads, iPadHandle, basePos, chargeMap, acc, shouldCloseTail);
329
330 GPUbarrier();
331
332 if (acc.HIPtb > -1) {
333 DPRINT("%d: start = %d\n", iThread, acc.HIPtb);
334 acc.activeHIPTail.SetOpen(acc.HIPtb);
335 acc.tailFilterCharge = Charge(MaxADC);
336 }
337
338 // Clear smem between iterations to prevent stale entries
339 if (handlePad) {
340 smem.tails[iPadHandle].Reset();
341 }
342
343 GPUbarrier();
344
345 } // if (hipTriggerFound)
346
347 } // for (uint16_t t = firstTB; t < lastTB; t += NumOfCachedTBs)
348
349 if (handlePad) {
350 updatePadBaseline(basePos.gpad + iPadHandle, clusterer, acc.totalCharges, acc.maxConsecCharges, acc.maxCharge);
351 }
352
353 // --- Close remaining tails
354 const bool shouldCloseTail = acc.activeHIPTail.HasValue();
355
356 // Call `work_group_any` here, instead of always counting.
357 // This is important as `work_group_count` is a lot slower
358 // and has a lot of overhead if no HIPs were found.
359 if (work_group_any(shouldCloseTail)) {
360 if (shouldCloseTail && acc.activeHIPTail.IsOpen()) {
361 acc.activeHIPTail.end = lastTB;
362 }
363
364 [[maybe_unused]] const uint16_t nClosedTails = CloseHIPTails(smem, clusterer, iThread, nThreads, iPadHandle, basePos, chargeMap, acc, shouldCloseTail);
365
366 DPRINTB_IF(nClosedTails > 0, "%d: Close remaining tails (%d)\n", iBlock, nClosedTails);
367 }
368
369#endif
370}
371
372GPUd() void GPUTPCCFCheckPadBaseline::CheckBaselineCPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
373{
374#ifndef GPUCA_GPUCODE
375 const CfFragment& fragment = clusterer.mPmemory->fragment;
376 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
377
378 CfChargePos basePos(iBlock * PadsPerCacheline, 0);
379
380 constexpr GPUTPCGeometry geo;
381 if (basePos.pad() >= geo.NPads(basePos.row())) {
382 return;
383 }
384
385 constexpr size_t ElemsInTileRow = (size_t)TilingLayout<GridSize<2>>::WidthInTiles * TimebinsPerCacheline * PadsPerCacheline;
386
387 using UShort8 = Vc::fixed_size_simd<uint16_t, PadsPerCacheline>;
388 using Charge8 = Vc::fixed_size_simd<float, PadsPerCacheline>;
389
390 UShort8 totalCharges{Vc::Zero};
391 UShort8 consecCharges{Vc::Zero};
392 UShort8 maxConsecCharges{Vc::Zero};
393 Charge8 maxCharge{Vc::Zero};
394
395 tpccf::TPCFragmentTime t = fragment.firstNonOverlapTimeBin();
396
397 // Access packed charges as raw integers. We throw away the PackedCharge type here to simplify vectorization.
398 const uint16_t* packedChargeStart = reinterpret_cast<uint16_t*>(&chargeMap[basePos.delta({0, t})]);
399
400 for (; t < fragment.lastNonOverlapTimeBin(); t += TimebinsPerCacheline) {
401 for (tpccf::TPCFragmentTime localtime = 0; localtime < TimebinsPerCacheline; localtime++) {
402 const UShort8 packedCharges{packedChargeStart + PadsPerCacheline * localtime, Vc::Aligned};
403 const UShort8::mask_type isCharge = packedCharges != 0;
404
405 if (isCharge.isNotEmpty()) {
406 totalCharges(isCharge)++;
407 consecCharges += 1;
408 consecCharges(not isCharge) = 0;
409 maxConsecCharges = Vc::max(consecCharges, maxConsecCharges);
410
411 // Manually unpack charges to float.
412 // Duplicated from PackedCharge::unpack to generate vectorized code:
413 // Charge unpack() const { return Charge(mVal & ChargeMask) / Charge(1 << DecimalBits); }
414 // Note that PackedCharge has to cut off the highest 2 bits via ChargeMask as they are used for flags by the cluster finder
415 // and are not part of the charge value. We can skip this step because the cluster finder hasn't run yet
416 // and thus these bits are guarenteed to be zero.
417 const Charge8 unpackedCharges = Charge8(packedCharges) / Charge(1 << PackedCharge::DecimalBits);
418 maxCharge = Vc::max(maxCharge, unpackedCharges);
419 } else {
420 consecCharges = 0;
421 }
422 }
423
424 packedChargeStart += ElemsInTileRow;
425 }
426
427 for (tpccf::Pad localpad = 0; localpad < PadsPerCacheline; localpad++) {
428 updatePadBaseline(basePos.gpad + localpad, clusterer, totalCharges[localpad], maxConsecCharges[localpad], maxCharge[localpad]);
429 }
430#endif
431}
432
433GPUd() void GPUTPCCFCheckPadBaseline::updatePadBaseline(int32_t pad, const GPUTPCClusterFinder& clusterer, int32_t totalCharges, int32_t consecCharges, Charge maxCharge)
434{
435 const CfFragment& fragment = clusterer.mPmemory->fragment;
436 const int32_t totalChargesBaseline = clusterer.Param().rec.tpc.maxTimeBinAboveThresholdIn1000Bin * fragment.lengthWithoutOverlap() / 1000;
437 const int32_t consecChargesBaseline = clusterer.Param().rec.tpc.maxConsecTimeBinAboveThreshold;
438 const uint16_t saturationThreshold = clusterer.Param().rec.tpc.noisyPadSaturationThreshold;
439 const bool isNoisy = (!saturationThreshold || maxCharge < saturationThreshold) && ((totalChargesBaseline > 0 && totalCharges >= totalChargesBaseline) || (consecChargesBaseline > 0 && consecCharges >= consecChargesBaseline));
440
441 if (isNoisy) {
442 clusterer.mPpadIsNoisy[pad] = true;
443 }
444}
445
446// ======== HIP Tail Connector Kernel ========
447
448template <>
449GPUd() void GPUTPCCFHIPTailConnector::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
450{
451 if (iBlock >= (int32_t)GPUTPCGeometry::NROWS) {
452 return;
453 }
454 const uint32_t row = iBlock;
455
456 const uint32_t nTails = CAMath::Min(clusterer.mPnHIPTails[row], (uint32_t)MaxHIPTailsPerRow - 1);
457
458 // HIP TAILS: indexing starts at 1, so 0 index indicates no connection
459 HIPTailDescriptor* tails = GetHIPTails(clusterer, row);
460
461#ifdef GPUCA_DETERMINISTIC_MODE
462 // Races in tail comparisons and atomic swap can lead to slightly different clusters.
463 // So need a sequential fallback for deterministic mode
464 if (iThread > 0) {
465 return;
466 }
467 nThreads = 1;
468 GPUCommonAlgorithm::sortInBlock(tails + 1, tails + nTails + 1, [](auto&& t1, auto&& t2) {
469 if (t1.pad != t2.pad) {
470 return t1.pad < t2.pad;
471 }
472 return t1.tailStart < t2.tailStart;
473 });
474#endif
475
476 for (uint32_t iTail = iThread + 1; iTail <= nTails; iTail += nThreads) {
477 auto* tail = &tails[iTail];
478
479 // TODO: this is needed because tailStarts may vary due to rising edge
480 // Better approach would be to also track the triggered timebin and match that instead
481 uint16_t overlapWindowStart = tail->tailStart >= 5 ? tail->tailStart - 5 : 0;
482 uint16_t overlapWindowEnd = tail->tailStart + 5;
483
484 for (uint32_t jTail = iTail + 1; jTail <= nTails; jTail++) {
485 auto* tailNext = &tails[jTail];
486 if (tailNext->iPrev > 0) {
487 continue;
488 }
489
490 const bool overlapPad = tailNext->pad >= tail->pad - GPUTPCCFCheckPadBaseline::SSClusterPadWidth && tailNext->pad <= tail->pad + GPUTPCCFCheckPadBaseline::SSClusterPadWidth;
491 const bool overlapTime = tailNext->tailStart >= overlapWindowStart && tailNext->tailStart < overlapWindowEnd;
492
493 if (overlapPad && overlapTime) {
494 if (CAMath::AtomicCAS(&tailNext->iPrev, 0u, iTail)) {
495 tail->iNext = jTail;
496 break;
497 }
498 }
499 }
500 }
501}
502
503// ======== HIP Clusterizer Kernel ========
504
505template <>
506GPUd() void GPUTPCCFHIPClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
507{
508 if (iBlock >= (int32_t)GPUTPCGeometry::NROWS) {
509 return;
510 }
511
512 const uint32_t row = iBlock;
513 uint32_t nTails = clusterer.mPnHIPTails[row];
514 nTails = CAMath::Min(nTails, (uint32_t)MaxHIPTailsPerRow - 1);
515
516 HIPTailDescriptor* tails = GetHIPTails(clusterer, row);
517 const auto& fragment = clusterer.mPmemory->fragment;
518
519 for (uint32_t iTail = iThread + 1; iTail <= nTails; iTail += nThreads) {
520
521 auto* tail = &tails[iTail];
522
523 if (tail->iPrev != 0) {
524 continue;
525 }
526
527 float qTot = tail->qTot;
528 float qMax = tail->qMax;
529 const float firstWeight = tail->qTot;
530 const float firstPad = tail->pad;
531 const float firstTime = HIPTailTimeMean(*tail);
532 float padSum = firstWeight * firstPad;
533 float padSqSum = firstWeight * firstPad * firstPad;
534 float timeSum = firstWeight * firstTime;
535
536 uint32_t tailStart = tail->tailStart;
537 uint32_t tailEnd = tail->tailEnd;
538
539 while (tail->iNext != 0) {
540
541 tail = &tails[tail->iNext];
542
543 const float tailWeight = tail->qTot;
544 const float tailPad = tail->pad;
545 const float tailTime = HIPTailTimeMean(*tail);
546 qMax = CAMath::Max(qMax, tail->qMax);
547 qTot += tail->qTot;
548 padSum += tailWeight * tailPad;
549 padSqSum += tailWeight * tailPad * tailPad;
550 timeSum += tailWeight * tailTime;
551 tailStart = CAMath::Min<uint32_t>(tailStart, tail->tailStart);
552 tailEnd = CAMath::Max<uint32_t>(tailEnd, tail->tailEnd);
553 }
554
555 const float weightSum = CAMath::Max(qTot, 1.f);
556 float padMean = padSum / weightSum;
557 float timeMean = timeSum / weightSum; // TODO: Use timebin of saturated signal instead! Time mean is biased for long tails.
558 float padSigma = CAMath::Sqrt(CAMath::Max(0.f, padSqSum / weightSum - padMean * padMean));
559
561 cn.qMax = qMax;
562 cn.setSaturatedQtot(qTot);
563 cn.setSaturatedTailLength(tailEnd - tailStart);
564 float clusterTime = fragment.start + timeMean - clusterer.Param().rec.tpc.clustersShiftTimebinsClusterizer;
565 cn.setTimeFlags(clusterTime, 0);
566 cn.setPad(padMean);
567 cn.setSigmaPad(padSigma);
568
569 if (cn.qMax >= 1023) {
570 // Cut off clusters where the tail connection failed for some reason
571 // TODO: Deduplicate with GPUTPCCFClusterizer::sortIntoBuckets (can't call cross-kernel).
572 // TODO: Add error reporting for row cluster overflow.
573 uint32_t index = CAMath::AtomicAdd(&clusterer.mPclusterInRow[row], 1u);
574 if (index < clusterer.mNMaxClusterPerRow) {
575 clusterer.mPclusterByRow[clusterer.mNMaxClusterPerRow * row + index] = cn;
576 }
577 }
578 }
579}
Class of a TPC cluster in TPC-native coordinates (row, time)
int16_t time
Definition RawEventData.h:4
int32_t i
#define GPUbarrier()
#define GPUCA_GET_THREAD_COUNT(...)
#define DPRINT(...)
#define DPRINTB(...)
#define DPRINTB_IF(test,...)
GPUd() void GPUTPCCFCheckPadBaseline
static Charge charge
static int32_t row
uint16_t pos
Definition RawData.h:3
Provides a basic fallback implementation for Vc.
static constexpr uint32_t NROWS
GLfloat GLfloat GLfloat alpha
Definition glcorearb.h:279
GLuint index
Definition glcorearb.h:781
GLuint GLsizei GLsizei * length
Definition glcorearb.h:790
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLint GLenum GLboolean GLsizei stride
Definition glcorearb.h:867
GLuint GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat t1
Definition glcorearb.h:5034
GPUdi() o2
Definition TrackTRD.h:39
tpccf::TPCFragmentTime length
Definition CfFragment.h:33
tpccf::TPCTime start
Definition CfFragment.h:31