28#define DPRINT(...) printf(__VA_ARGS__)
32#define DPRINTB_IF(test, ...) \
33 if (iThread == 0 && (test)) \
36#define DPRINT(...) ((void)0)
37#define DPRINTB(...) ((void)0)
38#define DPRINTB_IF(test, ...) ((void)0)
54 return filteredCharge +
alpha * (
charge - filteredCharge);
59 const float length = tail.
tailEnd > tail.tailStart ? float(tail.tailEnd - tail.tailStart) : 1.f;
60 return tail.tailStart + 0.5f * (
length - 1.f);
65 const float length = tail.tailEnd > tail.tailStart ? float(tail.tailEnd - tail.tailStart) : 1.f;
72static GPUdi() uint16_t CloseHIPTails(
73 Kernel::GPUSharedMemory& smem,
75 int32_t iThread, int32_t nThreads,
79 Kernel::PadChargeAccu& acc,
82 const uint32_t
row = basePos.row();
83 const uint16_t nClosedTails = work_group_count(shouldCloseTail);
85 auto* nHIPTails = clusterer.mPnHIPTails;
86 auto* hipTails = GetHIPTails(clusterer,
row);
88 if (nClosedTails > 0) {
89 int16_t iClosedTail = work_group_scan_inclusive_add((int16_t)shouldCloseTail) - 1;
90 const bool shouldStoreTail = shouldCloseTail && acc.activeHIPTail.Length() > 0;
91 uint16_t nStoredTails = work_group_count(shouldStoreTail);
92 int16_t iStoredTail = work_group_scan_inclusive_add((int16_t)shouldStoreTail) - 1;
96 if (nStoredTails > 0) {
98 smem.tailStoreBase = CAMath::AtomicAdd(&nHIPTails[
row], (uint32_t)nStoredTails);
102 if (shouldCloseTail) {
103 smem.tailsClosedPad[iClosedTail] = iPadHandle;
104 smem.tailsClosed[iClosedTail] = acc.activeHIPTail;
107 if (shouldStoreTail) {
108 const uint32_t
idx = smem.tailStoreBase + iStoredTail + 1;
109 smem.tailsClosedStoreIdx[iClosedTail] =
idx;
111 hipTails[
idx] = {0, 0, (uint16_t)iPadHandle,
112 (uint16_t)acc.activeHIPTail.start, (uint16_t)acc.activeHIPTail.end,
117 acc.tailFilterCharge = 0;
118 acc.activeHIPTail.Reset();
125 for (uint16_t iTail = 0; iTail < nClosedTails; iTail++) {
126 const auto tailPad = smem.tailsClosedPad[iTail];
127 const auto tail = smem.tailsClosed[iTail];
128 const uint32_t tailStoreIdx = smem.tailsClosedStoreIdx[iTail];
132 for (uint16_t iTime = iThread; iTime < tail.Length(); iTime += nThreads) {
133 const int16_t
time = tail.start + iTime;
134 auto pos = basePos.delta({tailPad,
time});
135 const Charge q = chargeMap[
pos].unpack();
137 qMax = CAMath::Max(qMax, q);
141 smem.tailQTotScratch[iThread] = qTot;
142 smem.tailQMaxScratch[iThread] = qMax;
147 smem.tailQTotScratch[iThread] += smem.tailQTotScratch[iThread +
stride];
148 smem.tailQMaxScratch[iThread] = CAMath::Max(smem.tailQMaxScratch[iThread], smem.tailQMaxScratch[iThread +
stride]);
156 tailDescriptor.
qTot = smem.tailQTotScratch[0];
157 tailDescriptor.
qMax = smem.tailQMaxScratch[0];
164template <
bool CheckHIPTrigger,
bool CheckHIPTailEnd>
165static GPUdi()
void ScanCachedCharges(
Kernel::GPUSharedMemory& smem, uint16_t timeOffset, uint16_t pad,
Charge hipTailThreshold,
Charge hipTailFilterAlpha,
Kernel::PadChargeAccu& acc)
168 const Charge qs = smem.charges[
i][pad];
169 const int16_t curTB = timeOffset +
i;
171 acc.totalCharges += qs > 0;
172 acc.consecCharges = qs > 0 ? acc.consecCharges + 1 : 0;
173 acc.maxConsecCharges = CAMath::Max(acc.consecCharges, acc.maxConsecCharges);
174 acc.maxCharge = CAMath::Max<Charge>(qs, acc.maxCharge);
176 if (qs >= hipTailThreshold) {
177 if (acc.aboveThresholdStart < 0) {
178 acc.aboveThresholdStart = curTB;
181 acc.aboveThresholdStart = -1;
184 if constexpr (CheckHIPTrigger) {
186 acc.HIPtb = acc.aboveThresholdStart;
187 smem.tails[pad] = {acc.HIPtb, 0};
191 if constexpr (CheckHIPTailEnd) {
192 if (acc.activeHIPTail.IsOpen()) {
193 acc.tailFilterCharge = UpdateHIPTailFilter(acc.tailFilterCharge, qs, hipTailFilterAlpha);
194 if (acc.tailFilterCharge < hipTailThreshold) {
195 acc.activeHIPTail.end = curTB;
206 CheckBaselineGPU(nBlocks, nThreads, iBlock, iThread, smem, clusterer);
208 CheckBaselineCPU(nBlocks, nThreads, iBlock, iThread, smem, clusterer);
218GPUd()
void GPUTPCCFCheckPadBaseline::CheckBaselineGPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
226 const CfFragment& fragment = clusterer.mPmemory->fragment;
227 const bool hipFilterOn = clusterer.Param().rec.tpc.hipTailFilter;
228 const Charge hipTailThreshold = clusterer.Param().rec.tpc.hipTailFilterThreshold;
229 const Charge hipTailFilterAlpha = clusterer.Param().rec.tpc.hipTailFilterAlpha;
234 const auto iRow = iBlock;
235 const auto nPads = geo.NPads(iRow);
240 const int16_t iPadOffset = iThread % MaxNPadsPerRow;
241 const int16_t iTimeOffset = iThread / MaxNPadsPerRow;
242 const int16_t iPadHandle = iThread;
243 const bool handlePad = iPadHandle < nPads;
245 if (iPadHandle < MaxNPadsPerRow) {
246 smem.tails[iPadHandle] = {-1, -1};
257 for (uint16_t t = firstTB; t < lastTB; t += NumOfCachedTBs) {
259 bool thisThreadHasTrigger =
false;
260 for (uint16_t
tt = 0;
tt < NumOfCachedTBs;
tt += TimebinsPerCacheline) {
265 const Charge ql = iTimeLoad < lastTB && iPadOffset < nPads ? chargeMap[
pos].unpack() : 0;
266 smem.charges[
tt + iTimeOffset][iPadOffset] = ql;
268 thisThreadHasTrigger |= ql >=
Charge(MaxADC);
271 bool hasHIPTrigger =
false;
273 hasHIPTrigger = work_group_any(thisThreadHasTrigger);
285 if (!hasHIPTrigger) [[likely]] {
286 if (!acc.activeHIPTail.IsOpen()) {
287 ScanCachedCharges<false, false>(smem, t, iPadHandle, hipTailThreshold, hipTailFilterAlpha, acc);
289 ScanCachedCharges<false, true>(smem, t, iPadHandle, hipTailThreshold, hipTailFilterAlpha, acc);
292 if (!acc.activeHIPTail.IsOpen()) {
293 ScanCachedCharges<true, false>(smem, t, iPadHandle, hipTailThreshold, hipTailFilterAlpha, acc);
295 ScanCachedCharges<true, true>(smem, t, iPadHandle, hipTailThreshold, hipTailFilterAlpha, acc);
302 if (hasHIPTrigger) [[unlikely]] {
304 DPRINTB(
"%d: Trigger!\n", iBlock);
306 if (handlePad && acc.HIPtb < 0) {
309 for (int16_t
i = -SSClusterPadWidth;
i < 0;
i++) {
310 const auto p = iPadHandle +
i;
312 acc.HIPtb = CAMath::Max(smem.tails[p].start, acc.HIPtb);
316 for (int16_t
i = 1;
i <= SSClusterPadWidth;
i++) {
317 const auto p = iPadHandle +
i;
318 if (p < MaxNPadsPerRow) {
319 acc.HIPtb = CAMath::Max(smem.tails[p].start, acc.HIPtb);
324 bool shouldCloseTail = acc.HIPtb > -1 && acc.activeHIPTail.HasValue();
325 if (shouldCloseTail && acc.activeHIPTail.IsOpen()) {
326 DPRINT(
"%d: end = %d\n", iThread, acc.HIPtb);
327 acc.activeHIPTail.end = acc.HIPtb;
330 CloseHIPTails(smem, clusterer, iThread, nThreads, iPadHandle, basePos, chargeMap, acc, shouldCloseTail);
334 if (acc.HIPtb > -1) {
335 DPRINT(
"%d: start = %d\n", iThread, acc.HIPtb);
336 acc.activeHIPTail.SetOpen(acc.HIPtb);
337 acc.tailFilterCharge =
Charge(MaxADC);
342 smem.tails[iPadHandle].Reset();
352 updatePadBaseline(basePos.gpad + iPadHandle, clusterer, acc.totalCharges, acc.maxConsecCharges, acc.maxCharge);
356 const bool shouldCloseTail = acc.activeHIPTail.HasValue();
361 if (work_group_any(shouldCloseTail)) {
362 if (shouldCloseTail && acc.activeHIPTail.IsOpen()) {
363 acc.activeHIPTail.end = lastTB;
366 [[maybe_unused]]
const uint16_t nClosedTails = CloseHIPTails(smem, clusterer, iThread, nThreads, iPadHandle, basePos, chargeMap, acc, shouldCloseTail);
368 DPRINTB_IF(nClosedTails > 0,
"%d: Close remaining tails (%d)\n", iBlock, nClosedTails);
374GPUd()
void GPUTPCCFCheckPadBaseline::CheckBaselineCPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
377 const CfFragment& fragment = clusterer.mPmemory->fragment;
383 if (basePos.pad() >= geo.NPads(basePos.row())) {
387 constexpr size_t ElemsInTileRow = (size_t)
TilingLayout<
GridSize<2>>::WidthInTiles * TimebinsPerCacheline * PadsPerCacheline;
389 using UShort8 = Vc::fixed_size_simd<uint16_t, PadsPerCacheline>;
390 using Charge8 = Vc::fixed_size_simd<float, PadsPerCacheline>;
392 UShort8 totalCharges{Vc::Zero};
393 UShort8 consecCharges{Vc::Zero};
394 UShort8 maxConsecCharges{Vc::Zero};
395 Charge8 maxCharge{Vc::Zero};
400 const uint16_t* packedChargeStart =
reinterpret_cast<uint16_t*
>(&chargeMap[basePos.delta({0, t})]);
402 for (; t < fragment.lastNonOverlapTimeBin(); t += TimebinsPerCacheline) {
404 const UShort8 packedCharges{packedChargeStart + PadsPerCacheline * localtime, Vc::Aligned};
405 const UShort8::mask_type isCharge = packedCharges != 0;
407 if (isCharge.isNotEmpty()) {
408 totalCharges(isCharge)++;
410 consecCharges(not isCharge) = 0;
411 maxConsecCharges = Vc::max(consecCharges, maxConsecCharges);
420 maxCharge = Vc::max(maxCharge, unpackedCharges);
426 packedChargeStart += ElemsInTileRow;
429 for (
tpccf::Pad localpad = 0; localpad < PadsPerCacheline; localpad++) {
430 updatePadBaseline(basePos.gpad + localpad, clusterer, totalCharges[localpad], maxConsecCharges[localpad], maxCharge[localpad]);
437 const CfFragment& fragment = clusterer.mPmemory->fragment;
438 const int32_t totalChargesBaseline = clusterer.Param().rec.tpc.maxTimeBinAboveThresholdIn1000Bin * fragment.lengthWithoutOverlap() / 1000;
439 const int32_t consecChargesBaseline = clusterer.Param().rec.tpc.maxConsecTimeBinAboveThreshold;
440 const uint16_t saturationThreshold = clusterer.Param().rec.tpc.noisyPadSaturationThreshold;
441 const bool isNoisy = (!saturationThreshold || maxCharge < saturationThreshold) && ((totalChargesBaseline > 0 && totalCharges >= totalChargesBaseline) || (consecChargesBaseline > 0 && consecCharges >= consecChargesBaseline));
444 clusterer.mPpadIsNoisy[pad] =
true;
456 const uint32_t
row = iBlock;
458 const uint32_t nTails = CAMath::Min(clusterer.mPnHIPTails[
row], (uint32_t)MaxHIPTailsPerRow - 1);
463#ifdef GPUCA_DETERMINISTIC_MODE
470 GPUCommonAlgorithm::sortInBlock(tails + 1, tails + nTails + 1, [](
auto&&
t1,
auto&& t2) {
471 if (
t1.pad != t2.pad) {
472 return t1.pad < t2.pad;
478 for (uint32_t iTail = iThread + 1; iTail <= nTails; iTail += nThreads) {
479 auto* tail = &tails[iTail];
483 uint16_t overlapWindowStart = tail->
tailStart >= 5 ? tail->tailStart - 5 : 0;
484 uint16_t overlapWindowEnd = tail->tailStart + 5;
486 for (uint32_t jTail = iTail + 1; jTail <= nTails; jTail++) {
487 auto* tailNext = &tails[jTail];
488 if (tailNext->iPrev > 0) {
493 const bool overlapTime = tailNext->tailStart >= overlapWindowStart && tailNext->tailStart < overlapWindowEnd;
495 if (overlapPad && overlapTime) {
496 if (CAMath::AtomicCAS(&tailNext->iPrev, 0u, iTail)) {
508GPUd()
void GPUTPCCFHIPClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, uint8_t onlyMC)
514 const uint32_t
row = iBlock;
515 uint32_t nTails = clusterer.mPnHIPTails[
row];
516 nTails = CAMath::Min(nTails, (uint32_t)MaxHIPTailsPerRow - 1);
518 const auto* tails = GetHIPTails(clusterer,
row);
519 const auto& fragment = clusterer.mPmemory->fragment;
521 auto* clusterPosInRow = clusterer.mPhipClusterPosInRow
522 ? clusterer.mPhipClusterPosInRow +
row * MaxHIPTailsPerRow
525 for (uint32_t iTail = iThread + 1; iTail <= nTails; iTail += nThreads) {
527 const auto* tail = &tails[iTail];
528 if (tail->iPrev != 0) {
539 uint32_t tailStart = (uint32_t)-1;
540 uint32_t tailEnd = 0;
543 for (; tail != tails; tail = &tails[tail->iNext]) {
544 const float tailWeight = tail->qTot;
545 const float tailPad = tail->pad;
546 const float tailTime = HIPTailTimeMean(*tail);
547 qMax = CAMath::Max(qMax, tail->qMax);
549 padSum += tailWeight * tailPad;
550 padSqSum += tailWeight * tailPad * tailPad;
551 timeSum += tailWeight * tailTime;
552 tailStart = CAMath::Min<uint32_t>(tailStart, tail->tailStart);
553 tailEnd = CAMath::Max<uint32_t>(tailEnd, tail->tailEnd);
555 CPU_ONLY(labelAcc.collectTail(
row, tail->pad, tail->tailStart, tail->tailEnd));
558 const float weightSum = CAMath::Max(qTot, 1.f);
559 const float padMean = padSum / weightSum;
560 const float timeMean = timeSum / weightSum;
561 const float padSigma = CAMath::Sqrt(CAMath::Max(0.f, padSqSum / weightSum - padMean * padMean));
565 cn.setSaturatedQtot(qTot);
566 cn.setSaturatedTailLength(tailEnd - tailStart);
567 float clusterTime = fragment.
start + timeMean - clusterer.Param().rec.tpc.clustersShiftTimebinsClusterizer;
568 cn.setTimeFlags(clusterTime, 0);
570 cn.setSigmaPad(padSigma);
572 if (cn.
qMax >= 1023) {
580 index = CAMath::AtomicAdd(&clusterer.mPclusterInRow[
row], 1u);
581 if (
index < clusterer.mNMaxClusterPerRow) {
582 clusterer.mPclusterByRow[clusterer.mNMaxClusterPerRow *
row +
index] = cn;
584 if (clusterPosInRow) {
585 clusterPosInRow[iTail] =
index;
588 index = clusterPosInRow[iTail];
Class of a TPC cluster in TPC-native coordinates (row, time)
#define GPUCA_GET_THREAD_COUNT(...)
#define DPRINTB_IF(test,...)
GPUd() void GPUTPCCFCheckPadBaseline
Provides a basic fallback implementation for Vc.
static constexpr uint32_t NROWS
GLfloat GLfloat GLfloat alpha
GLuint GLsizei GLsizei * length
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLint GLenum GLboolean GLsizei stride
GLuint GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat t1
tpccf::TPCFragmentTime length