26#define DPRINT(...) printf(__VA_ARGS__)
30#define DPRINTB_IF(test, ...) \
31 if (iThread == 0 && (test)) \
34#define DPRINT(...) ((void)0)
35#define DPRINTB(...) ((void)0)
36#define DPRINTB_IF(test, ...) ((void)0)
52 return filteredCharge +
alpha * (
charge - filteredCharge);
57 const float length = tail.
tailEnd > tail.tailStart ? float(tail.tailEnd - tail.tailStart) : 1.f;
58 return tail.tailStart + 0.5f * (
length - 1.f);
63 const float length = tail.tailEnd > tail.tailStart ? float(tail.tailEnd - tail.tailStart) : 1.f;
70static GPUdi() uint16_t CloseHIPTails(
71 Kernel::GPUSharedMemory& smem,
73 int32_t iThread, int32_t nThreads,
77 Kernel::PadChargeAccu& acc,
80 const uint32_t
row = basePos.row();
81 const uint16_t nClosedTails = work_group_count(shouldCloseTail);
83 auto* nHIPTails = clusterer.mPnHIPTails;
84 auto* hipTails = GetHIPTails(clusterer,
row);
86 if (nClosedTails > 0) {
87 int16_t iClosedTail = work_group_scan_inclusive_add((int16_t)shouldCloseTail) - 1;
88 const bool shouldStoreTail = shouldCloseTail && acc.activeHIPTail.Length() > 0;
89 uint16_t nStoredTails = work_group_count(shouldStoreTail);
90 int16_t iStoredTail = work_group_scan_inclusive_add((int16_t)shouldStoreTail) - 1;
94 if (nStoredTails > 0) {
96 smem.tailStoreBase = CAMath::AtomicAdd(&nHIPTails[
row], (uint32_t)nStoredTails);
100 if (shouldCloseTail) {
101 smem.tailsClosedPad[iClosedTail] = iPadHandle;
102 smem.tailsClosed[iClosedTail] = acc.activeHIPTail;
105 if (shouldStoreTail) {
106 const uint32_t
idx = smem.tailStoreBase + iStoredTail + 1;
107 smem.tailsClosedStoreIdx[iClosedTail] =
idx;
109 hipTails[
idx] = {0, 0, (uint16_t)iPadHandle,
110 (uint16_t)acc.activeHIPTail.start, (uint16_t)acc.activeHIPTail.end,
115 acc.tailFilterCharge = 0;
116 acc.activeHIPTail.Reset();
123 for (uint16_t iTail = 0; iTail < nClosedTails; iTail++) {
124 const auto tailPad = smem.tailsClosedPad[iTail];
125 const auto tail = smem.tailsClosed[iTail];
126 const uint32_t tailStoreIdx = smem.tailsClosedStoreIdx[iTail];
130 for (uint16_t iTime = iThread; iTime < tail.Length(); iTime += nThreads) {
131 const int16_t
time = tail.start + iTime;
132 auto pos = basePos.delta({tailPad,
time});
133 const Charge q = chargeMap[
pos].unpack();
135 qMax = CAMath::Max(qMax, q);
139 smem.tailQTotScratch[iThread] = qTot;
140 smem.tailQMaxScratch[iThread] = qMax;
145 smem.tailQTotScratch[iThread] += smem.tailQTotScratch[iThread +
stride];
146 smem.tailQMaxScratch[iThread] = CAMath::Max(smem.tailQMaxScratch[iThread], smem.tailQMaxScratch[iThread +
stride]);
154 tailDescriptor.
qTot = smem.tailQTotScratch[0];
155 tailDescriptor.
qMax = smem.tailQMaxScratch[0];
162template <
bool CheckHIPTrigger,
bool CheckHIPTailEnd>
163static GPUdi()
void ScanCachedCharges(
Kernel::GPUSharedMemory& smem, uint16_t timeOffset, uint16_t pad,
Charge hipTailThreshold,
Charge hipTailFilterAlpha,
Kernel::PadChargeAccu& acc)
166 const Charge qs = smem.charges[
i][pad];
167 const int16_t curTB = timeOffset +
i;
169 acc.totalCharges += qs > 0;
170 acc.consecCharges = qs > 0 ? acc.consecCharges + 1 : 0;
171 acc.maxConsecCharges = CAMath::Max(acc.consecCharges, acc.maxConsecCharges);
172 acc.maxCharge = CAMath::Max<Charge>(qs, acc.maxCharge);
174 if (qs >= hipTailThreshold) {
175 if (acc.aboveThresholdStart < 0) {
176 acc.aboveThresholdStart = curTB;
179 acc.aboveThresholdStart = -1;
182 if constexpr (CheckHIPTrigger) {
184 acc.HIPtb = acc.aboveThresholdStart;
185 smem.tails[pad] = {acc.HIPtb, 0};
189 if constexpr (CheckHIPTailEnd) {
190 if (acc.activeHIPTail.IsOpen()) {
191 acc.tailFilterCharge = UpdateHIPTailFilter(acc.tailFilterCharge, qs, hipTailFilterAlpha);
192 if (acc.tailFilterCharge < hipTailThreshold) {
193 acc.activeHIPTail.end = curTB;
204 CheckBaselineGPU(nBlocks, nThreads, iBlock, iThread, smem, clusterer);
206 CheckBaselineCPU(nBlocks, nThreads, iBlock, iThread, smem, clusterer);
216GPUd()
void GPUTPCCFCheckPadBaseline::CheckBaselineGPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
224 const CfFragment& fragment = clusterer.mPmemory->fragment;
225 const bool hipFilterOn = clusterer.Param().rec.tpc.hipTailFilter;
226 const Charge hipTailThreshold = clusterer.Param().rec.tpc.hipTailFilterThreshold;
227 const Charge hipTailFilterAlpha = clusterer.Param().rec.tpc.hipTailFilterAlpha;
232 const auto iRow = iBlock;
233 const auto nPads = geo.NPads(iRow);
238 const int16_t iPadOffset = iThread % MaxNPadsPerRow;
239 const int16_t iTimeOffset = iThread / MaxNPadsPerRow;
240 const int16_t iPadHandle = iThread;
241 const bool handlePad = iPadHandle < nPads;
243 if (iPadHandle < MaxNPadsPerRow) {
244 smem.tails[iPadHandle] = {-1, -1};
255 for (uint16_t t = firstTB; t < lastTB; t += NumOfCachedTBs) {
257 bool thisThreadHasTrigger =
false;
258 for (uint16_t
tt = 0;
tt < NumOfCachedTBs;
tt += TimebinsPerCacheline) {
263 const Charge ql = iTimeLoad < lastTB && iPadOffset < nPads ? chargeMap[
pos].unpack() : 0;
264 smem.charges[
tt + iTimeOffset][iPadOffset] = ql;
266 thisThreadHasTrigger |= ql >=
Charge(MaxADC);
269 bool hasHIPTrigger =
false;
271 hasHIPTrigger = work_group_any(thisThreadHasTrigger);
283 if (!hasHIPTrigger) [[likely]] {
284 if (!acc.activeHIPTail.IsOpen()) {
285 ScanCachedCharges<false, false>(smem, t, iPadHandle, hipTailThreshold, hipTailFilterAlpha, acc);
287 ScanCachedCharges<false, true>(smem, t, iPadHandle, hipTailThreshold, hipTailFilterAlpha, acc);
290 if (!acc.activeHIPTail.IsOpen()) {
291 ScanCachedCharges<true, false>(smem, t, iPadHandle, hipTailThreshold, hipTailFilterAlpha, acc);
293 ScanCachedCharges<true, true>(smem, t, iPadHandle, hipTailThreshold, hipTailFilterAlpha, acc);
300 if (hasHIPTrigger) [[unlikely]] {
302 DPRINTB(
"%d: Trigger!\n", iBlock);
304 if (handlePad && acc.HIPtb < 0) {
307 for (int16_t
i = -SSClusterPadWidth;
i < 0;
i++) {
308 const auto p = iPadHandle +
i;
310 acc.HIPtb = CAMath::Max(smem.tails[p].start, acc.HIPtb);
314 for (int16_t
i = 1;
i <= SSClusterPadWidth;
i++) {
315 const auto p = iPadHandle +
i;
316 if (p < MaxNPadsPerRow) {
317 acc.HIPtb = CAMath::Max(smem.tails[p].start, acc.HIPtb);
322 bool shouldCloseTail = acc.HIPtb > -1 && acc.activeHIPTail.HasValue();
323 if (shouldCloseTail && acc.activeHIPTail.IsOpen()) {
324 DPRINT(
"%d: end = %d\n", iThread, acc.HIPtb);
325 acc.activeHIPTail.end = acc.HIPtb;
328 CloseHIPTails(smem, clusterer, iThread, nThreads, iPadHandle, basePos, chargeMap, acc, shouldCloseTail);
332 if (acc.HIPtb > -1) {
333 DPRINT(
"%d: start = %d\n", iThread, acc.HIPtb);
334 acc.activeHIPTail.SetOpen(acc.HIPtb);
335 acc.tailFilterCharge =
Charge(MaxADC);
340 smem.tails[iPadHandle].Reset();
350 updatePadBaseline(basePos.gpad + iPadHandle, clusterer, acc.totalCharges, acc.maxConsecCharges, acc.maxCharge);
354 const bool shouldCloseTail = acc.activeHIPTail.HasValue();
359 if (work_group_any(shouldCloseTail)) {
360 if (shouldCloseTail && acc.activeHIPTail.IsOpen()) {
361 acc.activeHIPTail.end = lastTB;
364 [[maybe_unused]]
const uint16_t nClosedTails = CloseHIPTails(smem, clusterer, iThread, nThreads, iPadHandle, basePos, chargeMap, acc, shouldCloseTail);
366 DPRINTB_IF(nClosedTails > 0,
"%d: Close remaining tails (%d)\n", iBlock, nClosedTails);
372GPUd()
void GPUTPCCFCheckPadBaseline::CheckBaselineCPU(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
375 const CfFragment& fragment = clusterer.mPmemory->fragment;
381 if (basePos.pad() >= geo.NPads(basePos.row())) {
385 constexpr size_t ElemsInTileRow = (size_t)
TilingLayout<
GridSize<2>>::WidthInTiles * TimebinsPerCacheline * PadsPerCacheline;
387 using UShort8 = Vc::fixed_size_simd<uint16_t, PadsPerCacheline>;
388 using Charge8 = Vc::fixed_size_simd<float, PadsPerCacheline>;
390 UShort8 totalCharges{Vc::Zero};
391 UShort8 consecCharges{Vc::Zero};
392 UShort8 maxConsecCharges{Vc::Zero};
393 Charge8 maxCharge{Vc::Zero};
398 const uint16_t* packedChargeStart =
reinterpret_cast<uint16_t*
>(&chargeMap[basePos.delta({0, t})]);
400 for (; t < fragment.lastNonOverlapTimeBin(); t += TimebinsPerCacheline) {
402 const UShort8 packedCharges{packedChargeStart + PadsPerCacheline * localtime, Vc::Aligned};
403 const UShort8::mask_type isCharge = packedCharges != 0;
405 if (isCharge.isNotEmpty()) {
406 totalCharges(isCharge)++;
408 consecCharges(not isCharge) = 0;
409 maxConsecCharges = Vc::max(consecCharges, maxConsecCharges);
418 maxCharge = Vc::max(maxCharge, unpackedCharges);
424 packedChargeStart += ElemsInTileRow;
427 for (
tpccf::Pad localpad = 0; localpad < PadsPerCacheline; localpad++) {
428 updatePadBaseline(basePos.gpad + localpad, clusterer, totalCharges[localpad], maxConsecCharges[localpad], maxCharge[localpad]);
435 const CfFragment& fragment = clusterer.mPmemory->fragment;
436 const int32_t totalChargesBaseline = clusterer.Param().rec.tpc.maxTimeBinAboveThresholdIn1000Bin * fragment.lengthWithoutOverlap() / 1000;
437 const int32_t consecChargesBaseline = clusterer.Param().rec.tpc.maxConsecTimeBinAboveThreshold;
438 const uint16_t saturationThreshold = clusterer.Param().rec.tpc.noisyPadSaturationThreshold;
439 const bool isNoisy = (!saturationThreshold || maxCharge < saturationThreshold) && ((totalChargesBaseline > 0 && totalCharges >= totalChargesBaseline) || (consecChargesBaseline > 0 && consecCharges >= consecChargesBaseline));
442 clusterer.mPpadIsNoisy[pad] =
true;
454 const uint32_t
row = iBlock;
456 const uint32_t nTails = CAMath::Min(clusterer.mPnHIPTails[
row], (uint32_t)MaxHIPTailsPerRow - 1);
461#ifdef GPUCA_DETERMINISTIC_MODE
468 GPUCommonAlgorithm::sortInBlock(tails + 1, tails + nTails + 1, [](
auto&&
t1,
auto&& t2) {
469 if (
t1.pad != t2.pad) {
470 return t1.pad < t2.pad;
476 for (uint32_t iTail = iThread + 1; iTail <= nTails; iTail += nThreads) {
477 auto* tail = &tails[iTail];
481 uint16_t overlapWindowStart = tail->
tailStart >= 5 ? tail->tailStart - 5 : 0;
482 uint16_t overlapWindowEnd = tail->tailStart + 5;
484 for (uint32_t jTail = iTail + 1; jTail <= nTails; jTail++) {
485 auto* tailNext = &tails[jTail];
486 if (tailNext->iPrev > 0) {
491 const bool overlapTime = tailNext->tailStart >= overlapWindowStart && tailNext->tailStart < overlapWindowEnd;
493 if (overlapPad && overlapTime) {
494 if (CAMath::AtomicCAS(&tailNext->iPrev, 0u, iTail)) {
506GPUd()
void GPUTPCCFHIPClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
512 const uint32_t
row = iBlock;
513 uint32_t nTails = clusterer.mPnHIPTails[
row];
514 nTails = CAMath::Min(nTails, (uint32_t)MaxHIPTailsPerRow - 1);
517 const auto& fragment = clusterer.mPmemory->fragment;
519 for (uint32_t iTail = iThread + 1; iTail <= nTails; iTail += nThreads) {
521 auto* tail = &tails[iTail];
523 if (tail->iPrev != 0) {
527 float qTot = tail->
qTot;
528 float qMax = tail->qMax;
529 const float firstWeight = tail->qTot;
530 const float firstPad = tail->pad;
531 const float firstTime = HIPTailTimeMean(*tail);
532 float padSum = firstWeight * firstPad;
533 float padSqSum = firstWeight * firstPad * firstPad;
534 float timeSum = firstWeight * firstTime;
536 uint32_t tailStart = tail->tailStart;
537 uint32_t tailEnd = tail->tailEnd;
539 while (tail->iNext != 0) {
541 tail = &tails[tail->iNext];
543 const float tailWeight = tail->
qTot;
544 const float tailPad = tail->pad;
545 const float tailTime = HIPTailTimeMean(*tail);
546 qMax = CAMath::Max(qMax, tail->qMax);
548 padSum += tailWeight * tailPad;
549 padSqSum += tailWeight * tailPad * tailPad;
550 timeSum += tailWeight * tailTime;
551 tailStart = CAMath::Min<uint32_t>(tailStart, tail->tailStart);
552 tailEnd = CAMath::Max<uint32_t>(tailEnd, tail->tailEnd);
555 const float weightSum = CAMath::Max(qTot, 1.f);
556 float padMean = padSum / weightSum;
557 float timeMean = timeSum / weightSum;
558 float padSigma = CAMath::Sqrt(CAMath::Max(0.f, padSqSum / weightSum - padMean * padMean));
562 cn.setSaturatedQtot(qTot);
563 cn.setSaturatedTailLength(tailEnd - tailStart);
564 float clusterTime = fragment.
start + timeMean - clusterer.Param().rec.tpc.clustersShiftTimebinsClusterizer;
565 cn.setTimeFlags(clusterTime, 0);
567 cn.setSigmaPad(padSigma);
569 if (cn.
qMax >= 1023) {
573 uint32_t
index = CAMath::AtomicAdd(&clusterer.mPclusterInRow[
row], 1u);
574 if (
index < clusterer.mNMaxClusterPerRow) {
575 clusterer.mPclusterByRow[clusterer.mNMaxClusterPerRow *
row +
index] = cn;
Class of a TPC cluster in TPC-native coordinates (row, time)
#define GPUCA_GET_THREAD_COUNT(...)
#define DPRINTB_IF(test,...)
GPUd() void GPUTPCCFCheckPadBaseline
Provides a basic fallback implementation for Vc.
static constexpr uint32_t NROWS
GLfloat GLfloat GLfloat alpha
GLuint GLsizei GLsizei * length
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLint GLenum GLboolean GLsizei stride
GLuint GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat t1
tpccf::TPCFragmentTime length