27 int32_t nElems = CompactionElems(clusterer, stage);
29 const auto* predicate = clusterer.mPisPeak;
30 auto* scanOffset = clusterer.GetScanBuffer(iBuf);
34 if (iThreadGlobal < nElems) {
35 pred = predicate[iThreadGlobal];
38 int32_t nElemsInBlock = CfUtils::blockPredicateSum<GPUCA_THREAD_COUNT_SCAN>(smem, pred);
40 int32_t lastThread = nThreads - 1;
41 if (iThread == lastThread) {
42 scanOffset[iBlock] = nElemsInBlock;
49 auto* scanOffset = clusterer.GetScanBuffer(iBuf - 1);
50 auto* scanOffsetNext = clusterer.GetScanBuffer(iBuf);
53 int32_t offsetInBlock = work_group_scan_inclusive_add((iThreadGlobal < nElems) ? scanOffset[iThreadGlobal] : 0);
56 scanOffset[iThreadGlobal] = offsetInBlock;
58 int32_t lastThread = nThreads - 1;
59 if (iThread == lastThread) {
60 scanOffsetNext[iBlock] = offsetInBlock;
68 int32_t* scanOffset = clusterer.GetScanBuffer(iBuf - 1);
70 bool inBounds = (iThreadGlobal < nElems);
72 int32_t offsetInBlock = work_group_scan_inclusive_add(inBounds ? scanOffset[iThreadGlobal] : 0);
75 scanOffset[iThreadGlobal] = offsetInBlock;
84 int32_t* scanOffsetPrev = clusterer.GetScanBuffer(iBuf - 1);
85 const int32_t* scanOffset = clusterer.GetScanBuffer(iBuf);
87 int32_t shift = scanOffset[iBlock];
89 if (iThreadGlobal < nElems) {
90 scanOffsetPrev[iThreadGlobal] += shift;
97 uint32_t nElems = CompactionElems(clusterer, stage);
98 SizeT bufferSize = (stage) ? clusterer.mNMaxClusters : clusterer.mNMaxPeaks;
102 const auto* predicate = clusterer.mPisPeak;
103 const auto* scanOffset = clusterer.GetScanBuffer(iBuf);
105 bool iAmDummy = (iThreadGlobal >= nElems);
107 int32_t pred = (iAmDummy) ? 0 : predicate[iThreadGlobal];
108 int32_t offsetInBlock = CfUtils::blockPredicateScan<GPUCA_THREAD_COUNT_SCAN>(smem, pred);
110 SizeT globalOffsetOut = offsetInBlock;
112 globalOffsetOut += scanOffset[iBlock - 1];
115 if (pred && globalOffsetOut < bufferSize) {
116 out[globalOffsetOut] = in[iThreadGlobal];
120 if (iThreadGlobal == lastId) {
121 SizeT nFinal = globalOffsetOut + pred;
122 if (nFinal > bufferSize) {
123 clusterer.raiseError(stage ? GPUErrors::ERROR_CF_CLUSTER_OVERFLOW : GPUErrors::ERROR_CF_PEAK_OVERFLOW, clusterer.mISector, nFinal, bufferSize);
127 clusterer.mPmemory->counters.nClusters = nFinal;
129 clusterer.mPmemory->counters.nPeaks = nFinal;