28 int32_t nElems = CompactionElems(clusterer, stage);
30 const auto* predicate = clusterer.mPisPeak;
31 auto* scanOffset = clusterer.GetScanBuffer(iBuf);
35 if (iThreadGlobal < nElems) {
36 pred = predicate[iThreadGlobal];
39 int32_t nElemsInBlock = CfUtils::blockPredicateSum<GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE>(smem, pred);
41 int32_t lastThread = nThreads - 1;
42 if (iThread == lastThread) {
43 scanOffset[iBlock] = nElemsInBlock;
52 auto* scanOffset = clusterer.GetScanBuffer(iBuf - 1);
53 auto* scanOffsetNext = clusterer.GetScanBuffer(iBuf);
56 int32_t offsetInBlock = work_group_scan_inclusive_add((iThreadGlobal < nElems) ? scanOffset[iThreadGlobal] : 0);
59 scanOffset[iThreadGlobal] = offsetInBlock;
61 int32_t lastThread = nThreads - 1;
62 if (iThread == lastThread) {
63 scanOffsetNext[iBlock] = offsetInBlock;
73 int32_t* scanOffset = clusterer.GetScanBuffer(iBuf - 1);
75 bool inBounds = (iThreadGlobal < nElems);
77 int32_t offsetInBlock = work_group_scan_inclusive_add(inBounds ? scanOffset[iThreadGlobal] : 0);
80 scanOffset[iThreadGlobal] = offsetInBlock;
91 int32_t* scanOffsetPrev = clusterer.GetScanBuffer(iBuf - 1);
92 const int32_t* scanOffset = clusterer.GetScanBuffer(iBuf);
94 int32_t shift = scanOffset[iBlock];
96 if (iThreadGlobal < nElems) {
97 scanOffsetPrev[iThreadGlobal] += shift;
106 uint32_t nElems = CompactionElems(clusterer, stage);
107 SizeT bufferSize = (stage) ? clusterer.mNMaxClusters : clusterer.mNMaxPeaks;
111 const auto* predicate = clusterer.mPisPeak;
112 const auto* scanOffset = clusterer.GetScanBuffer(iBuf);
114 bool iAmDummy = (iThreadGlobal >= nElems);
116 int32_t pred = (iAmDummy) ? 0 : predicate[iThreadGlobal];
117 int32_t offsetInBlock = CfUtils::blockPredicateScan<GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE>(smem, pred);
119 SizeT globalOffsetOut = offsetInBlock;
121 globalOffsetOut += scanOffset[iBlock - 1];
124 if (pred && globalOffsetOut < bufferSize) {
125 out[globalOffsetOut] = in[iThreadGlobal];
129 if (iThreadGlobal == lastId) {
130 SizeT nFinal = globalOffsetOut + pred;
131 if (nFinal > bufferSize) {
132 clusterer.raiseError(stage ? GPUErrors::ERROR_CF_CLUSTER_OVERFLOW : GPUErrors::ERROR_CF_PEAK_OVERFLOW, clusterer.mISector, nFinal, bufferSize);
136 clusterer.mPmemory->counters.nClusters = nFinal;
138 clusterer.mPmemory->counters.nPeaks = nFinal;