28 int32_t nElems = CompactionElems(clusterer, stage);
30 const auto* predicate = clusterer.mPisPeak;
31 auto* scanOffset = clusterer.GetScanBuffer(iBuf);
35 if (iThreadGlobal < nElems) {
36 pred = predicate[iThreadGlobal];
39 int32_t nElemsInBlock = CfUtils::blockPredicateSum<GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE>(smem, pred);
41 int32_t lastThread = nThreads - 1;
42 if (iThread == lastThread) {
43 scanOffset[iBlock] = nElemsInBlock;
52 auto* scanOffset = clusterer.GetScanBuffer(iBuf - 1);
53 auto* scanOffsetNext = clusterer.GetScanBuffer(iBuf);
56 int32_t offsetInBlock = work_group_scan_inclusive_add((iThreadGlobal < nElems) ? scanOffset[iThreadGlobal] : 0);
58 if (iThreadGlobal < nElems) {
59 scanOffset[iThreadGlobal] = offsetInBlock;
62 int32_t lastThread = nThreads - 1;
63 if (iThread == lastThread) {
64 scanOffsetNext[iBlock] = offsetInBlock;
74 int32_t* scanOffset = clusterer.GetScanBuffer(iBuf - 1);
76 bool inBounds = (iThreadGlobal < nElems);
78 int32_t offsetInBlock = work_group_scan_inclusive_add(inBounds ? scanOffset[iThreadGlobal] : 0);
81 scanOffset[iThreadGlobal] = offsetInBlock;
92 int32_t* scanOffsetPrev = clusterer.GetScanBuffer(iBuf - 1);
93 const int32_t* scanOffset = clusterer.GetScanBuffer(iBuf);
95 int32_t shift = scanOffset[iBlock];
97 if (iThreadGlobal < nElems) {
98 scanOffsetPrev[iThreadGlobal] += shift;
107 uint32_t nElems = CompactionElems(clusterer, stage);
108 SizeT bufferSize = (stage) ? clusterer.mNMaxClusters : clusterer.mNMaxPeaks;
112 const auto* predicate = clusterer.mPisPeak;
113 const auto* scanOffset = clusterer.GetScanBuffer(iBuf);
115 bool iAmDummy = (iThreadGlobal >= nElems);
117 int32_t pred = (iAmDummy) ? 0 : predicate[iThreadGlobal];
118 int32_t offsetInBlock = CfUtils::blockPredicateScan<GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE>(smem, pred);
120 SizeT globalOffsetOut = offsetInBlock;
122 globalOffsetOut += scanOffset[iBlock - 1];
125 if (pred && globalOffsetOut < bufferSize) {
126 out[globalOffsetOut] = in[iThreadGlobal];
130 if (iThreadGlobal == lastId) {
131 SizeT nFinal = globalOffsetOut + pred;
132 if (nFinal > bufferSize) {
133 clusterer.raiseError(stage ? GPUErrors::ERROR_CF_CLUSTER_OVERFLOW : GPUErrors::ERROR_CF_PEAK_OVERFLOW, clusterer.mISector, nFinal, bufferSize);
137 clusterer.mPmemory->counters.nClusters = nFinal;
139 clusterer.mPmemory->counters.nPeaks = nFinal;