30 static GPUdi() bool innerAboveThreshold(uint8_t aboveThreshold, uint16_t
outerIdx)
32 return aboveThreshold & (1 << cfconsts::OuterToInner[
outerIdx]);
35 static GPUdi() bool innerAboveThresholdInv(uint8_t aboveThreshold, uint16_t
outerIdx)
37 return aboveThreshold & (1 << cfconsts::OuterToInnerInv[
outerIdx]);
40 static GPUdi() bool isPeak(uint8_t peak) {
return peak & 0x01; }
42 static GPUdi() bool isAboveThreshold(uint8_t peak) {
return peak >> 1; }
44 static GPUdi() int32_t warpPredicateScan(int32_t
pred, int32_t*
sum)
47 int32_t iLane = hipThreadIdx_x % warpSize;
48 uint64_t waveMask = __ballot(
pred);
49 uint64_t lowerWarpMask = (1ull << iLane) - 1ull;
50 int32_t
myOffset = __popcll(waveMask & lowerWarpMask);
51 *
sum = __popcll(waveMask);
53#elif defined(__CUDACC__)
54 int32_t iLane = threadIdx.x % warpSize;
55 uint32_t waveMask = __ballot_sync(0xFFFFFFFF,
pred);
56 uint32_t lowerWarpMask = (1u << iLane) - 1u;
57 int32_t
myOffset = __popc(waveMask & lowerWarpMask);
58 *
sum = __popc(waveMask);
61 int32_t
myOffset = warp_scan_inclusive_add(
pred ? 1 : 0);
68 template <
size_t BlockSize,
typename SharedMemory>
69 static GPUdi() int32_t blockPredicateScan(SharedMemory& smem, int32_t
pred, int32_t*
sum =
nullptr)
71#if defined(__HIPCC__) || defined(__CUDACC__)
79 int32_t iWarp = iThread / warpSize;
80 int32_t nWarps = BlockSize / warpSize;
83 int32_t laneOffset = warpPredicateScan(
pred, &warpSum);
85 if (iThread % warpSize == 0) {
86 smem.warpPredicateSum[iWarp] = warpSum;
90 int32_t warpOffset = 0;
93 for (int32_t
w = 0;
w < iWarp;
w++) {
94 int32_t s = smem.warpPredicateSum[
w];
99 for (int32_t
w = 0;
w < nWarps;
w++) {
100 int32_t s = smem.warpPredicateSum[
w];
108 return warpOffset + laneOffset;
110 int32_t
lpos = work_group_scan_inclusive_add(
pred ? 1 : 0);
111 if (
sum !=
nullptr) {
112 *
sum = work_group_broadcast(
lpos, BlockSize - 1);
119 template <
size_t BlockSize,
typename SharedMemory>
120 static GPUdi() int32_t blockPredicateSum(SharedMemory& smem, int32_t
pred)
122#if defined(__HIPCC__) || defined(__CUDACC__)
130 int32_t iWarp = iThread / warpSize;
131 int32_t nWarps = BlockSize / warpSize;
135 __popcll(__ballot(
pred));
137 __popc(__ballot_sync(0xFFFFFFFF,
pred));
140 if (iThread % warpSize == 0) {
141 smem.warpPredicateSum[iWarp] = warpSum;
146 for (int32_t
w = 0;
w < nWarps;
w++) {
147 sum += smem.warpPredicateSum[
w];
152 return work_group_reduce_add(
pred ? 1 : 0);
156 template <
size_t SCRATCH_PAD_WORK_GROUP_SIZE,
typename SharedMemory>
162 int32_t
lpos = blockPredicateScan<SCRATCH_PAD_WORK_GROUP_SIZE>(smem, int32_t(!
pred && participates), &
part);
170 template <
typename T>
182#if defined(GPUCA_GPUCODE)
190 uint32_t writeTo =
N *
i +
x;
191 buf[writeTo] = map[readFrom.delta(d)];
203 for (uint32_t
i = 0;
i <
N;
i++) {
206 uint32_t writeTo =
N *
ll +
i;
207 buf[writeTo] = map[readFrom.delta(d)];
214 template <
typename T,
bool Inv = false>
224 const uint8_t* aboveThreshold,
227#if defined(GPUCA_GPUCODE)
234 uint8_t above = aboveThreshold[
i];
235 uint32_t writeTo =
N *
i +
x;
237 bool cond = (Inv) ? innerAboveThresholdInv(above,
x +
offset)
238 : innerAboveThreshold(above,
x +
offset);
240 v = map[readFrom.delta(d)];
251 uint8_t above = aboveThreshold[
ll];
254 for (uint32_t
i = 0;
i <
N;
i++) {
257 uint32_t writeTo =
N *
ll +
i;
259 bool cond = (Inv) ? innerAboveThresholdInv(above,
i +
offset)
260 : innerAboveThreshold(above,
i +
offset);
262 v = map[readFrom.delta(d)];