35 static GPUdi() bool innerAboveThreshold(uint8_t aboveThreshold, uint16_t
outerIdx)
37 return aboveThreshold & (1 << cfconsts::OuterToInner[
outerIdx]);
40 static GPUdi() bool innerAboveThresholdInv(uint8_t aboveThreshold, uint16_t
outerIdx)
42 return aboveThreshold & (1 << cfconsts::OuterToInnerInv[
outerIdx]);
45 static GPUdi() bool isPeak(uint8_t peak) {
return peak & 0x01; }
47 static GPUdi() bool isAboveThreshold(uint8_t peak) {
return peak >> 1; }
49 static GPUdi() int32_t warpPredicateScan(int32_t
pred, int32_t*
sum)
52 int32_t iLane = hipThreadIdx_x % warpSize;
53 uint64_t waveMask = __ballot(
pred);
54 uint64_t lowerWarpMask = (1ull << iLane) - 1ull;
55 int32_t
myOffset = __popcll(waveMask & lowerWarpMask);
56 *
sum = __popcll(waveMask);
58#elif defined(__CUDACC__)
59 int32_t iLane = threadIdx.x % warpSize;
60 uint32_t waveMask = __ballot_sync(0xFFFFFFFF,
pred);
61 uint32_t lowerWarpMask = (1u << iLane) - 1u;
62 int32_t
myOffset = __popc(waveMask & lowerWarpMask);
63 *
sum = __popc(waveMask);
66 int32_t
myOffset = warp_scan_inclusive_add(
pred ? 1 : 0);
73 template <
size_t BlockSize,
typename SharedMemory>
74 static GPUdi() int32_t blockPredicateScan(SharedMemory& smem, int32_t
pred, int32_t*
sum =
nullptr)
76#if defined(__HIPCC__) || defined(__CUDACC__)
84 int32_t iWarp = iThread / warpSize;
85 int32_t nWarps = BlockSize / warpSize;
88 int32_t laneOffset = warpPredicateScan(
pred, &warpSum);
90 if (iThread % warpSize == 0) {
91 smem.warpPredicateSum[iWarp] = warpSum;
95 int32_t warpOffset = 0;
98 for (int32_t
w = 0;
w < iWarp;
w++) {
99 int32_t s = smem.warpPredicateSum[
w];
104 for (int32_t
w = 0;
w < nWarps;
w++) {
105 int32_t s = smem.warpPredicateSum[
w];
113 return warpOffset + laneOffset;
115 int32_t
lpos = work_group_scan_inclusive_add(
pred ? 1 : 0);
116 if (
sum !=
nullptr) {
117 *
sum = work_group_broadcast(
lpos, BlockSize - 1);
124 template <
size_t BlockSize,
typename SharedMemory>
125 static GPUdi() int32_t blockPredicateSum(SharedMemory& smem, int32_t
pred)
127#if defined(__HIPCC__) || defined(__CUDACC__)
135 int32_t iWarp = iThread / warpSize;
136 int32_t nWarps = BlockSize / warpSize;
140 __popcll(__ballot(
pred));
142 __popc(__ballot_sync(0xFFFFFFFF,
pred));
145 if (iThread % warpSize == 0) {
146 smem.warpPredicateSum[iWarp] = warpSum;
151 for (int32_t
w = 0;
w < nWarps;
w++) {
152 sum += smem.warpPredicateSum[
w];
157 return work_group_reduce_add(
pred ? 1 : 0);
161 template <
size_t SCRATCH_PAD_WORK_GROUP_SIZE,
typename SharedMemory>
167 int32_t
lpos = blockPredicateScan<SCRATCH_PAD_WORK_GROUP_SIZE>(smem, int32_t(!
pred && participates), &
part);
175 template <
typename T>
187#if defined(GPUCA_GPUCODE)
195 uint32_t writeTo =
N *
i +
x;
196 buf[writeTo] = map[readFrom.delta(d)];
208 for (uint32_t
i = 0;
i <
N;
i++) {
211 uint32_t writeTo =
N *
ll +
i;
212 buf[writeTo] = map[readFrom.delta(d)];
219 template <
typename T,
bool Inv = false>
229 const uint8_t* aboveThreshold,
232#if defined(GPUCA_GPUCODE)
239 uint8_t above = aboveThreshold[
i];
240 uint32_t writeTo =
N *
i +
x;
242 bool cond = (Inv) ? innerAboveThresholdInv(above,
x +
offset)
243 : innerAboveThreshold(above,
x +
offset);
245 v = map[readFrom.delta(d)];
256 uint8_t above = aboveThreshold[
ll];
259 for (uint32_t
i = 0;
i <
N;
i++) {
262 uint32_t writeTo =
N *
ll +
i;
264 bool cond = (Inv) ? innerAboveThresholdInv(above,
i +
offset)
265 : innerAboveThreshold(above,
i +
offset);
267 v = map[readFrom.delta(d)];