30 static GPUdi() bool innerAboveThreshold(uint8_t aboveThreshold, uint16_t
outerIdx)
32 return aboveThreshold & (1 << cfconsts::OuterToInner[
outerIdx]);
35 static GPUdi() bool innerAboveThresholdInv(uint8_t aboveThreshold, uint16_t
outerIdx)
37 return aboveThreshold & (1 << cfconsts::OuterToInnerInv[
outerIdx]);
40 static GPUdi() bool isPeak(uint8_t peak) {
return peak & 0x01; }
42 static GPUdi() bool isAboveThreshold(uint8_t peak) {
return peak >> 1; }
44 static GPUdi() int32_t warpPredicateScan(int32_t
pred, int32_t*
sum)
47 int32_t iLane = hipThreadIdx_x % warpSize;
48 uint64_t waveMask = __ballot(
pred);
49 uint64_t lowerWarpMask = (1ull << iLane) - 1ull;
50 int32_t myOffset = __popcll(waveMask & lowerWarpMask);
51 *
sum = __popcll(waveMask);
53#elif defined(__CUDACC__)
54 int32_t iLane = threadIdx.x % warpSize;
55 uint32_t waveMask = __ballot_sync(0xFFFFFFFF,
pred);
56 uint32_t lowerWarpMask = (1u << iLane) - 1u;
57 int32_t myOffset = __popc(waveMask & lowerWarpMask);
58 *
sum = __popc(waveMask);
61 int32_t myOffset = warp_scan_inclusive_add(!!
pred);
62 *
sum = warp_broadcast(myOffset, GPUCA_WARP_SIZE - 1);
67 template <
size_t BlockSize,
typename SharedMemory>
68 static GPUdi() int32_t blockPredicateScan(SharedMemory& smem, int32_t
pred, int32_t*
sum =
nullptr)
70#if defined(__HIPCC__) || defined(__CUDACC__)
78 int32_t iWarp = iThread / warpSize;
79 int32_t nWarps = BlockSize / warpSize;
82 int32_t laneOffset = warpPredicateScan(
pred, &warpSum);
84 if (iThread % warpSize == 0) {
85 smem.warpPredicateSum[iWarp] = warpSum;
89 int32_t warpOffset = 0;
92 for (int32_t
w = 0;
w < iWarp;
w++) {
93 int32_t s = smem.warpPredicateSum[
w];
98 for (int32_t
w = 0;
w < nWarps;
w++) {
99 int32_t s = smem.warpPredicateSum[
w];
107 return warpOffset + laneOffset;
109 int32_t
lpos = work_group_scan_inclusive_add(
pred ? 1 : 0);
110 if (
sum !=
nullptr) {
111 *
sum = work_group_broadcast(
lpos, BlockSize - 1);
117 template <
size_t BlockSize,
typename SharedMemory>
118 static GPUdi() int32_t blockPredicateSum(SharedMemory& smem, int32_t
pred)
120#if defined(__HIPCC__) || defined(__CUDACC__)
128 int32_t iWarp = iThread / warpSize;
129 int32_t nWarps = BlockSize / warpSize;
133 __popcll(__ballot(
pred));
135 __popc(__ballot_sync(0xFFFFFFFF,
pred));
138 if (iThread % warpSize == 0) {
139 smem.warpPredicateSum[iWarp] = warpSum;
144 for (int32_t
w = 0;
w < nWarps;
w++) {
145 sum += smem.warpPredicateSum[
w];
150 return work_group_reduce_add(!!
pred);
154 template <
size_t SCRATCH_PAD_WORK_GROUP_SIZE,
typename SharedMemory>
160 int32_t
lpos = blockPredicateScan<SCRATCH_PAD_WORK_GROUP_SIZE>(smem, int32_t(!
pred && participates), &
part);
168 template <
typename T>
180#if defined(GPUCA_GPUCODE)
188 uint32_t writeTo =
N *
i +
x;
189 buf[writeTo] = map[readFrom.delta(d)];
201 for (uint32_t
i = 0;
i <
N;
i++) {
204 uint32_t writeTo =
N *
ll +
i;
205 buf[writeTo] = map[readFrom.delta(d)];
212 template <
typename T,
bool Inv = false>
222 const uint8_t* aboveThreshold,
225#if defined(GPUCA_GPUCODE)
232 uint8_t above = aboveThreshold[
i];
233 uint32_t writeTo =
N *
i +
x;
235 bool cond = (Inv) ? innerAboveThresholdInv(above,
x +
offset)
236 : innerAboveThreshold(above,
x +
offset);
238 v = map[readFrom.delta(d)];
249 uint8_t above = aboveThreshold[
ll];
252 for (uint32_t
i = 0;
i <
N;
i++) {
255 uint32_t writeTo =
N *
ll +
i;
257 bool cond = (Inv) ? innerAboveThresholdInv(above,
i +
offset)
258 : innerAboveThreshold(above,
i +
offset);
260 v = map[readFrom.delta(d)];