Project
Loading...
Searching...
No Matches
CfUtils.h
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
15#ifndef O2_GPU_CF_UTILS_H
16#define O2_GPU_CF_UTILS_H
17
18#include "clusterFinderDefs.h"
19#include "GPUCommonAlgorithm.h"
20#include "Array2D.h"
21#include "CfConsts.h"
22
23namespace o2::gpu
24{
25
27{
28
29 public:
30 static GPUdi() bool innerAboveThreshold(uint8_t aboveThreshold, uint16_t outerIdx)
31 {
32 return aboveThreshold & (1 << cfconsts::OuterToInner[outerIdx]);
33 }
34
35 static GPUdi() bool innerAboveThresholdInv(uint8_t aboveThreshold, uint16_t outerIdx)
36 {
37 return aboveThreshold & (1 << cfconsts::OuterToInnerInv[outerIdx]);
38 }
39
40 static GPUdi() bool isPeak(uint8_t peak) { return peak & 0x01; }
41
42 static GPUdi() bool isAboveThreshold(uint8_t peak) { return peak >> 1; }
43
44 static GPUdi() int32_t warpPredicateScan(int32_t pred, int32_t* sum)
45 {
46#ifdef __HIPCC__
47 int32_t iLane = hipThreadIdx_x % warpSize;
48 uint64_t waveMask = __ballot(pred);
49 uint64_t lowerWarpMask = (1ull << iLane) - 1ull;
50 int32_t myOffset = __popcll(waveMask & lowerWarpMask);
51 *sum = __popcll(waveMask);
52 return myOffset;
53#elif defined(__CUDACC__)
54 int32_t iLane = threadIdx.x % warpSize;
55 uint32_t waveMask = __ballot_sync(0xFFFFFFFF, pred);
56 uint32_t lowerWarpMask = (1u << iLane) - 1u;
57 int32_t myOffset = __popc(waveMask & lowerWarpMask);
58 *sum = __popc(waveMask);
59 return myOffset;
60#else // CPU / OpenCL fallback
61 int32_t myOffset = warp_scan_inclusive_add(pred ? 1 : 0);
62 *sum = warp_broadcast(myOffset, GPUCA_WARP_SIZE - 1);
64 return myOffset;
65#endif
66 }
67
68 template <size_t BlockSize, typename SharedMemory>
69 static GPUdi() int32_t blockPredicateScan(SharedMemory& smem, int32_t pred, int32_t* sum = nullptr)
70 {
71#if defined(__HIPCC__) || defined(__CUDACC__)
72 int32_t iThread =
73#ifdef __HIPCC__
74 hipThreadIdx_x;
75#else
76 threadIdx.x;
77#endif
78
79 int32_t iWarp = iThread / warpSize;
80 int32_t nWarps = BlockSize / warpSize;
81
82 int32_t warpSum;
83 int32_t laneOffset = warpPredicateScan(pred, &warpSum);
84
85 if (iThread % warpSize == 0) {
86 smem.warpPredicateSum[iWarp] = warpSum;
87 }
88 __syncthreads();
89
90 int32_t warpOffset = 0;
91
92 if (sum == nullptr) {
93 for (int32_t w = 0; w < iWarp; w++) {
94 int32_t s = smem.warpPredicateSum[w];
95 warpOffset += s;
96 }
97 } else {
98 *sum = 0;
99 for (int32_t w = 0; w < nWarps; w++) {
100 int32_t s = smem.warpPredicateSum[w];
101 if (w < iWarp) {
102 warpOffset += s;
103 }
104 *sum += s;
105 }
106 }
107
108 return warpOffset + laneOffset;
109#else // CPU / OpenCL fallback
110 int32_t lpos = work_group_scan_inclusive_add(pred ? 1 : 0);
111 if (sum != nullptr) {
112 *sum = work_group_broadcast(lpos, BlockSize - 1);
113 }
114 lpos--;
115 return lpos;
116#endif
117 }
118
119 template <size_t BlockSize, typename SharedMemory>
120 static GPUdi() int32_t blockPredicateSum(SharedMemory& smem, int32_t pred)
121 {
122#if defined(__HIPCC__) || defined(__CUDACC__)
123 int32_t iThread =
124#ifdef __HIPCC__
125 hipThreadIdx_x;
126#else
127 threadIdx.x;
128#endif
129
130 int32_t iWarp = iThread / warpSize;
131 int32_t nWarps = BlockSize / warpSize;
132
133 int32_t warpSum =
134#ifdef __HIPCC__
135 __popcll(__ballot(pred));
136#else
137 __popc(__ballot_sync(0xFFFFFFFF, pred));
138#endif
139
140 if (iThread % warpSize == 0) {
141 smem.warpPredicateSum[iWarp] = warpSum;
142 }
143 __syncthreads();
144
145 int32_t sum = 0;
146 for (int32_t w = 0; w < nWarps; w++) {
147 sum += smem.warpPredicateSum[w];
148 }
149
150 return sum;
151#else // CPU / OpenCL fallback
152 return work_group_reduce_add(pred ? 1 : 0);
153#endif
154 }
155
156 template <size_t SCRATCH_PAD_WORK_GROUP_SIZE, typename SharedMemory>
157 static GPUdi() uint16_t partition(SharedMemory& smem, uint16_t ll, bool pred, uint16_t partSize, uint16_t* newPartSize)
158 {
159 bool participates = ll < partSize;
160
161 int32_t part;
162 int32_t lpos = blockPredicateScan<SCRATCH_PAD_WORK_GROUP_SIZE>(smem, int32_t(!pred && participates), &part);
163
164 uint16_t pos = (participates && !pred) ? lpos : part;
165
167 return pos;
168 }
169
170 template <typename T>
171 static GPUdi() void blockLoad(
172 const Array2D<T>& map,
173 uint32_t wgSize,
174 uint32_t elems,
175 uint16_t ll,
176 uint32_t offset,
177 uint32_t N,
178 GPUconstexprref() const tpccf::Delta2* neighbors,
179 const ChargePos* posBcast,
180 GPUgeneric() T* buf)
181 {
182#if defined(GPUCA_GPUCODE)
183 GPUbarrier();
184 uint16_t x = ll % N;
185 uint16_t y = ll / N;
186 tpccf::Delta2 d = neighbors[x + offset];
187
188 for (uint32_t i = y; i < wgSize; i += (elems / N)) {
189 ChargePos readFrom = posBcast[i];
190 uint32_t writeTo = N * i + x;
191 buf[writeTo] = map[readFrom.delta(d)];
192 }
193 GPUbarrier();
194#else
195 if (ll >= wgSize) {
196 return;
197 }
198
199 ChargePos readFrom = posBcast[ll];
200
201 GPUbarrier();
202
203 for (uint32_t i = 0; i < N; i++) {
204 tpccf::Delta2 d = neighbors[i + offset];
205
206 uint32_t writeTo = N * ll + i;
207 buf[writeTo] = map[readFrom.delta(d)];
208 }
209
210 GPUbarrier();
211#endif
212 }
213
214 template <typename T, bool Inv = false>
215 static GPUdi() void condBlockLoad(
216 const Array2D<T>& map,
217 uint16_t wgSize,
218 uint16_t elems,
219 uint16_t ll,
220 uint16_t offset,
221 uint16_t N,
222 GPUconstexprref() const tpccf::Delta2* neighbors,
223 const ChargePos* posBcast,
224 const uint8_t* aboveThreshold,
225 GPUgeneric() T* buf)
226 {
227#if defined(GPUCA_GPUCODE)
228 GPUbarrier();
229 uint16_t y = ll / N;
230 uint16_t x = ll % N;
231 tpccf::Delta2 d = neighbors[x + offset];
232 for (uint32_t i = y; i < wgSize; i += (elems / N)) {
233 ChargePos readFrom = posBcast[i];
234 uint8_t above = aboveThreshold[i];
235 uint32_t writeTo = N * i + x;
236 T v(0);
237 bool cond = (Inv) ? innerAboveThresholdInv(above, x + offset)
238 : innerAboveThreshold(above, x + offset);
239 if (cond) {
240 v = map[readFrom.delta(d)];
241 }
242 buf[writeTo] = v;
243 }
244 GPUbarrier();
245#else
246 if (ll >= wgSize) {
247 return;
248 }
249
250 ChargePos readFrom = posBcast[ll];
251 uint8_t above = aboveThreshold[ll];
252 GPUbarrier();
253
254 for (uint32_t i = 0; i < N; i++) {
255 tpccf::Delta2 d = neighbors[i + offset];
256
257 uint32_t writeTo = N * ll + i;
258 T v(0);
259 bool cond = (Inv) ? innerAboveThresholdInv(above, i + offset)
260 : innerAboveThreshold(above, i + offset);
261 if (cond) {
262 v = map[readFrom.delta(d)];
263 }
264 buf[writeTo] = v;
265 }
266
267 GPUbarrier();
268#endif
269 }
270};
271
272} // namespace o2::gpu
273
274#endif
int32_t i
#define GPUbarrier()
#define GPUgeneric()
#define GPUCA_WARP_SIZE
uint16_t pos
Definition CfUtils.h:164
static GPUdi() int32_t warpPredicateScan(int32_t pred
static GPUdi() void condBlockLoad(const Array2D< T > &map
static int32_t * sum
Definition CfUtils.h:45
static uint32_t wgSize
Definition CfUtils.h:173
static GPUdi() void blockLoad(const Array2D< T > &map
static uint16_t bool uint16_t uint16_t * newPartSize
Definition CfUtils.h:158
static GPUdi() int32_t blockPredicateScan(SharedMemory &smem
static uint32_t uint32_t uint16_t uint32_t uint32_t N
Definition CfUtils.h:177
static GPUdi() bool innerAboveThreshold(uint8_t aboveThreshold
static uint32_t uint32_t uint16_t uint32_t offset
Definition CfUtils.h:176
static GPUdi() bool isAboveThreshold(uint8_t peak)
Definition CfUtils.h:42
return myOffset
Definition CfUtils.h:64
static GPUdi() bool innerAboveThresholdInv(uint8_t aboveThreshold
static uint32_t uint32_t uint16_t uint32_t uint32_t GPUconstexprref() const tpccf
Definition CfUtils.h:178
static uint16_t outerIdx
Definition CfUtils.h:31
static uint16_t bool uint16_t partSize
Definition CfUtils.h:157
static int32_t int32_t static SharedMemory GPUdi() int32_t blockPredicateSum(SharedMemory &smem
static uint32_t uint32_t elems
Definition CfUtils.h:174
static GPUdi() uint16_t partition(SharedMemory &smem
static uint16_t ll
Definition CfUtils.h:157
static GPUdi() bool isPeak(uint8_t peak)
Definition CfUtils.h:40
static int32_t pred
Definition CfUtils.h:69
GLint GLenum GLint x
Definition glcorearb.h:403
const GLdouble * v
Definition glcorearb.h:832
GLintptr offset
Definition glcorearb.h:660
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLenum GLuint GLenum GLsizei const GLchar * buf
Definition glcorearb.h:2514
GLubyte GLubyte GLubyte GLubyte w
Definition glcorearb.h:852