Project
Loading...
Searching...
No Matches
CfUtils.h
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
15#ifndef O2_GPU_CF_UTILS_H
16#define O2_GPU_CF_UTILS_H
17
18#include "clusterFinderDefs.h"
19#include "GPUCommonAlgorithm.h"
20#include "CfArray2D.h"
21#include "CfConsts.h"
22
23namespace o2::gpu
24{
25
27{
28
29 public:
30 static GPUdi() bool innerAboveThreshold(uint8_t aboveThreshold, uint16_t outerIdx)
31 {
32 return aboveThreshold & (1 << cfconsts::OuterToInner[outerIdx]);
33 }
34
35 static GPUdi() bool innerAboveThresholdInv(uint8_t aboveThreshold, uint16_t outerIdx)
36 {
37 return aboveThreshold & (1 << cfconsts::OuterToInnerInv[outerIdx]);
38 }
39
40 static GPUdi() bool isPeak(uint8_t peak) { return peak & 0x01; }
41
42 static GPUdi() bool isAboveThreshold(uint8_t peak) { return peak >> 1; }
43
44 static GPUdi() int32_t warpPredicateScan(int32_t pred, int32_t* sum)
45 {
46#ifdef __HIPCC__
47 int32_t iLane = hipThreadIdx_x % warpSize;
48 uint64_t waveMask = __ballot(pred);
49 uint64_t lowerWarpMask = (1ull << iLane) - 1ull;
50 int32_t myOffset = __popcll(waveMask & lowerWarpMask);
51 *sum = __popcll(waveMask);
52 return myOffset;
53#elif defined(__CUDACC__)
54 int32_t iLane = threadIdx.x % warpSize;
55 uint32_t waveMask = __ballot_sync(0xFFFFFFFF, pred);
56 uint32_t lowerWarpMask = (1u << iLane) - 1u;
57 int32_t myOffset = __popc(waveMask & lowerWarpMask);
58 *sum = __popc(waveMask);
59 return myOffset;
60#else // CPU / OpenCL fallback
61 int32_t myOffset = warp_scan_inclusive_add(!!pred);
62 *sum = warp_broadcast(myOffset, GPUCA_WARP_SIZE - 1);
63 return myOffset - !!pred;
64#endif
65 }
66
67 template <size_t BlockSize, typename SharedMemory>
68 static GPUdi() int32_t blockPredicateScan(SharedMemory& smem, int32_t pred, int32_t* sum = nullptr)
69 {
70#if defined(__HIPCC__) || defined(__CUDACC__)
71 int32_t iThread =
72#ifdef __HIPCC__
73 hipThreadIdx_x;
74#else
75 threadIdx.x;
76#endif
77
78 int32_t iWarp = iThread / warpSize;
79 int32_t nWarps = BlockSize / warpSize;
80
81 int32_t warpSum;
82 int32_t laneOffset = warpPredicateScan(pred, &warpSum);
83
84 if (iThread % warpSize == 0) {
85 smem.warpPredicateSum[iWarp] = warpSum;
86 }
87 __syncthreads();
88
89 int32_t warpOffset = 0;
90
91 if (sum == nullptr) {
92 for (int32_t w = 0; w < iWarp; w++) {
93 int32_t s = smem.warpPredicateSum[w];
94 warpOffset += s;
95 }
96 } else {
97 *sum = 0;
98 for (int32_t w = 0; w < nWarps; w++) {
99 int32_t s = smem.warpPredicateSum[w];
100 if (w < iWarp) {
101 warpOffset += s;
102 }
103 *sum += s;
104 }
105 }
106
107 return warpOffset + laneOffset;
108#else // CPU / OpenCL fallback
109 int32_t lpos = work_group_scan_inclusive_add(pred ? 1 : 0);
110 if (sum != nullptr) {
111 *sum = work_group_broadcast(lpos, BlockSize - 1);
112 }
113 return lpos - !!pred;
114#endif
115 }
116
117 template <size_t BlockSize, typename SharedMemory>
118 static GPUdi() int32_t blockPredicateSum(SharedMemory& smem, int32_t pred)
119 {
120#if defined(__HIPCC__) || defined(__CUDACC__)
121 int32_t iThread =
122#ifdef __HIPCC__
123 hipThreadIdx_x;
124#else
125 threadIdx.x;
126#endif
127
128 int32_t iWarp = iThread / warpSize;
129 int32_t nWarps = BlockSize / warpSize;
130
131 int32_t warpSum =
132#ifdef __HIPCC__
133 __popcll(__ballot(pred));
134#else
135 __popc(__ballot_sync(0xFFFFFFFF, pred));
136#endif
137
138 if (iThread % warpSize == 0) {
139 smem.warpPredicateSum[iWarp] = warpSum;
140 }
141 __syncthreads();
142
143 int32_t sum = 0;
144 for (int32_t w = 0; w < nWarps; w++) {
145 sum += smem.warpPredicateSum[w];
146 }
147
148 return sum;
149#else // CPU / OpenCL fallback
150 return work_group_reduce_add(!!pred);
151#endif
152 }
153
154 template <size_t SCRATCH_PAD_WORK_GROUP_SIZE, typename SharedMemory>
155 static GPUdi() uint16_t partition(SharedMemory& smem, uint16_t ll, bool pred, uint16_t partSize, uint16_t* newPartSize)
156 {
157 bool participates = ll < partSize;
158
159 int32_t part;
160 int32_t lpos = blockPredicateScan<SCRATCH_PAD_WORK_GROUP_SIZE>(smem, int32_t(!pred && participates), &part);
161
162 uint16_t pos = (participates && !pred) ? lpos : part;
163
165 return pos;
166 }
167
168 template <typename T>
169 static GPUdi() void blockLoad(
170 const CfArray2D<T>& map,
171 uint32_t wgSize,
172 uint32_t elems,
173 uint16_t ll,
174 uint32_t offset,
175 uint32_t N,
176 GPUconstexprref() const tpccf::Delta2* neighbors,
177 const CfChargePos* posBcast,
178 GPUgeneric() T* buf)
179 {
180#if defined(GPUCA_GPUCODE)
181 GPUbarrier();
182 uint16_t x = ll % N;
183 uint16_t y = ll / N;
184 tpccf::Delta2 d = neighbors[x + offset];
185
186 for (uint32_t i = y; i < wgSize; i += (elems / N)) {
187 CfChargePos readFrom = posBcast[i];
188 uint32_t writeTo = N * i + x;
189 buf[writeTo] = map[readFrom.delta(d)];
190 }
191 GPUbarrier();
192#else
193 if (ll >= wgSize) {
194 return;
195 }
196
197 CfChargePos readFrom = posBcast[ll];
198
199 GPUbarrier();
200
201 for (uint32_t i = 0; i < N; i++) {
202 tpccf::Delta2 d = neighbors[i + offset];
203
204 uint32_t writeTo = N * ll + i;
205 buf[writeTo] = map[readFrom.delta(d)];
206 }
207
208 GPUbarrier();
209#endif
210 }
211
212 template <typename T, bool Inv = false>
213 static GPUdi() void condBlockLoad(
214 const CfArray2D<T>& map,
215 uint16_t wgSize,
216 uint16_t elems,
217 uint16_t ll,
218 uint16_t offset,
219 uint16_t N,
220 GPUconstexprref() const tpccf::Delta2* neighbors,
221 const CfChargePos* posBcast,
222 const uint8_t* aboveThreshold,
223 GPUgeneric() T* buf)
224 {
225#if defined(GPUCA_GPUCODE)
226 GPUbarrier();
227 uint16_t y = ll / N;
228 uint16_t x = ll % N;
229 tpccf::Delta2 d = neighbors[x + offset];
230 for (uint32_t i = y; i < wgSize; i += (elems / N)) {
231 CfChargePos readFrom = posBcast[i];
232 uint8_t above = aboveThreshold[i];
233 uint32_t writeTo = N * i + x;
234 T v(0);
235 bool cond = (Inv) ? innerAboveThresholdInv(above, x + offset)
236 : innerAboveThreshold(above, x + offset);
237 if (cond) {
238 v = map[readFrom.delta(d)];
239 }
240 buf[writeTo] = v;
241 }
242 GPUbarrier();
243#else
244 if (ll >= wgSize) {
245 return;
246 }
247
248 CfChargePos readFrom = posBcast[ll];
249 uint8_t above = aboveThreshold[ll];
250 GPUbarrier();
251
252 for (uint32_t i = 0; i < N; i++) {
253 tpccf::Delta2 d = neighbors[i + offset];
254
255 uint32_t writeTo = N * ll + i;
256 T v(0);
257 bool cond = (Inv) ? innerAboveThresholdInv(above, i + offset)
258 : innerAboveThreshold(above, i + offset);
259 if (cond) {
260 v = map[readFrom.delta(d)];
261 }
262 buf[writeTo] = v;
263 }
264
265 GPUbarrier();
266#endif
267 }
268};
269
270} // namespace o2::gpu
271
272#endif
int32_t i
#define GPUbarrier()
#define GPUgeneric()
uint16_t pos
Definition CfUtils.h:162
static GPUdi() int32_t warpPredicateScan(int32_t pred
static int32_t * sum
Definition CfUtils.h:45
static uint32_t wgSize
Definition CfUtils.h:171
static uint16_t bool uint16_t uint16_t * newPartSize
Definition CfUtils.h:156
static GPUdi() int32_t blockPredicateScan(SharedMemory &smem
static uint32_t uint32_t uint16_t uint32_t uint32_t N
Definition CfUtils.h:175
static GPUdi() bool innerAboveThreshold(uint8_t aboveThreshold
static uint32_t uint32_t uint16_t uint32_t offset
Definition CfUtils.h:174
static GPUdi() bool isAboveThreshold(uint8_t peak)
Definition CfUtils.h:42
static GPUdi() void condBlockLoad(const CfArray2D< T > &map
static GPUdi() bool innerAboveThresholdInv(uint8_t aboveThreshold
static uint32_t uint32_t uint16_t uint32_t uint32_t GPUconstexprref() const tpccf
Definition CfUtils.h:176
static uint16_t outerIdx
Definition CfUtils.h:31
static uint16_t bool uint16_t partSize
Definition CfUtils.h:155
static int32_t int32_t static SharedMemory GPUdi() int32_t blockPredicateSum(SharedMemory &smem
static uint32_t uint32_t elems
Definition CfUtils.h:172
static GPUdi() void blockLoad(const CfArray2D< T > &map
static GPUdi() uint16_t partition(SharedMemory &smem
static uint16_t ll
Definition CfUtils.h:155
static GPUdi() bool isPeak(uint8_t peak)
Definition CfUtils.h:40
static int32_t pred
Definition CfUtils.h:68
GLint GLenum GLint x
Definition glcorearb.h:403
const GLdouble * v
Definition glcorearb.h:832
GLintptr offset
Definition glcorearb.h:660
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLenum GLuint GLenum GLsizei const GLchar * buf
Definition glcorearb.h:2514
GLubyte GLubyte GLubyte GLubyte w
Definition glcorearb.h:852