Project
Loading...
Searching...
No Matches
CfUtils.h
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
15#ifndef O2_GPU_CF_UTILS_H
16#define O2_GPU_CF_UTILS_H
17
18#include "clusterFinderDefs.h"
19#include "GPUCommonAlgorithm.h"
20#include "Array2D.h"
21#include "CfConsts.h"
22
23namespace o2::gpu
24{
25
27{
28
29 public:
30 static GPUdi() bool isAtEdge(const ChargePos& pos, tpccf::GlobalPad padsPerRow)
31 {
32 return (pos.pad() < 2 || pos.pad() >= padsPerRow - 2);
33 }
34
35 static GPUdi() bool innerAboveThreshold(uint8_t aboveThreshold, uint16_t outerIdx)
36 {
37 return aboveThreshold & (1 << cfconsts::OuterToInner[outerIdx]);
38 }
39
40 static GPUdi() bool innerAboveThresholdInv(uint8_t aboveThreshold, uint16_t outerIdx)
41 {
42 return aboveThreshold & (1 << cfconsts::OuterToInnerInv[outerIdx]);
43 }
44
45 static GPUdi() bool isPeak(uint8_t peak) { return peak & 0x01; }
46
47 static GPUdi() bool isAboveThreshold(uint8_t peak) { return peak >> 1; }
48
49 static GPUdi() int32_t warpPredicateScan(int32_t pred, int32_t* sum)
50 {
51#ifdef __HIPCC__
52 int32_t iLane = hipThreadIdx_x % warpSize;
53 uint64_t waveMask = __ballot(pred);
54 uint64_t lowerWarpMask = (1ull << iLane) - 1ull;
55 int32_t myOffset = __popcll(waveMask & lowerWarpMask);
56 *sum = __popcll(waveMask);
57 return myOffset;
58#elif defined(__CUDACC__)
59 int32_t iLane = threadIdx.x % warpSize;
60 uint32_t waveMask = __ballot_sync(0xFFFFFFFF, pred);
61 uint32_t lowerWarpMask = (1u << iLane) - 1u;
62 int32_t myOffset = __popc(waveMask & lowerWarpMask);
63 *sum = __popc(waveMask);
64 return myOffset;
65#else // CPU / OpenCL fallback
66 int32_t myOffset = warp_scan_inclusive_add(pred ? 1 : 0);
67 *sum = warp_broadcast(myOffset, GPUCA_WARP_SIZE - 1);
69 return myOffset;
70#endif
71 }
72
73 template <size_t BlockSize, typename SharedMemory>
74 static GPUdi() int32_t blockPredicateScan(SharedMemory& smem, int32_t pred, int32_t* sum = nullptr)
75 {
76#if defined(__HIPCC__) || defined(__CUDACC__)
77 int32_t iThread =
78#ifdef __HIPCC__
79 hipThreadIdx_x;
80#else
81 threadIdx.x;
82#endif
83
84 int32_t iWarp = iThread / warpSize;
85 int32_t nWarps = BlockSize / warpSize;
86
87 int32_t warpSum;
88 int32_t laneOffset = warpPredicateScan(pred, &warpSum);
89
90 if (iThread % warpSize == 0) {
91 smem.warpPredicateSum[iWarp] = warpSum;
92 }
93 __syncthreads();
94
95 int32_t warpOffset = 0;
96
97 if (sum == nullptr) {
98 for (int32_t w = 0; w < iWarp; w++) {
99 int32_t s = smem.warpPredicateSum[w];
100 warpOffset += s;
101 }
102 } else {
103 *sum = 0;
104 for (int32_t w = 0; w < nWarps; w++) {
105 int32_t s = smem.warpPredicateSum[w];
106 if (w < iWarp) {
107 warpOffset += s;
108 }
109 *sum += s;
110 }
111 }
112
113 return warpOffset + laneOffset;
114#else // CPU / OpenCL fallback
115 int32_t lpos = work_group_scan_inclusive_add(pred ? 1 : 0);
116 if (sum != nullptr) {
117 *sum = work_group_broadcast(lpos, BlockSize - 1);
118 }
119 lpos--;
120 return lpos;
121#endif
122 }
123
124 template <size_t BlockSize, typename SharedMemory>
125 static GPUdi() int32_t blockPredicateSum(SharedMemory& smem, int32_t pred)
126 {
127#if defined(__HIPCC__) || defined(__CUDACC__)
128 int32_t iThread =
129#ifdef __HIPCC__
130 hipThreadIdx_x;
131#else
132 threadIdx.x;
133#endif
134
135 int32_t iWarp = iThread / warpSize;
136 int32_t nWarps = BlockSize / warpSize;
137
138 int32_t warpSum =
139#ifdef __HIPCC__
140 __popcll(__ballot(pred));
141#else
142 __popc(__ballot_sync(0xFFFFFFFF, pred));
143#endif
144
145 if (iThread % warpSize == 0) {
146 smem.warpPredicateSum[iWarp] = warpSum;
147 }
148 __syncthreads();
149
150 int32_t sum = 0;
151 for (int32_t w = 0; w < nWarps; w++) {
152 sum += smem.warpPredicateSum[w];
153 }
154
155 return sum;
156#else // CPU / OpenCL fallback
157 return work_group_reduce_add(pred ? 1 : 0);
158#endif
159 }
160
161 template <size_t SCRATCH_PAD_WORK_GROUP_SIZE, typename SharedMemory>
162 static GPUdi() uint16_t partition(SharedMemory& smem, uint16_t ll, bool pred, uint16_t partSize, uint16_t* newPartSize)
163 {
164 bool participates = ll < partSize;
165
166 int32_t part;
167 int32_t lpos = blockPredicateScan<SCRATCH_PAD_WORK_GROUP_SIZE>(smem, int32_t(!pred && participates), &part);
168
169 uint16_t pos = (participates && !pred) ? lpos : part;
170
172 return pos;
173 }
174
175 template <typename T>
176 static GPUdi() void blockLoad(
177 const Array2D<T>& map,
178 uint32_t wgSize,
179 uint32_t elems,
180 uint16_t ll,
181 uint32_t offset,
182 uint32_t N,
183 GPUconstexprref() const tpccf::Delta2* neighbors,
184 const ChargePos* posBcast,
185 GPUgeneric() T* buf)
186 {
187#if defined(GPUCA_GPUCODE)
188 GPUbarrier();
189 uint16_t x = ll % N;
190 uint16_t y = ll / N;
191 tpccf::Delta2 d = neighbors[x + offset];
192
193 for (uint32_t i = y; i < wgSize; i += (elems / N)) {
194 ChargePos readFrom = posBcast[i];
195 uint32_t writeTo = N * i + x;
196 buf[writeTo] = map[readFrom.delta(d)];
197 }
198 GPUbarrier();
199#else
200 if (ll >= wgSize) {
201 return;
202 }
203
204 ChargePos readFrom = posBcast[ll];
205
206 GPUbarrier();
207
208 for (uint32_t i = 0; i < N; i++) {
209 tpccf::Delta2 d = neighbors[i + offset];
210
211 uint32_t writeTo = N * ll + i;
212 buf[writeTo] = map[readFrom.delta(d)];
213 }
214
215 GPUbarrier();
216#endif
217 }
218
219 template <typename T, bool Inv = false>
220 static GPUdi() void condBlockLoad(
221 const Array2D<T>& map,
222 uint16_t wgSize,
223 uint16_t elems,
224 uint16_t ll,
225 uint16_t offset,
226 uint16_t N,
227 GPUconstexprref() const tpccf::Delta2* neighbors,
228 const ChargePos* posBcast,
229 const uint8_t* aboveThreshold,
230 GPUgeneric() T* buf)
231 {
232#if defined(GPUCA_GPUCODE)
233 GPUbarrier();
234 uint16_t y = ll / N;
235 uint16_t x = ll % N;
236 tpccf::Delta2 d = neighbors[x + offset];
237 for (uint32_t i = y; i < wgSize; i += (elems / N)) {
238 ChargePos readFrom = posBcast[i];
239 uint8_t above = aboveThreshold[i];
240 uint32_t writeTo = N * i + x;
241 T v(0);
242 bool cond = (Inv) ? innerAboveThresholdInv(above, x + offset)
243 : innerAboveThreshold(above, x + offset);
244 if (cond) {
245 v = map[readFrom.delta(d)];
246 }
247 buf[writeTo] = v;
248 }
249 GPUbarrier();
250#else
251 if (ll >= wgSize) {
252 return;
253 }
254
255 ChargePos readFrom = posBcast[ll];
256 uint8_t above = aboveThreshold[ll];
257 GPUbarrier();
258
259 for (uint32_t i = 0; i < N; i++) {
260 tpccf::Delta2 d = neighbors[i + offset];
261
262 uint32_t writeTo = N * ll + i;
263 T v(0);
264 bool cond = (Inv) ? innerAboveThresholdInv(above, i + offset)
265 : innerAboveThreshold(above, i + offset);
266 if (cond) {
267 v = map[readFrom.delta(d)];
268 }
269 buf[writeTo] = v;
270 }
271
272 GPUbarrier();
273#endif
274 }
275};
276
277} // namespace o2::gpu
278
279#endif
int32_t i
#define GPUbarrier()
#define GPUgeneric()
#define GPUCA_WARP_SIZE
uint16_t pos
Definition CfUtils.h:169
static GPUdi() int32_t warpPredicateScan(int32_t pred
static GPUdi() void condBlockLoad(const Array2D< T > &map
static int32_t * sum
Definition CfUtils.h:50
static uint32_t wgSize
Definition CfUtils.h:178
static GPUdi() void blockLoad(const Array2D< T > &map
static uint16_t bool uint16_t uint16_t * newPartSize
Definition CfUtils.h:163
static GPUdi() int32_t blockPredicateScan(SharedMemory &smem
static tpccf::GlobalPad padsPerRow
Definition CfUtils.h:31
static uint32_t uint32_t uint16_t uint32_t uint32_t N
Definition CfUtils.h:182
static GPUdi() bool innerAboveThreshold(uint8_t aboveThreshold
static uint32_t uint32_t uint16_t uint32_t offset
Definition CfUtils.h:181
static GPUdi() bool isAboveThreshold(uint8_t peak)
Definition CfUtils.h:47
return myOffset
Definition CfUtils.h:69
static GPUdi() bool innerAboveThresholdInv(uint8_t aboveThreshold
static uint32_t uint32_t uint16_t uint32_t uint32_t GPUconstexprref() const tpccf
Definition CfUtils.h:183
static uint16_t outerIdx
Definition CfUtils.h:36
static uint16_t bool uint16_t partSize
Definition CfUtils.h:162
static int32_t int32_t static SharedMemory GPUdi() int32_t blockPredicateSum(SharedMemory &smem
static uint32_t uint32_t elems
Definition CfUtils.h:179
static GPUdi() uint16_t partition(SharedMemory &smem
static GPUdi() bool isAtEdge(const ChargePos &pos
static uint16_t ll
Definition CfUtils.h:162
static GPUdi() bool isPeak(uint8_t peak)
Definition CfUtils.h:45
static int32_t pred
Definition CfUtils.h:74
GLint GLenum GLint x
Definition glcorearb.h:403
const GLdouble * v
Definition glcorearb.h:832
GLintptr offset
Definition glcorearb.h:660
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLenum GLuint GLenum GLsizei const GLchar * buf
Definition glcorearb.h:2514
GLubyte GLubyte GLubyte GLubyte w
Definition glcorearb.h:852