Project
Loading...
Searching...
No Matches
GPUCommonAlgorithm.h
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
15#ifndef GPUCOMMONALGORITHM_H
16#define GPUCOMMONALGORITHM_H
17
18#include "GPUCommonDef.h"
19
20#if !defined(GPUCA_GPUCODE) // Could also enable custom search on the CPU, but it is not always faster, so we stick to std::sort
21#include <algorithm>
22#define GPUCA_ALGORITHM_STD
23#endif
24
25// ----------------------------- SORTING -----------------------------
26
27namespace o2::gpu
28{
30{
31 public:
32 template <class T>
33 GPUd() static void sort(T* begin, T* end);
34 template <class T>
35 GPUd() static void sortInBlock(T* begin, T* end);
36 template <class T>
37 GPUd() static void sortDeviceDynamic(T* begin, T* end);
38 template <class T, class S>
39 GPUd() static void sort(T* begin, T* end, const S& comp);
40 template <class T, class S>
41 GPUd() static void sortInBlock(T* begin, T* end, const S& comp);
42 template <class T, class S>
43 GPUd() static void sortDeviceDynamic(T* begin, T* end, const S& comp);
44#ifndef __OPENCL__
45 template <class T, class S>
46 GPUh() static void sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp);
47#endif
48 template <class T>
49 GPUd() static void swap(T& a, T& b);
50
51 private:
52 // Quicksort implementation
53 template <typename I>
54 GPUd() static void QuickSort(I f, I l) noexcept;
55
56 // Quicksort implementation
57 template <typename I, typename Cmp>
58 GPUd() static void QuickSort(I f, I l, Cmp cmp) noexcept;
59
60 // Insertionsort implementation
61 template <typename I, typename Cmp>
62 GPUd() static void InsertionSort(I f, I l, Cmp cmp) noexcept;
63
64 // Helper for Quicksort implementation
65 template <typename I, typename Cmp>
66 GPUd() static I MedianOf3Select(I f, I l, Cmp cmp) noexcept;
67
68 // Helper for Quicksort implementation
69 template <typename I, typename T, typename Cmp>
70 GPUd() static I UnguardedPartition(I f, I l, T piv, Cmp cmp) noexcept;
71
72 // Helper
73 template <typename I>
74 GPUd() static void IterSwap(I a, I b) noexcept;
75};
76
77#ifndef GPUCA_ALGORITHM_STD
78template <typename I>
79GPUdi() void GPUCommonAlgorithm::IterSwap(I a, I b) noexcept
80{
81 auto tmp = *a;
82 *a = *b;
83 *b = tmp;
84}
85
86template <typename I, typename Cmp>
87GPUdi() void GPUCommonAlgorithm::InsertionSort(I f, I l, Cmp cmp) noexcept
88{
89 auto it0{f};
90 while (it0 != l) {
91 auto tmp{*it0};
92
93 auto it1{it0};
94 while (it1 != f && cmp(tmp, it1[-1])) {
95 it1[0] = it1[-1];
96 --it1;
97 }
98 it1[0] = tmp;
99
100 ++it0;
101 }
102}
103
104template <typename I, typename Cmp>
105GPUdi() I GPUCommonAlgorithm::MedianOf3Select(I f, I l, Cmp cmp) noexcept
106{
107 auto m = f + (l - f) / 2;
108
109 --l;
110
111 if (cmp(*f, *m)) {
112 if (cmp(*m, *l)) {
113 return m;
114 } else if (cmp(*f, *l)) {
115 return l;
116 } else {
117 return f;
118 }
119 } else if (cmp(*f, *l)) {
120 return f;
121 } else if (cmp(*m, *l)) {
122 return l;
123 } else {
124 return m;
125 }
126}
127
128template <typename I, typename T, typename Cmp>
129GPUdi() I GPUCommonAlgorithm::UnguardedPartition(I f, I l, T piv, Cmp cmp) noexcept
130{
131 do {
132 while (cmp(*f, piv)) {
133 ++f;
134 }
135 --l;
136 while (cmp(piv, *l)) {
137 --l;
138 }
139
140 if (l <= f) {
141 return f;
142 }
143 IterSwap(f, l);
144 ++f;
145 } while (true);
146}
147
148template <typename I, typename Cmp>
149GPUdi() void GPUCommonAlgorithm::QuickSort(I f, I l, Cmp cmp) noexcept
150{
151 if (f == l) {
152 return;
153 }
154 using IndexType = uint16_t;
155
156 struct pair {
157 IndexType first;
158 IndexType second;
159 };
160
161 struct Stack {
162 pair data[11];
163 uint8_t n{0};
164
165 GPUd() void emplace(IndexType x, IndexType y)
166 {
167 data[n++] = {x, y};
168 }
169 GPUd() bool empty() const { return n == 0; }
170 GPUd() pair& top() { return data[n - 1]; }
171 GPUd() void pop() { --n; }
172 };
173
174 Stack s;
175 s.emplace(0, l - f);
176 while (!s.empty()) {
177 const auto it0 = f + s.top().first;
178 const auto it1 = f + s.top().second;
179 s.pop();
180
181 const auto piv = *MedianOf3Select(it0, it1, cmp);
182 const auto pp = UnguardedPartition(it0, it1, piv, cmp);
183
184 constexpr auto cutoff = 50u;
185 const auto lsz = pp - it0;
186 const auto rsz = it1 - pp;
187 if (lsz < rsz) {
188 if (rsz > cutoff) {
189 s.emplace(pp - f, it1 - f);
190 }
191 if (lsz > cutoff) {
192 s.emplace(it0 - f, pp - f);
193 }
194 } else {
195 if (lsz > cutoff) {
196 s.emplace(it0 - f, pp - f);
197 }
198 if (rsz > cutoff) {
199 s.emplace(pp - f, it1 - f);
200 }
201 }
202 }
203 InsertionSort(f, l, cmp);
204}
205
206template <typename I>
207GPUdi() void GPUCommonAlgorithm::QuickSort(I f, I l) noexcept
208{
209 QuickSort(f, l, [](auto&& x, auto&& y) { return x < y; });
210}
211#endif
212
214
215} // namespace o2::gpu
216
217#if (((defined(__CUDACC__) && !defined(__clang__)) || defined(__HIPCC__))) && !defined(GPUCA_GPUCODE_HOSTONLY)
218
220
221#else
222
223namespace o2::gpu
224{
225
226template <class T>
227GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end)
228{
229#ifndef GPUCA_GPUCODE
230 GPUCommonAlgorithm::sort(begin, end);
231#else
232 GPUCommonAlgorithm::sortDeviceDynamic(begin, end, [](auto&& x, auto&& y) { return x < y; });
233#endif
234}
235
236template <class T, class S>
237GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end, const S& comp)
238{
239 GPUCommonAlgorithm::sort(begin, end, comp);
240}
241
242} // namespace o2::gpu
243
244#endif // THRUST
245// sort and sortInBlock below are not taken from Thrust, since our implementations are faster
246
247namespace o2::gpu
248{
249
250template <class T>
251GPUdi() void GPUCommonAlgorithm::sort(T* begin, T* end)
252{
253#ifdef GPUCA_ALGORITHM_STD
254 std::sort(begin, end);
255#else
256 QuickSort(begin, end, [](auto&& x, auto&& y) { return x < y; });
257#endif
258}
259
260template <class T, class S>
261GPUdi() void GPUCommonAlgorithm::sort(T* begin, T* end, const S& comp)
262{
263#ifdef GPUCA_ALGORITHM_STD
264 std::sort(begin, end, comp);
265#else
266 QuickSort(begin, end, comp);
267#endif
268}
269
270template <class T>
271GPUdi() void GPUCommonAlgorithm::sortInBlock(T* begin, T* end)
272{
273#ifndef GPUCA_GPUCODE
274 GPUCommonAlgorithm::sort(begin, end);
275#else
276 GPUCommonAlgorithm::sortInBlock(begin, end, [](auto&& x, auto&& y) { return x < y; });
277#endif
278}
279
280template <class T, class S>
281GPUdi() void GPUCommonAlgorithm::sortInBlock(T* begin, T* end, const S& comp)
282{
283#ifndef GPUCA_GPUCODE
284 GPUCommonAlgorithm::sort(begin, end, comp);
285#else
286 int32_t n = end - begin;
287 for (int32_t i = 0; i < n; i++) {
288 for (int32_t tIdx = get_local_id(0); tIdx < n; tIdx += get_local_size(0)) {
289 int32_t offset = i % 2;
290 int32_t curPos = 2 * tIdx + offset;
291 int32_t nextPos = curPos + 1;
292
293 if (nextPos < n) {
294 if (!comp(begin[curPos], begin[nextPos])) {
295 IterSwap(&begin[curPos], &begin[nextPos]);
296 }
297 }
298 }
299 GPUbarrier();
300 }
301#endif
302}
303
304#ifdef GPUCA_GPUCODE
305template <class T>
306GPUdi() void GPUCommonAlgorithm::swap(T& a, T& b)
307{
308 auto tmp = a;
309 a = b;
310 b = tmp;
311}
312#else
313template <class T>
314GPUdi() void GPUCommonAlgorithm::swap(T& a, T& b)
315{
316 std::swap(a, b);
317}
318#endif
319
320} // namespace o2::gpu
321
322// ----------------------------- WORK GROUP FUNCTIONS -----------------------------
323
324#ifdef __OPENCL__
325// Nothing to do, work_group functions available
326#pragma OPENCL EXTENSION cl_khr_subgroups : enable
327
328template <class T>
329GPUdi() T work_group_scan_inclusive_add_FUNC(T v)
330{
331 return sub_group_scan_inclusive_add(v);
332}
333template <> // FIXME: It seems OpenCL does not support 8 and 16 bit subgroup operations
334GPUdi() uint8_t work_group_scan_inclusive_add_FUNC<uint8_t>(uint8_t v)
335{
336 return sub_group_scan_inclusive_add((uint32_t)v);
337}
338template <class T>
339GPUdi() T work_group_broadcast_FUNC(T v, int32_t i)
340{
341 return sub_group_broadcast(v, i);
342}
343template <>
344GPUdi() uint8_t work_group_broadcast_FUNC<uint8_t>(uint8_t v, int32_t i)
345{
346 return sub_group_broadcast((uint32_t)v, i);
347}
348
349#define warp_scan_inclusive_add(v) work_group_scan_inclusive_add_FUNC(v)
350#define warp_broadcast(v, i) work_group_broadcast_FUNC(v, i)
351
352#elif (defined(__CUDACC__) || defined(__HIPCC__))
353// CUDA and HIP work the same way using cub, need just different header
354
355#if !defined(GPUCA_GPUCODE_COMPILEKERNELS) && !defined(GPUCA_GPUCODE_HOSTONLY)
356#if defined(__CUDACC__)
357#include <cub/cub.cuh>
358#elif defined(__HIPCC__)
359#include <hipcub/hipcub.hpp>
360#endif
361#endif
362
363#define work_group_scan_inclusive_add(v) work_group_scan_inclusive_add_FUNC(v, smem)
364template <class T, class S>
365GPUdi() T work_group_scan_inclusive_add_FUNC(T v, S& smem)
366{
367 typename S::BlockScan(smem.cubTmpMem).InclusiveSum(v, v);
368 __syncthreads();
369 return v;
370}
371
372#define work_group_broadcast(v, i) work_group_broadcast_FUNC(v, i, smem)
373template <class T, class S>
374GPUdi() T work_group_broadcast_FUNC(T v, int32_t i, S& smem)
375{
376 if ((int32_t)threadIdx.x == i) {
377 smem.tmpBroadcast = v;
378 }
379 __syncthreads();
380 T retVal = smem.tmpBroadcast;
381 __syncthreads();
382 return retVal;
383}
384
385#define work_group_reduce_add(v) work_group_reduce_add_FUNC(v, smem)
386template <class T, class S>
387GPUdi() T work_group_reduce_add_FUNC(T v, S& smem)
388{
389 v = typename S::BlockReduce(smem.cubReduceTmpMem).Sum(v);
390 __syncthreads();
391 v = work_group_broadcast(v, 0);
392 return v;
393}
394
395#define warp_scan_inclusive_add(v) warp_scan_inclusive_add_FUNC(v, smem)
396template <class T, class S>
397GPUdi() T warp_scan_inclusive_add_FUNC(T v, S& smem)
398{
399 typename S::WarpScan(smem.cubWarpTmpMem).InclusiveSum(v, v);
400 return v;
401}
402
403#define warp_broadcast(v, i) warp_broadcast_FUNC(v, i)
404template <class T>
405GPUdi() T warp_broadcast_FUNC(T v, int32_t i)
406{
407#ifdef __CUDACC__
408 return __shfl_sync(0xFFFFFFFF, v, i);
409#else // HIP
410 return __shfl(v, i);
411#endif
412}
413
414#else
415// Trivial implementation for the CPU
416
417template <class T>
418GPUdi() T work_group_scan_inclusive_add(T v)
419{
420 return v;
421}
422
423template <class T>
424GPUdi() T work_group_reduce_add(T v)
425{
426 return v;
427}
428
429template <class T>
430GPUdi() T work_group_broadcast(T v, int32_t i)
431{
432 return v;
433}
434
435template <class T>
436GPUdi() T warp_scan_inclusive_add(T v)
437{
438 return v;
439}
440
441template <class T>
442GPUdi() T warp_broadcast(T v, int32_t i)
443{
444 return v;
445}
446
447#endif
448
449#ifdef GPUCA_ALGORITHM_STD
450#undef GPUCA_ALGORITHM_STD
451#endif
452
453#endif
int32_t i
#define get_local_size(dim)
#define GPUdi()
#define get_local_id(dim)
#define GPUbarrier()
#define GPUd()
int32_t retVal
GPUd() static void sort(T *begin
GPUh() static void sortOnDevice(auto *rec
GLdouble n
Definition glcorearb.h:1982
GLint GLenum GLint x
Definition glcorearb.h:403
const GLfloat * m
Definition glcorearb.h:4066
GLuint GLuint end
Definition glcorearb.h:469
const GLdouble * v
Definition glcorearb.h:832
GLdouble GLdouble GLdouble GLdouble top
Definition glcorearb.h:4077
GLint first
Definition glcorearb.h:399
GLdouble f
Definition glcorearb.h:310
GLboolean GLboolean GLboolean b
Definition glcorearb.h:1233
GLint y
Definition glcorearb.h:270
GLboolean * data
Definition glcorearb.h:298
GLintptr offset
Definition glcorearb.h:660
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLboolean GLboolean GLboolean GLboolean a
Definition glcorearb.h:1233
GLuint GLuint stream
Definition glcorearb.h:1806
uint8_t itsSharedClusterMap uint8_t
GPUdi() o2
Definition TrackTRD.h:38
GPUCommonAlgorithm CAAlgo
Enum< T >::Iterator begin(Enum< T >)
Definition Defs.h:173
void empty(int)
GPUReconstruction * rec
a move-only header stack with serialized headers This is the flat buffer where all the headers in a m...
Definition Stack.h:36
__global__ void sortInBlock(float *data, size_t dataLength)
char const *restrict const cmp
Definition x9.h:96