Project
Loading...
Searching...
No Matches
GPUCommonAlgorithm.h
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
15#ifndef GPUCOMMONALGORITHM_H
16#define GPUCOMMONALGORITHM_H
17
18#include "GPUCommonDef.h"
19
20#if !defined(GPUCA_GPUCODE) // Could also enable custom search on the CPU, but it is not always faster, so we stick to std::sort
21#include <algorithm>
22#define GPUCA_ALGORITHM_STD
23#endif
24
25// ----------------------------- SORTING -----------------------------
26
27namespace o2
28{
29namespace gpu
30{
32{
33 public:
34 template <class T>
35 GPUd() static void sort(T* begin, T* end);
36 template <class T>
37 GPUd() static void sortInBlock(T* begin, T* end);
38 template <class T>
39 GPUd() static void sortDeviceDynamic(T* begin, T* end);
40 template <class T, class S>
41 GPUd() static void sort(T* begin, T* end, const S& comp);
42 template <class T, class S>
43 GPUd() static void sortInBlock(T* begin, T* end, const S& comp);
44 template <class T, class S>
45 GPUd() static void sortDeviceDynamic(T* begin, T* end, const S& comp);
46 template <class T>
47 GPUd() static void swap(T& a, T& b);
48
49 private:
50 // Quicksort implementation
51 template <typename I>
52 GPUd() static void QuickSort(I f, I l) noexcept;
53
54 // Quicksort implementation
55 template <typename I, typename Cmp>
56 GPUd() static void QuickSort(I f, I l, Cmp cmp) noexcept;
57
58 // Insertionsort implementation
59 template <typename I, typename Cmp>
60 GPUd() static void InsertionSort(I f, I l, Cmp cmp) noexcept;
61
62 // Helper for Quicksort implementation
63 template <typename I, typename Cmp>
64 GPUd() static I MedianOf3Select(I f, I l, Cmp cmp) noexcept;
65
66 // Helper for Quicksort implementation
67 template <typename I, typename T, typename Cmp>
68 GPUd() static I UnguardedPartition(I f, I l, T piv, Cmp cmp) noexcept;
69
70 // Helper
71 template <typename I>
72 GPUd() static void IterSwap(I a, I b) noexcept;
73};
74} // namespace gpu
75} // namespace o2
76
77namespace o2
78{
79namespace gpu
80{
81
82#ifndef GPUCA_ALGORITHM_STD
83template <typename I>
84GPUdi() void GPUCommonAlgorithm::IterSwap(I a, I b) noexcept
85{
86 auto tmp = *a;
87 *a = *b;
88 *b = tmp;
89}
90
91template <typename I, typename Cmp>
92GPUdi() void GPUCommonAlgorithm::InsertionSort(I f, I l, Cmp cmp) noexcept
93{
94 auto it0{f};
95 while (it0 != l) {
96 auto tmp{*it0};
97
98 auto it1{it0};
99 while (it1 != f && cmp(tmp, it1[-1])) {
100 it1[0] = it1[-1];
101 --it1;
102 }
103 it1[0] = tmp;
104
105 ++it0;
106 }
107}
108
109template <typename I, typename Cmp>
110GPUdi() I GPUCommonAlgorithm::MedianOf3Select(I f, I l, Cmp cmp) noexcept
111{
112 auto m = f + (l - f) / 2;
113
114 --l;
115
116 if (cmp(*f, *m)) {
117 if (cmp(*m, *l)) {
118 return m;
119 } else if (cmp(*f, *l)) {
120 return l;
121 } else {
122 return f;
123 }
124 } else if (cmp(*f, *l)) {
125 return f;
126 } else if (cmp(*m, *l)) {
127 return l;
128 } else {
129 return m;
130 }
131}
132
133template <typename I, typename T, typename Cmp>
134GPUdi() I GPUCommonAlgorithm::UnguardedPartition(I f, I l, T piv, Cmp cmp) noexcept
135{
136 do {
137 while (cmp(*f, piv)) {
138 ++f;
139 }
140 --l;
141 while (cmp(piv, *l)) {
142 --l;
143 }
144
145 if (l <= f) {
146 return f;
147 }
148 IterSwap(f, l);
149 ++f;
150 } while (true);
151}
152
153template <typename I, typename Cmp>
154GPUdi() void GPUCommonAlgorithm::QuickSort(I f, I l, Cmp cmp) noexcept
155{
156 if (f == l) {
157 return;
158 }
159 using IndexType = uint16_t;
160
161 struct pair {
162 IndexType first;
163 IndexType second;
164 };
165
166 struct Stack {
167 pair data[11];
168 uint8_t n{0};
169
170 GPUd() void emplace(IndexType x, IndexType y)
171 {
172 data[n++] = {x, y};
173 }
174 GPUd() bool empty() const { return n == 0; }
175 GPUd() pair& top() { return data[n - 1]; }
176 GPUd() void pop() { --n; }
177 };
178
179 Stack s;
180 s.emplace(0, l - f);
181 while (!s.empty()) {
182 const auto it0 = f + s.top().first;
183 const auto it1 = f + s.top().second;
184 s.pop();
185
186 const auto piv = *MedianOf3Select(it0, it1, cmp);
187 const auto pp = UnguardedPartition(it0, it1, piv, cmp);
188
189 constexpr auto cutoff = 50u;
190 const auto lsz = pp - it0;
191 const auto rsz = it1 - pp;
192 if (lsz < rsz) {
193 if (rsz > cutoff) {
194 s.emplace(pp - f, it1 - f);
195 }
196 if (lsz > cutoff) {
197 s.emplace(it0 - f, pp - f);
198 }
199 } else {
200 if (lsz > cutoff) {
201 s.emplace(it0 - f, pp - f);
202 }
203 if (rsz > cutoff) {
204 s.emplace(pp - f, it1 - f);
205 }
206 }
207 }
208 InsertionSort(f, l, cmp);
209}
210
211template <typename I>
212GPUdi() void GPUCommonAlgorithm::QuickSort(I f, I l) noexcept
213{
214 QuickSort(f, l, [](auto&& x, auto&& y) { return x < y; });
215}
216#endif
217
219
220} // namespace gpu
221} // namespace o2
222
223#if (((defined(__CUDACC__) && !defined(__clang__)) || defined(__HIPCC__))) && !defined(GPUCA_GPUCODE_GENRTC) && !defined(GPUCA_GPUCODE_HOSTONLY)
224
226
227#else
228
229namespace o2
230{
231namespace gpu
232{
233
234template <class T>
235GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end)
236{
237#ifndef GPUCA_GPUCODE
238 GPUCommonAlgorithm::sort(begin, end);
239#else
240 GPUCommonAlgorithm::sortDeviceDynamic(begin, end, [](auto&& x, auto&& y) { return x < y; });
241#endif
242}
243
244template <class T, class S>
245GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end, const S& comp)
246{
247 GPUCommonAlgorithm::sort(begin, end, comp);
248}
249
250} // namespace gpu
251} // namespace o2
252
253#endif // THRUST
254// sort and sortInBlock below are not taken from Thrust, since our implementations are faster
255
256namespace o2
257{
258namespace gpu
259{
260
261template <class T>
262GPUdi() void GPUCommonAlgorithm::sort(T* begin, T* end)
263{
264#ifdef GPUCA_ALGORITHM_STD
265 std::sort(begin, end);
266#else
267 QuickSort(begin, end, [](auto&& x, auto&& y) { return x < y; });
268#endif
269}
270
271template <class T, class S>
272GPUdi() void GPUCommonAlgorithm::sort(T* begin, T* end, const S& comp)
273{
274#ifdef GPUCA_ALGORITHM_STD
275 std::sort(begin, end, comp);
276#else
277 QuickSort(begin, end, comp);
278#endif
279}
280
281template <class T>
282GPUdi() void GPUCommonAlgorithm::sortInBlock(T* begin, T* end)
283{
284#ifndef GPUCA_GPUCODE
285 GPUCommonAlgorithm::sort(begin, end);
286#else
287 GPUCommonAlgorithm::sortInBlock(begin, end, [](auto&& x, auto&& y) { return x < y; });
288#endif
289}
290
291template <class T, class S>
292GPUdi() void GPUCommonAlgorithm::sortInBlock(T* begin, T* end, const S& comp)
293{
294#ifndef GPUCA_GPUCODE
295 GPUCommonAlgorithm::sort(begin, end, comp);
296#else
297 int32_t n = end - begin;
298 for (int32_t i = 0; i < n; i++) {
299 for (int32_t tIdx = get_local_id(0); tIdx < n; tIdx += get_local_size(0)) {
300 int32_t offset = i % 2;
301 int32_t curPos = 2 * tIdx + offset;
302 int32_t nextPos = curPos + 1;
303
304 if (nextPos < n) {
305 if (!comp(begin[curPos], begin[nextPos])) {
306 IterSwap(&begin[curPos], &begin[nextPos]);
307 }
308 }
309 }
310 GPUbarrier();
311 }
312#endif
313}
314
315#ifdef GPUCA_GPUCODE
316template <class T>
317GPUdi() void GPUCommonAlgorithm::swap(T& a, T& b)
318{
319 auto tmp = a;
320 a = b;
321 b = tmp;
322}
323#else
324template <class T>
325GPUdi() void GPUCommonAlgorithm::swap(T& a, T& b)
326{
327 std::swap(a, b);
328}
329#endif
330
331} // namespace gpu
332} // namespace o2
333
334// ----------------------------- WORK GROUP FUNCTIONS -----------------------------
335
336#ifdef __OPENCL__
337// Nothing to do, work_group functions available
338#pragma OPENCL EXTENSION cl_khr_subgroups : enable
339
340template <class T>
341GPUdi() T work_group_scan_inclusive_add_FUNC(T v)
342{
343 return sub_group_scan_inclusive_add(v);
344}
345template <> // FIXME: It seems OpenCL does not support 8 and 16 bit subgroup operations
346GPUdi() uint8_t work_group_scan_inclusive_add_FUNC<uint8_t>(uint8_t v)
347{
348 return sub_group_scan_inclusive_add((uint32_t)v);
349}
350template <class T>
351GPUdi() T work_group_broadcast_FUNC(T v, int32_t i)
352{
353 return sub_group_broadcast(v, i);
354}
355template <>
356GPUdi() uint8_t work_group_broadcast_FUNC<uint8_t>(uint8_t v, int32_t i)
357{
358 return sub_group_broadcast((uint32_t)v, i);
359}
360
361#define warp_scan_inclusive_add(v) work_group_scan_inclusive_add_FUNC(v)
362#define warp_broadcast(v, i) work_group_broadcast_FUNC(v, i)
363
364#elif (defined(__CUDACC__) || defined(__HIPCC__))
365// CUDA and HIP work the same way using cub, need just different header
366
367#if !defined(GPUCA_GPUCODE_COMPILEKERNELS) && !defined(GPUCA_GPUCODE_HOSTONLY)
368#if defined(__CUDACC__)
369#include <cub/cub.cuh>
370#elif defined(__HIPCC__)
371#include <hipcub/hipcub.hpp>
372#endif
373#endif
374
375#define work_group_scan_inclusive_add(v) work_group_scan_inclusive_add_FUNC(v, smem)
376template <class T, class S>
377GPUdi() T work_group_scan_inclusive_add_FUNC(T v, S& smem)
378{
379 typename S::BlockScan(smem.cubTmpMem).InclusiveSum(v, v);
380 __syncthreads();
381 return v;
382}
383
384#define work_group_broadcast(v, i) work_group_broadcast_FUNC(v, i, smem)
385template <class T, class S>
386GPUdi() T work_group_broadcast_FUNC(T v, int32_t i, S& smem)
387{
388 if ((int32_t)threadIdx.x == i) {
389 smem.tmpBroadcast = v;
390 }
391 __syncthreads();
392 T retVal = smem.tmpBroadcast;
393 __syncthreads();
394 return retVal;
395}
396
397#define work_group_reduce_add(v) work_group_reduce_add_FUNC(v, smem)
398template <class T, class S>
399GPUdi() T work_group_reduce_add_FUNC(T v, S& smem)
400{
401 v = typename S::BlockReduce(smem.cubReduceTmpMem).Sum(v);
402 __syncthreads();
403 v = work_group_broadcast(v, 0);
404 return v;
405}
406
407#define warp_scan_inclusive_add(v) warp_scan_inclusive_add_FUNC(v, smem)
408template <class T, class S>
409GPUdi() T warp_scan_inclusive_add_FUNC(T v, S& smem)
410{
411 typename S::WarpScan(smem.cubWarpTmpMem).InclusiveSum(v, v);
412 return v;
413}
414
415#define warp_broadcast(v, i) warp_broadcast_FUNC(v, i)
416template <class T>
417GPUdi() T warp_broadcast_FUNC(T v, int32_t i)
418{
419#ifdef __CUDACC__
420 return __shfl_sync(0xFFFFFFFF, v, i);
421#else // HIP
422 return __shfl(v, i);
423#endif
424}
425
426#else
427// Trivial implementation for the CPU
428
429template <class T>
430GPUdi() T work_group_scan_inclusive_add(T v)
431{
432 return v;
433}
434
435template <class T>
436GPUdi() T work_group_reduce_add(T v)
437{
438 return v;
439}
440
441template <class T>
442GPUdi() T work_group_broadcast(T v, int32_t i)
443{
444 return v;
445}
446
447template <class T>
448GPUdi() T warp_scan_inclusive_add(T v)
449{
450 return v;
451}
452
453template <class T>
454GPUdi() T warp_broadcast(T v, int32_t i)
455{
456 return v;
457}
458
459#endif
460
461#endif
int32_t i
#define get_local_size(dim)
#define GPUdi()
#define get_local_id(dim)
#define GPUbarrier()
#define GPUd()
int32_t retVal
GPUd() static void sort(T *begin
GLdouble n
Definition glcorearb.h:1982
GLint GLenum GLint x
Definition glcorearb.h:403
const GLfloat * m
Definition glcorearb.h:4066
GLuint GLuint end
Definition glcorearb.h:469
const GLdouble * v
Definition glcorearb.h:832
GLdouble GLdouble GLdouble GLdouble top
Definition glcorearb.h:4077
GLint first
Definition glcorearb.h:399
GLdouble f
Definition glcorearb.h:310
GLboolean GLboolean GLboolean b
Definition glcorearb.h:1233
GLint y
Definition glcorearb.h:270
GLboolean * data
Definition glcorearb.h:298
GLintptr offset
Definition glcorearb.h:660
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLboolean GLboolean GLboolean GLboolean a
Definition glcorearb.h:1233
GPUdi() o2
Definition TrackTRD.h:38
Enum< T >::Iterator begin(Enum< T >)
Definition Defs.h:173
a couple of static helper functions to create timestamp values for CCDB queries or override obsolete ...
void empty(int)
a move-only header stack with serialized headers This is the flat buffer where all the headers in a m...
Definition Stack.h:36
__global__ void sortInBlock(float *data, size_t dataLength)
char const *restrict const cmp
Definition x9.h:96