15#ifndef GPUCOMMONALGORITHM_H
16#define GPUCOMMONALGORITHM_H
20#if !defined(GPUCA_GPUCODE)
22#define GPUCA_ALGORITHM_STD
35 GPUd() static
void sort(T* begin, T*
end);
39 GPUd() static
void sortDeviceDynamic(T* begin, T*
end);
40 template <class T, class
S>
42 template <class T, class
S>
44 template <class T, class
S>
45 GPUd() static
void sortDeviceDynamic(T* begin, T*
end, const
S&
comp);
52 GPUd() static
void QuickSort(I
f, I l) noexcept;
55 template <typename I, typename Cmp>
56 GPUd() static
void QuickSort(I
f, I l, Cmp
cmp) noexcept;
59 template <typename I, typename Cmp>
60 GPUd() static
void InsertionSort(I
f, I l, Cmp
cmp) noexcept;
63 template <typename I, typename Cmp>
64 GPUd() static I MedianOf3Select(I
f, I l, Cmp
cmp) noexcept;
67 template <typename I, typename T, typename Cmp>
68 GPUd() static I UnguardedPartition(I
f, I l, T piv, Cmp
cmp) noexcept;
72 GPUd() static
void IterSwap(I
a, I
b) noexcept;
82#ifndef GPUCA_ALGORITHM_STD
84GPUdi()
void GPUCommonAlgorithm::IterSwap(I
a, I
b)
noexcept
91template <
typename I,
typename Cmp>
92GPUdi()
void GPUCommonAlgorithm::InsertionSort(I
f, I l, Cmp
cmp)
noexcept
99 while (it1 !=
f &&
cmp(tmp, it1[-1])) {
109template <
typename I,
typename Cmp>
110GPUdi() I GPUCommonAlgorithm::MedianOf3Select(I
f, I l, Cmp
cmp)
noexcept
112 auto m =
f + (l -
f) / 2;
119 }
else if (
cmp(*
f, *l)) {
124 }
else if (
cmp(*
f, *l)) {
126 }
else if (
cmp(*
m, *l)) {
133template <
typename I,
typename T,
typename Cmp>
134GPUdi() I GPUCommonAlgorithm::UnguardedPartition(I
f, I l, T piv, Cmp
cmp)
noexcept
137 while (
cmp(*
f, piv)) {
141 while (
cmp(piv, *l)) {
153template <
typename I,
typename Cmp>
154GPUdi()
void GPUCommonAlgorithm::QuickSort(I
f, I l, Cmp
cmp)
noexcept
159 using IndexType = uint16_t;
170 GPUd()
void emplace(IndexType
x, IndexType
y)
176 GPUd()
void pop() { --
n; }
182 const auto it0 =
f + s.top().first;
183 const auto it1 =
f + s.top().second;
186 const auto piv = *MedianOf3Select(it0, it1,
cmp);
187 const auto pp = UnguardedPartition(it0, it1, piv,
cmp);
189 constexpr auto cutoff = 50u;
190 const auto lsz = pp - it0;
191 const auto rsz = it1 - pp;
194 s.emplace(pp -
f, it1 -
f);
197 s.emplace(it0 -
f, pp -
f);
201 s.emplace(it0 -
f, pp -
f);
204 s.emplace(pp -
f, it1 -
f);
208 InsertionSort(
f, l,
cmp);
212GPUdi()
void GPUCommonAlgorithm::QuickSort(I
f, I l)
noexcept
214 QuickSort(
f, l, [](
auto&&
x,
auto&&
y) {
return x <
y; });
223#if (((defined(__CUDACC__) && !defined(__clang__)) || defined(__HIPCC__))) && !defined(GPUCA_GPUCODE_GENRTC) && !defined(GPUCA_GPUCODE_HOSTONLY)
238 GPUCommonAlgorithm::sort(begin,
end);
240 GPUCommonAlgorithm::sortDeviceDynamic(begin,
end, [](
auto&&
x,
auto&&
y) {
return x <
y; });
244template <
class T,
class S>
245GPUdi()
void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T*
end, const
S& comp)
247 GPUCommonAlgorithm::sort(begin,
end, comp);
264#ifdef GPUCA_ALGORITHM_STD
265 std::sort(begin,
end);
267 QuickSort(begin,
end, [](
auto&&
x,
auto&&
y) {
return x <
y; });
271template <
class T,
class S>
272GPUdi()
void GPUCommonAlgorithm::sort(T* begin, T*
end, const
S& comp)
274#ifdef GPUCA_ALGORITHM_STD
275 std::sort(begin,
end, comp);
277 QuickSort(begin,
end, comp);
285 GPUCommonAlgorithm::sort(begin,
end);
287 GPUCommonAlgorithm::sortInBlock(begin,
end, [](
auto&&
x,
auto&&
y) {
return x <
y; });
291template <
class T,
class S>
295 GPUCommonAlgorithm::sort(begin,
end, comp);
298 for (int32_t
i = 0;
i <
n;
i++) {
301 int32_t curPos = 2 * tIdx +
offset;
302 int32_t nextPos = curPos + 1;
305 if (!comp(begin[curPos], begin[nextPos])) {
306 IterSwap(&begin[curPos], &begin[nextPos]);
338#pragma OPENCL EXTENSION cl_khr_subgroups : enable
341GPUdi() T work_group_scan_inclusive_add_FUNC(T
v)
343 return sub_group_scan_inclusive_add(
v);
346GPUdi() uint8_t work_group_scan_inclusive_add_FUNC<uint8_t>(uint8_t
v)
348 return sub_group_scan_inclusive_add((uint32_t)
v);
351GPUdi() T work_group_broadcast_FUNC(T
v, int32_t
i)
353 return sub_group_broadcast(
v,
i);
356GPUdi() uint8_t work_group_broadcast_FUNC<uint8_t>(uint8_t
v, int32_t
i)
358 return sub_group_broadcast((uint32_t)
v,
i);
361#define warp_scan_inclusive_add(v) work_group_scan_inclusive_add_FUNC(v)
362#define warp_broadcast(v, i) work_group_broadcast_FUNC(v, i)
364#elif (defined(__CUDACC__) || defined(__HIPCC__))
367#if !defined(GPUCA_GPUCODE_COMPILEKERNELS) && !defined(GPUCA_GPUCODE_HOSTONLY)
368#if defined(__CUDACC__)
369#include <cub/cub.cuh>
370#elif defined(__HIPCC__)
371#include <hipcub/hipcub.hpp>
375#define work_group_scan_inclusive_add(v) work_group_scan_inclusive_add_FUNC(v, smem)
376template <
class T,
class S>
377GPUdi() T work_group_scan_inclusive_add_FUNC(T
v,
S& smem)
379 typename S::BlockScan(smem.cubTmpMem).InclusiveSum(
v,
v);
384#define work_group_broadcast(v, i) work_group_broadcast_FUNC(v, i, smem)
385template <
class T,
class S>
386GPUdi() T work_group_broadcast_FUNC(T
v, int32_t
i,
S& smem)
388 if ((int32_t)threadIdx.x ==
i) {
389 smem.tmpBroadcast =
v;
397#define work_group_reduce_add(v) work_group_reduce_add_FUNC(v, smem)
398template <
class T,
class S>
399GPUdi() T work_group_reduce_add_FUNC(T
v,
S& smem)
401 v =
typename S::BlockReduce(smem.cubReduceTmpMem).Sum(
v);
403 v = work_group_broadcast(
v, 0);
407#define warp_scan_inclusive_add(v) warp_scan_inclusive_add_FUNC(v, smem)
408template <
class T,
class S>
409GPUdi() T warp_scan_inclusive_add_FUNC(T
v,
S& smem)
411 typename S::WarpScan(smem.cubWarpTmpMem).InclusiveSum(
v,
v);
415#define warp_broadcast(v, i) warp_broadcast_FUNC(v, i)
417GPUdi() T warp_broadcast_FUNC(T
v, int32_t
i)
420 return __shfl_sync(0xFFFFFFFF,
v,
i);
430GPUdi() T work_group_scan_inclusive_add(T
v)
436GPUdi() T work_group_reduce_add(T
v)
448GPUdi() T warp_scan_inclusive_add(T
v)
454GPUdi() T warp_broadcast(T
v, int32_t
i)
#define get_local_size(dim)
#define get_local_id(dim)
GPUd() static void sort(T *begin
GLdouble GLdouble GLdouble GLdouble top
GLboolean GLboolean GLboolean b
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLboolean GLboolean GLboolean GLboolean a
Enum< T >::Iterator begin(Enum< T >)
a couple of static helper functions to create timestamp values for CCDB queries or override obsolete ...
__global__ void sortInBlock(float *data, size_t dataLength)
char const *restrict const cmp