15#ifndef GPUCOMMONALGORITHM_H
16#define GPUCOMMONALGORITHM_H
20#if !defined(GPUCA_GPUCODE)
22#define GPUCA_ALGORITHM_STD
38 template <class T, class
S>
40 template <class T, class
S>
42 template <class T, class
S>
45 template <
class T,
class S>
54 GPUd() static
void QuickSort(I
f, I l) noexcept;
57 template <typename I, typename Cmp>
58 GPUd() static
void QuickSort(I
f, I l, Cmp
cmp) noexcept;
61 template <typename I, typename Cmp>
62 GPUd() static
void InsertionSort(I
f, I l, Cmp
cmp) noexcept;
65 template <typename I, typename Cmp>
66 GPUd() static I MedianOf3Select(I
f, I l, Cmp
cmp) noexcept;
69 template <typename I, typename T, typename Cmp>
70 GPUd() static I UnguardedPartition(I
f, I l, T piv, Cmp
cmp) noexcept;
74 GPUd() static
void IterSwap(I
a, I
b) noexcept;
77#ifndef GPUCA_ALGORITHM_STD
86template <
typename I,
typename Cmp>
94 while (it1 !=
f &&
cmp(tmp, it1[-1])) {
104template <
typename I,
typename Cmp>
105GPUdi() I GPUCommonAlgorithm::MedianOf3Select(I
f, I l, Cmp
cmp) noexcept
107 auto m =
f + (l -
f) / 2;
114 }
else if (
cmp(*
f, *l)) {
119 }
else if (
cmp(*
f, *l)) {
121 }
else if (
cmp(*
m, *l)) {
128template <
typename I,
typename T,
typename Cmp>
129GPUdi() I GPUCommonAlgorithm::UnguardedPartition(I
f, I l, T piv, Cmp
cmp) noexcept
132 while (
cmp(*
f, piv)) {
136 while (
cmp(piv, *l)) {
148template <
typename I,
typename Cmp>
149GPUdi()
void GPUCommonAlgorithm::QuickSort(I
f, I l, Cmp
cmp) noexcept
154 using IndexType = uint16_t;
177 const auto it0 =
f +
s.top().first;
178 const auto it1 =
f +
s.top().second;
181 const auto piv = *MedianOf3Select(it0, it1,
cmp);
182 const auto pp = UnguardedPartition(it0, it1, piv,
cmp);
184 constexpr auto cutoff = 50u;
185 const auto lsz = pp - it0;
186 const auto rsz = it1 - pp;
189 s.emplace(pp -
f, it1 -
f);
192 s.emplace(it0 -
f, pp -
f);
196 s.emplace(it0 -
f, pp -
f);
199 s.emplace(pp -
f, it1 -
f);
203 InsertionSort(
f, l,
cmp);
207GPUdi()
void GPUCommonAlgorithm::QuickSort(I
f, I l) noexcept
209 QuickSort(
f, l, [](
auto&&
x,
auto&&
y) {
return x <
y; });
217#if (((defined(__CUDACC__) && !defined(__clang__)) || defined(__HIPCC__))) && !defined(GPUCA_GPUCODE_HOSTONLY)
227GPUdi()
void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T*
end)
230 GPUCommonAlgorithm::sort(
begin,
end);
232 GPUCommonAlgorithm::sortDeviceDynamic(
begin,
end, [](
auto&&
x,
auto&&
y) {
return x <
y; });
236template <
class T,
class S>
237GPUdi()
void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T*
end, const
S& comp)
239 GPUCommonAlgorithm::sort(begin,
end, comp);
253#ifdef GPUCA_ALGORITHM_STD
254 std::sort(begin,
end);
256 QuickSort(begin,
end, [](
auto&&
x,
auto&&
y) {
return x <
y; });
260template <
class T,
class S>
261GPUdi()
void GPUCommonAlgorithm::sort(T* begin, T*
end, const
S& comp)
263#ifdef GPUCA_ALGORITHM_STD
264 std::sort(begin,
end, comp);
266 QuickSort(begin,
end, comp);
274 GPUCommonAlgorithm::sort(begin,
end);
276 GPUCommonAlgorithm::sortInBlock(begin,
end, [](
auto&&
x,
auto&&
y) {
return x <
y; });
280template <
class T,
class S>
284 GPUCommonAlgorithm::sort(begin,
end, comp);
287 for (int32_t
i = 0;
i <
n;
i++) {
290 int32_t curPos = 2 * tIdx +
offset;
291 int32_t nextPos = curPos + 1;
294 if (!comp(begin[curPos], begin[nextPos])) {
295 IterSwap(&begin[curPos], &begin[nextPos]);
326#pragma OPENCL EXTENSION cl_khr_subgroups : enable
329GPUdi() T work_group_scan_inclusive_add_FUNC(T
v)
331 return sub_group_scan_inclusive_add(
v);
334GPUdi() uint8_t work_group_scan_inclusive_add_FUNC<uint8_t>(uint8_t
v)
336 return sub_group_scan_inclusive_add((uint32_t)
v);
339GPUdi() T work_group_broadcast_FUNC(T
v, int32_t
i)
341 return sub_group_broadcast(
v,
i);
344GPUdi() uint8_t work_group_broadcast_FUNC<uint8_t>(uint8_t
v, int32_t
i)
346 return sub_group_broadcast((uint32_t)
v,
i);
349#define warp_scan_inclusive_add(v) work_group_scan_inclusive_add_FUNC(v)
350#define warp_broadcast(v, i) work_group_broadcast_FUNC(v, i)
352#elif (defined(__CUDACC__) || defined(__HIPCC__))
355#if !defined(GPUCA_GPUCODE_COMPILEKERNELS) && !defined(GPUCA_GPUCODE_HOSTONLY)
356#if defined(__CUDACC__)
357#include <cub/cub.cuh>
358#elif defined(__HIPCC__)
359#include <hipcub/hipcub.hpp>
363#define work_group_scan_inclusive_add(v) work_group_scan_inclusive_add_FUNC(v, smem)
364template <
class T,
class S>
365GPUdi() T work_group_scan_inclusive_add_FUNC(T
v,
S& smem)
367 typename S::BlockScan(smem.cubTmpMem).InclusiveSum(
v,
v);
372#define work_group_broadcast(v, i) work_group_broadcast_FUNC(v, i, smem)
373template <
class T,
class S>
374GPUdi() T work_group_broadcast_FUNC(T
v, int32_t
i,
S& smem)
376 if ((int32_t)threadIdx.x ==
i) {
377 smem.tmpBroadcast =
v;
385#define work_group_reduce_add(v) work_group_reduce_add_FUNC(v, smem)
386template <
class T,
class S>
387GPUdi() T work_group_reduce_add_FUNC(T
v,
S& smem)
389 v =
typename S::BlockReduce(smem.cubReduceTmpMem).Sum(
v);
391 v = work_group_broadcast(
v, 0);
395#define warp_scan_inclusive_add(v) warp_scan_inclusive_add_FUNC(v, smem)
396template <
class T,
class S>
397GPUdi() T warp_scan_inclusive_add_FUNC(T
v,
S& smem)
399 typename S::WarpScan(smem.cubWarpTmpMem).InclusiveSum(
v,
v);
403#define warp_broadcast(v, i) warp_broadcast_FUNC(v, i)
405GPUdi() T warp_broadcast_FUNC(T
v, int32_t
i)
408 return __shfl_sync(0xFFFFFFFF,
v,
i);
418GPUdi() T work_group_scan_inclusive_add(T
v)
424GPUdi() T work_group_reduce_add(T
v)
436GPUdi() T warp_scan_inclusive_add(T
v)
442GPUdi() T warp_broadcast(T
v, int32_t
i)
449#ifdef GPUCA_ALGORITHM_STD
450#undef GPUCA_ALGORITHM_STD
#define get_local_size(dim)
#define get_local_id(dim)
GPUd() static void sort(T *begin
GPUh() static void sortOnDevice(auto *rec
GLdouble GLdouble GLdouble GLdouble top
GLboolean GLboolean GLboolean b
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLboolean GLboolean GLboolean GLboolean a
uint8_t itsSharedClusterMap uint8_t
GPUCommonAlgorithm CAAlgo
Enum< T >::Iterator begin(Enum< T >)
__global__ void sortInBlock(float *data, size_t dataLength)
char const *restrict const cmp