Project
Loading...
Searching...
No Matches
GPUCommonMath.h
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
15#ifndef GPUCOMMONMATH_H
16#define GPUCOMMONMATH_H
17
18#include "GPUCommonDef.h"
19
20#if defined(__CUDACC__) && !defined(__clang__) && !defined(GPUCA_GPUCODE_COMPILEKERNELS) && !defined(GPUCA_GPUCODE_HOSTONLY)
21#include <sm_20_atomic_functions.h>
22#endif
23
24#if !defined(GPUCA_GPUCODE_DEVICE)
25#include <cmath>
26#include <algorithm>
27#include <atomic>
28#include <limits>
29#include <cstring>
30#endif
31
32#if !defined(GPUCA_GPUCODE_COMPILEKERNELS) && (!defined(GPUCA_GPUCODE_DEVICE) || defined(__CUDACC__) || defined(__HIPCC__))
33#include <cstdint>
34#endif
35
36// GPUCA_CHOICE Syntax: GPUCA_CHOICE(Host, CUDA&HIP, OpenCL)
37#if defined(GPUCA_GPUCODE_DEVICE) && (defined(__CUDACC__) || defined(__HIPCC__)) // clang-format off
38 #define GPUCA_CHOICE(c1, c2, c3) (c2) // Select second option for CUDA and HIP
39#elif defined(GPUCA_GPUCODE_DEVICE) && defined (__OPENCL__)
40 #define GPUCA_CHOICE(c1, c2, c3) (c3) // Select third option for OpenCL
41#else
42 #define GPUCA_CHOICE(c1, c2, c3) (c1) // Select first option for Host
43#endif // clang-format on
44
45namespace o2::gpu
46{
47
49{
50 public:
51 GPUd() static float2 MakeFloat2(float x, float y); // TODO: Find better appraoch that is constexpr
52
53 template <class T>
54 GPUhd() constexpr static T Min(const T x, const T y)
55 {
56 return GPUCA_CHOICE(std::min(x, y), min(x, y), min(x, y));
57 }
58 template <class T>
59 GPUhd() constexpr static T Max(const T x, const T y)
60 {
61 return GPUCA_CHOICE(std::max(x, y), max(x, y), max(x, y));
62 }
63 template <class T, class S, class R>
64 GPUd() static T MinWithRef(T x, T y, S refX, S refY, R& r);
65 template <class T, class S, class R>
66 GPUd() static T MaxWithRef(T x, T y, S refX, S refY, R& r);
67 template <class T, class S, class R>
68 GPUd() static T MaxWithRef(T x, T y, T z, T w, S refX, S refY, S refZ, S refW, R& r);
69 template <class T>
70 GPUdi() constexpr static T Clamp(const T v, const T lo, const T hi)
71 {
72 return Max(lo, Min(v, hi));
73 }
74 GPUhdni() constexpr static float Sqrt(float x);
75 GPUd() static float InvSqrt(float x);
76 template <class T>
77 GPUdi() constexpr static T Square(T x)
78 {
79 return x * x;
80 }
81 template <class T>
82 GPUhd() constexpr static T Abs(T x);
83 GPUd() constexpr static float ASin(float x);
84 GPUd() constexpr static float ACos(float x);
85 GPUd() constexpr static float ATan(float x);
86 GPUhd() constexpr static float ATan2(float y, float x);
87 GPUd() constexpr static float Sin(float x);
88 GPUd() constexpr static float Cos(float x);
89 GPUhdni() static void SinCos(float x, float& s, float& c);
90 GPUhdni() static void SinCosd(double x, double& s, double& c);
91 GPUd() constexpr static float Tan(float x);
92 GPUd() constexpr static float Pow(float x, float y);
93 GPUd() constexpr static float Log(float x);
94 GPUd() constexpr static float Exp(float x);
95 GPUhdni() constexpr static float Copysign(float x, float y) { return GPUCA_CHOICE(std::copysignf(x, y), copysignf(x, y), copysign(x, y)); }
96 GPUd() constexpr static float TwoPi() { return 6.2831853f; }
97 GPUd() constexpr static float Pi() { return 3.1415927f; }
98 GPUd() constexpr static float Round(float x);
99 GPUd() constexpr static float Floor(float x) { return GPUCA_CHOICE(floorf(x), floorf(x), floor(x)); }
100 GPUd() static uint32_t Float2UIntReint(const float& x);
101 GPUd() constexpr static uint32_t Float2UIntRn(float x) { return (uint32_t)(int32_t)(x + 0.5f); }
102 GPUd() constexpr static int32_t Float2IntRn(float x);
103 GPUd() constexpr static float Modf(float x, float y);
104 GPUhdi() static float Remainderf(float x, float y);
105 GPUd() constexpr static bool Finite(float x);
106 GPUd() constexpr static bool IsNaN(float x);
107#ifndef __FAST_MATH__
108 GPUd() constexpr static float QuietNaN() { return GPUCA_CHOICE(std::numeric_limits<float>::quiet_NaN(), __builtin_nanf(""), nan(0u)); }
109#endif
110 GPUd() constexpr static uint32_t Clz(uint32_t val);
111 GPUd() constexpr static uint32_t Ctz(uint32_t val);
112 GPUd() constexpr static uint32_t Popcount(uint32_t val);
113
114 GPUd() static void memcpy(void* dst, const void* src, size_t size);
115
116 GPUhdi() constexpr static float Hypot(float x, float y) { return Sqrt(x * x + y * y); }
117 GPUhdi() constexpr static float Hypot(float x, float y, float z) { return Sqrt(x * x + y * y + z * z); }
118 GPUhdi() constexpr static float Hypot(float x, float y, float z, float w) { return Sqrt(x * x + y * y + z * z + w * w); }
119
120 template <typename T>
121 GPUhd() constexpr static void Swap(T& a, T& b);
122
123 template <class T>
124 GPUdi() static T AtomicExch(GPUglobalref() GPUgeneric() GPUAtomic(T) * addr, T val)
125 {
126 return GPUCommonMath::AtomicExchInternal(addr, val);
127 }
128
129 template <class T>
130 GPUdi() static bool AtomicCAS(GPUglobalref() GPUgeneric() GPUAtomic(T) * addr, T cmp, T val)
131 {
132 return GPUCommonMath::AtomicCASInternal(addr, cmp, val);
133 }
134
135 template <class T>
136 GPUdi() static T AtomicAdd(GPUglobalref() GPUgeneric() GPUAtomic(T) * addr, T val)
137 {
138 return GPUCommonMath::AtomicAddInternal(addr, val);
139 }
140 template <class T>
141 GPUdi() static void AtomicMax(GPUglobalref() GPUgeneric() GPUAtomic(T) * addr, T val)
142 {
143 GPUCommonMath::AtomicMaxInternal(addr, val);
144 }
145 template <class T>
146 GPUdi() static void AtomicMin(GPUglobalref() GPUgeneric() GPUAtomic(T) * addr, T val)
147 {
148 GPUCommonMath::AtomicMinInternal(addr, val);
149 }
150 template <class T>
151 GPUdi() static T AtomicExchShared(GPUsharedref() GPUgeneric() GPUAtomic(T) * addr, T val)
152 {
153 return GPUCommonMath::AtomicExchInternal(addr, val);
154 }
155 template <class T>
156 GPUdi() static T AtomicAddShared(GPUsharedref() GPUgeneric() GPUAtomic(T) * addr, T val)
157 {
158 return GPUCommonMath::AtomicAddInternal(addr, val);
159 }
160 template <class T>
161 GPUdi() static void AtomicMaxShared(GPUsharedref() GPUgeneric() GPUAtomic(T) * addr, T val)
162 {
163 GPUCommonMath::AtomicMaxInternal(addr, val);
164 }
165 template <class T>
166 GPUdi() static void AtomicMinShared(GPUsharedref() GPUgeneric() GPUAtomic(T) * addr, T val)
167 {
168 GPUCommonMath::AtomicMinInternal(addr, val);
169 }
170 GPUd() constexpr static int32_t Mul24(int32_t a, int32_t b);
171 GPUd() constexpr static float FMulRZ(float a, float b);
172
173 template <int32_t I, class T>
174 GPUd() constexpr static T nextMultipleOf(T val);
175
176 template <typename... Args>
177 GPUhdni() constexpr static float Sum2(float w, Args... args);
178
179 private:
180 template <class S, class T>
181 GPUd() static uint32_t AtomicExchInternal(S* addr, T val);
182 template <class S, class T>
183 GPUd() static bool AtomicCASInternal(S* addr, T cmp, T val);
184 template <class S, class T>
185 GPUd() static uint32_t AtomicAddInternal(S* addr, T val);
186 template <class S, class T>
187 GPUd() static void AtomicMaxInternal(S* addr, T val);
188 template <class S, class T>
189 GPUd() static void AtomicMinInternal(S* addr, T val);
190};
191
193
194template <typename... Args>
195GPUhdi() constexpr float GPUCommonMath::Sum2(float w, Args... args)
196{
197 if constexpr (sizeof...(Args) == 0) {
198 return w * w;
199 } else {
200 return w * w + Sum2(args...);
201 }
202 return 0;
203}
204
205GPUdi() void GPUCommonMath::memcpy(void* dst, const void* src, size_t size)
206{
207#ifndef GPUCA_GPUCODE_DEVICE
208 std::memcpy(dst, src, size);
209#elif defined(__CUDACC__) || defined(__HIPCC__)
210 ::memcpy(dst, src, size);
211#elif defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
212 __builtin_memcpy(dst, src, size);
213#else
214 char* d = (char*)dst;
215 const char* s = (const char*)src;
216 for (size_t i = 0; i < size; i++) {
217 d[i] = s[i];
218 }
219#endif
220}
221
222template <int32_t I, class T>
223GPUdi() constexpr T GPUCommonMath::nextMultipleOf(T val)
224{
225 if constexpr (I & (I - 1)) {
226 T tmp = val % I;
227 if (tmp) {
228 val += I - tmp;
229 }
230 return val;
231 } else {
232 return (val + I - 1) & ~(T)(I - 1);
233 }
234 return 0; // BUG: Cuda complains about missing return value with constexpr if
235}
236
237GPUdi() float2 GPUCommonMath::MakeFloat2(float x, float y)
238{
239#if !defined(GPUCA_GPUCODE) || defined(__OPENCL__) || defined(__OPENCL_HOST__)
240 float2 ret = {x, y};
241 return ret;
242#else
243 return make_float2(x, y);
244#endif // GPUCA_GPUCODE
245}
246
247GPUdi() constexpr float GPUCommonMath::Modf(float x, float y) { return GPUCA_CHOICE(fmodf(x, y), fmodf(x, y), fmod(x, y)); }
248GPUhdi() float GPUCommonMath::Remainderf(float x, float y) { return GPUCA_CHOICE(std::remainderf(x, y), remainderf(x, y), remainder(x, y)); }
249
250GPUdi() uint32_t GPUCommonMath::Float2UIntReint(const float& x)
251{
252#if defined(GPUCA_GPUCODE_DEVICE) && (defined(__CUDACC__) || defined(__HIPCC__))
253 return __float_as_uint(x);
254#elif defined(GPUCA_GPUCODE_DEVICE) && defined(__OPENCL__)
255 return as_uint(x);
256#else
257 return reinterpret_cast<const uint32_t&>(x);
258#endif
259}
260
261GPUCA_DETERMINISTIC_CODE( // clang-format off
262GPUdi() constexpr float GPUCommonMath::Round(float x) { return GPUCA_CHOICE(roundf(x), roundf(x), round(x)); }
263GPUdi() constexpr int32_t GPUCommonMath::Float2IntRn(float x) { return (int32_t)Round(x); }
264GPUhdi() constexpr float GPUCommonMath::Sqrt(float x) { return GPUCA_CHOICE(sqrtf(x), (float)sqrt((double)x), sqrt(x)); }
265GPUdi() constexpr float GPUCommonMath::ATan(float x) { return GPUCA_CHOICE((float)atan((double)x), (float)atan((double)x), atan(x)); }
266GPUhdi() constexpr float GPUCommonMath::ATan2(float y, float x) { return GPUCA_CHOICE((float)atan2((double)y, (double)x), (float)atan2((double)y, (double)x), atan2(y, x)); }
267GPUdi() constexpr float GPUCommonMath::Sin(float x) { return GPUCA_CHOICE((float)sin((double)x), (float)sin((double)x), sin(x)); }
268GPUdi() constexpr float GPUCommonMath::Cos(float x) { return GPUCA_CHOICE((float)cos((double)x), (float)cos((double)x), cos(x)); }
269GPUdi() constexpr float GPUCommonMath::Tan(float x) { return GPUCA_CHOICE((float)tanf((double)x), (float)tanf((double)x), tan(x)); }
270GPUdi() constexpr float GPUCommonMath::Pow(float x, float y) { return GPUCA_CHOICE((float)pow((double)x, (double)y), pow((double)x, (double)y), pow(x, y)); }
271GPUdi() constexpr float GPUCommonMath::ASin(float x) { return GPUCA_CHOICE((float)asin((double)x), (float)asin((double)x), asin(x)); }
272GPUdi() constexpr float GPUCommonMath::ACos(float x) { return GPUCA_CHOICE((float)acos((double)x), (float)acos((double)x), acos(x)); }
273GPUdi() constexpr float GPUCommonMath::Log(float x) { return GPUCA_CHOICE((float)log((double)x), (float)log((double)x), log(x)); }
274GPUdi() constexpr float GPUCommonMath::Exp(float x) { return GPUCA_CHOICE((float)exp((double)x), (float)exp((double)x), exp(x)); }
275GPUdi() constexpr bool GPUCommonMath::Finite(float x) { return GPUCA_CHOICE(std::isfinite(x), isfinite(x), isfinite(x)); }
276GPUdi() constexpr bool GPUCommonMath::IsNaN(float x) { return GPUCA_CHOICE(std::isnan(x), isnan(x), isnan(x)); }
277, // !GPUCA_DETERMINISTIC_CODE
278GPUdi() constexpr float GPUCommonMath::Round(float x) { return GPUCA_CHOICE(roundf(x), rintf(x), rint(x)); }
279GPUdi() constexpr int32_t GPUCommonMath::Float2IntRn(float x) { return GPUCA_CHOICE((int32_t)Round(x), __float2int_rn(x), (int32_t)Round(x)); }
280GPUhdi() constexpr float GPUCommonMath::Sqrt(float x) { return GPUCA_CHOICE(sqrtf(x), sqrtf(x), sqrt(x)); }
281GPUdi() constexpr float GPUCommonMath::ATan(float x) { return GPUCA_CHOICE(atanf(x), atanf(x), atan(x)); }
282GPUhdi() constexpr float GPUCommonMath::ATan2(float y, float x) { return GPUCA_CHOICE(atan2f(y, x), atan2f(y, x), atan2(y, x)); }
283GPUdi() constexpr float GPUCommonMath::Sin(float x) { return GPUCA_CHOICE(sinf(x), sinf(x), sin(x)); }
284GPUdi() constexpr float GPUCommonMath::Cos(float x) { return GPUCA_CHOICE(cosf(x), cosf(x), cos(x)); }
285GPUdi() constexpr float GPUCommonMath::Tan(float x) { return GPUCA_CHOICE(tanf(x), tanf(x), tan(x)); }
286GPUdi() constexpr float GPUCommonMath::Pow(float x, float y) { return GPUCA_CHOICE(powf(x, y), powf(x, y), pow(x, y)); }
287GPUdi() constexpr float GPUCommonMath::ASin(float x) { return GPUCA_CHOICE(asinf(x), asinf(x), asin(x)); }
288GPUdi() constexpr float GPUCommonMath::ACos(float x) { return GPUCA_CHOICE(acosf(x), acosf(x), acos(x)); }
289GPUdi() constexpr float GPUCommonMath::Log(float x) { return GPUCA_CHOICE(logf(x), logf(x), log(x)); }
290GPUdi() constexpr float GPUCommonMath::Exp(float x) { return GPUCA_CHOICE(expf(x), expf(x), exp(x)); }
291GPUdi() constexpr bool GPUCommonMath::Finite(float x) { return true; }
292GPUdi() constexpr bool GPUCommonMath::IsNaN(float x) { return false; }
293) // clang-format on
294
295GPUhdi() void GPUCommonMath::SinCos(float x, float& s, float& c)
296{
297 GPUCA_DETERMINISTIC_CODE( // clang-format off
298 s = sin((double)x);
299 c = cos((double)x);
300 , // !GPUCA_DETERMINISTIC_CODE
301#if !defined(GPUCA_GPUCODE_DEVICE) && defined(__APPLE__)
302 __sincosf(x, &s, &c);
303#elif !defined(GPUCA_GPUCODE_DEVICE) && (defined(__GNU_SOURCE__) || defined(_GNU_SOURCE) || defined(GPUCA_GPUCODE))
304 sincosf(x, &s, &c);
305#else
306 GPUCA_CHOICE((void)((s = sinf(x)) + (c = cosf(x))), sincosf(x, &s, &c), s = sincos(x, &c));
307#endif
308 ) // clang-format on
309}
310
311GPUhdi() void GPUCommonMath::SinCosd(double x, double& s, double& c)
312{
313#if !defined(GPUCA_GPUCODE_DEVICE) && defined(__APPLE__)
314 __sincos(x, &s, &c);
315#elif !defined(GPUCA_GPUCODE_DEVICE) && (defined(__GNU_SOURCE__) || defined(_GNU_SOURCE) || defined(GPUCA_GPUCODE))
316 sincos(x, &s, &c);
317#else
318 GPUCA_CHOICE((void)((s = sin(x)) + (c = cos(x))), sincos(x, &s, &c), s = sincos(x, &c));
319#endif
320}
321
322GPUdi() constexpr uint32_t GPUCommonMath::Clz(uint32_t x)
323{
324#if (defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIPCC__))
325 return x == 0 ? 32 : GPUCA_CHOICE(__builtin_clz(x), __clz(x), __builtin_clz(x)); // use builtin if available
326#else
327 for (int32_t i = 31; i >= 0; i--) {
328 if (x & (1u << i)) {
329 return (31 - i);
330 }
331 }
332 return 32;
333#endif
334}
335
336GPUdi() constexpr uint32_t GPUCommonMath::Ctz(uint32_t x)
337{
338#if (defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIPCC__))
339 return x == 0 ? 32 : GPUCA_CHOICE(__builtin_ctz(x), __ffs(x) - 1, __builtin_ctz(x));
340#else
341 for (uint32_t i = 0; i < 32; ++i) {
342 if (x & (1u << i)) {
343 return i;
344 }
345 }
346 return 32;
347#endif
348}
349
350GPUdi() constexpr uint32_t GPUCommonMath::Popcount(uint32_t x)
351{
352#if (defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIPCC__)) && !defined(__OPENCL__) // TODO: remove OPENCL when reported SPIR-V bug is fixed
353 // use builtin if available
354 return GPUCA_CHOICE(__builtin_popcount(x), __popc(x), __builtin_popcount(x));
355#else
356 x = x - ((x >> 1) & 0x55555555);
357 x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
358 return (((x + (x >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
359#endif
360}
361
362template <typename T>
363GPUhdi() constexpr void GPUCommonMath::Swap(T& a, T& b)
364{
365#ifndef GPUCA_GPUCODE_DEVICE
366 std::swap(a, b);
367#else
368 T tmp = a;
369 a = b;
370 b = tmp;
371#endif
372}
373
374template <class T, class S, class R>
375GPUdi() T GPUCommonMath::MinWithRef(T x, T y, S refX, S refY, R& r)
376{
377 if (x < y) {
378 r = refX;
379 return x;
380 }
381 r = refY;
382 return y;
383}
384
385template <class T, class S, class R>
386GPUdi() T GPUCommonMath::MaxWithRef(T x, T y, S refX, S refY, R& r)
387{
388 if (x > y) {
389 r = refX;
390 return x;
391 }
392 r = refY;
393 return y;
394}
395
396template <class T, class S, class R>
397GPUdi() T GPUCommonMath::MaxWithRef(T x, T y, T z, T w, S refX, S refY, S refZ, S refW, R& r)
398{
399 T retVal = x;
400 S retRef = refX;
401 if (y > retVal) {
402 retVal = y;
403 retRef = refY;
404 }
405 if (z > retVal) {
406 retVal = z;
407 retRef = refZ;
408 }
409 if (w > retVal) {
410 retVal = w;
411 retRef = refW;
412 }
413 r = retRef;
414 return retVal;
415}
416
417GPUdi() float GPUCommonMath::InvSqrt(float _x)
418{
419 GPUCA_DETERMINISTIC_CODE( // clang-format off
420 return 1.f / Sqrt(_x);
421 , // !GPUCA_DETERMINISTIC_CODE
422#if defined(__CUDACC__) || defined(__HIPCC__)
423 return __frsqrt_rn(_x);
424#elif defined(__OPENCL__) && defined(__clang__)
425 return 1.f / sqrt(_x);
426#elif !defined(__OPENCL__) && (defined(__FAST_MATH__) || defined(__clang__))
427 return 1.f / sqrtf(_x);
428#else
429 union {
430 float f;
431 int32_t i;
432 } x = {_x};
433 const float xhalf = 0.5f * x.f;
434 x.i = 0x5f3759df - (x.i >> 1);
435 x.f = x.f * (1.5f - xhalf * x.f * x.f);
436 return x.f;
437#endif
438 ) // clang-format on
439}
440
441template <>
442GPUhdi() constexpr float GPUCommonMath::Abs<float>(float x)
443{
444 return GPUCA_CHOICE(fabsf(x), fabsf(x), fabs(x));
445}
446
447template <>
448GPUhdi() constexpr double GPUCommonMath::Abs<double>(double x)
449{
450 return GPUCA_CHOICE(fabs(x), fabs(x), fabs(x));
451}
452
453template <>
454GPUhdi() constexpr int32_t GPUCommonMath::Abs<int32_t>(int32_t x)
455{
456 return GPUCA_CHOICE(abs(x), abs(x), abs(x));
457}
458
459template <class S, class T>
460GPUdi() uint32_t GPUCommonMath::AtomicExchInternal(S* addr, T val)
461{
462#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
463 return ::atomic_exchange(addr, val);
464#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
465 return ::atomic_xchg(addr, val);
466#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
467 return ::atomicExch(addr, val);
468#elif defined(WITH_OPENMP)
469 uint32_t old;
470 __atomic_exchange(addr, &val, &old, __ATOMIC_SEQ_CST);
471 return old;
472#else
473 return reinterpret_cast<std::atomic<T>*>(addr)->exchange(val);
474#endif
475}
476
477template <class S, class T>
478GPUdi() bool GPUCommonMath::AtomicCASInternal(S* addr, T cmp, T val)
479{
480#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
481 return ::atomic_compare_exchange(addr, cmp, val) == cmp;
482#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
483 return ::atomic_cmpxchg(addr, cmp, val) == cmp;
484#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
485 return ::atomicCAS(addr, cmp, val) == cmp;
486#elif defined(WITH_OPENMP)
487 return __atomic_compare_exchange(addr, &cmp, &val, true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
488#else
489 return reinterpret_cast<std::atomic<T>*>(addr)->compare_exchange_strong(cmp, val);
490#endif
491}
492
493template <class S, class T>
494GPUdi() uint32_t GPUCommonMath::AtomicAddInternal(S* addr, T val)
495{
496#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
497 return ::atomic_fetch_add(addr, val);
498#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
499 return ::atomic_add(addr, val);
500#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
501 return ::atomicAdd(addr, val);
502#elif defined(WITH_OPENMP)
503 return __atomic_add_fetch(addr, val, __ATOMIC_SEQ_CST) - val;
504#else
505 return reinterpret_cast<std::atomic<T>*>(addr)->fetch_add(val);
506#endif
507}
508
509template <class S, class T>
510GPUdi() void GPUCommonMath::AtomicMaxInternal(S* addr, T val)
511{
512#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
513 ::atomic_fetch_max(addr, val);
514#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
515 ::atomic_max(addr, val);
516#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
517 ::atomicMax(addr, val);
518#else
519 S current;
520 while ((current = *(volatile S*)addr) < val && !AtomicCASInternal(addr, current, val)) {
521 }
522#endif // GPUCA_GPUCODE
523}
524
525template <class S, class T>
526GPUdi() void GPUCommonMath::AtomicMinInternal(S* addr, T val)
527{
528#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
529 ::atomic_fetch_min(addr, val);
530#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
531 ::atomic_min(addr, val);
532#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
533 ::atomicMin(addr, val);
534#else
535 S current;
536 while ((current = *(volatile S*)addr) > val && !AtomicCASInternal(addr, current, val)) {
537 }
538#endif // GPUCA_GPUCODE
539}
540
541#if (defined(__CUDACC__) || defined(__HIPCC__)) && !defined(G__ROOT) && !defined(__CLING__)
542#define GPUCA_HAVE_ATOMIC_MINMAX_FLOAT
543template <>
544GPUdii() void GPUCommonMath::AtomicMaxInternal(GPUglobalref() GPUgeneric() GPUAtomic(float) * addr, float val)
545{
546 if (val == -0.f) {
547 val = 0.f;
548 }
549 if (val >= 0) {
550 AtomicMaxInternal((GPUAtomic(int32_t)*)addr, __float_as_int(val));
551 } else {
552 AtomicMinInternal((GPUAtomic(uint32_t)*)addr, __float_as_uint(val));
553 }
554}
555template <>
556GPUdii() void GPUCommonMath::AtomicMinInternal(GPUglobalref() GPUgeneric() GPUAtomic(float) * addr, float val)
557{
558 if (val == -0.f) {
559 val = 0.f;
560 }
561 if (val >= 0) {
562 AtomicMinInternal((GPUAtomic(int32_t)*)addr, __float_as_int(val));
563 } else {
564 AtomicMaxInternal((GPUAtomic(uint32_t)*)addr, __float_as_uint(val));
565 }
566}
567#endif
568
569#undef GPUCA_CHOICE
570
571} // namespace o2::gpu
572
573#endif // GPUCOMMONMATH_H
uint64_t exp(uint64_t base, uint8_t exp) noexcept
int32_t i
#define GPUsharedref()
#define GPUdii()
#define GPUAtomic(type)
#define GPUgeneric()
#define GPUglobalref()
#define GPUCA_DETERMINISTIC_CODE(det, indet)
#define GPUCA_CHOICE(c1, c2, c3)
int32_t retVal
GPUd() const expr static float Pi()
GPUhd() const expr static T Min(const T x
GPUd() const expr static float QuietNaN()
GPUdi() static void AtomicMax(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
GPUd() const expr static int32_t Float2IntRn(float x)
GPUd() const expr static float Round(float x)
GPUhdni() const expr static float Sqrt(float x)
GPUdi() static void AtomicMin(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
GPUdi() const expr static T Clamp(const T v
GPUd() static float2 MakeFloat2(float x
GPUhd() const expr static T Max(const T x
GPUdi() static T AtomicAddShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GPUhd() const expr static T Abs(T x)
GPUdi() static T AtomicAdd(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
GPUd() const expr static float TwoPi()
const void size_t size
GPUdi() static T AtomicExchShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GPUd() static uint32_t Float2UIntReint(const float &x)
GPUd() static T MinWithRef(T x
GPUdi() static void AtomicMinShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GPUhdi() static float Remainderf(float x
GPUhdi() const expr static float Hypot(float x
GPUd() const expr static uint32_t Clz(uint32_t val)
GPUdi() static void AtomicMaxShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GLint GLenum GLint x
Definition glcorearb.h:403
GLenum src
Definition glcorearb.h:1767
GLsizeiptr size
Definition glcorearb.h:659
const GLdouble * v
Definition glcorearb.h:832
GLdouble f
Definition glcorearb.h:310
GLboolean GLboolean GLboolean b
Definition glcorearb.h:1233
GLenum GLenum dst
Definition glcorearb.h:1767
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLuint GLfloat * val
Definition glcorearb.h:1582
GLboolean r
Definition glcorearb.h:1233
GLboolean GLboolean GLboolean GLboolean a
Definition glcorearb.h:1233
GLubyte GLubyte GLubyte GLubyte w
Definition glcorearb.h:852
GLdouble GLdouble GLdouble z
Definition glcorearb.h:843
bool isnan(float f)
constexpr size_t min
constexpr size_t max