15#ifndef GPUCOMMONMATH_H
16#define GPUCOMMONMATH_H
20#if defined(__CUDACC__) && !defined(__clang__) && !defined(GPUCA_GPUCODE_COMPILEKERNELS) && !defined(GPUCA_GPUCODE_HOSTONLY)
21#include <sm_20_atomic_functions.h>
24#if !defined(GPUCA_GPUCODE_DEVICE)
32#if !defined(GPUCA_GPUCODE_COMPILEKERNELS) && (!defined(GPUCA_GPUCODE_DEVICE) || defined(__CUDACC__) || defined(__HIPCC__))
37#if defined(GPUCA_GPUCODE_DEVICE) && (defined(__CUDACC__) || defined(__HIPCC__))
38 #define GPUCA_CHOICE(c1, c2, c3) (c2)
39#elif defined(GPUCA_GPUCODE_DEVICE) && defined (__OPENCL__)
40 #define GPUCA_CHOICE(c1, c2, c3) (c3)
42 #define GPUCA_CHOICE(c1, c2, c3) (c1)
54 GPUhd() constexpr static T Min(const T
x, const T
y)
59 GPUhd() constexpr static T Max(const T
x, const T
y)
63 template <
class T,
class S,
class R>
65 template <class T, class
S, class
R>
67 template <class T, class
S, class
R>
70 GPUdi() constexpr static T Clamp(const T
v, const T
lo, const T
hi)
72 return Max(
lo, Min(
v,
hi));
75 GPUd() static
float InvSqrt(
float x);
78 GPUd() constexpr static
float ASin(
float x);
79 GPUd() constexpr static
float ACos(
float x);
80 GPUd() constexpr static
float ATan(
float x);
81 GPUhd() constexpr static
float ATan2(
float y,
float x);
84 GPUhdni() static
void SinCos(
float x,
float&
s,
float&
c);
85 GPUhdni() static
void SinCosd(
double x,
double&
s,
double&
c);
87 GPUd() constexpr static
float Pow(
float x,
float y);
91 GPUd() constexpr static
float TwoPi() {
return 6.2831853f; }
92 GPUd() constexpr static
float Pi() {
return 3.1415927f; }
95 GPUd() static uint32_t Float2UIntReint(const
float&
x);
96 GPUd() constexpr static uint32_t Float2UIntRn(
float x) {
return (uint32_t)(int32_t)(
x + 0.5f); }
97 GPUd() constexpr static int32_t Float2IntRn(
float x);
98 GPUd() constexpr static
float Modf(
float x,
float y);
99 GPUhdi() static
float Remainderf(
float x,
float y);
100 GPUd() constexpr static
bool Finite(
float x);
101 GPUd() constexpr static
bool IsNaN(
float x);
103 GPUd() constexpr static
float QuietNaN() {
return GPUCA_CHOICE(std::numeric_limits<float>::quiet_NaN(), __builtin_nanf(
""), nan(0u)); }
105 GPUd() constexpr static uint32_t Clz(uint32_t
val);
106 GPUd() constexpr static uint32_t Popcount(uint32_t
val);
112 GPUhdi() constexpr static
float Hypot(
float x,
float y,
float z,
float w) {
return Sqrt(
x *
x +
y *
y +
z *
z +
w *
w); }
114 template <
typename T>
115 GPUhd() constexpr static
void Swap(T&
a, T&
b);
120 return GPUCommonMath::AtomicExchInternal(addr,
val);
126 return GPUCommonMath::AtomicCASInternal(addr,
cmp,
val);
132 return GPUCommonMath::AtomicAddInternal(addr,
val);
137 GPUCommonMath::AtomicMaxInternal(addr,
val);
142 GPUCommonMath::AtomicMinInternal(addr,
val);
147 return GPUCommonMath::AtomicExchInternal(addr,
val);
152 return GPUCommonMath::AtomicAddInternal(addr,
val);
157 GPUCommonMath::AtomicMaxInternal(addr,
val);
162 GPUCommonMath::AtomicMinInternal(addr,
val);
164 GPUd() constexpr static int32_t Mul24(int32_t
a, int32_t
b);
165 GPUd() constexpr static
float FMulRZ(
float a,
float b);
167 template <int32_t I, class T>
168 GPUd() constexpr static T nextMultipleOf(T
val);
170 template <typename... Args>
174 template <class
S, class T>
175 GPUd() static uint32_t AtomicExchInternal(
S* addr, T
val);
176 template <class
S, class T>
177 GPUd() static
bool AtomicCASInternal(
S* addr, T
cmp, T
val);
178 template <class
S, class T>
179 GPUd() static uint32_t AtomicAddInternal(
S* addr, T
val);
180 template <class
S, class T>
181 GPUd() static
void AtomicMaxInternal(
S* addr, T
val);
182 template <class
S, class T>
183 GPUd() static
void AtomicMinInternal(
S* addr, T
val);
188template <typename... Args>
191 if constexpr (
sizeof...(Args) == 0) {
194 return w *
w + Sum2(
args...);
201#ifndef GPUCA_GPUCODE_DEVICE
203#elif defined(__CUDACC__) || defined(__HIPCC__)
205#elif defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
208 char* d = (
char*)
dst;
209 const char*
s = (
const char*)
src;
210 for (
size_t i = 0;
i <
size;
i++) {
216template <
int32_t I,
class T>
217GPUdi() constexpr T GPUCommonMath::nextMultipleOf(T
val)
219 if constexpr (I & (I - 1)) {
226 return (
val + I - 1) & ~(
T)(I - 1);
233#if !defined(GPUCA_GPUCODE) || defined(__OPENCL__) || defined(__OPENCL_HOST__)
237 return make_float2(
x,
y);
241GPUdi() constexpr
float GPUCommonMath::Modf(
float x,
float y) {
return GPUCA_CHOICE(fmodf(
x,
y), fmodf(
x,
y), fmod(
x,
y)); }
244GPUdi() uint32_t GPUCommonMath::Float2UIntReint(const
float&
x)
246#if defined(GPUCA_GPUCODE_DEVICE) && (defined(__CUDACC__) || defined(__HIPCC__))
247 return __float_as_uint(
x);
248#elif defined(GPUCA_GPUCODE_DEVICE) && defined(__OPENCL__)
251 return reinterpret_cast<const uint32_t&
>(
x);
256GPUdi()
constexpr float GPUCommonMath::Round(
float x) {
return GPUCA_CHOICE(roundf(
x), roundf(
x), round(
x)); }
257GPUdi() constexpr int32_t GPUCommonMath::Float2IntRn(
float x) {
return (int32_t)
Round(
x); }
260GPUhdi() constexpr
float GPUCommonMath::ATan2(
float y,
float x) {
return GPUCA_CHOICE((
float)atan2((
double)
y, (
double)
x), (
float)atan2((
double)
y, (
double)
x), atan2(
y,
x)); }
261GPUdi() constexpr
float GPUCommonMath::
Sin(
float x) {
return GPUCA_CHOICE((
float)sin((
double)
x), (
float)sin((
double)
x), sin(
x)); }
262GPUdi() constexpr
float GPUCommonMath::
Cos(
float x) {
return GPUCA_CHOICE((
float)cos((
double)
x), (
float)cos((
double)
x), cos(
x)); }
263GPUdi() constexpr
float GPUCommonMath::
Tan(
float x) {
return GPUCA_CHOICE((
float)tanf((
double)
x), (
float)tanf((
double)
x), tan(
x)); }
264GPUdi() constexpr
float GPUCommonMath::Pow(
float x,
float y) {
return GPUCA_CHOICE((
float)pow((
double)
x, (
double)
y), pow((
double)
x, (
double)
y), pow(
x,
y)); }
265GPUdi() constexpr
float GPUCommonMath::ASin(
float x) {
return GPUCA_CHOICE((
float)asin((
double)
x), (
float)asin((
double)
x), asin(
x)); }
266GPUdi() constexpr
float GPUCommonMath::ACos(
float x) {
return GPUCA_CHOICE((
float)acos((
double)
x), (
float)acos((
double)
x), acos(
x)); }
267GPUdi() constexpr
float GPUCommonMath::
Log(
float x) {
return GPUCA_CHOICE((
float)log((
double)
x), (
float)log((
double)
x), log(
x)); }
269GPUdi() constexpr
bool GPUCommonMath::Finite(
float x) {
return GPUCA_CHOICE(std::isfinite(
x), isfinite(
x), isfinite(
x)); }
276GPUhdi() constexpr
float GPUCommonMath::ATan2(
float y,
float x) {
return GPUCA_CHOICE(atan2f(
y,
x), atan2f(
y,
x), atan2(
y,
x)); }
281GPUdi() constexpr
float GPUCommonMath::ASin(
float x) {
return GPUCA_CHOICE(asinf(
x), asinf(
x), asin(
x)); }
282GPUdi() constexpr
float GPUCommonMath::ACos(
float x) {
return GPUCA_CHOICE(acosf(
x), acosf(
x), acos(
x)); }
285GPUdi() constexpr
bool GPUCommonMath::Finite(
float x) {
return true; }
286GPUdi() constexpr
bool GPUCommonMath::IsNaN(
float x) {
return false; }
289GPUhdi()
void GPUCommonMath::SinCos(
float x,
float&
s,
float&
c)
295#if !defined(GPUCA_GPUCODE_DEVICE) && defined(__APPLE__)
296 __sincosf(
x, &
s, &
c);
297#elif !defined(GPUCA_GPUCODE_DEVICE) && (defined(__GNU_SOURCE__) || defined(_GNU_SOURCE) || defined(GPUCA_GPUCODE))
305GPUhdi()
void GPUCommonMath::SinCosd(
double x,
double&
s,
double&
c)
307#if !defined(GPUCA_GPUCODE_DEVICE) && defined(__APPLE__)
309#elif !defined(GPUCA_GPUCODE_DEVICE) && (defined(__GNU_SOURCE__) || defined(_GNU_SOURCE) || defined(GPUCA_GPUCODE))
316GPUdi() constexpr uint32_t GPUCommonMath::Clz(uint32_t
x)
318#if (defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIPCC__))
319 return x == 0 ? 32 :
GPUCA_CHOICE(__builtin_clz(
x), __clz(
x), __builtin_clz(
x));
321 for (int32_t
i = 31;
i >= 0;
i--) {
330GPUdi() constexpr uint32_t GPUCommonMath::Popcount(uint32_t
x)
332#if (defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIPCC__)) && !defined(__OPENCL__)
334 return GPUCA_CHOICE(__builtin_popcount(
x), __popc(
x), __builtin_popcount(
x));
336 x =
x - ((
x >> 1) & 0x55555555);
337 x = (
x & 0x33333333) + ((
x >> 2) & 0x33333333);
338 return (((
x + (
x >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
343GPUhdi() constexpr
void GPUCommonMath::Swap(T&
a, T&
b)
345#ifndef GPUCA_GPUCODE_DEVICE
354template <
class T,
class S,
class R>
365template <
class T,
class S,
class R>
376template <
class T,
class S,
class R>
397GPUdi() float GPUCommonMath::InvSqrt(
float _x)
400 return 1.f /
Sqrt(_x);
402#if defined(__CUDACC__) || defined(__HIPCC__)
403 return __frsqrt_rn(_x);
404#elif defined(__OPENCL__) && defined(__clang__)
405 return 1.f / sqrt(_x);
406#elif !defined(__OPENCL__) && (defined(__FAST_MATH__) || defined(__clang__))
407 return 1.f / sqrtf(_x);
413 const float xhalf = 0.5f *
x.f;
414 x.i = 0x5f3759df - (
x.i >> 1);
415 x.f =
x.f * (1.5f - xhalf *
x.f *
x.f);
439template <
class S,
class T>
440GPUdi() uint32_t GPUCommonMath::AtomicExchInternal(
S* addr, T
val)
442#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
443 return ::atomic_exchange(addr,
val);
444#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
445 return ::atomic_xchg(addr,
val);
446#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
447 return ::atomicExch(addr,
val);
448#elif defined(WITH_OPENMP)
450 __atomic_exchange(addr, &
val, &old, __ATOMIC_SEQ_CST);
453 return reinterpret_cast<std::atomic<T>*
>(addr)->exchange(
val);
457template <
class S,
class T>
458GPUdi() bool GPUCommonMath::AtomicCASInternal(
S* addr, T
cmp, T
val)
460#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
461 return ::atomic_compare_exchange(addr,
cmp,
val) ==
cmp;
462#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
463 return ::atomic_cmpxchg(addr,
cmp,
val) ==
cmp;
464#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
465 return ::atomicCAS(addr,
cmp,
val) ==
cmp;
466#elif defined(WITH_OPENMP)
467 return __atomic_compare_exchange(addr, &
cmp, &
val,
true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
469 return reinterpret_cast<std::atomic<T>*
>(addr)->compare_exchange_strong(
cmp,
val);
473template <
class S,
class T>
474GPUdi() uint32_t GPUCommonMath::AtomicAddInternal(
S* addr, T
val)
476#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
477 return ::atomic_fetch_add(addr,
val);
478#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
479 return ::atomic_add(addr,
val);
480#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
481 return ::atomicAdd(addr,
val);
482#elif defined(WITH_OPENMP)
483 return __atomic_add_fetch(addr,
val, __ATOMIC_SEQ_CST) -
val;
485 return reinterpret_cast<std::atomic<T>*
>(addr)->fetch_add(
val);
489template <
class S,
class T>
492#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
493 ::atomic_fetch_max(addr,
val);
494#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
495 ::atomic_max(addr,
val);
496#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
497 ::atomicMax(addr,
val);
500 while ((current = *(
volatile S*)addr) <
val && !AtomicCASInternal(addr, current,
val)) {
505template <
class S,
class T>
508#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
509 ::atomic_fetch_min(addr,
val);
510#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
511 ::atomic_min(addr,
val);
512#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
513 ::atomicMin(addr,
val);
516 while ((current = *(
volatile S*)addr) >
val && !AtomicCASInternal(addr, current,
val)) {
521#if (defined(__CUDACC__) || defined(__HIPCC__)) && !defined(G__ROOT) && !defined(__CLING__)
522#define GPUCA_HAVE_ATOMIC_MINMAX_FLOAT
530 AtomicMaxInternal((
GPUAtomic(int32_t)*)addr, __float_as_int(
val));
532 AtomicMinInternal((
GPUAtomic(uint32_t)*)addr, __float_as_uint(
val));
542 AtomicMinInternal((
GPUAtomic(int32_t)*)addr, __float_as_int(
val));
544 AtomicMaxInternal((
GPUAtomic(uint32_t)*)addr, __float_as_uint(
val));
uint64_t exp(uint64_t base, uint8_t exp) noexcept
#define GPUCA_DETERMINISTIC_CODE(det, indet)
#define GPUCA_CHOICE(c1, c2, c3)
GPUd() const expr static float Pi()
GPUhd() const expr static T Min(const T x
GPUd() const expr static float QuietNaN()
GPUdi() static void AtomicMax(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
GPUd() const expr static int32_t Float2IntRn(float x)
GPUd() const expr static float Round(float x)
GPUhdni() const expr static float Sqrt(float x)
GPUdi() static void AtomicMin(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
GPUdi() const expr static T Clamp(const T v
GPUd() static float2 MakeFloat2(float x
GPUhd() const expr static T Max(const T x
GPUdi() static T AtomicAddShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GPUdi() static T AtomicAdd(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
GPUd() const expr static float TwoPi()
GPUdi() static T AtomicExchShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GPUd() static uint32_t Float2UIntReint(const float &x)
GPUd() static T MinWithRef(T x
GPUdi() static void AtomicMinShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GPUhdi() static float Remainderf(float x
GPUhdi() const expr static float Hypot(float x
GPUd() const expr static uint32_t Clz(uint32_t val)
GPUdi() static void AtomicMaxShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GLboolean GLboolean GLboolean b
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLboolean GLboolean GLboolean GLboolean a
GLubyte GLubyte GLubyte GLubyte w
GLdouble GLdouble GLdouble z