15#ifndef GPUCOMMONMATH_H
16#define GPUCOMMONMATH_H
20#if defined(__CUDACC__) && !defined(__clang__) && !defined(GPUCA_GPUCODE_COMPILEKERNELS) && !defined(GPUCA_GPUCODE_HOSTONLY)
21#include <sm_20_atomic_functions.h>
24#if !defined(GPUCA_GPUCODE_DEVICE)
32#if !defined(GPUCA_GPUCODE_COMPILEKERNELS) && (!defined(GPUCA_GPUCODE_DEVICE) || defined(__CUDACC__) || defined(__HIPCC__))
37#if defined(GPUCA_GPUCODE_DEVICE) && (defined(__CUDACC__) || defined(__HIPCC__))
38 #define GPUCA_CHOICE(c1, c2, c3) (c2)
39#elif defined(GPUCA_GPUCODE_DEVICE) && defined (__OPENCL__)
40 #define GPUCA_CHOICE(c1, c2, c3) (c3)
42 #define GPUCA_CHOICE(c1, c2, c3) (c1)
54 GPUhd() constexpr static T Min(const T
x, const T
y)
59 GPUhd() constexpr static T Max(const T
x, const T
y)
63 template <
class T,
class S,
class R>
65 template <class T, class
S, class
R>
67 template <class T, class
S, class
R>
70 GPUdi() constexpr static T Clamp(const T
v, const T
lo, const T
hi)
72 return Max(
lo, Min(
v,
hi));
75 GPUd() static
float InvSqrt(
float x);
77 GPUdi() constexpr static T Square(T
x)
83 GPUd() constexpr static
float ASin(
float x);
84 GPUd() constexpr static
float ACos(
float x);
85 GPUd() constexpr static
float ATan(
float x);
86 GPUhd() constexpr static
float ATan2(
float y,
float x);
89 GPUhdni() static
void SinCos(
float x,
float&
s,
float&
c);
90 GPUhdni() static
void SinCosd(
double x,
double&
s,
double&
c);
92 GPUd() constexpr static
float Pow(
float x,
float y);
96 GPUd() constexpr static
float TwoPi() {
return 6.2831853f; }
97 GPUd() constexpr static
float Pi() {
return 3.1415927f; }
100 GPUd() static uint32_t Float2UIntReint(const
float&
x);
101 GPUd() constexpr static uint32_t Float2UIntRn(
float x) {
return (uint32_t)(int32_t)(
x + 0.5f); }
102 GPUd() constexpr static int32_t Float2IntRn(
float x);
103 GPUd() constexpr static
float Modf(
float x,
float y);
104 GPUhdi() static
float Remainderf(
float x,
float y);
105 GPUd() constexpr static
bool Finite(
float x);
106 GPUd() constexpr static
bool IsNaN(
float x);
108 GPUd() constexpr static
float QuietNaN() {
return GPUCA_CHOICE(std::numeric_limits<float>::quiet_NaN(), __builtin_nanf(
""), nan(0u)); }
110 GPUd() constexpr static uint32_t Clz(uint32_t
val);
111 GPUd() constexpr static uint32_t Ctz(uint32_t
val);
112 GPUd() constexpr static uint32_t Popcount(uint32_t
val);
118 GPUhdi() constexpr static
float Hypot(
float x,
float y,
float z,
float w) {
return Sqrt(
x *
x +
y *
y +
z *
z +
w *
w); }
120 template <
typename T>
121 GPUhd() constexpr static
void Swap(T&
a, T&
b);
126 return GPUCommonMath::AtomicExchInternal(addr,
val);
132 return GPUCommonMath::AtomicCASInternal(addr,
cmp,
val);
138 return GPUCommonMath::AtomicAddInternal(addr,
val);
143 GPUCommonMath::AtomicMaxInternal(addr,
val);
148 GPUCommonMath::AtomicMinInternal(addr,
val);
153 return GPUCommonMath::AtomicExchInternal(addr,
val);
158 return GPUCommonMath::AtomicAddInternal(addr,
val);
163 GPUCommonMath::AtomicMaxInternal(addr,
val);
168 GPUCommonMath::AtomicMinInternal(addr,
val);
170 GPUd() constexpr static int32_t Mul24(int32_t
a, int32_t
b);
171 GPUd() constexpr static
float FMulRZ(
float a,
float b);
173 template <int32_t I, class T>
174 GPUd() constexpr static T nextMultipleOf(T
val);
176 template <typename... Args>
180 template <class
S, class T>
181 GPUd() static uint32_t AtomicExchInternal(
S* addr, T
val);
182 template <class
S, class T>
183 GPUd() static
bool AtomicCASInternal(
S* addr, T
cmp, T
val);
184 template <class
S, class T>
185 GPUd() static uint32_t AtomicAddInternal(
S* addr, T
val);
186 template <class
S, class T>
187 GPUd() static
void AtomicMaxInternal(
S* addr, T
val);
188 template <class
S, class T>
189 GPUd() static
void AtomicMinInternal(
S* addr, T
val);
194template <typename... Args>
197 if constexpr (
sizeof...(Args) == 0) {
200 return w *
w + Sum2(
args...);
207#ifndef GPUCA_GPUCODE_DEVICE
209#elif defined(__CUDACC__) || defined(__HIPCC__)
211#elif defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
214 char* d = (
char*)
dst;
215 const char*
s = (
const char*)
src;
216 for (
size_t i = 0;
i <
size;
i++) {
222template <
int32_t I,
class T>
223GPUdi() constexpr T GPUCommonMath::nextMultipleOf(T
val)
225 if constexpr (I & (I - 1)) {
232 return (
val + I - 1) & ~(
T)(I - 1);
239#if !defined(GPUCA_GPUCODE) || defined(__OPENCL__) || defined(__OPENCL_HOST__)
243 return make_float2(
x,
y);
247GPUdi() constexpr
float GPUCommonMath::Modf(
float x,
float y) {
return GPUCA_CHOICE(fmodf(
x,
y), fmodf(
x,
y), fmod(
x,
y)); }
250GPUdi() uint32_t GPUCommonMath::Float2UIntReint(const
float&
x)
252#if defined(GPUCA_GPUCODE_DEVICE) && (defined(__CUDACC__) || defined(__HIPCC__))
253 return __float_as_uint(
x);
254#elif defined(GPUCA_GPUCODE_DEVICE) && defined(__OPENCL__)
257 return reinterpret_cast<const uint32_t&
>(
x);
262GPUdi()
constexpr float GPUCommonMath::Round(
float x) {
return GPUCA_CHOICE(roundf(
x), roundf(
x), round(
x)); }
263GPUdi() constexpr int32_t GPUCommonMath::Float2IntRn(
float x) {
return (int32_t)
Round(
x); }
266GPUhdi() constexpr
float GPUCommonMath::ATan2(
float y,
float x) {
return GPUCA_CHOICE((
float)atan2((
double)
y, (
double)
x), (
float)atan2((
double)
y, (
double)
x), atan2(
y,
x)); }
267GPUdi() constexpr
float GPUCommonMath::
Sin(
float x) {
return GPUCA_CHOICE((
float)sin((
double)
x), (
float)sin((
double)
x), sin(
x)); }
268GPUdi() constexpr
float GPUCommonMath::
Cos(
float x) {
return GPUCA_CHOICE((
float)cos((
double)
x), (
float)cos((
double)
x), cos(
x)); }
269GPUdi() constexpr
float GPUCommonMath::
Tan(
float x) {
return GPUCA_CHOICE((
float)tanf((
double)
x), (
float)tanf((
double)
x), tan(
x)); }
270GPUdi() constexpr
float GPUCommonMath::Pow(
float x,
float y) {
return GPUCA_CHOICE((
float)pow((
double)
x, (
double)
y), pow((
double)
x, (
double)
y), pow(
x,
y)); }
271GPUdi() constexpr
float GPUCommonMath::ASin(
float x) {
return GPUCA_CHOICE((
float)asin((
double)
x), (
float)asin((
double)
x), asin(
x)); }
272GPUdi() constexpr
float GPUCommonMath::ACos(
float x) {
return GPUCA_CHOICE((
float)acos((
double)
x), (
float)acos((
double)
x), acos(
x)); }
273GPUdi() constexpr
float GPUCommonMath::
Log(
float x) {
return GPUCA_CHOICE((
float)log((
double)
x), (
float)log((
double)
x), log(
x)); }
275GPUdi() constexpr
bool GPUCommonMath::Finite(
float x) {
return GPUCA_CHOICE(std::isfinite(
x), isfinite(
x), isfinite(
x)); }
282GPUhdi() constexpr
float GPUCommonMath::ATan2(
float y,
float x) {
return GPUCA_CHOICE(atan2f(
y,
x), atan2f(
y,
x), atan2(
y,
x)); }
287GPUdi() constexpr
float GPUCommonMath::ASin(
float x) {
return GPUCA_CHOICE(asinf(
x), asinf(
x), asin(
x)); }
288GPUdi() constexpr
float GPUCommonMath::ACos(
float x) {
return GPUCA_CHOICE(acosf(
x), acosf(
x), acos(
x)); }
291GPUdi() constexpr
bool GPUCommonMath::Finite(
float x) {
return true; }
292GPUdi() constexpr
bool GPUCommonMath::IsNaN(
float x) {
return false; }
295GPUhdi()
void GPUCommonMath::SinCos(
float x,
float&
s,
float&
c)
301#if !defined(GPUCA_GPUCODE_DEVICE) && defined(__APPLE__)
302 __sincosf(
x, &
s, &
c);
303#elif !defined(GPUCA_GPUCODE_DEVICE) && (defined(__GNU_SOURCE__) || defined(_GNU_SOURCE) || defined(GPUCA_GPUCODE))
311GPUhdi()
void GPUCommonMath::SinCosd(
double x,
double&
s,
double&
c)
313#if !defined(GPUCA_GPUCODE_DEVICE) && defined(__APPLE__)
315#elif !defined(GPUCA_GPUCODE_DEVICE) && (defined(__GNU_SOURCE__) || defined(_GNU_SOURCE) || defined(GPUCA_GPUCODE))
322GPUdi() constexpr uint32_t GPUCommonMath::Clz(uint32_t
x)
324#if (defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIPCC__))
325 return x == 0 ? 32 :
GPUCA_CHOICE(__builtin_clz(
x), __clz(
x), __builtin_clz(
x));
327 for (int32_t
i = 31;
i >= 0;
i--) {
336GPUdi() constexpr uint32_t GPUCommonMath::Ctz(uint32_t
x)
338#if (defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIPCC__))
339 return x == 0 ? 32 :
GPUCA_CHOICE(__builtin_ctz(
x), __ffs(
x) - 1, __builtin_ctz(
x));
341 for (uint32_t
i = 0;
i < 32; ++
i) {
350GPUdi() constexpr uint32_t GPUCommonMath::Popcount(uint32_t
x)
352#if (defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIPCC__)) && !defined(__OPENCL__)
354 return GPUCA_CHOICE(__builtin_popcount(
x), __popc(
x), __builtin_popcount(
x));
356 x =
x - ((
x >> 1) & 0x55555555);
357 x = (
x & 0x33333333) + ((
x >> 2) & 0x33333333);
358 return (((
x + (
x >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
363GPUhdi() constexpr
void GPUCommonMath::Swap(T&
a, T&
b)
365#ifndef GPUCA_GPUCODE_DEVICE
374template <
class T,
class S,
class R>
385template <
class T,
class S,
class R>
396template <
class T,
class S,
class R>
417GPUdi() float GPUCommonMath::InvSqrt(
float _x)
420 return 1.f /
Sqrt(_x);
422#if defined(__CUDACC__) || defined(__HIPCC__)
423 return __frsqrt_rn(_x);
424#elif defined(__OPENCL__) && defined(__clang__)
425 return 1.f / sqrt(_x);
426#elif !defined(__OPENCL__) && (defined(__FAST_MATH__) || defined(__clang__))
427 return 1.f / sqrtf(_x);
433 const float xhalf = 0.5f *
x.f;
434 x.i = 0x5f3759df - (
x.i >> 1);
435 x.f =
x.f * (1.5f - xhalf *
x.f *
x.f);
459template <
class S,
class T>
460GPUdi() uint32_t GPUCommonMath::AtomicExchInternal(
S* addr, T
val)
462#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
463 return ::atomic_exchange(addr,
val);
464#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
465 return ::atomic_xchg(addr,
val);
466#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
467 return ::atomicExch(addr,
val);
468#elif defined(WITH_OPENMP)
470 __atomic_exchange(addr, &
val, &old, __ATOMIC_SEQ_CST);
473 return reinterpret_cast<std::atomic<T>*
>(addr)->exchange(
val);
477template <
class S,
class T>
478GPUdi() bool GPUCommonMath::AtomicCASInternal(
S* addr, T
cmp, T
val)
480#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
481 return ::atomic_compare_exchange(addr,
cmp,
val) ==
cmp;
482#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
483 return ::atomic_cmpxchg(addr,
cmp,
val) ==
cmp;
484#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
485 return ::atomicCAS(addr,
cmp,
val) ==
cmp;
486#elif defined(WITH_OPENMP)
487 return __atomic_compare_exchange(addr, &
cmp, &
val,
true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
489 return reinterpret_cast<std::atomic<T>*
>(addr)->compare_exchange_strong(
cmp,
val);
493template <
class S,
class T>
494GPUdi() uint32_t GPUCommonMath::AtomicAddInternal(
S* addr, T
val)
496#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
497 return ::atomic_fetch_add(addr,
val);
498#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
499 return ::atomic_add(addr,
val);
500#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
501 return ::atomicAdd(addr,
val);
502#elif defined(WITH_OPENMP)
503 return __atomic_add_fetch(addr,
val, __ATOMIC_SEQ_CST) -
val;
505 return reinterpret_cast<std::atomic<T>*
>(addr)->fetch_add(
val);
509template <
class S,
class T>
512#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
513 ::atomic_fetch_max(addr,
val);
514#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
515 ::atomic_max(addr,
val);
516#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
517 ::atomicMax(addr,
val);
520 while ((current = *(
volatile S*)addr) <
val && !AtomicCASInternal(addr, current,
val)) {
525template <
class S,
class T>
528#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
529 ::atomic_fetch_min(addr,
val);
530#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
531 ::atomic_min(addr,
val);
532#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
533 ::atomicMin(addr,
val);
536 while ((current = *(
volatile S*)addr) >
val && !AtomicCASInternal(addr, current,
val)) {
541#if (defined(__CUDACC__) || defined(__HIPCC__)) && !defined(G__ROOT) && !defined(__CLING__)
542#define GPUCA_HAVE_ATOMIC_MINMAX_FLOAT
550 AtomicMaxInternal((
GPUAtomic(int32_t)*)addr, __float_as_int(
val));
552 AtomicMinInternal((
GPUAtomic(uint32_t)*)addr, __float_as_uint(
val));
562 AtomicMinInternal((
GPUAtomic(int32_t)*)addr, __float_as_int(
val));
564 AtomicMaxInternal((
GPUAtomic(uint32_t)*)addr, __float_as_uint(
val));
uint64_t exp(uint64_t base, uint8_t exp) noexcept
#define GPUCA_DETERMINISTIC_CODE(det, indet)
#define GPUCA_CHOICE(c1, c2, c3)
GPUd() const expr static float Pi()
GPUhd() const expr static T Min(const T x
GPUd() const expr static float QuietNaN()
GPUdi() static void AtomicMax(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
GPUd() const expr static int32_t Float2IntRn(float x)
GPUd() const expr static float Round(float x)
GPUhdni() const expr static float Sqrt(float x)
GPUdi() static void AtomicMin(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
GPUdi() const expr static T Clamp(const T v
GPUd() static float2 MakeFloat2(float x
GPUhd() const expr static T Max(const T x
GPUdi() static T AtomicAddShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GPUhd() const expr static T Abs(T x)
GPUdi() static T AtomicAdd(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
GPUd() const expr static float TwoPi()
GPUdi() static T AtomicExchShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GPUd() static uint32_t Float2UIntReint(const float &x)
GPUd() static T MinWithRef(T x
GPUdi() static void AtomicMinShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GPUhdi() static float Remainderf(float x
GPUhdi() const expr static float Hypot(float x
GPUd() const expr static uint32_t Clz(uint32_t val)
GPUdi() static void AtomicMaxShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GLboolean GLboolean GLboolean b
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLboolean GLboolean GLboolean GLboolean a
GLubyte GLubyte GLubyte GLubyte w
GLdouble GLdouble GLdouble z