15#ifndef GPUCOMMONMATH_H
16#define GPUCOMMONMATH_H
20#if defined(__CUDACC__) && !defined(__clang__) && !defined(GPUCA_GPUCODE_COMPILEKERNELS) && !defined(GPUCA_GPUCODE_HOSTONLY)
21#include <sm_20_atomic_functions.h>
24#if !defined(GPUCA_GPUCODE_DEVICE)
32#if !defined(GPUCA_GPUCODE_COMPILEKERNELS) && (!defined(GPUCA_GPUCODE_DEVICE) || defined(__CUDACC__) || defined(__HIPCC__))
37#if defined(GPUCA_GPUCODE_DEVICE) && (defined(__CUDACC__) || defined(__HIPCC__))
38 #define GPUCA_CHOICE(c1, c2, c3) (c2)
39#elif defined(GPUCA_GPUCODE_DEVICE) && defined (__OPENCL__)
40 #define GPUCA_CHOICE(c1, c2, c3) (c3)
42 #define GPUCA_CHOICE(c1, c2, c3) (c1)
54 GPUhd() constexpr static T Min(const T
x, const T
y)
59 GPUhd() constexpr static T Max(const T
x, const T
y)
63 template <
class T,
class S,
class R>
65 template <class T, class
S, class
R>
67 template <class T, class
S, class
R>
70 GPUdi() constexpr static T Clamp(const T
v, const T
lo, const T
hi)
72 return Max(
lo, Min(
v,
hi));
75 GPUd() static
float InvSqrt(
float x);
78 GPUd() constexpr static
float ASin(
float x);
79 GPUd() constexpr static
float ACos(
float x);
80 GPUd() constexpr static
float ATan(
float x);
81 GPUhd() constexpr static
float ATan2(
float y,
float x);
84 GPUhdni() static
void SinCos(
float x,
float&
s,
float&
c);
85 GPUhdni() static
void SinCosd(
double x,
double&
s,
double&
c);
87 GPUd() constexpr static
float Pow(
float x,
float y);
91 GPUd() constexpr static
float TwoPi() {
return 6.2831853f; }
92 GPUd() constexpr static
float Pi() {
return 3.1415927f; }
95 GPUd() static uint32_t Float2UIntReint(const
float&
x);
96 GPUd() constexpr static uint32_t Float2UIntRn(
float x) {
return (uint32_t)(int32_t)(
x + 0.5f); }
97 GPUd() constexpr static int32_t Float2IntRn(
float x);
98 GPUd() constexpr static
float Modf(
float x,
float y);
99 GPUd() constexpr static
bool Finite(
float x);
100 GPUd() constexpr static
bool IsNaN(
float x);
101 GPUd() constexpr static
float QuietNaN() {
return GPUCA_CHOICE(std::numeric_limits<float>::quiet_NaN(), __builtin_nanf(
""), nan(0u)); }
102 GPUd() constexpr static uint32_t Clz(uint32_t
val);
103 GPUd() constexpr static uint32_t Popcount(uint32_t
val);
109 GPUhdi() constexpr static
float Hypot(
float x,
float y,
float z,
float w) {
return Sqrt(
x *
x +
y *
y +
z *
z +
w *
w); }
111 template <
typename T>
112 GPUhd() constexpr static
void Swap(T&
a, T&
b);
117 return GPUCommonMath::AtomicExchInternal(addr,
val);
123 return GPUCommonMath::AtomicCASInternal(addr,
cmp,
val);
129 return GPUCommonMath::AtomicAddInternal(addr,
val);
134 GPUCommonMath::AtomicMaxInternal(addr,
val);
139 GPUCommonMath::AtomicMinInternal(addr,
val);
144 return GPUCommonMath::AtomicExchInternal(addr,
val);
149 return GPUCommonMath::AtomicAddInternal(addr,
val);
154 GPUCommonMath::AtomicMaxInternal(addr,
val);
159 GPUCommonMath::AtomicMinInternal(addr,
val);
161 GPUd() constexpr static int32_t Mul24(int32_t
a, int32_t
b);
162 GPUd() constexpr static
float FMulRZ(
float a,
float b);
164 template <int32_t I, class T>
165 GPUd() constexpr static T nextMultipleOf(T
val);
167 template <typename... Args>
171 template <class
S, class T>
172 GPUd() static uint32_t AtomicExchInternal(
S* addr, T
val);
173 template <class
S, class T>
174 GPUd() static
bool AtomicCASInternal(
S* addr, T
cmp, T
val);
175 template <class
S, class T>
176 GPUd() static uint32_t AtomicAddInternal(
S* addr, T
val);
177 template <class
S, class T>
178 GPUd() static
void AtomicMaxInternal(
S* addr, T
val);
179 template <class
S, class T>
180 GPUd() static
void AtomicMinInternal(
S* addr, T
val);
185template <typename... Args>
188 if constexpr (
sizeof...(Args) == 0) {
191 return w *
w + Sum2(
args...);
198#ifndef GPUCA_GPUCODE_DEVICE
200#elif defined(__CUDACC__) || defined(__HIPCC__)
202#elif defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
205 char* d = (
char*)
dst;
206 const char*
s = (
const char*)
src;
207 for (
size_t i = 0;
i <
size;
i++) {
213template <
int32_t I,
class T>
214GPUdi() constexpr T GPUCommonMath::nextMultipleOf(T
val)
216 if constexpr (I & (I - 1)) {
223 return (
val + I - 1) & ~(
T)(I - 1);
230#if !defined(GPUCA_GPUCODE) || defined(__OPENCL__) || defined(__OPENCL_HOST__)
234 return make_float2(
x,
y);
238GPUdi() constexpr
float GPUCommonMath::Modf(
float x,
float y) {
return GPUCA_CHOICE(fmodf(
x,
y), fmodf(
x,
y), fmod(
x,
y)); }
240GPUdi() uint32_t GPUCommonMath::Float2UIntReint(const
float&
x)
242#if defined(GPUCA_GPUCODE_DEVICE) && (defined(__CUDACC__) || defined(__HIPCC__))
243 return __float_as_uint(
x);
244#elif defined(GPUCA_GPUCODE_DEVICE) && defined(__OPENCL__)
247 return reinterpret_cast<const uint32_t&
>(
x);
252GPUdi()
constexpr float GPUCommonMath::Round(
float x) {
return GPUCA_CHOICE(roundf(
x), roundf(
x), round(
x)); }
253GPUdi() constexpr int32_t GPUCommonMath::Float2IntRn(
float x) {
return (int32_t)
Round(
x); }
256GPUhdi() constexpr
float GPUCommonMath::ATan2(
float y,
float x) {
return GPUCA_CHOICE((
float)atan2((
double)
y, (
double)
x), (
float)atan2((
double)
y, (
double)
x), atan2(
y,
x)); }
257GPUdi() constexpr
float GPUCommonMath::
Sin(
float x) {
return GPUCA_CHOICE((
float)sin((
double)
x), (
float)sin((
double)
x), sin(
x)); }
258GPUdi() constexpr
float GPUCommonMath::
Cos(
float x) {
return GPUCA_CHOICE((
float)cos((
double)
x), (
float)cos((
double)
x), cos(
x)); }
259GPUdi() constexpr
float GPUCommonMath::
Tan(
float x) {
return GPUCA_CHOICE((
float)tanf((
double)
x), (
float)tanf((
double)
x), tan(
x)); }
260GPUdi() constexpr
float GPUCommonMath::Pow(
float x,
float y) {
return GPUCA_CHOICE((
float)pow((
double)
x, (
double)
y), pow((
double)
x, (
double)
y), pow(
x,
y)); }
261GPUdi() constexpr
float GPUCommonMath::ASin(
float x) {
return GPUCA_CHOICE((
float)asin((
double)
x), (
float)asin((
double)
x), asin(
x)); }
262GPUdi() constexpr
float GPUCommonMath::ACos(
float x) {
return GPUCA_CHOICE((
float)acos((
double)
x), (
float)acos((
double)
x), acos(
x)); }
263GPUdi() constexpr
float GPUCommonMath::
Log(
float x) {
return GPUCA_CHOICE((
float)log((
double)
x), (
float)log((
double)
x), log(
x)); }
265GPUdi() constexpr
bool GPUCommonMath::Finite(
float x) {
return GPUCA_CHOICE(std::isfinite(
x), isfinite(
x), isfinite(
x)); }
272GPUhdi() constexpr
float GPUCommonMath::ATan2(
float y,
float x) {
return GPUCA_CHOICE(atan2f(
y,
x), atan2f(
y,
x), atan2(
y,
x)); }
277GPUdi() constexpr
float GPUCommonMath::ASin(
float x) {
return GPUCA_CHOICE(asinf(
x), asinf(
x), asin(
x)); }
278GPUdi() constexpr
float GPUCommonMath::ACos(
float x) {
return GPUCA_CHOICE(acosf(
x), acosf(
x), acos(
x)); }
281GPUdi() constexpr
bool GPUCommonMath::Finite(
float x) {
return true; }
282GPUdi() constexpr
bool GPUCommonMath::IsNaN(
float x) {
return false; }
285GPUhdi()
void GPUCommonMath::SinCos(
float x,
float&
s,
float&
c)
291#if !defined(GPUCA_GPUCODE_DEVICE) && defined(__APPLE__)
292 __sincosf(
x, &
s, &
c);
293#elif !defined(GPUCA_GPUCODE_DEVICE) && (defined(__GNU_SOURCE__) || defined(_GNU_SOURCE) || defined(GPUCA_GPUCODE))
303#if !defined(GPUCA_GPUCODE_DEVICE) && defined(__APPLE__)
305#elif !defined(GPUCA_GPUCODE_DEVICE) && (defined(__GNU_SOURCE__) || defined(_GNU_SOURCE) || defined(GPUCA_GPUCODE))
312GPUdi() constexpr uint32_t GPUCommonMath::Clz(uint32_t
x)
314#if (defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIPCC__))
315 return x == 0 ? 32 :
GPUCA_CHOICE(__builtin_clz(
x), __clz(
x), __builtin_clz(
x));
317 for (int32_t
i = 31;
i >= 0;
i--) {
326GPUdi() constexpr uint32_t GPUCommonMath::Popcount(uint32_t
x)
328#if (defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIPCC__)) && !defined(__OPENCL__)
330 return GPUCA_CHOICE(__builtin_popcount(
x), __popc(
x), __builtin_popcount(
x));
332 x =
x - ((
x >> 1) & 0x55555555);
333 x = (
x & 0x33333333) + ((
x >> 2) & 0x33333333);
334 return (((
x + (
x >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
339GPUhdi() constexpr
void GPUCommonMath::Swap(T&
a, T&
b)
341#ifndef GPUCA_GPUCODE_DEVICE
350template <
class T,
class S,
class R>
361template <
class T,
class S,
class R>
372template <
class T,
class S,
class R>
393GPUdi() float GPUCommonMath::InvSqrt(
float _x)
396 return 1.f /
Sqrt(_x);
398#if defined(__CUDACC__) || defined(__HIPCC__)
399 return __frsqrt_rn(_x);
400#elif defined(__OPENCL__) && defined(__clang__)
401 return 1.f / sqrt(_x);
402#elif !defined(__OPENCL__) && (defined(__FAST_MATH__) || defined(__clang__))
403 return 1.f / sqrtf(_x);
409 const float xhalf = 0.5f *
x.f;
410 x.i = 0x5f3759df - (
x.i >> 1);
411 x.f =
x.f * (1.5f - xhalf *
x.f *
x.f);
435template <
class S,
class T>
436GPUdi() uint32_t GPUCommonMath::AtomicExchInternal(
S* addr, T
val)
438#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
439 return ::atomic_exchange(addr,
val);
440#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
441 return ::atomic_xchg(addr,
val);
442#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
443 return ::atomicExch(addr,
val);
444#elif defined(WITH_OPENMP)
446 __atomic_exchange(addr, &
val, &old, __ATOMIC_SEQ_CST);
449 return reinterpret_cast<std::atomic<T>*
>(addr)->exchange(
val);
453template <
class S,
class T>
454GPUdi() bool GPUCommonMath::AtomicCASInternal(
S* addr, T
cmp, T
val)
456#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
457 return ::atomic_compare_exchange(addr,
cmp,
val) ==
cmp;
458#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
459 return ::atomic_cmpxchg(addr,
cmp,
val) ==
cmp;
460#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
461 return ::atomicCAS(addr,
cmp,
val) ==
cmp;
462#elif defined(WITH_OPENMP)
463 return __atomic_compare_exchange(addr, &
cmp, &
val,
true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
465 return reinterpret_cast<std::atomic<T>*
>(addr)->compare_exchange_strong(
cmp,
val);
469template <
class S,
class T>
470GPUdi() uint32_t GPUCommonMath::AtomicAddInternal(
S* addr, T
val)
472#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
473 return ::atomic_fetch_add(addr,
val);
474#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
475 return ::atomic_add(addr,
val);
476#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
477 return ::atomicAdd(addr,
val);
478#elif defined(WITH_OPENMP)
479 return __atomic_add_fetch(addr,
val, __ATOMIC_SEQ_CST) -
val;
481 return reinterpret_cast<std::atomic<T>*
>(addr)->fetch_add(
val);
485template <
class S,
class T>
488#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
489 ::atomic_fetch_max(addr,
val);
490#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
491 ::atomic_max(addr,
val);
492#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
493 ::atomicMax(addr,
val);
496 while ((current = *(
volatile S*)addr) <
val && !AtomicCASInternal(addr, current,
val)) {
501template <
class S,
class T>
504#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
505 ::atomic_fetch_min(addr,
val);
506#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
507 ::atomic_min(addr,
val);
508#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
509 ::atomicMin(addr,
val);
512 while ((current = *(
volatile S*)addr) >
val && !AtomicCASInternal(addr, current,
val)) {
517#if (defined(__CUDACC__) || defined(__HIPCC__)) && !defined(G__ROOT)
518#define GPUCA_HAVE_ATOMIC_MINMAX_FLOAT
526 AtomicMaxInternal((
GPUAtomic(int32_t)*)addr, __float_as_int(
val));
528 AtomicMinInternal((
GPUAtomic(uint32_t)*)addr, __float_as_uint(
val));
538 AtomicMinInternal((
GPUAtomic(int32_t)*)addr, __float_as_int(
val));
540 AtomicMaxInternal((
GPUAtomic(uint32_t)*)addr, __float_as_uint(
val));
uint64_t exp(uint64_t base, uint8_t exp) noexcept
#define GPUCA_DETERMINISTIC_CODE(det, indet)
#define GPUCA_CHOICE(c1, c2, c3)
GPUd() const expr static float Pi()
GPUhd() const expr static T Min(const T x
GPUdi() static void AtomicMax(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
GPUd() const expr static int32_t Float2IntRn(float x)
GPUd() const expr static float Round(float x)
GPUhdni() const expr static float Sqrt(float x)
GPUdi() static void AtomicMin(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
GPUdi() const expr static T Clamp(const T v
GPUd() static float2 MakeFloat2(float x
GPUhd() const expr static T Max(const T x
GPUdi() static T AtomicAddShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GPUdi() static T AtomicAdd(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
GPUd() const expr static float TwoPi()
GPUdi() static T AtomicExchShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GPUd() static uint32_t Float2UIntReint(const float &x)
GPUd() static T MinWithRef(T x
GPUdi() static void AtomicMinShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GPUhdi() const expr static float Hypot(float x
GPUd() const expr static uint32_t Clz(uint32_t val)
GPUdi() static void AtomicMaxShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GLboolean GLboolean GLboolean b
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLboolean GLboolean GLboolean GLboolean a
GLubyte GLubyte GLubyte GLubyte w
GLdouble GLdouble GLdouble z