15#ifndef GPUCOMMONMATH_H
16#define GPUCOMMONMATH_H
20#if defined(__CUDACC__) && !defined(__clang__) && !defined(GPUCA_GPUCODE_COMPILEKERNELS) && !defined(GPUCA_GPUCODE_HOSTONLY)
21#include <sm_20_atomic_functions.h>
24#if !defined(GPUCA_GPUCODE_DEVICE)
32#if !defined(GPUCA_GPUCODE_COMPILEKERNELS) && (!defined(GPUCA_GPUCODE_DEVICE) || defined(__CUDACC__) || defined(__HIPCC__))
37#if defined(GPUCA_GPUCODE_DEVICE) && (defined(__CUDACC__) || defined(__HIPCC__))
38 #define GPUCA_CHOICE(c1, c2, c3) (c2)
39#elif defined(GPUCA_GPUCODE_DEVICE) && defined (__OPENCL__)
40 #define GPUCA_CHOICE(c1, c2, c3) (c3)
42 #define GPUCA_CHOICE(c1, c2, c3) (c1)
54 GPUhd() constexpr static T Min(const T
x, const T
y)
59 GPUhd() constexpr static T Max(const T
x, const T
y)
63 template <
class T,
class S,
class R>
65 template <class T, class
S, class
R>
67 template <class T, class
S, class
R>
70 GPUdi() constexpr static T Clamp(const T
v, const T
lo, const T
hi)
72 return Max(
lo, Min(
v,
hi));
75 GPUd() static
float InvSqrt(
float x);
78 GPUd() constexpr static
float ASin(
float x);
79 GPUd() constexpr static
float ACos(
float x);
80 GPUd() constexpr static
float ATan(
float x);
81 GPUhd() constexpr static
float ATan2(
float y,
float x);
84 GPUhdni() static
void SinCos(
float x,
float&
s,
float&
c);
85 GPUhdni() static
void SinCosd(
double x,
double&
s,
double&
c);
87 GPUd() constexpr static
float Pow(
float x,
float y);
91 GPUd() constexpr static
float TwoPi() {
return 6.2831853f; }
92 GPUd() constexpr static
float Pi() {
return 3.1415927f; }
95 GPUd() static uint32_t Float2UIntReint(const
float&
x);
96 GPUd() constexpr static uint32_t Float2UIntRn(
float x) {
return (uint32_t)(int32_t)(
x + 0.5f); }
97 GPUd() constexpr static int32_t Float2IntRn(
float x);
98 GPUd() constexpr static
float Modf(
float x,
float y);
99 GPUhdi() static
float Remainderf(
float x,
float y);
100 GPUd() constexpr static
bool Finite(
float x);
101 GPUd() constexpr static
bool IsNaN(
float x);
102 GPUd() constexpr static
float QuietNaN() {
return GPUCA_CHOICE(std::numeric_limits<float>::quiet_NaN(), __builtin_nanf(
""), nan(0u)); }
103 GPUd() constexpr static uint32_t Clz(uint32_t
val);
104 GPUd() constexpr static uint32_t Popcount(uint32_t
val);
110 GPUhdi() constexpr static
float Hypot(
float x,
float y,
float z,
float w) {
return Sqrt(
x *
x +
y *
y +
z *
z +
w *
w); }
112 template <
typename T>
113 GPUhd() constexpr static
void Swap(T&
a, T&
b);
118 return GPUCommonMath::AtomicExchInternal(addr,
val);
124 return GPUCommonMath::AtomicCASInternal(addr,
cmp,
val);
130 return GPUCommonMath::AtomicAddInternal(addr,
val);
135 GPUCommonMath::AtomicMaxInternal(addr,
val);
140 GPUCommonMath::AtomicMinInternal(addr,
val);
145 return GPUCommonMath::AtomicExchInternal(addr,
val);
150 return GPUCommonMath::AtomicAddInternal(addr,
val);
155 GPUCommonMath::AtomicMaxInternal(addr,
val);
160 GPUCommonMath::AtomicMinInternal(addr,
val);
162 GPUd() constexpr static int32_t Mul24(int32_t
a, int32_t
b);
163 GPUd() constexpr static
float FMulRZ(
float a,
float b);
165 template <int32_t I, class T>
166 GPUd() constexpr static T nextMultipleOf(T
val);
168 template <typename... Args>
172 template <class
S, class T>
173 GPUd() static uint32_t AtomicExchInternal(
S* addr, T
val);
174 template <class
S, class T>
175 GPUd() static
bool AtomicCASInternal(
S* addr, T
cmp, T
val);
176 template <class
S, class T>
177 GPUd() static uint32_t AtomicAddInternal(
S* addr, T
val);
178 template <class
S, class T>
179 GPUd() static
void AtomicMaxInternal(
S* addr, T
val);
180 template <class
S, class T>
181 GPUd() static
void AtomicMinInternal(
S* addr, T
val);
186template <typename... Args>
189 if constexpr (
sizeof...(Args) == 0) {
192 return w *
w + Sum2(
args...);
199#ifndef GPUCA_GPUCODE_DEVICE
201#elif defined(__CUDACC__) || defined(__HIPCC__)
203#elif defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
206 char* d = (
char*)
dst;
207 const char*
s = (
const char*)
src;
208 for (
size_t i = 0;
i <
size;
i++) {
214template <
int32_t I,
class T>
215GPUdi() constexpr T GPUCommonMath::nextMultipleOf(T
val)
217 if constexpr (I & (I - 1)) {
224 return (
val + I - 1) & ~(
T)(I - 1);
231#if !defined(GPUCA_GPUCODE) || defined(__OPENCL__) || defined(__OPENCL_HOST__)
235 return make_float2(
x,
y);
239GPUdi() constexpr
float GPUCommonMath::Modf(
float x,
float y) {
return GPUCA_CHOICE(fmodf(
x,
y), fmodf(
x,
y), fmod(
x,
y)); }
242GPUdi() uint32_t GPUCommonMath::Float2UIntReint(const
float&
x)
244#if defined(GPUCA_GPUCODE_DEVICE) && (defined(__CUDACC__) || defined(__HIPCC__))
245 return __float_as_uint(
x);
246#elif defined(GPUCA_GPUCODE_DEVICE) && defined(__OPENCL__)
249 return reinterpret_cast<const uint32_t&
>(
x);
254GPUdi()
constexpr float GPUCommonMath::Round(
float x) {
return GPUCA_CHOICE(roundf(
x), roundf(
x), round(
x)); }
255GPUdi() constexpr int32_t GPUCommonMath::Float2IntRn(
float x) {
return (int32_t)
Round(
x); }
258GPUhdi() constexpr
float GPUCommonMath::ATan2(
float y,
float x) {
return GPUCA_CHOICE((
float)atan2((
double)
y, (
double)
x), (
float)atan2((
double)
y, (
double)
x), atan2(
y,
x)); }
259GPUdi() constexpr
float GPUCommonMath::
Sin(
float x) {
return GPUCA_CHOICE((
float)sin((
double)
x), (
float)sin((
double)
x), sin(
x)); }
260GPUdi() constexpr
float GPUCommonMath::
Cos(
float x) {
return GPUCA_CHOICE((
float)cos((
double)
x), (
float)cos((
double)
x), cos(
x)); }
261GPUdi() constexpr
float GPUCommonMath::
Tan(
float x) {
return GPUCA_CHOICE((
float)tanf((
double)
x), (
float)tanf((
double)
x), tan(
x)); }
262GPUdi() constexpr
float GPUCommonMath::Pow(
float x,
float y) {
return GPUCA_CHOICE((
float)pow((
double)
x, (
double)
y), pow((
double)
x, (
double)
y), pow(
x,
y)); }
263GPUdi() constexpr
float GPUCommonMath::ASin(
float x) {
return GPUCA_CHOICE((
float)asin((
double)
x), (
float)asin((
double)
x), asin(
x)); }
264GPUdi() constexpr
float GPUCommonMath::ACos(
float x) {
return GPUCA_CHOICE((
float)acos((
double)
x), (
float)acos((
double)
x), acos(
x)); }
265GPUdi() constexpr
float GPUCommonMath::
Log(
float x) {
return GPUCA_CHOICE((
float)log((
double)
x), (
float)log((
double)
x), log(
x)); }
267GPUdi() constexpr
bool GPUCommonMath::Finite(
float x) {
return GPUCA_CHOICE(std::isfinite(
x), isfinite(
x), isfinite(
x)); }
274GPUhdi() constexpr
float GPUCommonMath::ATan2(
float y,
float x) {
return GPUCA_CHOICE(atan2f(
y,
x), atan2f(
y,
x), atan2(
y,
x)); }
279GPUdi() constexpr
float GPUCommonMath::ASin(
float x) {
return GPUCA_CHOICE(asinf(
x), asinf(
x), asin(
x)); }
280GPUdi() constexpr
float GPUCommonMath::ACos(
float x) {
return GPUCA_CHOICE(acosf(
x), acosf(
x), acos(
x)); }
283GPUdi() constexpr
bool GPUCommonMath::Finite(
float x) {
return true; }
284GPUdi() constexpr
bool GPUCommonMath::IsNaN(
float x) {
return false; }
287GPUhdi()
void GPUCommonMath::SinCos(
float x,
float&
s,
float&
c)
293#if !defined(GPUCA_GPUCODE_DEVICE) && defined(__APPLE__)
294 __sincosf(
x, &
s, &
c);
295#elif !defined(GPUCA_GPUCODE_DEVICE) && (defined(__GNU_SOURCE__) || defined(_GNU_SOURCE) || defined(GPUCA_GPUCODE))
303GPUhdi()
void GPUCommonMath::SinCosd(
double x,
double&
s,
double&
c)
305#if !defined(GPUCA_GPUCODE_DEVICE) && defined(__APPLE__)
307#elif !defined(GPUCA_GPUCODE_DEVICE) && (defined(__GNU_SOURCE__) || defined(_GNU_SOURCE) || defined(GPUCA_GPUCODE))
314GPUdi() constexpr uint32_t GPUCommonMath::Clz(uint32_t
x)
316#if (defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIPCC__))
317 return x == 0 ? 32 :
GPUCA_CHOICE(__builtin_clz(
x), __clz(
x), __builtin_clz(
x));
319 for (int32_t
i = 31;
i >= 0;
i--) {
328GPUdi() constexpr uint32_t GPUCommonMath::Popcount(uint32_t
x)
330#if (defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIPCC__)) && !defined(__OPENCL__)
332 return GPUCA_CHOICE(__builtin_popcount(
x), __popc(
x), __builtin_popcount(
x));
334 x =
x - ((
x >> 1) & 0x55555555);
335 x = (
x & 0x33333333) + ((
x >> 2) & 0x33333333);
336 return (((
x + (
x >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
341GPUhdi() constexpr
void GPUCommonMath::Swap(T&
a, T&
b)
343#ifndef GPUCA_GPUCODE_DEVICE
352template <
class T,
class S,
class R>
363template <
class T,
class S,
class R>
374template <
class T,
class S,
class R>
395GPUdi() float GPUCommonMath::InvSqrt(
float _x)
398 return 1.f /
Sqrt(_x);
400#if defined(__CUDACC__) || defined(__HIPCC__)
401 return __frsqrt_rn(_x);
402#elif defined(__OPENCL__) && defined(__clang__)
403 return 1.f / sqrt(_x);
404#elif !defined(__OPENCL__) && (defined(__FAST_MATH__) || defined(__clang__))
405 return 1.f / sqrtf(_x);
411 const float xhalf = 0.5f *
x.f;
412 x.i = 0x5f3759df - (
x.i >> 1);
413 x.f =
x.f * (1.5f - xhalf *
x.f *
x.f);
437template <
class S,
class T>
438GPUdi() uint32_t GPUCommonMath::AtomicExchInternal(
S* addr, T
val)
440#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
441 return ::atomic_exchange(addr,
val);
442#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
443 return ::atomic_xchg(addr,
val);
444#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
445 return ::atomicExch(addr,
val);
446#elif defined(WITH_OPENMP)
448 __atomic_exchange(addr, &
val, &old, __ATOMIC_SEQ_CST);
451 return reinterpret_cast<std::atomic<T>*
>(addr)->exchange(
val);
455template <
class S,
class T>
456GPUdi() bool GPUCommonMath::AtomicCASInternal(
S* addr, T
cmp, T
val)
458#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
459 return ::atomic_compare_exchange(addr,
cmp,
val) ==
cmp;
460#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
461 return ::atomic_cmpxchg(addr,
cmp,
val) ==
cmp;
462#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
463 return ::atomicCAS(addr,
cmp,
val) ==
cmp;
464#elif defined(WITH_OPENMP)
465 return __atomic_compare_exchange(addr, &
cmp, &
val,
true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
467 return reinterpret_cast<std::atomic<T>*
>(addr)->compare_exchange_strong(
cmp,
val);
471template <
class S,
class T>
472GPUdi() uint32_t GPUCommonMath::AtomicAddInternal(
S* addr, T
val)
474#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
475 return ::atomic_fetch_add(addr,
val);
476#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
477 return ::atomic_add(addr,
val);
478#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
479 return ::atomicAdd(addr,
val);
480#elif defined(WITH_OPENMP)
481 return __atomic_add_fetch(addr,
val, __ATOMIC_SEQ_CST) -
val;
483 return reinterpret_cast<std::atomic<T>*
>(addr)->fetch_add(
val);
487template <
class S,
class T>
490#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
491 ::atomic_fetch_max(addr,
val);
492#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
493 ::atomic_max(addr,
val);
494#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
495 ::atomicMax(addr,
val);
498 while ((current = *(
volatile S*)addr) <
val && !AtomicCASInternal(addr, current,
val)) {
503template <
class S,
class T>
506#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
507 ::atomic_fetch_min(addr,
val);
508#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
509 ::atomic_min(addr,
val);
510#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
511 ::atomicMin(addr,
val);
514 while ((current = *(
volatile S*)addr) >
val && !AtomicCASInternal(addr, current,
val)) {
519#if (defined(__CUDACC__) || defined(__HIPCC__)) && !defined(G__ROOT)
520#define GPUCA_HAVE_ATOMIC_MINMAX_FLOAT
528 AtomicMaxInternal((
GPUAtomic(int32_t)*)addr, __float_as_int(
val));
530 AtomicMinInternal((
GPUAtomic(uint32_t)*)addr, __float_as_uint(
val));
540 AtomicMinInternal((
GPUAtomic(int32_t)*)addr, __float_as_int(
val));
542 AtomicMaxInternal((
GPUAtomic(uint32_t)*)addr, __float_as_uint(
val));
uint64_t exp(uint64_t base, uint8_t exp) noexcept
#define GPUCA_DETERMINISTIC_CODE(det, indet)
#define GPUCA_CHOICE(c1, c2, c3)
GPUd() const expr static float Pi()
GPUhd() const expr static T Min(const T x
GPUdi() static void AtomicMax(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
GPUd() const expr static int32_t Float2IntRn(float x)
GPUd() const expr static float Round(float x)
GPUhdni() const expr static float Sqrt(float x)
GPUdi() static void AtomicMin(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
GPUdi() const expr static T Clamp(const T v
GPUd() static float2 MakeFloat2(float x
GPUhd() const expr static T Max(const T x
GPUdi() static T AtomicAddShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GPUdi() static T AtomicAdd(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
GPUd() const expr static float TwoPi()
GPUdi() static T AtomicExchShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GPUd() static uint32_t Float2UIntReint(const float &x)
GPUd() static T MinWithRef(T x
GPUdi() static void AtomicMinShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GPUhdi() static float Remainderf(float x
GPUhdi() const expr static float Hypot(float x
GPUd() const expr static uint32_t Clz(uint32_t val)
GPUdi() static void AtomicMaxShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GLboolean GLboolean GLboolean b
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLboolean GLboolean GLboolean GLboolean a
GLubyte GLubyte GLubyte GLubyte w
GLdouble GLdouble GLdouble z