15#ifndef GPUCOMMONMATH_H
16#define GPUCOMMONMATH_H
20#if defined(__CUDACC__) && !defined(__clang__) && !defined(GPUCA_GPUCODE_COMPILEKERNELS) && !defined(GPUCA_GPUCODE_HOSTONLY)
21#include <sm_20_atomic_functions.h>
24#if !defined(GPUCA_GPUCODE_DEVICE)
32#if !defined(GPUCA_GPUCODE_COMPILEKERNELS) && (!defined(GPUCA_GPUCODE_DEVICE) || defined(__CUDACC__) || defined(__HIPCC__))
37#if defined(GPUCA_GPUCODE_DEVICE) && (defined(__CUDACC__) || defined(__HIPCC__))
38 #define GPUCA_CHOICE(c1, c2, c3) (c2)
39#elif defined(GPUCA_GPUCODE_DEVICE) && defined (__OPENCL__)
40 #define GPUCA_CHOICE(c1, c2, c3) (c3)
42 #define GPUCA_CHOICE(c1, c2, c3) (c1)
54 GPUhd() constexpr static T Min(const T
x, const T
y)
59 GPUhd() constexpr static T Max(const T
x, const T
y)
63 template <
class T,
class S,
class R>
65 template <class T, class
S, class
R>
67 template <class T, class
S, class
R>
70 GPUdi() constexpr static T Clamp(const T
v, const T
lo, const T
hi)
72 return Max(
lo, Min(
v,
hi));
75 GPUd() static
float InvSqrt(
float x);
77 GPUdi() constexpr static T Square(T
x)
83 GPUd() constexpr static
float ASin(
float x);
84 GPUd() constexpr static
float ACos(
float x);
85 GPUd() constexpr static
float ATan(
float x);
86 GPUhd() constexpr static
float ATan2(
float y,
float x);
89 GPUhdni() static
void SinCos(
float x,
float&
s,
float&
c);
90 GPUhdni() static
void SinCosd(
double x,
double&
s,
double&
c);
92 GPUd() constexpr static
float Pow(
float x,
float y);
96 GPUd() constexpr static
float TwoPi() {
return 6.2831853f; }
97 GPUd() constexpr static
float Pi() {
return 3.1415927f; }
100 GPUd() static uint32_t Float2UIntReint(const
float&
x);
101 GPUd() constexpr static uint32_t Float2UIntRn(
float x) {
return (uint32_t)(int32_t)(
x + 0.5f); }
102 GPUd() constexpr static int32_t Float2IntRn(
float x);
103 GPUd() constexpr static
float Modf(
float x,
float y);
104 GPUhdi() static
float Remainderf(
float x,
float y);
105 GPUd() constexpr static
bool Finite(
float x);
106 GPUd() constexpr static
bool IsNaN(
float x);
108 GPUd() constexpr static
float QuietNaN() {
return GPUCA_CHOICE(std::numeric_limits<float>::quiet_NaN(), __builtin_nanf(
""), nan(0u)); }
110 GPUd() constexpr static uint32_t Clz(uint32_t
val);
111 GPUd() constexpr static uint32_t Popcount(uint32_t
val);
117 GPUhdi() constexpr static
float Hypot(
float x,
float y,
float z,
float w) {
return Sqrt(
x *
x +
y *
y +
z *
z +
w *
w); }
119 template <
typename T>
120 GPUhd() constexpr static
void Swap(T&
a, T&
b);
125 return GPUCommonMath::AtomicExchInternal(addr,
val);
131 return GPUCommonMath::AtomicCASInternal(addr,
cmp,
val);
137 return GPUCommonMath::AtomicAddInternal(addr,
val);
142 GPUCommonMath::AtomicMaxInternal(addr,
val);
147 GPUCommonMath::AtomicMinInternal(addr,
val);
152 return GPUCommonMath::AtomicExchInternal(addr,
val);
157 return GPUCommonMath::AtomicAddInternal(addr,
val);
162 GPUCommonMath::AtomicMaxInternal(addr,
val);
167 GPUCommonMath::AtomicMinInternal(addr,
val);
169 GPUd() constexpr static int32_t Mul24(int32_t
a, int32_t
b);
170 GPUd() constexpr static
float FMulRZ(
float a,
float b);
172 template <int32_t I, class T>
173 GPUd() constexpr static T nextMultipleOf(T
val);
175 template <typename... Args>
179 template <class
S, class T>
180 GPUd() static uint32_t AtomicExchInternal(
S* addr, T
val);
181 template <class
S, class T>
182 GPUd() static
bool AtomicCASInternal(
S* addr, T
cmp, T
val);
183 template <class
S, class T>
184 GPUd() static uint32_t AtomicAddInternal(
S* addr, T
val);
185 template <class
S, class T>
186 GPUd() static
void AtomicMaxInternal(
S* addr, T
val);
187 template <class
S, class T>
188 GPUd() static
void AtomicMinInternal(
S* addr, T
val);
193template <typename... Args>
196 if constexpr (
sizeof...(Args) == 0) {
199 return w *
w + Sum2(
args...);
206#ifndef GPUCA_GPUCODE_DEVICE
208#elif defined(__CUDACC__) || defined(__HIPCC__)
210#elif defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
213 char* d = (
char*)
dst;
214 const char*
s = (
const char*)
src;
215 for (
size_t i = 0;
i <
size;
i++) {
221template <
int32_t I,
class T>
222GPUdi() constexpr T GPUCommonMath::nextMultipleOf(T
val)
224 if constexpr (I & (I - 1)) {
231 return (
val + I - 1) & ~(
T)(I - 1);
238#if !defined(GPUCA_GPUCODE) || defined(__OPENCL__) || defined(__OPENCL_HOST__)
242 return make_float2(
x,
y);
246GPUdi() constexpr
float GPUCommonMath::Modf(
float x,
float y) {
return GPUCA_CHOICE(fmodf(
x,
y), fmodf(
x,
y), fmod(
x,
y)); }
249GPUdi() uint32_t GPUCommonMath::Float2UIntReint(const
float&
x)
251#if defined(GPUCA_GPUCODE_DEVICE) && (defined(__CUDACC__) || defined(__HIPCC__))
252 return __float_as_uint(
x);
253#elif defined(GPUCA_GPUCODE_DEVICE) && defined(__OPENCL__)
256 return reinterpret_cast<const uint32_t&
>(
x);
261GPUdi()
constexpr float GPUCommonMath::Round(
float x) {
return GPUCA_CHOICE(roundf(
x), roundf(
x), round(
x)); }
262GPUdi() constexpr int32_t GPUCommonMath::Float2IntRn(
float x) {
return (int32_t)
Round(
x); }
265GPUhdi() constexpr
float GPUCommonMath::ATan2(
float y,
float x) {
return GPUCA_CHOICE((
float)atan2((
double)
y, (
double)
x), (
float)atan2((
double)
y, (
double)
x), atan2(
y,
x)); }
266GPUdi() constexpr
float GPUCommonMath::
Sin(
float x) {
return GPUCA_CHOICE((
float)sin((
double)
x), (
float)sin((
double)
x), sin(
x)); }
267GPUdi() constexpr
float GPUCommonMath::
Cos(
float x) {
return GPUCA_CHOICE((
float)cos((
double)
x), (
float)cos((
double)
x), cos(
x)); }
268GPUdi() constexpr
float GPUCommonMath::
Tan(
float x) {
return GPUCA_CHOICE((
float)tanf((
double)
x), (
float)tanf((
double)
x), tan(
x)); }
269GPUdi() constexpr
float GPUCommonMath::Pow(
float x,
float y) {
return GPUCA_CHOICE((
float)pow((
double)
x, (
double)
y), pow((
double)
x, (
double)
y), pow(
x,
y)); }
270GPUdi() constexpr
float GPUCommonMath::ASin(
float x) {
return GPUCA_CHOICE((
float)asin((
double)
x), (
float)asin((
double)
x), asin(
x)); }
271GPUdi() constexpr
float GPUCommonMath::ACos(
float x) {
return GPUCA_CHOICE((
float)acos((
double)
x), (
float)acos((
double)
x), acos(
x)); }
272GPUdi() constexpr
float GPUCommonMath::
Log(
float x) {
return GPUCA_CHOICE((
float)log((
double)
x), (
float)log((
double)
x), log(
x)); }
274GPUdi() constexpr
bool GPUCommonMath::Finite(
float x) {
return GPUCA_CHOICE(std::isfinite(
x), isfinite(
x), isfinite(
x)); }
281GPUhdi() constexpr
float GPUCommonMath::ATan2(
float y,
float x) {
return GPUCA_CHOICE(atan2f(
y,
x), atan2f(
y,
x), atan2(
y,
x)); }
286GPUdi() constexpr
float GPUCommonMath::ASin(
float x) {
return GPUCA_CHOICE(asinf(
x), asinf(
x), asin(
x)); }
287GPUdi() constexpr
float GPUCommonMath::ACos(
float x) {
return GPUCA_CHOICE(acosf(
x), acosf(
x), acos(
x)); }
290GPUdi() constexpr
bool GPUCommonMath::Finite(
float x) {
return true; }
291GPUdi() constexpr
bool GPUCommonMath::IsNaN(
float x) {
return false; }
294GPUhdi()
void GPUCommonMath::SinCos(
float x,
float&
s,
float&
c)
300#if !defined(GPUCA_GPUCODE_DEVICE) && defined(__APPLE__)
301 __sincosf(
x, &
s, &
c);
302#elif !defined(GPUCA_GPUCODE_DEVICE) && (defined(__GNU_SOURCE__) || defined(_GNU_SOURCE) || defined(GPUCA_GPUCODE))
310GPUhdi()
void GPUCommonMath::SinCosd(
double x,
double&
s,
double&
c)
312#if !defined(GPUCA_GPUCODE_DEVICE) && defined(__APPLE__)
314#elif !defined(GPUCA_GPUCODE_DEVICE) && (defined(__GNU_SOURCE__) || defined(_GNU_SOURCE) || defined(GPUCA_GPUCODE))
321GPUdi() constexpr uint32_t GPUCommonMath::Clz(uint32_t
x)
323#if (defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIPCC__))
324 return x == 0 ? 32 :
GPUCA_CHOICE(__builtin_clz(
x), __clz(
x), __builtin_clz(
x));
326 for (int32_t
i = 31;
i >= 0;
i--) {
335GPUdi() constexpr uint32_t GPUCommonMath::Popcount(uint32_t
x)
337#if (defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIPCC__)) && !defined(__OPENCL__)
339 return GPUCA_CHOICE(__builtin_popcount(
x), __popc(
x), __builtin_popcount(
x));
341 x =
x - ((
x >> 1) & 0x55555555);
342 x = (
x & 0x33333333) + ((
x >> 2) & 0x33333333);
343 return (((
x + (
x >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
348GPUhdi() constexpr
void GPUCommonMath::Swap(T&
a, T&
b)
350#ifndef GPUCA_GPUCODE_DEVICE
359template <
class T,
class S,
class R>
370template <
class T,
class S,
class R>
381template <
class T,
class S,
class R>
402GPUdi() float GPUCommonMath::InvSqrt(
float _x)
405 return 1.f /
Sqrt(_x);
407#if defined(__CUDACC__) || defined(__HIPCC__)
408 return __frsqrt_rn(_x);
409#elif defined(__OPENCL__) && defined(__clang__)
410 return 1.f / sqrt(_x);
411#elif !defined(__OPENCL__) && (defined(__FAST_MATH__) || defined(__clang__))
412 return 1.f / sqrtf(_x);
418 const float xhalf = 0.5f *
x.f;
419 x.i = 0x5f3759df - (
x.i >> 1);
420 x.f =
x.f * (1.5f - xhalf *
x.f *
x.f);
444template <
class S,
class T>
445GPUdi() uint32_t GPUCommonMath::AtomicExchInternal(
S* addr, T
val)
447#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
448 return ::atomic_exchange(addr,
val);
449#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
450 return ::atomic_xchg(addr,
val);
451#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
452 return ::atomicExch(addr,
val);
453#elif defined(WITH_OPENMP)
455 __atomic_exchange(addr, &
val, &old, __ATOMIC_SEQ_CST);
458 return reinterpret_cast<std::atomic<T>*
>(addr)->exchange(
val);
462template <
class S,
class T>
463GPUdi() bool GPUCommonMath::AtomicCASInternal(
S* addr, T
cmp, T
val)
465#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
466 return ::atomic_compare_exchange(addr,
cmp,
val) ==
cmp;
467#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
468 return ::atomic_cmpxchg(addr,
cmp,
val) ==
cmp;
469#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
470 return ::atomicCAS(addr,
cmp,
val) ==
cmp;
471#elif defined(WITH_OPENMP)
472 return __atomic_compare_exchange(addr, &
cmp, &
val,
true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
474 return reinterpret_cast<std::atomic<T>*
>(addr)->compare_exchange_strong(
cmp,
val);
478template <
class S,
class T>
479GPUdi() uint32_t GPUCommonMath::AtomicAddInternal(
S* addr, T
val)
481#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
482 return ::atomic_fetch_add(addr,
val);
483#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
484 return ::atomic_add(addr,
val);
485#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
486 return ::atomicAdd(addr,
val);
487#elif defined(WITH_OPENMP)
488 return __atomic_add_fetch(addr,
val, __ATOMIC_SEQ_CST) -
val;
490 return reinterpret_cast<std::atomic<T>*
>(addr)->fetch_add(
val);
494template <
class S,
class T>
497#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
498 ::atomic_fetch_max(addr,
val);
499#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
500 ::atomic_max(addr,
val);
501#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
502 ::atomicMax(addr,
val);
505 while ((current = *(
volatile S*)addr) <
val && !AtomicCASInternal(addr, current,
val)) {
510template <
class S,
class T>
513#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
514 ::atomic_fetch_min(addr,
val);
515#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
516 ::atomic_min(addr,
val);
517#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
518 ::atomicMin(addr,
val);
521 while ((current = *(
volatile S*)addr) >
val && !AtomicCASInternal(addr, current,
val)) {
526#if (defined(__CUDACC__) || defined(__HIPCC__)) && !defined(G__ROOT) && !defined(__CLING__)
527#define GPUCA_HAVE_ATOMIC_MINMAX_FLOAT
535 AtomicMaxInternal((
GPUAtomic(int32_t)*)addr, __float_as_int(
val));
537 AtomicMinInternal((
GPUAtomic(uint32_t)*)addr, __float_as_uint(
val));
547 AtomicMinInternal((
GPUAtomic(int32_t)*)addr, __float_as_int(
val));
549 AtomicMaxInternal((
GPUAtomic(uint32_t)*)addr, __float_as_uint(
val));
uint64_t exp(uint64_t base, uint8_t exp) noexcept
#define GPUCA_DETERMINISTIC_CODE(det, indet)
#define GPUCA_CHOICE(c1, c2, c3)
GPUd() const expr static float Pi()
GPUhd() const expr static T Min(const T x
GPUd() const expr static float QuietNaN()
GPUdi() static void AtomicMax(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
GPUd() const expr static int32_t Float2IntRn(float x)
GPUd() const expr static float Round(float x)
GPUhdni() const expr static float Sqrt(float x)
GPUdi() static void AtomicMin(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
GPUdi() const expr static T Clamp(const T v
GPUd() static float2 MakeFloat2(float x
GPUhd() const expr static T Max(const T x
GPUdi() static T AtomicAddShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GPUhd() const expr static T Abs(T x)
GPUdi() static T AtomicAdd(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
GPUd() const expr static float TwoPi()
GPUdi() static T AtomicExchShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GPUd() static uint32_t Float2UIntReint(const float &x)
GPUd() static T MinWithRef(T x
GPUdi() static void AtomicMinShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GPUhdi() static float Remainderf(float x
GPUhdi() const expr static float Hypot(float x
GPUd() const expr static uint32_t Clz(uint32_t val)
GPUdi() static void AtomicMaxShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GLboolean GLboolean GLboolean b
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLboolean GLboolean GLboolean GLboolean a
GLubyte GLubyte GLubyte GLubyte w
GLdouble GLdouble GLdouble z