15#ifndef GPUCOMMONMATH_H
16#define GPUCOMMONMATH_H
20#if defined(__CUDACC__) && !defined(__clang__) && !defined(GPUCA_GPUCODE_COMPILEKERNELS) && !defined(GPUCA_GPUCODE_HOSTONLY)
21#include <sm_20_atomic_functions.h>
24#if !defined(GPUCA_GPUCODE_DEVICE)
30#if !defined(GPUCA_GPUCODE_COMPILEKERNELS) && (!defined(GPUCA_GPUCODE_DEVICE) || defined(__CUDACC__) || defined(__HIPCC__))
45 GPUhd() static T Min(const T
x, const T
y);
47 GPUhd() static T Max(const T
x, const T
y);
48 template <class T, class
S, class
R>
50 template <class T, class
S, class
R>
52 template <class T, class
S, class
R>
55 GPUdi() static T Clamp(const T
v, const T
lo, const T
hi)
57 return Max(
lo, Min(
v,
hi));
60 GPUd() static
float InvSqrt(
float x);
63 GPUd() static
float ASin(
float x);
64 GPUd() static
float ACos(
float x);
65 GPUd() static
float ATan(
float x);
66 GPUhd() static
float ATan2(
float y,
float x);
69 GPUhdni() static
void SinCos(
float x,
float&
s,
float&
c);
70 GPUhdni() static
void SinCosd(
double x,
double&
s,
double&
c);
72 GPUd() static
float Pow(
float x,
float y);
76 GPUd() static constexpr
float TwoPi() {
return 6.2831853f; }
77 GPUd() static constexpr
float Pi() {
return 3.1415927f; }
79 GPUd() static
float Floor(
float x);
80 GPUd() static uint32_t Float2UIntReint(const
float&
x);
81 GPUd() static uint32_t Float2UIntRn(
float x);
82 GPUd() static int32_t Float2IntRn(
float x);
83 GPUd() static
float Modf(
float x,
float y);
84 GPUd() static
bool Finite(
float x);
86 GPUd() static uint32_t Popcount(uint32_t
val);
90 GPUhdni() static
float Hypot(
float x,
float y,
float z,
float w);
98 return GPUCommonMath::AtomicExchInternal(addr,
val);
104 return GPUCommonMath::AtomicCASInternal(addr,
cmp,
val);
110 return GPUCommonMath::AtomicAddInternal(addr,
val);
115 GPUCommonMath::AtomicMaxInternal(addr,
val);
120 GPUCommonMath::AtomicMinInternal(addr,
val);
125 return GPUCommonMath::AtomicExchInternal(addr,
val);
130 return GPUCommonMath::AtomicAddInternal(addr,
val);
135 GPUCommonMath::AtomicMaxInternal(addr,
val);
140 GPUCommonMath::AtomicMinInternal(addr,
val);
142 GPUd() static int32_t Mul24(int32_t
a, int32_t
b);
143 GPUd() static
float FMulRZ(
float a,
float b);
145 template <int32_t I, class T>
146 GPUd() constexpr static T nextMultipleOf(T
val);
148 template <typename... Args>
151 if constexpr (
sizeof...(Args) == 0) {
154 return w *
w + Sum2(
args...);
160 template <
class S,
class T>
161 GPUd() static uint32_t AtomicExchInternal(
S* addr, T
val);
162 template <class
S, class T>
163 GPUd() static
bool AtomicCASInternal(
S* addr, T
cmp, T
val);
164 template <class
S, class T>
165 GPUd() static uint32_t AtomicAddInternal(
S* addr, T
val);
166 template <class
S, class T>
167 GPUd() static
void AtomicMaxInternal(
S* addr, T
val);
168 template <class
S, class T>
169 GPUd() static
void AtomicMinInternal(
S* addr, T
val);
175#if defined(GPUCA_GPUCODE_DEVICE) && (defined(__CUDACC__) || defined(__HIPCC__))
176 #define CHOICE(c1, c2, c3) (c2)
177#elif defined(GPUCA_GPUCODE_DEVICE) && defined (__OPENCL__)
178 #define CHOICE(c1, c2, c3) (c3)
180 #define CHOICE(c1, c2, c3) (c1)
183template <
int32_t I,
class T>
184GPUdi() constexpr T GPUCommonMath::nextMultipleOf(T
val)
186 if constexpr (I & (I - 1)) {
193 return (
val + I - 1) & ~(T)(I - 1);
200#if !defined(GPUCA_GPUCODE) || defined(__OPENCL__) || defined(__OPENCL_HOST__)
204 return make_float2(
x,
y);
208GPUdi() float GPUCommonMath::Modf(
float x,
float y) {
return CHOICE(fmodf(
x,
y), fmodf(
x,
y), fmod(
x,
y)); }
210GPUdi() uint32_t GPUCommonMath::Float2UIntReint(const
float&
x)
212#if defined(GPUCA_GPUCODE_DEVICE) && (defined(__CUDACC__) || defined(__HIPCC__))
213 return __float_as_uint(
x);
214#elif defined(GPUCA_GPUCODE_DEVICE) && defined(__OPENCL__)
217 return reinterpret_cast<const uint32_t&
>(
x);
221GPUdi() uint32_t GPUCommonMath::Float2UIntRn(
float x) {
return (uint32_t)(int32_t)(
x + 0.5f); }
222GPUdi() float GPUCommonMath::Floor(
float x) {
return CHOICE(floorf(
x), floorf(
x), floor(
x)); }
224#ifdef GPUCA_NO_FAST_MATH
226GPUdi() int32_t GPUCommonMath::Float2IntRn(
float x) {
return (int32_t)
Round(
x); }
227GPUdi() bool GPUCommonMath::Finite(
float x) {
return CHOICE(std::isfinite(
x), isfinite(
x),
true); }
228GPUhdi() float GPUCommonMath::
Sqrt(
float x) {
return CHOICE(sqrtf(
x), (
float)sqrt((
double)
x), sqrt(
x)); }
230GPUhdi() float GPUCommonMath::ATan2(
float y,
float x) {
return CHOICE((
float)atan2((
double)
y, (
double)
x), (
float)atan2((
double)
y, (
double)
x), atan2(
y,
x)); }
231GPUdi() float GPUCommonMath::
Sin(
float x) {
return CHOICE((
float)sin((
double)
x), (
float)sin((
double)
x), sin(
x)); }
232GPUdi() float GPUCommonMath::
Cos(
float x) {
return CHOICE((
float)cos((
double)
x), (
float)cos((
double)
x), cos(
x)); }
233GPUdi() float GPUCommonMath::
Tan(
float x) {
return CHOICE((
float)tanf((
double)
x), (
float)tanf((
double)
x), tan(
x)); }
234GPUdi() float GPUCommonMath::Pow(
float x,
float y) {
return CHOICE((
float)pow((
double)
x, (
double)
y), pow((
double)
x, (
double)
y), pow(
x,
y)); }
235GPUdi() float GPUCommonMath::ASin(
float x) {
return CHOICE((
float)asin((
double)
x), (
float)asin((
double)
x), asin(
x)); }
236GPUdi() float GPUCommonMath::ACos(
float x) {
return CHOICE((
float)acos((
double)
x), (
float)acos((
double)
x), acos(
x)); }
237GPUdi() float GPUCommonMath::
Log(
float x) {
return CHOICE((
float)log((
double)
x), (
float)log((
double)
x), log(
x)); }
241GPUdi() int32_t GPUCommonMath::Float2IntRn(
float x) {
return CHOICE((int32_t)
Round(
x), __float2int_rn(
x), (int32_t)
Round(
x)); }
242GPUdi() bool GPUCommonMath::Finite(
float x) {
return CHOICE(std::isfinite(
x),
true,
true); }
244GPUdi() float GPUCommonMath::ATan(
float x) {
return CHOICE(atanf(
x), atanf(
x), atan(
x)); }
245GPUhdi() float GPUCommonMath::ATan2(
float y,
float x) {
return CHOICE(atan2f(
y,
x), atan2f(
y,
x), atan2(
y,
x)); }
249GPUdi() float GPUCommonMath::Pow(
float x,
float y) {
return CHOICE(powf(
x,
y), powf(
x,
y), pow(
x,
y)); }
250GPUdi() float GPUCommonMath::ASin(
float x) {
return CHOICE(asinf(
x), asinf(
x), asin(
x)); }
251GPUdi() float GPUCommonMath::ACos(
float x) {
return CHOICE(acosf(
x), acosf(
x), acos(
x)); }
256GPUhdi()
void GPUCommonMath::SinCos(
float x,
float&
s,
float&
c)
258#if defined(GPUCA_NO_FAST_MATH) && !defined(__OPENCL__)
261#elif !defined(GPUCA_GPUCODE_DEVICE) && defined(__APPLE__)
262 __sincosf(
x, &
s, &
c);
263#elif !defined(GPUCA_GPUCODE_DEVICE) && (defined(__GNU_SOURCE__) || defined(_GNU_SOURCE) || defined(GPUCA_GPUCODE))
266 CHOICE((
void)((
s = sinf(
x)) + (
c = cosf(
x))), sincosf(
x, &
s, &
c),
s = sincos(
x, &
c));
270GPUhdi()
void GPUCommonMath::SinCosd(
double x,
double&
s,
double&
c)
272#if !defined(GPUCA_GPUCODE_DEVICE) && defined(__APPLE__)
274#elif !defined(GPUCA_GPUCODE_DEVICE) && (defined(__GNU_SOURCE__) || defined(_GNU_SOURCE) || defined(GPUCA_GPUCODE))
277 CHOICE((
void)((
s = sin(
x)) + (
c = cos(
x))), sincos(
x, &
s, &
c),
s = sincos(
x, &
c));
281GPUdi() uint32_t GPUCommonMath::Clz(uint32_t
x)
283#if (defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIPCC__))
284 return x == 0 ? 32 :
CHOICE(__builtin_clz(
x), __clz(
x), __builtin_clz(
x));
286 for (int32_t
i = 31;
i >= 0;
i--) {
295GPUdi() uint32_t GPUCommonMath::Popcount(uint32_t
x)
297#if (defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIPCC__)) && !defined(__OPENCL__)
299 return CHOICE(__builtin_popcount(
x), __popc(
x), __builtin_popcount(
x));
301 x =
x - ((
x >> 1) & 0x55555555);
302 x = (
x & 0x33333333) + ((
x >> 2) & 0x33333333);
303 return (((
x + (
x >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
307GPUhdi() float GPUCommonMath::Hypot(
float x,
float y)
312GPUhdi() float GPUCommonMath::Hypot(
float x,
float y,
float z)
317GPUhdi() float GPUCommonMath::Hypot(
float x,
float y,
float z,
float w)
333 CHOICE(std::swap(
a,
b), _swap<T>(
a,
b), _swap<T>(
a,
b));
337GPUhdi() T GPUCommonMath::Min(const T
x, const T
y)
343GPUhdi() T GPUCommonMath::Max(const T
x, const T
y)
348template <
class T,
class S,
class R>
359template <
class T,
class S,
class R>
370template <
class T,
class S,
class R>
391GPUdi() float GPUCommonMath::InvSqrt(
float _x)
393#if defined(GPUCA_NO_FAST_MATH) || defined(__OPENCL__)
394 return 1.f /
Sqrt(_x);
395#elif defined(__CUDACC__) || defined(__HIPCC__)
396 return __frsqrt_rn(_x);
397#elif defined(__FAST_MATH__)
398 return 1.f / sqrtf(_x);
404 const float xhalf = 0.5f *
x.f;
405 x.i = 0x5f3759df - (
x.i >> 1);
406 x.f =
x.f * (1.5f - xhalf *
x.f *
x.f);
414 return CHOICE(fabsf(
x), fabsf(
x), fabs(
x));
417#if !defined(__OPENCL__) || defined(cl_khr_fp64)
421 return CHOICE(fabs(
x), fabs(
x), fabs(
x));
431GPUhdi() float GPUCommonMath::Copysign(
float x,
float y)
433#if defined(__OPENCL__)
434 return copysign(
x,
y);
435#elif defined(GPUCA_GPUCODE) && !defined(__OPENCL__)
436 return copysignf(
x,
y);
438 return std::copysignf(
x,
y);
442template <
class S,
class T>
443GPUdi() uint32_t GPUCommonMath::AtomicExchInternal(
S* addr, T
val)
445#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
446 return ::atomic_exchange(addr,
val);
447#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
448 return ::atomic_xchg(addr,
val);
449#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
450 return ::atomicExch(addr,
val);
451#elif defined(WITH_OPENMP)
453 __atomic_exchange(addr, &
val, &old, __ATOMIC_SEQ_CST);
456 return reinterpret_cast<std::atomic<T>*
>(addr)->exchange(
val);
460template <
class S,
class T>
461GPUdi() bool GPUCommonMath::AtomicCASInternal(
S* addr, T
cmp, T
val)
463#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
464 return ::atomic_compare_exchange(addr,
cmp,
val) ==
cmp;
465#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
466 return ::atomic_cmpxchg(addr,
cmp,
val) ==
cmp;
467#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
468 return ::atomicCAS(addr,
cmp,
val) ==
cmp;
469#elif defined(WITH_OPENMP)
470 return __atomic_compare_exchange(addr, &
cmp, &
val,
true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
472 return reinterpret_cast<std::atomic<T>*
>(addr)->compare_exchange_strong(
cmp,
val);
476template <
class S,
class T>
477GPUdi() uint32_t GPUCommonMath::AtomicAddInternal(
S* addr, T
val)
479#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
480 return ::atomic_fetch_add(addr,
val);
481#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
482 return ::atomic_add(addr,
val);
483#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
484 return ::atomicAdd(addr,
val);
485#elif defined(WITH_OPENMP)
486 return __atomic_add_fetch(addr,
val, __ATOMIC_SEQ_CST) -
val;
488 return reinterpret_cast<std::atomic<T>*
>(addr)->fetch_add(
val);
492template <
class S,
class T>
495#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
496 ::atomic_fetch_max(addr,
val);
497#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
498 ::atomic_max(addr,
val);
499#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
500 ::atomicMax(addr,
val);
503 while ((current = *(
volatile S*)addr) <
val && !AtomicCASInternal(addr, current,
val)) {
508template <
class S,
class T>
511#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
512 ::atomic_fetch_min(addr,
val);
513#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
514 ::atomic_min(addr,
val);
515#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
516 ::atomicMin(addr,
val);
519 while ((current = *(
volatile S*)addr) >
val && !AtomicCASInternal(addr, current,
val)) {
524#if (defined(__CUDACC__) || defined(__HIPCC__)) && !defined(G__ROOT)
525#define GPUCA_HAVE_ATOMIC_MINMAX_FLOAT
533 AtomicMaxInternal((
GPUAtomic(int32_t)*)addr, __float_as_int(
val));
535 AtomicMinInternal((
GPUAtomic(uint32_t)*)addr, __float_as_uint(
val));
545 AtomicMinInternal((
GPUAtomic(int32_t)*)addr, __float_as_int(
val));
547 AtomicMaxInternal((
GPUAtomic(uint32_t)*)addr, __float_as_uint(
val));
uint64_t exp(uint64_t base, uint8_t exp) noexcept
#define CHOICE(c1, c2, c3)
GPUhd() static T Min(const T x
GPUd() static const expr float Pi()
GPUdi() static void AtomicMax(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
GPUdi() static void AtomicMin(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
GPUhdni() static float Sqrt(float x)
GPUd() static float2 MakeFloat2(float x
GPUdi() static T AtomicAddShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GPUdi() static T AtomicAdd(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
GPUd() static float Round(float x)
GPUdi() static T AtomicExchShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GPUdi() static T Clamp(const T v
GPUdi() static void AtomicMinShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GPUdi() static void AtomicMaxShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GLboolean GLboolean GLboolean b
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLboolean GLboolean GLboolean GLboolean a
GLubyte GLubyte GLubyte GLubyte w
GLdouble GLdouble GLdouble z
GPUhdi() float GPUCommonMath
a couple of static helper functions to create timestamp values for CCDB queries or override obsolete ...