15#ifndef GPUCOMMONMATH_H 
   16#define GPUCOMMONMATH_H 
   20#if defined(__CUDACC__) && !defined(__clang__) && !defined(GPUCA_GPUCODE_COMPILEKERNELS) && !defined(GPUCA_GPUCODE_HOSTONLY) 
   21#include <sm_20_atomic_functions.h> 
   24#if !defined(GPUCA_GPUCODE_DEVICE) 
   32#if !defined(GPUCA_GPUCODE_COMPILEKERNELS) && (!defined(GPUCA_GPUCODE_DEVICE) || defined(__CUDACC__) || defined(__HIPCC__)) 
   37#if defined(GPUCA_GPUCODE_DEVICE) && (defined(__CUDACC__) || defined(__HIPCC__))  
   38    #define GPUCA_CHOICE(c1, c2, c3) (c2)  
   39#elif defined(GPUCA_GPUCODE_DEVICE) && defined (__OPENCL__) 
   40    #define GPUCA_CHOICE(c1, c2, c3) (c3)  
   42    #define GPUCA_CHOICE(c1, c2, c3) (c1)  
   54  GPUhd() constexpr static T Min(const T 
x, const T 
y)
 
   59  GPUhd() constexpr static T Max(const T 
x, const T 
y)
 
   63  template <
class T, 
class S, 
class R>
 
   65  template <class T, class 
S, class 
R>
 
   67  template <class T, class 
S, class 
R>
 
   70  GPUdi() constexpr static T Clamp(const T 
v, const T 
lo, const T 
hi)
 
   72    return Max(
lo, Min(
v, 
hi));
 
   75  GPUd() static 
float InvSqrt(
float x);
 
   77  GPUdi() constexpr static T Square(T 
x)
 
 
   83  GPUd() constexpr static 
float ASin(
float x);
 
   84  GPUd() constexpr static 
float ACos(
float x);
 
   85  GPUd() constexpr static 
float ATan(
float x);
 
   86  GPUhd() constexpr static 
float ATan2(
float y, 
float x);
 
   89  GPUhdni() static 
void SinCos(
float x, 
float& 
s, 
float& 
c);
 
   90  GPUhdni() static 
void SinCosd(
double x, 
double& 
s, 
double& 
c);
 
   92  GPUd() constexpr static 
float Pow(
float x, 
float y);
 
   96  GPUd() constexpr static 
float TwoPi() { 
return 6.2831853f; }
 
   97  GPUd() constexpr static 
float Pi() { 
return 3.1415927f; }
 
  100  GPUd() static uint32_t Float2UIntReint(const 
float& 
x);
 
  101  GPUd() constexpr static uint32_t Float2UIntRn(
float x) { 
return (uint32_t)(int32_t)(
x + 0.5f); }
 
  102  GPUd() constexpr static int32_t Float2IntRn(
float x);
 
  103  GPUd() constexpr static 
float Modf(
float x, 
float y);
 
  104  GPUhdi() static 
float Remainderf(
float x, 
float y);
 
  105  GPUd() constexpr static 
bool Finite(
float x);
 
  106  GPUd() constexpr static 
bool IsNaN(
float x);
 
  108  GPUd() constexpr static 
float QuietNaN() { 
return GPUCA_CHOICE(std::numeric_limits<float>::quiet_NaN(), __builtin_nanf(
""), nan(0u)); }
 
  110  GPUd() constexpr static uint32_t Clz(uint32_t 
val);
 
  111  GPUd() constexpr static uint32_t Popcount(uint32_t 
val);
 
  117  GPUhdi() constexpr static 
float Hypot(
float x, 
float y, 
float z, 
float w) { 
return Sqrt(
x * 
x + 
y * 
y + 
z * 
z + 
w * 
w); }
 
  119  template <
typename T>
 
  120  GPUhd() constexpr static 
void Swap(T& 
a, T& 
b);
 
  125    return GPUCommonMath::AtomicExchInternal(addr, 
val);
 
  131    return GPUCommonMath::AtomicCASInternal(addr, 
cmp, 
val);
 
  137    return GPUCommonMath::AtomicAddInternal(addr, 
val);
 
  142    GPUCommonMath::AtomicMaxInternal(addr, 
val);
 
  147    GPUCommonMath::AtomicMinInternal(addr, 
val);
 
  152    return GPUCommonMath::AtomicExchInternal(addr, 
val);
 
  157    return GPUCommonMath::AtomicAddInternal(addr, 
val);
 
  162    GPUCommonMath::AtomicMaxInternal(addr, 
val);
 
  167    GPUCommonMath::AtomicMinInternal(addr, 
val);
 
  169  GPUd() constexpr static int32_t Mul24(int32_t 
a, int32_t 
b);
 
  170  GPUd() constexpr static 
float FMulRZ(
float a, 
float b);
 
  172  template <int32_t I, class T>
 
  173  GPUd() constexpr static T nextMultipleOf(T 
val);
 
  175  template <typename... Args>
 
  179  template <class 
S, class T>
 
  180  GPUd() static uint32_t AtomicExchInternal(
S* addr, T 
val);
 
  181  template <class 
S, class T>
 
  182  GPUd() static 
bool AtomicCASInternal(
S* addr, T 
cmp, T 
val);
 
  183  template <class 
S, class T>
 
  184  GPUd() static uint32_t AtomicAddInternal(
S* addr, T 
val);
 
  185  template <class 
S, class T>
 
  186  GPUd() static 
void AtomicMaxInternal(
S* addr, T 
val);
 
  187  template <class 
S, class T>
 
  188  GPUd() static 
void AtomicMinInternal(
S* addr, T 
val);
 
 
  193template <typename... Args>
 
  196  if constexpr (
sizeof...(Args) == 0) {
 
  199    return w * 
w + Sum2(
args...);
 
 
  206#ifndef GPUCA_GPUCODE_DEVICE 
  208#elif defined(__CUDACC__) || defined(__HIPCC__) 
  210#elif defined(__clang__) || defined(__GNUC__) || defined(__GNUG__) 
  213  char* d = (
char*)
dst;
 
  214  const char* 
s = (
const char*)
src;
 
  215  for (
size_t i = 0; 
i < 
size; 
i++) {
 
  221template <
int32_t I, 
class T>
 
  222GPUdi() constexpr T GPUCommonMath::nextMultipleOf(T 
val)
 
  224  if constexpr (I & (I - 1)) {
 
  231    return (
val + I - 1) & ~(
T)(I - 1);
 
  238#if !defined(GPUCA_GPUCODE) || defined(__OPENCL__) || defined(__OPENCL_HOST__) 
  242  return make_float2(
x, 
y);
 
  246GPUdi() constexpr 
float GPUCommonMath::Modf(
float x, 
float y) { 
return GPUCA_CHOICE(fmodf(
x, 
y), fmodf(
x, 
y), fmod(
x, 
y)); }
 
  249GPUdi() uint32_t GPUCommonMath::Float2UIntReint(const 
float& 
x)
 
  251#if defined(GPUCA_GPUCODE_DEVICE) && (defined(__CUDACC__) || defined(__HIPCC__)) 
  252  return __float_as_uint(
x);
 
  253#elif defined(GPUCA_GPUCODE_DEVICE) && defined(__OPENCL__) 
  256  return reinterpret_cast<const uint32_t&
>(
x);
 
  261GPUdi() 
constexpr float GPUCommonMath::Round(
float x) { 
return GPUCA_CHOICE(roundf(
x), roundf(
x), round(
x)); }
 
  262GPUdi() constexpr int32_t GPUCommonMath::Float2IntRn(
float x) { 
return (int32_t)
Round(
x); }
 
  265GPUhdi() constexpr 
float GPUCommonMath::ATan2(
float y, 
float x) { 
return GPUCA_CHOICE((
float)atan2((
double)
y, (
double)
x), (
float)atan2((
double)
y, (
double)
x), atan2(
y, 
x)); }
 
  266GPUdi() constexpr 
float GPUCommonMath::
Sin(
float x) { 
return GPUCA_CHOICE((
float)sin((
double)
x), (
float)sin((
double)
x), sin(
x)); }
 
  267GPUdi() constexpr 
float GPUCommonMath::
Cos(
float x) { 
return GPUCA_CHOICE((
float)cos((
double)
x), (
float)cos((
double)
x), cos(
x)); }
 
  268GPUdi() constexpr 
float GPUCommonMath::
Tan(
float x) { 
return GPUCA_CHOICE((
float)tanf((
double)
x), (
float)tanf((
double)
x), tan(
x)); }
 
  269GPUdi() constexpr 
float GPUCommonMath::Pow(
float x, 
float y) { 
return GPUCA_CHOICE((
float)pow((
double)
x, (
double)
y), pow((
double)
x, (
double)
y), pow(
x, 
y)); }
 
  270GPUdi() constexpr 
float GPUCommonMath::ASin(
float x) { 
return GPUCA_CHOICE((
float)asin((
double)
x), (
float)asin((
double)
x), asin(
x)); }
 
  271GPUdi() constexpr 
float GPUCommonMath::ACos(
float x) { 
return GPUCA_CHOICE((
float)acos((
double)
x), (
float)acos((
double)
x), acos(
x)); }
 
  272GPUdi() constexpr 
float GPUCommonMath::
Log(
float x) { 
return GPUCA_CHOICE((
float)log((
double)
x), (
float)log((
double)
x), log(
x)); }
 
  274GPUdi() constexpr 
bool GPUCommonMath::Finite(
float x) { 
return GPUCA_CHOICE(std::isfinite(
x), isfinite(
x), isfinite(
x)); }
 
  281GPUhdi() constexpr 
float GPUCommonMath::ATan2(
float y, 
float x) { 
return GPUCA_CHOICE(atan2f(
y, 
x), atan2f(
y, 
x), atan2(
y, 
x)); }
 
  286GPUdi() constexpr 
float GPUCommonMath::ASin(
float x) { 
return GPUCA_CHOICE(asinf(
x), asinf(
x), asin(
x)); }
 
  287GPUdi() constexpr 
float GPUCommonMath::ACos(
float x) { 
return GPUCA_CHOICE(acosf(
x), acosf(
x), acos(
x)); }
 
  290GPUdi() constexpr 
bool GPUCommonMath::Finite(
float x) { 
return true; }
 
  291GPUdi() constexpr 
bool GPUCommonMath::IsNaN(
float x) { 
return false; }
 
  294GPUhdi() 
void GPUCommonMath::SinCos(
float x, 
float& 
s, 
float& 
c)
 
  300#if !defined(GPUCA_GPUCODE_DEVICE) && defined(__APPLE__) 
  301    __sincosf(
x, &
s, &
c);
 
  302#elif !defined(GPUCA_GPUCODE_DEVICE) && (defined(__GNU_SOURCE__) || defined(_GNU_SOURCE) || defined(GPUCA_GPUCODE)) 
 
  310GPUhdi() 
void GPUCommonMath::SinCosd(
double x, 
double& 
s, 
double& 
c)
 
  312#if !defined(GPUCA_GPUCODE_DEVICE) && defined(__APPLE__) 
  314#elif !defined(GPUCA_GPUCODE_DEVICE) && (defined(__GNU_SOURCE__) || defined(_GNU_SOURCE) || defined(GPUCA_GPUCODE)) 
  321GPUdi() constexpr uint32_t GPUCommonMath::Clz(uint32_t 
x)
 
  323#if (defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIPCC__)) 
  324  return x == 0 ? 32 : 
GPUCA_CHOICE(__builtin_clz(
x), __clz(
x), __builtin_clz(
x)); 
 
  326  for (int32_t 
i = 31; 
i >= 0; 
i--) {
 
  335GPUdi() constexpr uint32_t GPUCommonMath::Popcount(uint32_t 
x)
 
  337#if (defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIPCC__)) && !defined(__OPENCL__)  
  339  return GPUCA_CHOICE(__builtin_popcount(
x), __popc(
x), __builtin_popcount(
x));
 
  341  x = 
x - ((
x >> 1) & 0x55555555);
 
  342  x = (
x & 0x33333333) + ((
x >> 2) & 0x33333333);
 
  343  return (((
x + (
x >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
 
  348GPUhdi() constexpr 
void GPUCommonMath::Swap(T& 
a, T& 
b)
 
  350#ifndef GPUCA_GPUCODE_DEVICE 
  359template <
class T, 
class S, 
class R>
 
  370template <
class T, 
class S, 
class R>
 
  381template <
class T, 
class S, 
class R>
 
  402GPUdi() float GPUCommonMath::InvSqrt(
float _x)
 
  405    return 1.f / 
Sqrt(_x);
 
  407#if defined(__CUDACC__) || defined(__HIPCC__) 
  408    return __frsqrt_rn(_x);
 
  409#elif defined(__OPENCL__) && defined(__clang__) 
  410    return 1.f / sqrt(_x);
 
  411#elif !defined(__OPENCL__) && (defined(__FAST_MATH__) || defined(__clang__)) 
  412    return 1.f / sqrtf(_x);
 
  418    const float xhalf = 0.5f * 
x.f;
 
  419    x.i = 0x5f3759df - (
x.i >> 1);
 
  420    x.f = 
x.f * (1.5f - xhalf * 
x.f * 
x.f);
 
  444template <
class S, 
class T>
 
  445GPUdi() uint32_t GPUCommonMath::AtomicExchInternal(
S* addr, T 
val)
 
  447#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS)) 
  448  return ::atomic_exchange(addr, 
val);
 
  449#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__) 
  450  return ::atomic_xchg(addr, 
val);
 
  451#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__)) 
  452  return ::atomicExch(addr, 
val);
 
  453#elif defined(WITH_OPENMP) 
  455  __atomic_exchange(addr, &
val, &old, __ATOMIC_SEQ_CST);
 
  458  return reinterpret_cast<std::atomic<T>*
>(addr)->exchange(
val);
 
  462template <
class S, 
class T>
 
  463GPUdi() bool GPUCommonMath::AtomicCASInternal(
S* addr, T 
cmp, T 
val)
 
  465#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS)) 
  466  return ::atomic_compare_exchange(addr, 
cmp, 
val) == 
cmp;
 
  467#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__) 
  468  return ::atomic_cmpxchg(addr, 
cmp, 
val) == 
cmp;
 
  469#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__)) 
  470  return ::atomicCAS(addr, 
cmp, 
val) == 
cmp;
 
  471#elif defined(WITH_OPENMP) 
  472  return __atomic_compare_exchange(addr, &
cmp, &
val, 
true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
 
  474  return reinterpret_cast<std::atomic<T>*
>(addr)->compare_exchange_strong(
cmp, 
val);
 
  478template <
class S, 
class T>
 
  479GPUdi() uint32_t GPUCommonMath::AtomicAddInternal(
S* addr, T 
val)
 
  481#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS)) 
  482  return ::atomic_fetch_add(addr, 
val);
 
  483#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__) 
  484  return ::atomic_add(addr, 
val);
 
  485#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__)) 
  486  return ::atomicAdd(addr, 
val);
 
  487#elif defined(WITH_OPENMP) 
  488  return __atomic_add_fetch(addr, 
val, __ATOMIC_SEQ_CST) - 
val;
 
  490  return reinterpret_cast<std::atomic<T>*
>(addr)->fetch_add(
val);
 
  494template <
class S, 
class T>
 
  497#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS)) 
  498  ::atomic_fetch_max(addr, 
val);
 
  499#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__) 
  500  ::atomic_max(addr, 
val);
 
  501#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__)) 
  502  ::atomicMax(addr, 
val);
 
  505  while ((current = *(
volatile S*)addr) < 
val && !AtomicCASInternal(addr, current, 
val)) {
 
  510template <
class S, 
class T>
 
  513#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS)) 
  514  ::atomic_fetch_min(addr, 
val);
 
  515#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__) 
  516  ::atomic_min(addr, 
val);
 
  517#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__)) 
  518  ::atomicMin(addr, 
val);
 
  521  while ((current = *(
volatile S*)addr) > 
val && !AtomicCASInternal(addr, current, 
val)) {
 
  526#if (defined(__CUDACC__) || defined(__HIPCC__)) && !defined(G__ROOT) && !defined(__CLING__) 
  527#define GPUCA_HAVE_ATOMIC_MINMAX_FLOAT 
  535    AtomicMaxInternal((
GPUAtomic(int32_t)*)addr, __float_as_int(
val));
 
  537    AtomicMinInternal((
GPUAtomic(uint32_t)*)addr, __float_as_uint(
val));
 
  547    AtomicMinInternal((
GPUAtomic(int32_t)*)addr, __float_as_int(
val));
 
  549    AtomicMaxInternal((
GPUAtomic(uint32_t)*)addr, __float_as_uint(
val));
 
uint64_t exp(uint64_t base, uint8_t exp) noexcept
 
#define GPUCA_DETERMINISTIC_CODE(det, indet)
 
#define GPUCA_CHOICE(c1, c2, c3)
 
GPUd() const expr static float Pi()
 
GPUhd() const expr static T Min(const T x
 
GPUd() const expr static float QuietNaN()
 
GPUdi() static void AtomicMax(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
 
GPUd() const expr static int32_t Float2IntRn(float x)
 
GPUd() const expr static float Round(float x)
 
GPUhdni() const expr static float Sqrt(float x)
 
GPUdi() static void AtomicMin(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
 
GPUdi() const expr static T Clamp(const T v
 
GPUd() static float2 MakeFloat2(float x
 
GPUhd() const expr static T Max(const T x
 
GPUdi() static T AtomicAddShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
 
GPUhd() const expr static T Abs(T x)
 
GPUdi() static T AtomicAdd(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
 
GPUd() const expr static float TwoPi()
 
GPUdi() static T AtomicExchShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
 
GPUd() static uint32_t Float2UIntReint(const float &x)
 
GPUd() static T MinWithRef(T x
 
GPUdi() static void AtomicMinShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
 
GPUhdi() static float Remainderf(float x
 
GPUhdi() const expr static float Hypot(float x
 
GPUd() const expr static uint32_t Clz(uint32_t val)
 
GPUdi() static void AtomicMaxShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
 
GLboolean GLboolean GLboolean b
 
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
 
GLboolean GLboolean GLboolean GLboolean a
 
GLubyte GLubyte GLubyte GLubyte w
 
GLdouble GLdouble GLdouble z