Project
Loading...
Searching...
No Matches
GPUCommonMath.h
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
15#ifndef GPUCOMMONMATH_H
16#define GPUCOMMONMATH_H
17
18#include "GPUCommonDef.h"
19
20#if defined(__CUDACC__) && !defined(__clang__) && !defined(GPUCA_GPUCODE_COMPILEKERNELS) && !defined(GPUCA_GPUCODE_HOSTONLY)
21#include <sm_20_atomic_functions.h>
22#endif
23
24#if !defined(GPUCA_GPUCODE_DEVICE)
25#include <cmath>
26#include <algorithm>
27#include <atomic>
28#include <limits>
29#include <cstring>
30#endif
31
32#if !defined(GPUCA_GPUCODE_COMPILEKERNELS) && (!defined(GPUCA_GPUCODE_DEVICE) || defined(__CUDACC__) || defined(__HIPCC__))
33#include <cstdint>
34#endif
35
36// GPUCA_CHOICE Syntax: GPUCA_CHOICE(Host, CUDA&HIP, OpenCL)
37#if defined(GPUCA_GPUCODE_DEVICE) && (defined(__CUDACC__) || defined(__HIPCC__)) // clang-format off
38 #define GPUCA_CHOICE(c1, c2, c3) (c2) // Select second option for CUDA and HIP
39#elif defined(GPUCA_GPUCODE_DEVICE) && defined (__OPENCL__)
40 #define GPUCA_CHOICE(c1, c2, c3) (c3) // Select third option for OpenCL
41#else
42 #define GPUCA_CHOICE(c1, c2, c3) (c1) // Select first option for Host
43#endif // clang-format on
44
45namespace o2::gpu
46{
47
49{
50 public:
51 GPUd() static float2 MakeFloat2(float x, float y); // TODO: Find better appraoch that is constexpr
52
53 template <class T>
54 GPUhd() constexpr static T Min(const T x, const T y)
55 {
56 return GPUCA_CHOICE(std::min(x, y), min(x, y), min(x, y));
57 }
58 template <class T>
59 GPUhd() constexpr static T Max(const T x, const T y)
60 {
61 return GPUCA_CHOICE(std::max(x, y), max(x, y), max(x, y));
62 }
63 template <class T, class S, class R>
64 GPUd() static T MinWithRef(T x, T y, S refX, S refY, R& r);
65 template <class T, class S, class R>
66 GPUd() static T MaxWithRef(T x, T y, S refX, S refY, R& r);
67 template <class T, class S, class R>
68 GPUd() static T MaxWithRef(T x, T y, T z, T w, S refX, S refY, S refZ, S refW, R& r);
69 template <class T>
70 GPUdi() constexpr static T Clamp(const T v, const T lo, const T hi)
71 {
72 return Max(lo, Min(v, hi));
73 }
74 GPUhdni() constexpr static float Sqrt(float x);
75 GPUd() static float InvSqrt(float x);
76 template <class T>
77 GPUdi() constexpr static T Square(T x)
78 {
79 return x * x;
80 }
81 template <class T>
82 GPUhd() constexpr static T Abs(T x);
83 GPUd() constexpr static float ASin(float x);
84 GPUd() constexpr static float ACos(float x);
85 GPUd() constexpr static float ATan(float x);
86 GPUhd() constexpr static float ATan2(float y, float x);
87 GPUd() constexpr static float Sin(float x);
88 GPUd() constexpr static float Cos(float x);
89 GPUhdni() static void SinCos(float x, float& s, float& c);
90 GPUhdni() static void SinCosd(double x, double& s, double& c);
91 GPUd() constexpr static float Tan(float x);
92 GPUd() constexpr static float Pow(float x, float y);
93 GPUd() constexpr static float Log(float x);
94 GPUd() constexpr static float Exp(float x);
95 GPUhdni() constexpr static float Copysign(float x, float y) { return GPUCA_CHOICE(std::copysignf(x, y), copysignf(x, y), copysign(x, y)); }
96 GPUd() constexpr static float TwoPi() { return 6.2831853f; }
97 GPUd() constexpr static float Pi() { return 3.1415927f; }
98 GPUd() constexpr static float Round(float x);
99 GPUd() constexpr static float Floor(float x) { return GPUCA_CHOICE(floorf(x), floorf(x), floor(x)); }
100 GPUd() static uint32_t Float2UIntReint(const float& x);
101 GPUd() constexpr static uint32_t Float2UIntRn(float x) { return (uint32_t)(int32_t)(x + 0.5f); }
102 GPUd() constexpr static int32_t Float2IntRn(float x);
103 GPUd() constexpr static float Modf(float x, float y);
104 GPUhdi() static float Remainderf(float x, float y);
105 GPUd() constexpr static bool Finite(float x);
106 GPUd() constexpr static bool IsNaN(float x);
107#ifndef __FAST_MATH__
108 GPUd() constexpr static float QuietNaN() { return GPUCA_CHOICE(std::numeric_limits<float>::quiet_NaN(), __builtin_nanf(""), nan(0u)); }
109#endif
110 GPUd() constexpr static uint32_t Clz(uint32_t val);
111 GPUd() constexpr static uint32_t Popcount(uint32_t val);
112
113 GPUd() static void memcpy(void* dst, const void* src, size_t size);
114
115 GPUhdi() constexpr static float Hypot(float x, float y) { return Sqrt(x * x + y * y); }
116 GPUhdi() constexpr static float Hypot(float x, float y, float z) { return Sqrt(x * x + y * y + z * z); }
117 GPUhdi() constexpr static float Hypot(float x, float y, float z, float w) { return Sqrt(x * x + y * y + z * z + w * w); }
118
119 template <typename T>
120 GPUhd() constexpr static void Swap(T& a, T& b);
121
122 template <class T>
123 GPUdi() static T AtomicExch(GPUglobalref() GPUgeneric() GPUAtomic(T) * addr, T val)
124 {
125 return GPUCommonMath::AtomicExchInternal(addr, val);
126 }
127
128 template <class T>
129 GPUdi() static bool AtomicCAS(GPUglobalref() GPUgeneric() GPUAtomic(T) * addr, T cmp, T val)
130 {
131 return GPUCommonMath::AtomicCASInternal(addr, cmp, val);
132 }
133
134 template <class T>
135 GPUdi() static T AtomicAdd(GPUglobalref() GPUgeneric() GPUAtomic(T) * addr, T val)
136 {
137 return GPUCommonMath::AtomicAddInternal(addr, val);
138 }
139 template <class T>
140 GPUdi() static void AtomicMax(GPUglobalref() GPUgeneric() GPUAtomic(T) * addr, T val)
141 {
142 GPUCommonMath::AtomicMaxInternal(addr, val);
143 }
144 template <class T>
145 GPUdi() static void AtomicMin(GPUglobalref() GPUgeneric() GPUAtomic(T) * addr, T val)
146 {
147 GPUCommonMath::AtomicMinInternal(addr, val);
148 }
149 template <class T>
150 GPUdi() static T AtomicExchShared(GPUsharedref() GPUgeneric() GPUAtomic(T) * addr, T val)
151 {
152 return GPUCommonMath::AtomicExchInternal(addr, val);
153 }
154 template <class T>
155 GPUdi() static T AtomicAddShared(GPUsharedref() GPUgeneric() GPUAtomic(T) * addr, T val)
156 {
157 return GPUCommonMath::AtomicAddInternal(addr, val);
158 }
159 template <class T>
160 GPUdi() static void AtomicMaxShared(GPUsharedref() GPUgeneric() GPUAtomic(T) * addr, T val)
161 {
162 GPUCommonMath::AtomicMaxInternal(addr, val);
163 }
164 template <class T>
165 GPUdi() static void AtomicMinShared(GPUsharedref() GPUgeneric() GPUAtomic(T) * addr, T val)
166 {
167 GPUCommonMath::AtomicMinInternal(addr, val);
168 }
169 GPUd() constexpr static int32_t Mul24(int32_t a, int32_t b);
170 GPUd() constexpr static float FMulRZ(float a, float b);
171
172 template <int32_t I, class T>
173 GPUd() constexpr static T nextMultipleOf(T val);
174
175 template <typename... Args>
176 GPUhdni() constexpr static float Sum2(float w, Args... args);
177
178 private:
179 template <class S, class T>
180 GPUd() static uint32_t AtomicExchInternal(S* addr, T val);
181 template <class S, class T>
182 GPUd() static bool AtomicCASInternal(S* addr, T cmp, T val);
183 template <class S, class T>
184 GPUd() static uint32_t AtomicAddInternal(S* addr, T val);
185 template <class S, class T>
186 GPUd() static void AtomicMaxInternal(S* addr, T val);
187 template <class S, class T>
188 GPUd() static void AtomicMinInternal(S* addr, T val);
189};
190
192
193template <typename... Args>
194GPUhdi() constexpr float GPUCommonMath::Sum2(float w, Args... args)
195{
196 if constexpr (sizeof...(Args) == 0) {
197 return w * w;
198 } else {
199 return w * w + Sum2(args...);
200 }
201 return 0;
202}
203
204GPUdi() void GPUCommonMath::memcpy(void* dst, const void* src, size_t size)
205{
206#ifndef GPUCA_GPUCODE_DEVICE
207 std::memcpy(dst, src, size);
208#elif defined(__CUDACC__) || defined(__HIPCC__)
209 ::memcpy(dst, src, size);
210#elif defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
211 __builtin_memcpy(dst, src, size);
212#else
213 char* d = (char*)dst;
214 const char* s = (const char*)src;
215 for (size_t i = 0; i < size; i++) {
216 d[i] = s[i];
217 }
218#endif
219}
220
221template <int32_t I, class T>
222GPUdi() constexpr T GPUCommonMath::nextMultipleOf(T val)
223{
224 if constexpr (I & (I - 1)) {
225 T tmp = val % I;
226 if (tmp) {
227 val += I - tmp;
228 }
229 return val;
230 } else {
231 return (val + I - 1) & ~(T)(I - 1);
232 }
233 return 0; // BUG: Cuda complains about missing return value with constexpr if
234}
235
236GPUdi() float2 GPUCommonMath::MakeFloat2(float x, float y)
237{
238#if !defined(GPUCA_GPUCODE) || defined(__OPENCL__) || defined(__OPENCL_HOST__)
239 float2 ret = {x, y};
240 return ret;
241#else
242 return make_float2(x, y);
243#endif // GPUCA_GPUCODE
244}
245
246GPUdi() constexpr float GPUCommonMath::Modf(float x, float y) { return GPUCA_CHOICE(fmodf(x, y), fmodf(x, y), fmod(x, y)); }
247GPUhdi() float GPUCommonMath::Remainderf(float x, float y) { return GPUCA_CHOICE(std::remainderf(x, y), remainderf(x, y), remainder(x, y)); }
248
249GPUdi() uint32_t GPUCommonMath::Float2UIntReint(const float& x)
250{
251#if defined(GPUCA_GPUCODE_DEVICE) && (defined(__CUDACC__) || defined(__HIPCC__))
252 return __float_as_uint(x);
253#elif defined(GPUCA_GPUCODE_DEVICE) && defined(__OPENCL__)
254 return as_uint(x);
255#else
256 return reinterpret_cast<const uint32_t&>(x);
257#endif
258}
259
260GPUCA_DETERMINISTIC_CODE( // clang-format off
261GPUdi() constexpr float GPUCommonMath::Round(float x) { return GPUCA_CHOICE(roundf(x), roundf(x), round(x)); }
262GPUdi() constexpr int32_t GPUCommonMath::Float2IntRn(float x) { return (int32_t)Round(x); }
263GPUhdi() constexpr float GPUCommonMath::Sqrt(float x) { return GPUCA_CHOICE(sqrtf(x), (float)sqrt((double)x), sqrt(x)); }
264GPUdi() constexpr float GPUCommonMath::ATan(float x) { return GPUCA_CHOICE((float)atan((double)x), (float)atan((double)x), atan(x)); }
265GPUhdi() constexpr float GPUCommonMath::ATan2(float y, float x) { return GPUCA_CHOICE((float)atan2((double)y, (double)x), (float)atan2((double)y, (double)x), atan2(y, x)); }
266GPUdi() constexpr float GPUCommonMath::Sin(float x) { return GPUCA_CHOICE((float)sin((double)x), (float)sin((double)x), sin(x)); }
267GPUdi() constexpr float GPUCommonMath::Cos(float x) { return GPUCA_CHOICE((float)cos((double)x), (float)cos((double)x), cos(x)); }
268GPUdi() constexpr float GPUCommonMath::Tan(float x) { return GPUCA_CHOICE((float)tanf((double)x), (float)tanf((double)x), tan(x)); }
269GPUdi() constexpr float GPUCommonMath::Pow(float x, float y) { return GPUCA_CHOICE((float)pow((double)x, (double)y), pow((double)x, (double)y), pow(x, y)); }
270GPUdi() constexpr float GPUCommonMath::ASin(float x) { return GPUCA_CHOICE((float)asin((double)x), (float)asin((double)x), asin(x)); }
271GPUdi() constexpr float GPUCommonMath::ACos(float x) { return GPUCA_CHOICE((float)acos((double)x), (float)acos((double)x), acos(x)); }
272GPUdi() constexpr float GPUCommonMath::Log(float x) { return GPUCA_CHOICE((float)log((double)x), (float)log((double)x), log(x)); }
273GPUdi() constexpr float GPUCommonMath::Exp(float x) { return GPUCA_CHOICE((float)exp((double)x), (float)exp((double)x), exp(x)); }
274GPUdi() constexpr bool GPUCommonMath::Finite(float x) { return GPUCA_CHOICE(std::isfinite(x), isfinite(x), isfinite(x)); }
275GPUdi() constexpr bool GPUCommonMath::IsNaN(float x) { return GPUCA_CHOICE(std::isnan(x), isnan(x), isnan(x)); }
276, // !GPUCA_DETERMINISTIC_CODE
277GPUdi() constexpr float GPUCommonMath::Round(float x) { return GPUCA_CHOICE(roundf(x), rintf(x), rint(x)); }
278GPUdi() constexpr int32_t GPUCommonMath::Float2IntRn(float x) { return GPUCA_CHOICE((int32_t)Round(x), __float2int_rn(x), (int32_t)Round(x)); }
279GPUhdi() constexpr float GPUCommonMath::Sqrt(float x) { return GPUCA_CHOICE(sqrtf(x), sqrtf(x), sqrt(x)); }
280GPUdi() constexpr float GPUCommonMath::ATan(float x) { return GPUCA_CHOICE(atanf(x), atanf(x), atan(x)); }
281GPUhdi() constexpr float GPUCommonMath::ATan2(float y, float x) { return GPUCA_CHOICE(atan2f(y, x), atan2f(y, x), atan2(y, x)); }
282GPUdi() constexpr float GPUCommonMath::Sin(float x) { return GPUCA_CHOICE(sinf(x), sinf(x), sin(x)); }
283GPUdi() constexpr float GPUCommonMath::Cos(float x) { return GPUCA_CHOICE(cosf(x), cosf(x), cos(x)); }
284GPUdi() constexpr float GPUCommonMath::Tan(float x) { return GPUCA_CHOICE(tanf(x), tanf(x), tan(x)); }
285GPUdi() constexpr float GPUCommonMath::Pow(float x, float y) { return GPUCA_CHOICE(powf(x, y), powf(x, y), pow(x, y)); }
286GPUdi() constexpr float GPUCommonMath::ASin(float x) { return GPUCA_CHOICE(asinf(x), asinf(x), asin(x)); }
287GPUdi() constexpr float GPUCommonMath::ACos(float x) { return GPUCA_CHOICE(acosf(x), acosf(x), acos(x)); }
288GPUdi() constexpr float GPUCommonMath::Log(float x) { return GPUCA_CHOICE(logf(x), logf(x), log(x)); }
289GPUdi() constexpr float GPUCommonMath::Exp(float x) { return GPUCA_CHOICE(expf(x), expf(x), exp(x)); }
290GPUdi() constexpr bool GPUCommonMath::Finite(float x) { return true; }
291GPUdi() constexpr bool GPUCommonMath::IsNaN(float x) { return false; }
292) // clang-format on
293
294GPUhdi() void GPUCommonMath::SinCos(float x, float& s, float& c)
295{
296 GPUCA_DETERMINISTIC_CODE( // clang-format off
297 s = sin((double)x);
298 c = cos((double)x);
299 , // !GPUCA_DETERMINISTIC_CODE
300#if !defined(GPUCA_GPUCODE_DEVICE) && defined(__APPLE__)
301 __sincosf(x, &s, &c);
302#elif !defined(GPUCA_GPUCODE_DEVICE) && (defined(__GNU_SOURCE__) || defined(_GNU_SOURCE) || defined(GPUCA_GPUCODE))
303 sincosf(x, &s, &c);
304#else
305 GPUCA_CHOICE((void)((s = sinf(x)) + (c = cosf(x))), sincosf(x, &s, &c), s = sincos(x, &c));
306#endif
307 ) // clang-format on
308}
309
310GPUhdi() void GPUCommonMath::SinCosd(double x, double& s, double& c)
311{
312#if !defined(GPUCA_GPUCODE_DEVICE) && defined(__APPLE__)
313 __sincos(x, &s, &c);
314#elif !defined(GPUCA_GPUCODE_DEVICE) && (defined(__GNU_SOURCE__) || defined(_GNU_SOURCE) || defined(GPUCA_GPUCODE))
315 sincos(x, &s, &c);
316#else
317 GPUCA_CHOICE((void)((s = sin(x)) + (c = cos(x))), sincos(x, &s, &c), s = sincos(x, &c));
318#endif
319}
320
321GPUdi() constexpr uint32_t GPUCommonMath::Clz(uint32_t x)
322{
323#if (defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIPCC__))
324 return x == 0 ? 32 : GPUCA_CHOICE(__builtin_clz(x), __clz(x), __builtin_clz(x)); // use builtin if available
325#else
326 for (int32_t i = 31; i >= 0; i--) {
327 if (x & (1u << i)) {
328 return (31 - i);
329 }
330 }
331 return 32;
332#endif
333}
334
335GPUdi() constexpr uint32_t GPUCommonMath::Popcount(uint32_t x)
336{
337#if (defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIPCC__)) && !defined(__OPENCL__) // TODO: remove OPENCL when reported SPIR-V bug is fixed
338 // use builtin if available
339 return GPUCA_CHOICE(__builtin_popcount(x), __popc(x), __builtin_popcount(x));
340#else
341 x = x - ((x >> 1) & 0x55555555);
342 x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
343 return (((x + (x >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
344#endif
345}
346
347template <typename T>
348GPUhdi() constexpr void GPUCommonMath::Swap(T& a, T& b)
349{
350#ifndef GPUCA_GPUCODE_DEVICE
351 std::swap(a, b);
352#else
353 T tmp = a;
354 a = b;
355 b = tmp;
356#endif
357}
358
359template <class T, class S, class R>
360GPUdi() T GPUCommonMath::MinWithRef(T x, T y, S refX, S refY, R& r)
361{
362 if (x < y) {
363 r = refX;
364 return x;
365 }
366 r = refY;
367 return y;
368}
369
370template <class T, class S, class R>
371GPUdi() T GPUCommonMath::MaxWithRef(T x, T y, S refX, S refY, R& r)
372{
373 if (x > y) {
374 r = refX;
375 return x;
376 }
377 r = refY;
378 return y;
379}
380
381template <class T, class S, class R>
382GPUdi() T GPUCommonMath::MaxWithRef(T x, T y, T z, T w, S refX, S refY, S refZ, S refW, R& r)
383{
384 T retVal = x;
385 S retRef = refX;
386 if (y > retVal) {
387 retVal = y;
388 retRef = refY;
389 }
390 if (z > retVal) {
391 retVal = z;
392 retRef = refZ;
393 }
394 if (w > retVal) {
395 retVal = w;
396 retRef = refW;
397 }
398 r = retRef;
399 return retVal;
400}
401
402GPUdi() float GPUCommonMath::InvSqrt(float _x)
403{
404 GPUCA_DETERMINISTIC_CODE( // clang-format off
405 return 1.f / Sqrt(_x);
406 , // !GPUCA_DETERMINISTIC_CODE
407#if defined(__CUDACC__) || defined(__HIPCC__)
408 return __frsqrt_rn(_x);
409#elif defined(__OPENCL__) && defined(__clang__)
410 return 1.f / sqrt(_x);
411#elif !defined(__OPENCL__) && (defined(__FAST_MATH__) || defined(__clang__))
412 return 1.f / sqrtf(_x);
413#else
414 union {
415 float f;
416 int32_t i;
417 } x = {_x};
418 const float xhalf = 0.5f * x.f;
419 x.i = 0x5f3759df - (x.i >> 1);
420 x.f = x.f * (1.5f - xhalf * x.f * x.f);
421 return x.f;
422#endif
423 ) // clang-format on
424}
425
426template <>
427GPUhdi() constexpr float GPUCommonMath::Abs<float>(float x)
428{
429 return GPUCA_CHOICE(fabsf(x), fabsf(x), fabs(x));
430}
431
432template <>
433GPUhdi() constexpr double GPUCommonMath::Abs<double>(double x)
434{
435 return GPUCA_CHOICE(fabs(x), fabs(x), fabs(x));
436}
437
438template <>
439GPUhdi() constexpr int32_t GPUCommonMath::Abs<int32_t>(int32_t x)
440{
441 return GPUCA_CHOICE(abs(x), abs(x), abs(x));
442}
443
444template <class S, class T>
445GPUdi() uint32_t GPUCommonMath::AtomicExchInternal(S* addr, T val)
446{
447#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
448 return ::atomic_exchange(addr, val);
449#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
450 return ::atomic_xchg(addr, val);
451#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
452 return ::atomicExch(addr, val);
453#elif defined(WITH_OPENMP)
454 uint32_t old;
455 __atomic_exchange(addr, &val, &old, __ATOMIC_SEQ_CST);
456 return old;
457#else
458 return reinterpret_cast<std::atomic<T>*>(addr)->exchange(val);
459#endif
460}
461
462template <class S, class T>
463GPUdi() bool GPUCommonMath::AtomicCASInternal(S* addr, T cmp, T val)
464{
465#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
466 return ::atomic_compare_exchange(addr, cmp, val) == cmp;
467#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
468 return ::atomic_cmpxchg(addr, cmp, val) == cmp;
469#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
470 return ::atomicCAS(addr, cmp, val) == cmp;
471#elif defined(WITH_OPENMP)
472 return __atomic_compare_exchange(addr, &cmp, &val, true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
473#else
474 return reinterpret_cast<std::atomic<T>*>(addr)->compare_exchange_strong(cmp, val);
475#endif
476}
477
478template <class S, class T>
479GPUdi() uint32_t GPUCommonMath::AtomicAddInternal(S* addr, T val)
480{
481#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
482 return ::atomic_fetch_add(addr, val);
483#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
484 return ::atomic_add(addr, val);
485#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
486 return ::atomicAdd(addr, val);
487#elif defined(WITH_OPENMP)
488 return __atomic_add_fetch(addr, val, __ATOMIC_SEQ_CST) - val;
489#else
490 return reinterpret_cast<std::atomic<T>*>(addr)->fetch_add(val);
491#endif
492}
493
494template <class S, class T>
495GPUdi() void GPUCommonMath::AtomicMaxInternal(S* addr, T val)
496{
497#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
498 ::atomic_fetch_max(addr, val);
499#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
500 ::atomic_max(addr, val);
501#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
502 ::atomicMax(addr, val);
503#else
504 S current;
505 while ((current = *(volatile S*)addr) < val && !AtomicCASInternal(addr, current, val)) {
506 }
507#endif // GPUCA_GPUCODE
508}
509
510template <class S, class T>
511GPUdi() void GPUCommonMath::AtomicMinInternal(S* addr, T val)
512{
513#if defined(GPUCA_GPUCODE) && defined(__OPENCL__) && (!defined(__clang__) || defined(GPUCA_OPENCL_CLANG_C11_ATOMICS))
514 ::atomic_fetch_min(addr, val);
515#elif defined(GPUCA_GPUCODE) && defined(__OPENCL__)
516 ::atomic_min(addr, val);
517#elif defined(GPUCA_GPUCODE) && (defined(__CUDACC__) || defined(__HIPCC__))
518 ::atomicMin(addr, val);
519#else
520 S current;
521 while ((current = *(volatile S*)addr) > val && !AtomicCASInternal(addr, current, val)) {
522 }
523#endif // GPUCA_GPUCODE
524}
525
526#if (defined(__CUDACC__) || defined(__HIPCC__)) && !defined(G__ROOT) && !defined(__CLING__)
527#define GPUCA_HAVE_ATOMIC_MINMAX_FLOAT
528template <>
529GPUdii() void GPUCommonMath::AtomicMaxInternal(GPUglobalref() GPUgeneric() GPUAtomic(float) * addr, float val)
530{
531 if (val == -0.f) {
532 val = 0.f;
533 }
534 if (val >= 0) {
535 AtomicMaxInternal((GPUAtomic(int32_t)*)addr, __float_as_int(val));
536 } else {
537 AtomicMinInternal((GPUAtomic(uint32_t)*)addr, __float_as_uint(val));
538 }
539}
540template <>
541GPUdii() void GPUCommonMath::AtomicMinInternal(GPUglobalref() GPUgeneric() GPUAtomic(float) * addr, float val)
542{
543 if (val == -0.f) {
544 val = 0.f;
545 }
546 if (val >= 0) {
547 AtomicMinInternal((GPUAtomic(int32_t)*)addr, __float_as_int(val));
548 } else {
549 AtomicMaxInternal((GPUAtomic(uint32_t)*)addr, __float_as_uint(val));
550 }
551}
552#endif
553
554#undef GPUCA_CHOICE
555
556} // namespace o2::gpu
557
558#endif // GPUCOMMONMATH_H
uint64_t exp(uint64_t base, uint8_t exp) noexcept
int32_t i
#define GPUsharedref()
#define GPUdii()
#define GPUAtomic(type)
#define GPUgeneric()
#define GPUglobalref()
#define GPUCA_DETERMINISTIC_CODE(det, indet)
#define GPUCA_CHOICE(c1, c2, c3)
int32_t retVal
GPUd() const expr static float Pi()
GPUhd() const expr static T Min(const T x
GPUd() const expr static float QuietNaN()
GPUdi() static void AtomicMax(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
GPUd() const expr static int32_t Float2IntRn(float x)
GPUd() const expr static float Round(float x)
GPUhdni() const expr static float Sqrt(float x)
GPUdi() static void AtomicMin(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
GPUdi() const expr static T Clamp(const T v
GPUd() static float2 MakeFloat2(float x
GPUhd() const expr static T Max(const T x
GPUdi() static T AtomicAddShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GPUhd() const expr static T Abs(T x)
GPUdi() static T AtomicAdd(GPUglobalref() GPUgeneric() GPUAtomic(T) *addr
GPUd() const expr static float TwoPi()
const void size_t size
GPUdi() static T AtomicExchShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GPUd() static uint32_t Float2UIntReint(const float &x)
GPUd() static T MinWithRef(T x
GPUdi() static void AtomicMinShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GPUhdi() static float Remainderf(float x
GPUhdi() const expr static float Hypot(float x
GPUd() const expr static uint32_t Clz(uint32_t val)
GPUdi() static void AtomicMaxShared(GPUsharedref() GPUgeneric() GPUAtomic(T) *addr
GLint GLenum GLint x
Definition glcorearb.h:403
GLenum src
Definition glcorearb.h:1767
GLsizeiptr size
Definition glcorearb.h:659
const GLdouble * v
Definition glcorearb.h:832
GLdouble f
Definition glcorearb.h:310
GLboolean GLboolean GLboolean b
Definition glcorearb.h:1233
GLenum GLenum dst
Definition glcorearb.h:1767
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLuint GLfloat * val
Definition glcorearb.h:1582
GLboolean r
Definition glcorearb.h:1233
GLboolean GLboolean GLboolean GLboolean a
Definition glcorearb.h:1233
GLubyte GLubyte GLubyte GLubyte w
Definition glcorearb.h:852
GLdouble GLdouble GLdouble z
Definition glcorearb.h:843
bool isnan(float f)
constexpr size_t min
constexpr size_t max