Project
Loading...
Searching...
No Matches
GPUORTFloat16.h
Go to the documentation of this file.
1// Copyright (c) Microsoft Corporation. All rights reserved.
2// Licensed under the MIT License.
3
4// This code was created from:
5// - https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_float16.h
6// - https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_cxx_api.h
7
8#ifndef GPUORTFLOAT16_H
9#define GPUORTFLOAT16_H
10
11#ifndef GPUCA_GPUCODE_DEVICE
12#include <stdint.h>
13#include <cmath>
14#include <cstring>
15#include <limits>
16#endif
17
18#include "GPUCommonDef.h"
19#include "GPUCommonMath.h"
20
21namespace o2
22{
23
24namespace OrtDataType
25{
26
27namespace detail
28{
29
30enum class endian {
31#if defined(_WIN32)
32 little = 0,
33 big = 1,
34 native = little,
35#elif defined(__GNUC__) || defined(__clang__)
36 little = __ORDER_LITTLE_ENDIAN__,
37 big = __ORDER_BIG_ENDIAN__,
38 native = __BYTE_ORDER__,
39#else
40#error OrtDataType::detail::endian is not implemented in this environment.
41#endif
42};
43
44static_assert(
45 endian::native == endian::little || endian::native == endian::big,
46 "Only little-endian or big-endian native byte orders are supported.");
47
48} // namespace detail
49
53template <class Derived>
55 protected:
61 GPUd() constexpr static uint16_t ToUint16Impl(float v) noexcept;
62
67 GPUd() float ToFloatImpl() const noexcept;
68
73 GPUd() uint16_t AbsImpl() const noexcept
74 {
75 return static_cast<uint16_t>(val & ~kSignMask);
76 }
77
82 GPUd() uint16_t NegateImpl() const noexcept
83 {
84 return IsNaN() ? val : static_cast<uint16_t>(val ^ kSignMask);
85 }
86
87 public:
88 // uint16_t special values
89 static constexpr uint16_t kSignMask = 0x8000U;
90 static constexpr uint16_t kBiasedExponentMask = 0x7C00U;
91 static constexpr uint16_t kPositiveInfinityBits = 0x7C00U;
92 static constexpr uint16_t kNegativeInfinityBits = 0xFC00U;
93 static constexpr uint16_t kPositiveQNaNBits = 0x7E00U;
94 static constexpr uint16_t kNegativeQNaNBits = 0xFE00U;
95 static constexpr uint16_t kEpsilonBits = 0x4170U;
96 static constexpr uint16_t kMinValueBits = 0xFBFFU; // Minimum normal number
97 static constexpr uint16_t kMaxValueBits = 0x7BFFU; // Largest normal number
98 static constexpr uint16_t kOneBits = 0x3C00U;
99 static constexpr uint16_t kMinusOneBits = 0xBC00U;
100
101 uint16_t val{0};
102
104
109 GPUd() bool IsNegative() const noexcept
110 {
111 return static_cast<int16_t>(val) < 0;
112 }
113
118 GPUd() bool IsNaN() const noexcept
119 {
120 return AbsImpl() > kPositiveInfinityBits;
121 }
122
127 GPUd() bool IsFinite() const noexcept
128 {
129 return AbsImpl() < kPositiveInfinityBits;
130 }
131
136 GPUd() bool IsPositiveInfinity() const noexcept
137 {
138 return val == kPositiveInfinityBits;
139 }
140
145 GPUd() bool IsNegativeInfinity() const noexcept
146 {
147 return val == kNegativeInfinityBits;
148 }
149
154 GPUd() bool IsInfinity() const noexcept
155 {
156 return AbsImpl() == kPositiveInfinityBits;
157 }
158
163 GPUd() bool IsNaNOrZero() const noexcept
164 {
165 auto abs = AbsImpl();
166 return (abs == 0 || abs > kPositiveInfinityBits);
167 }
168
173 GPUd() bool IsNormal() const noexcept
174 {
175 auto abs = AbsImpl();
176 return (abs < kPositiveInfinityBits) // is finite
177 && (abs != 0) // is not zero
178 && ((abs & kBiasedExponentMask) != 0); // is not subnormal (has a non-zero exponent)
179 }
180
185 GPUd() bool IsSubnormal() const noexcept
186 {
187 auto abs = AbsImpl();
188 return (abs < kPositiveInfinityBits) // is finite
189 && (abs != 0) // is not zero
190 && ((abs & kBiasedExponentMask) == 0); // is subnormal (has a zero exponent)
191 }
192
197 GPUd() Derived Abs() const noexcept { return Derived::FromBits(AbsImpl()); }
198
203 GPUd() Derived Negate() const noexcept { return Derived::FromBits(NegateImpl()); }
204
213 GPUd() static bool AreZero(const Float16Impl& lhs, const Float16Impl& rhs) noexcept
214 {
215 return static_cast<uint16_t>((lhs.val | rhs.val) & ~kSignMask) == 0;
216 }
217
218 GPUd() bool operator==(const Float16Impl& rhs) const noexcept
219 {
220 if (IsNaN() || rhs.IsNaN()) {
221 // IEEE defines that NaN is not equal to anything, including itself.
222 return false;
223 }
224 return val == rhs.val;
225 }
226
227 GPUd() bool operator!=(const Float16Impl& rhs) const noexcept { return !(*this == rhs); }
228
229 GPUd() bool operator<(const Float16Impl& rhs) const noexcept
230 {
231 if (IsNaN() || rhs.IsNaN()) {
232 // IEEE defines that NaN is unordered with respect to everything, including itself.
233 return false;
234 }
235
236 const bool left_is_negative = IsNegative();
237 if (left_is_negative != rhs.IsNegative()) {
238 // When the signs of left and right differ, we know that left is less than right if it is
239 // the negative value. The exception to this is if both values are zero, in which case IEEE
240 // says they should be equal, even if the signs differ.
241 return left_is_negative && !AreZero(*this, rhs);
242 }
243 return (val != rhs.val) && ((val < rhs.val) ^ left_is_negative);
244 }
245};
246
247// The following Float16_t conversions are based on the code from
248// Eigen library.
249
250// The conversion routines are Copyright (c) Fabian Giesen, 2016.
251// The original license follows:
252//
253// Copyright (c) Fabian Giesen, 2016
254// All rights reserved.
255// Redistribution and use in source and binary forms, with or without
256// modification, are permitted.
257// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
258// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
259// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
260// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
261// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
262// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
263// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
264// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
265// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
266// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
267// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
268
269namespace detail
270{
272 unsigned int u;
273 float f;
274};
275}; // namespace detail
276
277template <class Derived>
278GPUdi() constexpr uint16_t Float16Impl<Derived>::ToUint16Impl(float v) noexcept
279{
281 f.f = v;
282
283 constexpr detail::float32_bits f32infty = {255 << 23};
284 constexpr detail::float32_bits f16max = {(127 + 16) << 23};
285 constexpr detail::float32_bits denorm_magic = {((127 - 15) + (23 - 10) + 1) << 23};
286 constexpr unsigned int sign_mask = 0x80000000u;
287 uint16_t val = static_cast<uint16_t>(0x0u);
288
289 unsigned int sign = f.u & sign_mask;
290 f.u ^= sign;
291
292 // NOTE all the integer compares in this function can be safely
293 // compiled into signed compares since all operands are below
294 // 0x80000000. Important if you want fast straight SSE2 code
295 // (since there's no unsigned PCMPGTD).
296
297 if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set)
298 val = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
299 } else { // (De)normalized number or zero
300 if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero
301 // use a magic value to align our 10 mantissa bits at the bottom of
302 // the float. as long as FP addition is round-to-nearest-even this
303 // just works.
304 f.f += denorm_magic.f;
305
306 // and one integer subtract of the bias later, we have our final float!
307 val = static_cast<uint16_t>(f.u - denorm_magic.u);
308 } else {
309 unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
310
311 // update exponent, rounding bias part 1
312 // Equivalent to `f.u += ((unsigned int)(15 - 127) << 23) + 0xfff`, but
313 // without arithmetic overflow.
314 f.u += 0xc8000fffU;
315 // rounding bias part 2
316 f.u += mant_odd;
317 // take the bits!
318 val = static_cast<uint16_t>(f.u >> 13);
319 }
320 }
321
322 val |= static_cast<uint16_t>(sign >> 16);
323 return val;
324}
325
326template <class Derived>
327GPUdi() float Float16Impl<Derived>::ToFloatImpl() const noexcept
328{
329 constexpr detail::float32_bits magic = {113 << 23};
330 constexpr unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
331 detail::float32_bits o{};
332
333 o.u = (val & 0x7fff) << 13; // exponent/mantissa bits
334 unsigned int exp = shifted_exp & o.u; // just the exponent
335 o.u += (127 - 15) << 23; // exponent adjust
336
337 // handle exponent special cases
338 if (exp == shifted_exp) { // Inf/NaN?
339 o.u += (128 - 16) << 23; // extra exp adjust
340 } else if (exp == 0) { // Zero/Denormal?
341 o.u += 1 << 23; // extra exp adjust
342 o.f -= magic.f; // re-normalize
343 }
344
345 // Attempt to workaround the Internal Compiler Error on ARM64
346 // for bitwise | operator, including std::bitset
347#if (defined _MSC_VER) && (defined _M_ARM || defined _M_ARM64 || defined _M_ARM64EC)
348 if (IsNegative()) {
349 return -o.f;
350 }
351#else
352 // original code:
353 o.u |= (val & 0x8000U) << 16U; // sign bit
354#endif
355 return o.f;
356}
357
359template <class Derived>
360struct BFloat16Impl {
361 protected:
367 GPUd() static uint16_t ToUint16Impl(float v) noexcept;
368
373 GPUd() float ToFloatImpl() const noexcept;
374
379 GPUd() uint16_t AbsImpl() const noexcept
380 {
381 return static_cast<uint16_t>(val & ~kSignMask);
382 }
383
388 GPUd() uint16_t NegateImpl() const noexcept
389 {
390 return IsNaN() ? val : static_cast<uint16_t>(val ^ kSignMask);
391 }
392
393 public:
394 // uint16_t special values
395 static constexpr uint16_t kSignMask = 0x8000U;
396 static constexpr uint16_t kBiasedExponentMask = 0x7F80U;
397 static constexpr uint16_t kPositiveInfinityBits = 0x7F80U;
398 static constexpr uint16_t kNegativeInfinityBits = 0xFF80U;
399 static constexpr uint16_t kPositiveQNaNBits = 0x7FC1U;
400 static constexpr uint16_t kNegativeQNaNBits = 0xFFC1U;
401 static constexpr uint16_t kSignaling_NaNBits = 0x7F80U;
402 static constexpr uint16_t kEpsilonBits = 0x0080U;
403 static constexpr uint16_t kMinValueBits = 0xFF7FU;
404 static constexpr uint16_t kMaxValueBits = 0x7F7FU;
405 static constexpr uint16_t kRoundToNearest = 0x7FFFU;
406 static constexpr uint16_t kOneBits = 0x3F80U;
407 static constexpr uint16_t kMinusOneBits = 0xBF80U;
408
409 uint16_t val{0};
410
411 GPUdDefault() BFloat16Impl() = default;
412
417 GPUd() bool IsNegative() const noexcept
418 {
419 return static_cast<int16_t>(val) < 0;
420 }
421
426 GPUd() bool IsNaN() const noexcept
427 {
428 return AbsImpl() > kPositiveInfinityBits;
429 }
430
435 GPUd() bool IsFinite() const noexcept
436 {
437 return AbsImpl() < kPositiveInfinityBits;
438 }
439
444 GPUd() bool IsPositiveInfinity() const noexcept
445 {
446 return val == kPositiveInfinityBits;
447 }
448
453 GPUd() bool IsNegativeInfinity() const noexcept
454 {
455 return val == kNegativeInfinityBits;
456 }
457
462 GPUd() bool IsInfinity() const noexcept
463 {
464 return AbsImpl() == kPositiveInfinityBits;
465 }
466
471 GPUd() bool IsNaNOrZero() const noexcept
472 {
473 auto abs = AbsImpl();
474 return (abs == 0 || abs > kPositiveInfinityBits);
475 }
476
481 GPUd() bool IsNormal() const noexcept
482 {
483 auto abs = AbsImpl();
484 return (abs < kPositiveInfinityBits) // is finite
485 && (abs != 0) // is not zero
486 && ((abs & kBiasedExponentMask) != 0); // is not subnormal (has a non-zero exponent)
487 }
488
493 GPUd() bool IsSubnormal() const noexcept
494 {
495 auto abs = AbsImpl();
496 return (abs < kPositiveInfinityBits) // is finite
497 && (abs != 0) // is not zero
498 && ((abs & kBiasedExponentMask) == 0); // is subnormal (has a zero exponent)
499 }
500
505 GPUd() Derived Abs() const noexcept { return Derived::FromBits(AbsImpl()); }
506
511 GPUd() Derived Negate() const noexcept { return Derived::FromBits(NegateImpl()); }
512
521 GPUd() static bool AreZero(const BFloat16Impl& lhs, const BFloat16Impl& rhs) noexcept
522 {
523 // IEEE defines that positive and negative zero are equal, this gives us a quick equality check
524 // for two values by or'ing the private bits together and stripping the sign. They are both zero,
525 // and therefore equivalent, if the resulting value is still zero.
526 return static_cast<uint16_t>((lhs.val | rhs.val) & ~kSignMask) == 0;
527 }
528};
529
530template <class Derived>
531GPUdi() uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
532{
533 uint16_t result;
534 if (o2::gpu::CAMath::IsNaN(v)) {
535 result = kPositiveQNaNBits;
536 } else {
537 auto get_msb_half = [](float fl) {
538 uint16_t result;
539#ifdef GPUCA_GPUCODE
540 o2::gpu::CAMath::memcpy(&result, reinterpret_cast<char*>(&fl) + sizeof(uint16_t), sizeof(uint16_t));
541#else
542#ifdef __cpp_if_constexpr
543 if constexpr (detail::endian::native == detail::endian::little)
544#else
545 if (detail::endian::native == detail::endian::little)
546#endif
547 {
548 std::memcpy(&result, reinterpret_cast<char*>(&fl) + sizeof(uint16_t), sizeof(uint16_t));
549 } else {
550 std::memcpy(&result, &fl, sizeof(uint16_t));
551 }
552#endif
553 return result;
554 };
555
556 uint16_t upper_bits = get_msb_half(v);
557 union {
558 uint32_t U32;
559 float F32;
560 };
561 F32 = v;
562 U32 += (upper_bits & 1) + kRoundToNearest;
563 result = get_msb_half(F32);
564 }
565 return result;
566}
567
568template <class Derived>
569GPUdi() float BFloat16Impl<Derived>::ToFloatImpl() const noexcept
570{
571 if (IsNaN()) {
572 return o2::gpu::CAMath::QuietNaN();
573 }
574 float result;
575 char* const first = reinterpret_cast<char*>(&result);
576 char* const second = first + sizeof(uint16_t);
577#ifdef GPUCA_GPUCODE
578 first[0] = first[1] = 0;
579 o2::gpu::CAMath::memcpy(second, &val, sizeof(uint16_t));
580#else
581#ifdef __cpp_if_constexpr
582 if constexpr (detail::endian::native == detail::endian::little)
583#else
584 if (detail::endian::native == detail::endian::little)
585#endif
586 {
587 std::memset(first, 0, sizeof(uint16_t));
588 std::memcpy(second, &val, sizeof(uint16_t));
589 } else {
590 std::memcpy(first, &val, sizeof(uint16_t));
591 std::memset(second, 0, sizeof(uint16_t));
592 }
593#endif
594 return result;
595}
596
615struct Float16_t : OrtDataType::Float16Impl<Float16_t> {
616 private:
622 constexpr explicit Float16_t(uint16_t v) noexcept { val = v; }
623
624 public:
625 using Base = OrtDataType::Float16Impl<Float16_t>;
626
630 GPUdDefault() Float16_t() = default;
631
637 GPUd() constexpr static Float16_t FromBits(uint16_t v) noexcept { return Float16_t(v); }
638
643 GPUd() explicit Float16_t(float v) noexcept { val = Base::ToUint16Impl(v); }
644
649 GPUd() float ToFloat() const noexcept { return Base::ToFloatImpl(); }
650
655 using Base::IsNegative;
656
661 using Base::IsNaN;
662
667 using Base::IsFinite;
668
673 using Base::IsPositiveInfinity;
674
679 using Base::IsNegativeInfinity;
680
685 using Base::IsInfinity;
686
691 using Base::IsNaNOrZero;
692
697 using Base::IsNormal;
698
703 using Base::IsSubnormal;
704
709 using Base::Abs;
710
715 using Base::Negate;
716
725 using Base::AreZero;
726
730 GPUdi() explicit operator float() const noexcept { return ToFloat(); }
731
732 using Base::operator==;
733 using Base::operator!=;
734 using Base::operator<;
735};
736
737static_assert(sizeof(Float16_t) == sizeof(uint16_t), "Sizes must match");
738
757struct BFloat16_t : OrtDataType::BFloat16Impl<BFloat16_t> {
758 private:
766 constexpr explicit BFloat16_t(uint16_t v) noexcept { val = v; }
767
768 public:
769 using Base = OrtDataType::BFloat16Impl<BFloat16_t>;
770
771 GPUdDefault() BFloat16_t() = default;
772
778 GPUd() static constexpr BFloat16_t FromBits(uint16_t v) noexcept { return BFloat16_t(v); }
779
784 GPUd() explicit BFloat16_t(float v) noexcept { val = Base::ToUint16Impl(v); }
785
790 GPUd() float ToFloat() const noexcept { return Base::ToFloatImpl(); }
791
796 using Base::IsNegative;
797
802 using Base::IsNaN;
803
808 using Base::IsFinite;
809
814 using Base::IsPositiveInfinity;
815
820 using Base::IsNegativeInfinity;
821
826 using Base::IsInfinity;
827
832 using Base::IsNaNOrZero;
833
838 using Base::IsNormal;
839
844 using Base::IsSubnormal;
845
850 using Base::Abs;
851
856 using Base::Negate;
857
866 using Base::AreZero;
867
871 GPUdi() explicit operator float() const noexcept { return ToFloat(); }
872
873 // We do not have an inherited impl for the below operators
874 // as the internal class implements them a little differently
875 bool operator==(const BFloat16_t& rhs) const noexcept;
876 bool operator!=(const BFloat16_t& rhs) const noexcept { return !(*this == rhs); }
877 bool operator<(const BFloat16_t& rhs) const noexcept;
878};
879
880static_assert(sizeof(BFloat16_t) == sizeof(uint16_t), "Sizes must match");
881
882} // namespace OrtDataType
883
884} // namespace o2
885#endif
uint64_t exp(uint64_t base, uint8_t exp) noexcept
#define GPUdi()
#define GPUdDefault()
#define GPUd()
bool o
GLuint64EXT * result
Definition glcorearb.h:5662
const GLdouble * v
Definition glcorearb.h:832
GLdouble f
Definition glcorearb.h:310
GLuint GLfloat * val
Definition glcorearb.h:1582
GPUdi() const expr uint16_t Float16Impl< Derived > GPUdi() uint16_t BFloat16Impl< Derived >
IEEE 754 half-precision floating point data type.
D const SVectorGPU< T, D > & rhs
Definition SMatrixGPU.h:191
a couple of static helper functions to create timestamp values for CCDB queries or override obsolete ...
bfloat16 (Brain Floating Point) data type
GPUdDefault() BFloat16_t()=default
OrtDataType::BFloat16Impl< BFloat16_t > Base
GPUdi() explicit operator float() const noexcept
User defined conversion operator. Converts BFloat16_t to float.
GPUd() float ToFloat() const noexcept
Converts bfloat16 to float.
bool operator!=(const BFloat16_t &rhs) const noexcept
GPUd() explicit BFloat16_t(float v) noexcept
__ctor from float. Float is converted into bfloat16 16-bit representation.
bool operator==(const BFloat16_t &rhs) const noexcept
bool operator<(const BFloat16_t &rhs) const noexcept
GPUd() static const expr BFloat16_t FromBits(uint16_t v) noexcept
Explicit conversion to uint16_t representation of bfloat16.
Shared implementation between public and internal classes. CRTP pattern.
GPUd() bool IsNaNOrZero() const noexcept
Tests if the value is NaN or zero. Useful for comparisons.
const Float16Impl &rhs noexcept
static constexpr uint16_t kSignMask
GPUd() bool IsPositiveInfinity() const noexcept
Tests if the value represents positive infinity.
static constexpr uint16_t kNegativeInfinityBits
GPUd() Derived Abs() const noexcept
Creates an instance that represents absolute value.
static constexpr uint16_t kNegativeQNaNBits
static constexpr uint16_t kPositiveInfinityBits
static constexpr uint16_t kMinusOneBits
GPUd() uint16_t NegateImpl() const noexcept
Creates a new instance with the sign flipped.
GPUd() bool IsNegativeInfinity() const noexcept
Tests if the value represents negative infinity.
GPUd() bool IsNaN() const noexcept
Tests if the value is NaN.
static constexpr uint16_t kBiasedExponentMask
static constexpr uint16_t kOneBits
static constexpr uint16_t kEpsilonBits
static constexpr uint16_t kMaxValueBits
static constexpr uint16_t kMinValueBits
GPUd() bool IsSubnormal() const noexcept
Tests if the value is subnormal (denormal).
GPUd() bool IsInfinity() const noexcept
Tests if the value is either positive or negative infinity.
GPUd() bool IsNormal() const noexcept
Tests if the value is normal (not zero, subnormal, infinite, or NaN).
GPUd() static bool AreZero(const Float16Impl &lhs
IEEE defines that positive and negative zero are equal, this gives us a quick equality check for two ...
GPUdDefault() Float16Impl()=default
GPUd() bool IsFinite() const noexcept
Tests if the value is finite.
GPUd() const expr static uint16_t ToUint16Impl(float v) noexcept
Converts from float to uint16_t float16 representation.
static constexpr uint16_t kPositiveQNaNBits
GPUd() Derived Negate() const noexcept
Creates a new instance with the sign flipped.