16#ifndef RANS_INTERNAL_COMMON_SIMD_H_
17#define RANS_INTERNAL_COMMON_SIMD_H_
34template <
typename T, std::enable_if_t<std::is_
integral_v<T>,
bool> = true>
35inline __m128i load(gsl::span<
const T, getElementCount<T>(SIMDWidth::SSE)>
v)
noexcept
37 return _mm_load_si128(
reinterpret_cast<const __m128i*
>(
v.data()));
40template <
typename T, std::enable_if_t<std::is_
integral_v<T>,
bool> = true>
41inline __m128i load(gsl::span<T, getElementCount<T>(SIMDWidth::SSE)>
v)
noexcept
43 return _mm_load_si128(
reinterpret_cast<const __m128i*
>(
v.data()));
46inline __m128d load(gsl::span<
const double_t, getElementCount<double_t>(SIMDWidth::SSE)>
v)
noexcept
48 return _mm_load_pd(
v.data());
51inline __m128d load(gsl::span<double_t, getElementCount<double_t>(SIMDWidth::SSE)>
v)
noexcept
53 return _mm_load_pd(
v.data());
56template <
typename T, std::enable_if_t<std::is_
integral_v<T>,
bool> = true>
57inline __m128i load(
const AlignedArray<T, SIMDWidth::SSE>&
v)
noexcept
59 return _mm_load_si128(
reinterpret_cast<const __m128i*
>(
v.data()));
62inline __m128d load(
const pd_t<SIMDWidth::SSE>&
v)
noexcept
64 return _mm_load_pd(
v.data());
69template <
typename T, std::enable_if_t<std::is_
integral_v<T>,
bool> = true>
70inline __m256i load(
const AlignedArray<T, SIMDWidth::AVX>&
v)
noexcept
72 return _mm256_load_si256(
reinterpret_cast<const __m256i*
>(
v.data()));
75inline __m256d load(
const pd_t<SIMDWidth::AVX>
v)
noexcept
77 return _mm256_load_pd(
v.data());
80template <
typename T, std::enable_if_t<std::is_
integral_v<T>,
bool> = true>
81inline __m256i load(gsl::span<
const T, getElementCount<T>(SIMDWidth::AVX)>
v)
noexcept
83 return _mm256_load_si256(
reinterpret_cast<const __m256i*
>(
v.data()));
86template <
typename T, std::enable_if_t<std::is_
integral_v<T>,
bool> = true>
87inline __m256i load(gsl::span<T, getElementCount<T>(SIMDWidth::AVX)>
v)
noexcept
89 return _mm256_load_si256(
reinterpret_cast<const __m256i*
>(
v.data()));
92inline __m256d load(gsl::span<double_t, getElementCount<double_t>(SIMDWidth::AVX)>
v)
noexcept
94 return _mm256_load_pd(
v.data());
97inline __m256d load(gsl::span<
const double_t, getElementCount<double_t>(SIMDWidth::AVX)>
v)
noexcept
99 return _mm256_load_pd(
v.data());
104template <
typename T, std::enable_if_t<std::is_
integral_v<T>,
bool> = true>
105inline AlignedArray<T, SIMDWidth::SSE> store(__m128i inVec)
noexcept
107 AlignedArray<T, SIMDWidth::SSE> out;
108 _mm_store_si128(
reinterpret_cast<__m128i*
>(out.data()), inVec);
112inline AlignedArray<double_t, SIMDWidth::SSE> store(__m128d inVec)
noexcept
114 AlignedArray<double_t, SIMDWidth::SSE> out;
115 _mm_store_pd(out.data(), inVec);
119template <
typename T, std::enable_if_t<std::is_
integral_v<T>,
bool> = true>
120inline void store(__m128i inVec, gsl::span<T, getElementCount<T>(SIMDWidth::SSE)>
v)
noexcept
122 _mm_store_si128(
reinterpret_cast<__m128i*
>(
v.data()), inVec);
125inline void store(__m128d inVec, gsl::span<double_t, getElementCount<double>(SIMDWidth::SSE)>
v)
noexcept
127 _mm_store_pd(
v.data(), inVec);
132template <
typename T, std::enable_if_t<std::is_
integral_v<T>,
bool> = true>
133inline AlignedArray<T, SIMDWidth::AVX> store(__m256i inVec)
noexcept
135 AlignedArray<T, SIMDWidth::AVX> out;
136 _mm256_store_si256(
reinterpret_cast<__m256i*
>(out.data()), inVec);
140inline AlignedArray<double_t, SIMDWidth::AVX> store(__m256d inVec)
noexcept
142 AlignedArray<double_t, SIMDWidth::AVX> out;
143 _mm256_store_pd(out.data(), inVec);
147template <
typename T, std::enable_if_t<std::is_
integral_v<T>,
bool> = true>
148inline void store(__m256i inVec, gsl::span<T, getElementCount<T>(SIMDWidth::AVX)>
v)
noexcept
150 _mm256_store_si256(
reinterpret_cast<__m256i*
>(
v.data()), inVec);
153inline void store(__m256d inVec, gsl::span<double_t, getElementCount<double>(SIMDWidth::AVX)>
v)
noexcept
155 _mm256_store_pd(
v.data(), inVec);
160template <SIMDW
idth w
idth_V>
161inline auto setAll(uint64_t
value)
noexcept
163 if constexpr (width_V == SIMDWidth::SSE) {
164 return _mm_set1_epi64x(
value);
166 return _mm256_set1_epi64x(
value);
170template <SIMDW
idth w
idth_V>
171inline auto setAll(uint32_t
value)
noexcept
173 if constexpr (width_V == SIMDWidth::SSE) {
174 return _mm_set1_epi32(
value);
176 return _mm256_set1_epi32(
value);
180template <SIMDW
idth w
idth_V>
181inline auto setAll(uint16_t
value)
noexcept
183 if constexpr (width_V == SIMDWidth::SSE) {
184 return _mm_set1_epi16(
value);
186 return _mm256_set1_epi16(
value);
190template <SIMDW
idth w
idth_V>
191inline auto setAll(uint8_t
value)
noexcept
193 if constexpr (width_V == SIMDWidth::SSE) {
194 return _mm_set1_epi8(
value);
196 return _mm256_set1_epi8(
value);
200template <SIMDW
idth w
idth_V>
201inline auto setAll(double_t
value)
noexcept
203 if constexpr (width_V == SIMDWidth::SSE) {
204 return _mm_set1_pd(
value);
206 return _mm256_set1_pd(
value);
213template <SIMDW
idth w
idth_V>
214inline auto int32ToDouble(__m128i in)
noexcept
216 if constexpr (width_V == SIMDWidth::SSE) {
217 return _mm_cvtepi32_pd(in);
218 }
else if constexpr (width_V == SIMDWidth::AVX) {
220 return _mm256_cvtepi32_pd(in);
232inline constexpr double AlignMantissaMagic = 0x0010000000000000;
234inline __m128d uint64ToDouble(__m128i in)
noexcept
237 auto vec = store<uint64_t>(in);
242 in = _mm_or_si128(in, _mm_castpd_si128(_mm_set1_pd(AlignMantissaMagic)));
243 __m128d out = _mm_sub_pd(_mm_castsi128_pd(in), _mm_set1_pd(AlignMantissaMagic));
252inline __m256d uint64ToDouble(__m256i in)
noexcept
255 auto vec = store<uint64_t>(in);
260 in = _mm256_or_si256(in, _mm256_castpd_si256(_mm256_set1_pd(AlignMantissaMagic)));
261 __m256d out = _mm256_sub_pd(_mm256_castsi256_pd(in), _mm256_set1_pd(AlignMantissaMagic));
266inline __m128i doubleToUint64(__m128d in)
noexcept
269 auto vec = store(in);
274 in = _mm_add_pd(in, _mm_set1_pd(AlignMantissaMagic));
275 __m128i out = _mm_xor_si128(_mm_castpd_si128(in),
276 _mm_castpd_si128(_mm_set1_pd(AlignMantissaMagic)));
282inline __m256i doubleToUint64(__m256d in)
noexcept
285 auto vec = store(in);
291 in = _mm256_add_pd(in, _mm256_set1_pd(AlignMantissaMagic));
292 __m256i out = _mm256_xor_si256(_mm256_castpd_si256(in),
293 _mm256_castpd_si256(_mm256_set1_pd(AlignMantissaMagic)));
303struct DivMod<SIMDWidth::
SSE> {
309inline DivMod<SIMDWidth::SSE>
310 divMod(__m128d numerator, __m128d denominator)
noexcept
312 __m128d div = _mm_floor_pd(_mm_div_pd(numerator, denominator));
314 __m128d mod = _mm_fnmadd_pd(div, denominator, numerator);
316 __m128d mod = _mm_mul_pd(div, denominator);
317 mod = _mm_sub_pd(numerator, mod);
325struct DivMod<SIMDWidth::AVX> {
331inline DivMod<SIMDWidth::AVX> divMod(__m256d numerator, __m256d denominator)
noexcept
333 __m256d div = _mm256_floor_pd(_mm256_div_pd(numerator, denominator));
334 __m256d mod = _mm256_fnmadd_pd(div, denominator, numerator);
339inline __m128i cmpgeq_epi64(__m128i
a, __m128i
b)
noexcept
341 __m128i cmpGreater = _mm_cmpgt_epi64(
a,
b);
342 __m128i cmpEqual = _mm_cmpeq_epi64(
a,
b);
343 return _mm_or_si128(cmpGreater, cmpEqual);
347inline __m256i cmpgeq_epi64(__m256i
a, __m256i
b)
noexcept
349 __m256i cmpGreater = _mm256_cmpgt_epi64(
a,
b);
350 __m256i cmpEqual = _mm256_cmpeq_epi64(
a,
b);
351 return _mm256_or_si256(cmpGreater, cmpEqual);
355inline std::pair<uint32_t, uint32_t>
minmax(
const uint32_t* begin,
const uint32_t*
end)
357 constexpr size_t ElemsPerLane = 4;
358 constexpr size_t nUnroll = 2 * ElemsPerLane;
361 uint32_t
min = *iter;
362 uint32_t
max = *iter;
364 if (
end - nUnroll > begin) {
370 minVec[0] = _mm_loadu_si128(
reinterpret_cast<const __m128i_u*
>(iter));
371 minVec[1] = minVec[0];
372 maxVec[0] = minVec[0];
373 maxVec[1] = minVec[0];
375 for (; iter <
end - nUnroll; iter += nUnroll) {
376 tmpVec[0] = _mm_loadu_si128(
reinterpret_cast<const __m128i_u*
>(iter));
377 minVec[0] = _mm_min_epu32(minVec[0], tmpVec[0]);
378 maxVec[0] = _mm_max_epu32(maxVec[0], tmpVec[0]);
380 tmpVec[1] = _mm_loadu_si128(
reinterpret_cast<const __m128i_u*
>(iter) + 1);
381 minVec[1] = _mm_min_epu32(minVec[1], tmpVec[1]);
382 maxVec[1] = _mm_max_epu32(maxVec[1], tmpVec[1]);
384 __builtin_prefetch(iter + 512, 0);
387 minVec[0] = _mm_min_epu32(minVec[0], minVec[1]);
388 maxVec[0] = _mm_max_epu32(maxVec[0], maxVec[1]);
390 uint32_t tmpMin[ElemsPerLane];
391 uint32_t tmpMax[ElemsPerLane];
392 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(tmpMin), minVec[0]);
393 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(tmpMax), maxVec[0]);
395 for (
size_t i = 0;
i < ElemsPerLane; ++
i) {
396 min = std::min(tmpMin[
i],
min);
397 max = std::max(tmpMax[
i],
max);
401 while (iter !=
end) {
402 min = std::min(*iter,
min);
403 max = std::max(*iter,
max);
410inline std::pair<int32_t, int32_t>
minmax(
const int32_t* begin,
const int32_t*
end)
412 constexpr size_t ElemsPerLane = 4;
413 constexpr size_t nUnroll = 2 * ElemsPerLane;
419 if (
end - nUnroll > begin) {
425 minVec[0] = _mm_loadu_si128(
reinterpret_cast<const __m128i_u*
>(iter));
426 minVec[1] = minVec[0];
427 maxVec[0] = minVec[0];
428 maxVec[1] = minVec[0];
430 for (; iter <
end - nUnroll; iter += nUnroll) {
431 tmpVec[0] = _mm_loadu_si128(
reinterpret_cast<const __m128i_u*
>(iter));
432 minVec[0] = _mm_min_epi32(minVec[0], tmpVec[0]);
433 maxVec[0] = _mm_max_epi32(maxVec[0], tmpVec[0]);
435 tmpVec[1] = _mm_loadu_si128(
reinterpret_cast<const __m128i_u*
>(iter) + 1);
436 minVec[1] = _mm_min_epi32(minVec[1], tmpVec[1]);
437 maxVec[1] = _mm_max_epi32(maxVec[1], tmpVec[1]);
439 __builtin_prefetch(iter + 512, 0);
442 minVec[0] = _mm_min_epi32(minVec[0], minVec[1]);
443 maxVec[0] = _mm_max_epi32(maxVec[0], maxVec[1]);
445 int32_t tmpMin[ElemsPerLane];
446 int32_t tmpMax[ElemsPerLane];
447 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(tmpMin), minVec[0]);
448 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(tmpMax), maxVec[0]);
450 for (
size_t i = 0;
i < ElemsPerLane; ++
i) {
451 min = std::min(tmpMin[
i],
min);
452 max = std::max(tmpMax[
i],
max);
456 while (iter !=
end) {
457 min = std::min(*iter,
min);
458 max = std::max(*iter,
max);
Memory aligned array used for SIMD operations.
common helper classes and functions
preprocessor defines to enable features based on CPU architecture
GLboolean GLboolean GLboolean b
GLsizei const GLfloat * value
GLboolean GLboolean GLboolean GLboolean a
auto make_span(const o2::rans::internal::simd::AlignedArray< T, width_V, size_V > &array)
std::pair< source_T, source_T > minmax(gsl::span< const source_T > range)
constexpr size_t pow2(size_t n) noexcept
Enum< T >::Iterator begin(Enum< T >)
basic SIMD datatypes and traits
std::vector< o2::ctf::BufferType > vec