Project
Loading...
Searching...
No Matches
simdops.h
Go to the documentation of this file.
1// Copyright 2019-2023 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
15
16#ifndef RANS_INTERNAL_COMMON_SIMD_H_
17#define RANS_INTERNAL_COMMON_SIMD_H_
18
20
21#ifdef RANS_SIMD
22
23#include <immintrin.h>
24
25#include <gsl/span>
26
30
32{
33
34template <typename T, std::enable_if_t<std::is_integral_v<T>, bool> = true>
35inline __m128i load(gsl::span<const T, getElementCount<T>(SIMDWidth::SSE)> v) noexcept
36{
37 return _mm_load_si128(reinterpret_cast<const __m128i*>(v.data()));
38};
39
40template <typename T, std::enable_if_t<std::is_integral_v<T>, bool> = true>
41inline __m128i load(gsl::span<T, getElementCount<T>(SIMDWidth::SSE)> v) noexcept
42{
43 return _mm_load_si128(reinterpret_cast<const __m128i*>(v.data()));
44};
45
46inline __m128d load(gsl::span<const double_t, getElementCount<double_t>(SIMDWidth::SSE)> v) noexcept
47{
48 return _mm_load_pd(v.data());
49};
50
51inline __m128d load(gsl::span<double_t, getElementCount<double_t>(SIMDWidth::SSE)> v) noexcept
52{
53 return _mm_load_pd(v.data());
54};
55
56template <typename T, std::enable_if_t<std::is_integral_v<T>, bool> = true>
57inline __m128i load(const AlignedArray<T, SIMDWidth::SSE>& v) noexcept
58{
59 return _mm_load_si128(reinterpret_cast<const __m128i*>(v.data()));
60};
61
62inline __m128d load(const pd_t<SIMDWidth::SSE>& v) noexcept
63{
64 return _mm_load_pd(v.data());
65};
66
67#ifdef RANS_AVX2
68
69template <typename T, std::enable_if_t<std::is_integral_v<T>, bool> = true>
70inline __m256i load(const AlignedArray<T, SIMDWidth::AVX>& v) noexcept
71{
72 return _mm256_load_si256(reinterpret_cast<const __m256i*>(v.data()));
73};
74
75inline __m256d load(const pd_t<SIMDWidth::AVX> v) noexcept
76{
77 return _mm256_load_pd(v.data());
78};
79
80template <typename T, std::enable_if_t<std::is_integral_v<T>, bool> = true>
81inline __m256i load(gsl::span<const T, getElementCount<T>(SIMDWidth::AVX)> v) noexcept
82{
83 return _mm256_load_si256(reinterpret_cast<const __m256i*>(v.data()));
84};
85
86template <typename T, std::enable_if_t<std::is_integral_v<T>, bool> = true>
87inline __m256i load(gsl::span<T, getElementCount<T>(SIMDWidth::AVX)> v) noexcept
88{
89 return _mm256_load_si256(reinterpret_cast<const __m256i*>(v.data()));
90};
91
92inline __m256d load(gsl::span<double_t, getElementCount<double_t>(SIMDWidth::AVX)> v) noexcept
93{
94 return _mm256_load_pd(v.data());
95};
96
97inline __m256d load(gsl::span<const double_t, getElementCount<double_t>(SIMDWidth::AVX)> v) noexcept
98{
99 return _mm256_load_pd(v.data());
100};
101
102#endif /* RANS_AVX2 */
103
104template <typename T, std::enable_if_t<std::is_integral_v<T>, bool> = true>
105inline AlignedArray<T, SIMDWidth::SSE> store(__m128i inVec) noexcept
106{
107 AlignedArray<T, SIMDWidth::SSE> out;
108 _mm_store_si128(reinterpret_cast<__m128i*>(out.data()), inVec);
109 return out;
110};
111
112inline AlignedArray<double_t, SIMDWidth::SSE> store(__m128d inVec) noexcept
113{
114 AlignedArray<double_t, SIMDWidth::SSE> out;
115 _mm_store_pd(out.data(), inVec);
116 return out;
117};
118
119template <typename T, std::enable_if_t<std::is_integral_v<T>, bool> = true>
120inline void store(__m128i inVec, gsl::span<T, getElementCount<T>(SIMDWidth::SSE)> v) noexcept
121{
122 _mm_store_si128(reinterpret_cast<__m128i*>(v.data()), inVec);
123};
124
125inline void store(__m128d inVec, gsl::span<double_t, getElementCount<double>(SIMDWidth::SSE)> v) noexcept
126{
127 _mm_store_pd(v.data(), inVec);
128};
129
130#ifdef RANS_AVX2
131
132template <typename T, std::enable_if_t<std::is_integral_v<T>, bool> = true>
133inline AlignedArray<T, SIMDWidth::AVX> store(__m256i inVec) noexcept
134{
135 AlignedArray<T, SIMDWidth::AVX> out;
136 _mm256_store_si256(reinterpret_cast<__m256i*>(out.data()), inVec);
137 return out;
138};
139
140inline AlignedArray<double_t, SIMDWidth::AVX> store(__m256d inVec) noexcept
141{
142 AlignedArray<double_t, SIMDWidth::AVX> out;
143 _mm256_store_pd(out.data(), inVec);
144 return out;
145};
146
147template <typename T, std::enable_if_t<std::is_integral_v<T>, bool> = true>
148inline void store(__m256i inVec, gsl::span<T, getElementCount<T>(SIMDWidth::AVX)> v) noexcept
149{
150 _mm256_store_si256(reinterpret_cast<__m256i*>(v.data()), inVec);
151};
152
153inline void store(__m256d inVec, gsl::span<double_t, getElementCount<double>(SIMDWidth::AVX)> v) noexcept
154{
155 _mm256_store_pd(v.data(), inVec);
156};
157
158#endif /* RANS_AVX2 */
159
160template <SIMDWidth width_V>
161inline auto setAll(uint64_t value) noexcept
162{
163 if constexpr (width_V == SIMDWidth::SSE) {
164 return _mm_set1_epi64x(value);
165 } else {
166 return _mm256_set1_epi64x(value);
167 }
168};
169
170template <SIMDWidth width_V>
171inline auto setAll(uint32_t value) noexcept
172{
173 if constexpr (width_V == SIMDWidth::SSE) {
174 return _mm_set1_epi32(value);
175 } else {
176 return _mm256_set1_epi32(value);
177 }
178};
179
180template <SIMDWidth width_V>
181inline auto setAll(uint16_t value) noexcept
182{
183 if constexpr (width_V == SIMDWidth::SSE) {
184 return _mm_set1_epi16(value);
185 } else {
186 return _mm256_set1_epi16(value);
187 }
188};
189
190template <SIMDWidth width_V>
191inline auto setAll(uint8_t value) noexcept
192{
193 if constexpr (width_V == SIMDWidth::SSE) {
194 return _mm_set1_epi8(value);
195 } else {
196 return _mm256_set1_epi8(value);
197 }
198};
199
200template <SIMDWidth width_V>
201inline auto setAll(double_t value) noexcept
202{
203 if constexpr (width_V == SIMDWidth::SSE) {
204 return _mm_set1_pd(value);
205 } else {
206 return _mm256_set1_pd(value);
207 }
208};
209
210//
211// uint32 -> double
212//
213template <SIMDWidth width_V>
214inline auto int32ToDouble(__m128i in) noexcept
215{
216 if constexpr (width_V == SIMDWidth::SSE) {
217 return _mm_cvtepi32_pd(in);
218 } else if constexpr (width_V == SIMDWidth::AVX) {
219#ifdef RANS_AVX2
220 return _mm256_cvtepi32_pd(in);
221#endif /* RANS_AVX2 */
222 }
223};
224
225//
226// uint64 -> double
227// Only works for inputs in the range: [0, 2^52)
228//
229
230// adding 2^53 to any IEEE754 double precision floating point number in the range of [0 - 2^52]
231// zeros out the exponent and sign bits and the mantissa becomes precisely the integer representation.
232inline constexpr double AlignMantissaMagic = 0x0010000000000000; // 2^53
233
234inline __m128d uint64ToDouble(__m128i in) noexcept
235{
236#if !defined(NDEBUG)
237 auto vec = store<uint64_t>(in);
238 for (auto i : gsl::make_span(vec)) {
239 assert(i < utils::pow2(52));
240 }
241#endif
242 in = _mm_or_si128(in, _mm_castpd_si128(_mm_set1_pd(AlignMantissaMagic)));
243 __m128d out = _mm_sub_pd(_mm_castsi128_pd(in), _mm_set1_pd(AlignMantissaMagic));
244 return out;
245};
246
247#ifdef RANS_AVX2
248//
249// uint64 -> double
250// Only works for inputs in the range: [0, 2^52)
251//
252inline __m256d uint64ToDouble(__m256i in) noexcept
253{
254#if !defined(NDEBUG)
255 auto vec = store<uint64_t>(in);
256 for (auto i : gsl::make_span(vec)) {
257 assert(i < utils::pow2(52));
258 }
259#endif
260 in = _mm256_or_si256(in, _mm256_castpd_si256(_mm256_set1_pd(AlignMantissaMagic)));
261 __m256d out = _mm256_sub_pd(_mm256_castsi256_pd(in), _mm256_set1_pd(AlignMantissaMagic));
262 return out;
263};
264#endif /* RANS_AVX2 */
265
266inline __m128i doubleToUint64(__m128d in) noexcept
267{
268#if !defined(NDEBUG)
269 auto vec = store(in);
270 for (auto i : gsl::make_span(vec)) {
271 assert(i < utils::pow2(52));
272 }
273#endif
274 in = _mm_add_pd(in, _mm_set1_pd(AlignMantissaMagic));
275 __m128i out = _mm_xor_si128(_mm_castpd_si128(in),
276 _mm_castpd_si128(_mm_set1_pd(AlignMantissaMagic)));
277 return out;
278}
279
280#ifdef RANS_AVX2
281
282inline __m256i doubleToUint64(__m256d in) noexcept
283{
284#if !defined(NDEBUG)
285 auto vec = store(in);
286 for (auto i : gsl::make_span(vec)) {
287 assert(i < utils::pow2(52));
288 }
289#endif
290
291 in = _mm256_add_pd(in, _mm256_set1_pd(AlignMantissaMagic));
292 __m256i out = _mm256_xor_si256(_mm256_castpd_si256(in),
293 _mm256_castpd_si256(_mm256_set1_pd(AlignMantissaMagic)));
294 return out;
295}
296
297#endif /* RANS_AVX2 */
298
299template <SIMDWidth>
300struct DivMod;
301
302template <>
303struct DivMod<SIMDWidth::SSE> {
304 __m128d div;
305 __m128d mod;
306};
307
308// calculate both floor(a/b) and a%b
309inline DivMod<SIMDWidth::SSE>
310 divMod(__m128d numerator, __m128d denominator) noexcept
311{
312 __m128d div = _mm_floor_pd(_mm_div_pd(numerator, denominator));
313#ifdef RANS_FMA
314 __m128d mod = _mm_fnmadd_pd(div, denominator, numerator);
315#else /* !defined(RANS_FMA) */
316 __m128d mod = _mm_mul_pd(div, denominator);
317 mod = _mm_sub_pd(numerator, mod);
318#endif
319 return {div, mod};
320}
321
322#ifdef RANS_AVX2
323
324template <>
325struct DivMod<SIMDWidth::AVX> {
326 __m256d div;
327 __m256d mod;
328};
329
330// calculate both floor(a/b) and a%b
331inline DivMod<SIMDWidth::AVX> divMod(__m256d numerator, __m256d denominator) noexcept
332{
333 __m256d div = _mm256_floor_pd(_mm256_div_pd(numerator, denominator));
334 __m256d mod = _mm256_fnmadd_pd(div, denominator, numerator);
335 return {div, mod};
336}
337#endif /* RANS_AVX2 */
338
339inline __m128i cmpgeq_epi64(__m128i a, __m128i b) noexcept
340{
341 __m128i cmpGreater = _mm_cmpgt_epi64(a, b);
342 __m128i cmpEqual = _mm_cmpeq_epi64(a, b);
343 return _mm_or_si128(cmpGreater, cmpEqual);
344};
345
346#ifdef RANS_AVX2
347inline __m256i cmpgeq_epi64(__m256i a, __m256i b) noexcept
348{
349 __m256i cmpGreater = _mm256_cmpgt_epi64(a, b);
350 __m256i cmpEqual = _mm256_cmpeq_epi64(a, b);
351 return _mm256_or_si256(cmpGreater, cmpEqual);
352};
353#endif /* RANS_AVX2 */
354
355inline std::pair<uint32_t, uint32_t> minmax(const uint32_t* begin, const uint32_t* end)
356{
357 constexpr size_t ElemsPerLane = 4;
358 constexpr size_t nUnroll = 2 * ElemsPerLane;
359 auto iter = begin;
360
361 uint32_t min = *iter;
362 uint32_t max = *iter;
363
364 if (end - nUnroll > begin) {
365
366 __m128i minVec[2];
367 __m128i maxVec[2];
368 __m128i tmpVec[2];
369
370 minVec[0] = _mm_loadu_si128(reinterpret_cast<const __m128i_u*>(iter));
371 minVec[1] = minVec[0];
372 maxVec[0] = minVec[0];
373 maxVec[1] = minVec[0];
374
375 for (; iter < end - nUnroll; iter += nUnroll) {
376 tmpVec[0] = _mm_loadu_si128(reinterpret_cast<const __m128i_u*>(iter));
377 minVec[0] = _mm_min_epu32(minVec[0], tmpVec[0]);
378 maxVec[0] = _mm_max_epu32(maxVec[0], tmpVec[0]);
379
380 tmpVec[1] = _mm_loadu_si128(reinterpret_cast<const __m128i_u*>(iter) + 1);
381 minVec[1] = _mm_min_epu32(minVec[1], tmpVec[1]);
382 maxVec[1] = _mm_max_epu32(maxVec[1], tmpVec[1]);
383
384 __builtin_prefetch(iter + 512, 0);
385 }
386
387 minVec[0] = _mm_min_epu32(minVec[0], minVec[1]);
388 maxVec[0] = _mm_max_epu32(maxVec[0], maxVec[1]);
389
390 uint32_t tmpMin[ElemsPerLane];
391 uint32_t tmpMax[ElemsPerLane];
392 _mm_storeu_si128(reinterpret_cast<__m128i*>(tmpMin), minVec[0]);
393 _mm_storeu_si128(reinterpret_cast<__m128i*>(tmpMax), maxVec[0]);
394
395 for (size_t i = 0; i < ElemsPerLane; ++i) {
396 min = std::min(tmpMin[i], min);
397 max = std::max(tmpMax[i], max);
398 }
399 }
400
401 while (iter != end) {
402 min = std::min(*iter, min);
403 max = std::max(*iter, max);
404 ++iter;
405 }
406
407 return {min, max};
408};
409
410inline std::pair<int32_t, int32_t> minmax(const int32_t* begin, const int32_t* end)
411{
412 constexpr size_t ElemsPerLane = 4;
413 constexpr size_t nUnroll = 2 * ElemsPerLane;
414 auto iter = begin;
415
416 int32_t min = *iter;
417 int32_t max = *iter;
418
419 if (end - nUnroll > begin) {
420
421 __m128i minVec[2];
422 __m128i maxVec[2];
423 __m128i tmpVec[2];
424
425 minVec[0] = _mm_loadu_si128(reinterpret_cast<const __m128i_u*>(iter));
426 minVec[1] = minVec[0];
427 maxVec[0] = minVec[0];
428 maxVec[1] = minVec[0];
429
430 for (; iter < end - nUnroll; iter += nUnroll) {
431 tmpVec[0] = _mm_loadu_si128(reinterpret_cast<const __m128i_u*>(iter));
432 minVec[0] = _mm_min_epi32(minVec[0], tmpVec[0]);
433 maxVec[0] = _mm_max_epi32(maxVec[0], tmpVec[0]);
434
435 tmpVec[1] = _mm_loadu_si128(reinterpret_cast<const __m128i_u*>(iter) + 1);
436 minVec[1] = _mm_min_epi32(minVec[1], tmpVec[1]);
437 maxVec[1] = _mm_max_epi32(maxVec[1], tmpVec[1]);
438
439 __builtin_prefetch(iter + 512, 0);
440 }
441
442 minVec[0] = _mm_min_epi32(minVec[0], minVec[1]);
443 maxVec[0] = _mm_max_epi32(maxVec[0], maxVec[1]);
444
445 int32_t tmpMin[ElemsPerLane];
446 int32_t tmpMax[ElemsPerLane];
447 _mm_storeu_si128(reinterpret_cast<__m128i*>(tmpMin), minVec[0]);
448 _mm_storeu_si128(reinterpret_cast<__m128i*>(tmpMax), maxVec[0]);
449
450 for (size_t i = 0; i < ElemsPerLane; ++i) {
451 min = std::min(tmpMin[i], min);
452 max = std::max(tmpMax[i], max);
453 }
454 }
455
456 while (iter != end) {
457 min = std::min(*iter, min);
458 max = std::max(*iter, max);
459 ++iter;
460 }
461
462 return {min, max};
463};
464
465} // namespace o2::rans::internal::simd
466
467#endif /* RANS_SIMD */
468
469#endif /* RANS_INTERNAL_COMMON_SIMD_H_ */
Memory aligned array used for SIMD operations.
int32_t i
common helper classes and functions
preprocessor defines to enable features based on CPU architecture
GLuint GLuint end
Definition glcorearb.h:469
const GLdouble * v
Definition glcorearb.h:832
GLboolean GLboolean GLboolean b
Definition glcorearb.h:1233
GLsizei const GLfloat * value
Definition glcorearb.h:819
GLboolean GLboolean GLboolean GLboolean a
Definition glcorearb.h:1233
auto make_span(const o2::rans::internal::simd::AlignedArray< T, width_V, size_V > &array)
std::pair< source_T, source_T > minmax(gsl::span< const source_T > range)
constexpr size_t pow2(size_t n) noexcept
Definition utils.h:165
Enum< T >::Iterator begin(Enum< T >)
Definition Defs.h:173
basic SIMD datatypes and traits
constexpr size_t min
constexpr size_t max
std::vector< o2::ctf::BufferType > vec