Project
Loading...
Searching...
No Matches
simdKernel.h
Go to the documentation of this file.
1// Copyright 2019-2023 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
15
16#ifndef RANS_INTERNAL_ENCODE_SIMDKERNEL_H_
17#define RANS_INTERNAL_ENCODE_SIMDKERNEL_H_
18
20
21#ifdef RANS_SIMD
22
23#include <immintrin.h>
24
25#include <array>
26
27#include <gsl/span>
28
34
36{
37//
38// rans Encode
39//
40inline __m128i ransEncode(__m128i state, __m128d frequency, __m128d cumulative, __m128d normalization) noexcept
41{
42#if !defined(NDEBUG)
43 auto vec = store<uint64_t>(state);
44 for (auto i : gsl::make_span(vec)) {
45 assert(i < utils::pow2(52));
46 }
47#endif
48
49 auto [div, mod] = divMod(uint64ToDouble(state), frequency);
50#ifdef RANS_FMA
51 auto newState = _mm_fmadd_pd(normalization, div, cumulative);
52#else /* !defined(RANS_FMA) */
53 auto newState = _mm_mul_pd(normalization, div);
54 newState = _mm_add_pd(newState, cumulative);
55#endif /* RANS_FMA */
56 newState = _mm_add_pd(newState, mod);
57
58 return doubleToUint64(newState);
59};
60#ifdef RANS_AVX2
61
62//
63// rans Encode
64//
65inline __m256i ransEncode(__m256i state, __m256d frequency, __m256d cumulative, __m256d normalization) noexcept
66{
67#if !defined(NDEBUG)
68 auto vec = store<uint64_t>(state);
69 for (auto i : gsl::make_span(vec)) {
70 assert(i < utils::pow2(52));
71 }
72#endif
73
74 auto [div, mod] = divMod(uint64ToDouble(state), frequency);
75 auto newState = _mm256_fmadd_pd(normalization, div, cumulative);
76 newState = _mm256_add_pd(newState, mod);
77
78 return doubleToUint64(newState);
79};
80
81#endif /* RANS_AVX2 */
82
83inline void aosToSoa(gsl::span<const Symbol*, 2> in, __m128i* __restrict__ frequency, __m128i* __restrict__ cumulatedFrequency) noexcept
84{
85 __m128i in0Reg = _mm_loadu_si128(reinterpret_cast<__m128i const*>(in[0]->data()));
86 __m128i in1Reg = _mm_loadu_si128(reinterpret_cast<__m128i const*>(in[1]->data()));
87
88 *frequency = _mm_unpacklo_epi32(in0Reg, in1Reg);
89 *cumulatedFrequency = _mm_shuffle_epi32(*frequency, _MM_SHUFFLE(0, 0, 3, 2));
90}
91
92inline void aosToSoa(gsl::span<const Symbol*, 4> in, __m128i* __restrict__ frequency, __m128i* __restrict__ cumulatedFrequency) noexcept
93{
94 __m128i in0Reg = _mm_loadu_si128(reinterpret_cast<__m128i const*>(in[0]->data()));
95 __m128i in1Reg = _mm_loadu_si128(reinterpret_cast<__m128i const*>(in[1]->data()));
96 __m128i in2Reg = _mm_loadu_si128(reinterpret_cast<__m128i const*>(in[2]->data()));
97 __m128i in3Reg = _mm_loadu_si128(reinterpret_cast<__m128i const*>(in[3]->data()));
98
99 __m128i merged0Reg = _mm_unpacklo_epi32(in0Reg, in1Reg);
100 __m128i merged1Reg = _mm_unpacklo_epi32(in2Reg, in3Reg);
101 *frequency = _mm_unpacklo_epi64(merged0Reg, merged1Reg);
102 *cumulatedFrequency = _mm_unpackhi_epi64(merged0Reg, merged1Reg);
103};
104
105template <SIMDWidth width_V, uint64_t lowerBound_V, uint8_t streamBits_V>
106inline auto computeMaxState(__m128i frequencyVec, uint8_t symbolTablePrecisionBits) noexcept
107{
108 const uint64_t xmax = (lowerBound_V >> symbolTablePrecisionBits) << streamBits_V;
109 const uint8_t shift = log2UIntNZ(xmax);
110 if constexpr (width_V == SIMDWidth::SSE) {
111 __m128i frequencyVecEpi64 = _mm_cvtepi32_epi64(frequencyVec);
112 return _mm_slli_epi64(frequencyVecEpi64, shift);
113 }
114 if constexpr (width_V == SIMDWidth::AVX) {
115#ifdef RANS_AVX2
116 __m256i frequencyVecEpi64 = _mm256_cvtepi32_epi64(frequencyVec);
117 return _mm256_slli_epi64(frequencyVecEpi64, shift);
118#endif /* RANS_AVX2 */
119 }
120};
121
122template <uint8_t streamBits_V>
123inline __m128i computeNewState(__m128i stateVec, __m128i cmpVec) noexcept
124{
125 // newState = (state >= maxState) ? state >> streamBits_V : state
126 __m128i newStateVec = _mm_srli_epi64(stateVec, streamBits_V);
127 newStateVec = _mm_blendv_epi8(stateVec, newStateVec, cmpVec);
128 return newStateVec;
129};
130
131#ifdef RANS_AVX2
132template <uint8_t streamBits_V>
133inline __m256i computeNewState(__m256i stateVec, __m256i cmpVec) noexcept
134{
135 // newState = (state >= maxState) ? state >> streamBits_V : state
136 __m256i newStateVec = _mm256_srli_epi64(stateVec, streamBits_V);
137 newStateVec = _mm256_blendv_epi8(stateVec, newStateVec, cmpVec);
138 return newStateVec;
139};
140
141#endif /* RANS_AVX2 */
142
143inline constexpr std::array<epi8_t<SIMDWidth::SSE>, 16>
144 SSEStreamOutLUT{{
145 {0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8}, // 0b0000 0xFFFFu, //0b0000
146 {0x00_u8, 0x01_u8, 0x02_u8, 0x03_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8}, // 0b0001 0x0FFFu, //0b0001
147 {0x04_u8, 0x05_u8, 0x06_u8, 0x07_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8}, // 0b0010 0x1FFFu, //0b0010
148 {0x04_u8, 0x05_u8, 0x06_u8, 0x07_u8, 0x00_u8, 0x01_u8, 0x02_u8, 0x03_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8}, // 0b0011 0x10FFu, //0b0011
149 {0x08_u8, 0x09_u8, 0x0A_u8, 0x0B_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8}, // 0b0100 0x2FFFu, //0b0100
150 {0x08_u8, 0x09_u8, 0x0A_u8, 0x0B_u8, 0x00_u8, 0x01_u8, 0x02_u8, 0x03_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8}, // 0b0101 0x20FFu, //0b0101
151 {0x04_u8, 0x05_u8, 0x06_u8, 0x07_u8, 0x08_u8, 0x09_u8, 0x0A_u8, 0x0B_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8}, // 0b0110 0x12FFu, //0b0110
152 {0x04_u8, 0x05_u8, 0x06_u8, 0x07_u8, 0x08_u8, 0x09_u8, 0x0A_u8, 0x0B_u8, 0x00_u8, 0x01_u8, 0x02_u8, 0x03_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8}, // 0b0111 0x120Fu, //0b0111
153 {0x0C_u8, 0x0D_u8, 0x0E_u8, 0x0F_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8}, // 0b1000 0x3FFFu, //0b1000
154 {0x0C_u8, 0x0D_u8, 0x0E_u8, 0x0F_u8, 0x00_u8, 0x01_u8, 0x02_u8, 0x03_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8}, // 0b1001 0x30FFu, //0b1001
155 {0x0C_u8, 0x0D_u8, 0x0E_u8, 0x0F_u8, 0x04_u8, 0x05_u8, 0x06_u8, 0x07_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8}, // 0b1010 0x31FFu, //0b1010
156 {0x0C_u8, 0x0D_u8, 0x0E_u8, 0x0F_u8, 0x04_u8, 0x05_u8, 0x06_u8, 0x07_u8, 0x00_u8, 0x01_u8, 0x02_u8, 0x03_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8}, // 0b1011 0x310Fu, //0b1011
157 {0x0C_u8, 0x0D_u8, 0x0E_u8, 0x0F_u8, 0x08_u8, 0x09_u8, 0x0A_u8, 0x0B_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8}, // 0b1100 0x32FFu, //0b1100
158 {0x0C_u8, 0x0D_u8, 0x0E_u8, 0x0F_u8, 0x08_u8, 0x09_u8, 0x0A_u8, 0x0B_u8, 0x00_u8, 0x01_u8, 0x02_u8, 0x03_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8}, // 0b1101 0x320Fu, //0b1101
159 {0x0C_u8, 0x0D_u8, 0x0E_u8, 0x0F_u8, 0x04_u8, 0x05_u8, 0x06_u8, 0x07_u8, 0x08_u8, 0x09_u8, 0x0A_u8, 0x0B_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8, 0xFF_u8}, // 0b1110 0x312Fu, //0b1110
160 {0x0C_u8, 0x0D_u8, 0x0E_u8, 0x0F_u8, 0x04_u8, 0x05_u8, 0x06_u8, 0x07_u8, 0x08_u8, 0x09_u8, 0x0A_u8, 0x0B_u8, 0x00_u8, 0x01_u8, 0x02_u8, 0x03_u8} // 0b1111 0x3120u, //0b1111
161 }};
162
163inline constexpr std::array<uint32_t, 256> AVXStreamOutLUT{
164 0xFFFFFFFFu, // 0b00000000
165 0x0FFFFFFFu, // 0b00000001
166 0x1FFFFFFFu, // 0b00000010
167 0x10FFFFFFu, // 0b00000011
168 0x2FFFFFFFu, // 0b00000100
169 0x20FFFFFFu, // 0b00000101
170 0x12FFFFFFu, // 0b00000110
171 0x120FFFFFu, // 0b00000111
172 0x3FFFFFFFu, // 0b00001000
173 0x30FFFFFFu, // 0b00001001
174 0x31FFFFFFu, // 0b00001010
175 0x310FFFFFu, // 0b00001011
176 0x32FFFFFFu, // 0b00001100
177 0x320FFFFFu, // 0b00001101
178 0x312FFFFFu, // 0b00001110
179 0x3120FFFFu, // 0b00001111
180 0x4FFFFFFFu, // 0b00010000
181 0x40FFFFFFu, // 0b00010001
182 0x14FFFFFFu, // 0b00010010
183 0x140FFFFFu, // 0b00010011
184 0x42FFFFFFu, // 0b00010100
185 0x420FFFFFu, // 0b00010101
186 0x142FFFFFu, // 0b00010110
187 0x1420FFFFu, // 0b00010111
188 0x34FFFFFFu, // 0b00011000
189 0x340FFFFFu, // 0b00011001
190 0x314FFFFFu, // 0b00011010
191 0x3140FFFFu, // 0b00011011
192 0x342FFFFFu, // 0b00011100
193 0x3420FFFFu, // 0b00011101
194 0x3142FFFFu, // 0b00011110
195 0x31420FFFu, // 0b00011111
196 0x5FFFFFFFu, // 0b00100000
197 0x50FFFFFFu, // 0b00100001
198 0x51FFFFFFu, // 0b00100010
199 0x510FFFFFu, // 0b00100011
200 0x52FFFFFFu, // 0b00100100
201 0x520FFFFFu, // 0b00100101
202 0x512FFFFFu, // 0b00100110
203 0x5120FFFFu, // 0b00100111
204 0x53FFFFFFu, // 0b00101000
205 0x530FFFFFu, // 0b00101001
206 0x531FFFFFu, // 0b00101010
207 0x5310FFFFu, // 0b00101011
208 0x532FFFFFu, // 0b00101100
209 0x5320FFFFu, // 0b00101101
210 0x5312FFFFu, // 0b00101110
211 0x53120FFFu, // 0b00101111
212 0x54FFFFFFu, // 0b00110000
213 0x540FFFFFu, // 0b00110001
214 0x514FFFFFu, // 0b00110010
215 0x5140FFFFu, // 0b00110011
216 0x542FFFFFu, // 0b00110100
217 0x5420FFFFu, // 0b00110101
218 0x5142FFFFu, // 0b00110110
219 0x51420FFFu, // 0b00110111
220 0x534FFFFFu, // 0b00111000
221 0x5340FFFFu, // 0b00111001
222 0x5314FFFFu, // 0b00111010
223 0x53140FFFu, // 0b00111011
224 0x5342FFFFu, // 0b00111100
225 0x53420FFFu, // 0b00111101
226 0x53142FFFu, // 0b00111110
227 0x531420FFu, // 0b00111111
228 0x6FFFFFFFu, // 0b01000000
229 0x60FFFFFFu, // 0b01000001
230 0x16FFFFFFu, // 0b01000010
231 0x160FFFFFu, // 0b01000011
232 0x62FFFFFFu, // 0b01000100
233 0x620FFFFFu, // 0b01000101
234 0x162FFFFFu, // 0b01000110
235 0x1620FFFFu, // 0b01000111
236 0x36FFFFFFu, // 0b01001000
237 0x360FFFFFu, // 0b01001001
238 0x316FFFFFu, // 0b01001010
239 0x3160FFFFu, // 0b01001011
240 0x362FFFFFu, // 0b01001100
241 0x3620FFFFu, // 0b01001101
242 0x3162FFFFu, // 0b01001110
243 0x31620FFFu, // 0b01001111
244 0x64FFFFFFu, // 0b01010000
245 0x640FFFFFu, // 0b01010001
246 0x164FFFFFu, // 0b01010010
247 0x1640FFFFu, // 0b01010011
248 0x642FFFFFu, // 0b01010100
249 0x6420FFFFu, // 0b01010101
250 0x1642FFFFu, // 0b01010110
251 0x16420FFFu, // 0b01010111
252 0x364FFFFFu, // 0b01011000
253 0x3640FFFFu, // 0b01011001
254 0x3164FFFFu, // 0b01011010
255 0x31640FFFu, // 0b01011011
256 0x3642FFFFu, // 0b01011100
257 0x36420FFFu, // 0b01011101
258 0x31642FFFu, // 0b01011110
259 0x316420FFu, // 0b01011111
260 0x56FFFFFFu, // 0b01100000
261 0x560FFFFFu, // 0b01100001
262 0x516FFFFFu, // 0b01100010
263 0x5160FFFFu, // 0b01100011
264 0x562FFFFFu, // 0b01100100
265 0x5620FFFFu, // 0b01100101
266 0x5162FFFFu, // 0b01100110
267 0x51620FFFu, // 0b01100111
268 0x536FFFFFu, // 0b01101000
269 0x5360FFFFu, // 0b01101001
270 0x5316FFFFu, // 0b01101010
271 0x53160FFFu, // 0b01101011
272 0x5362FFFFu, // 0b01101100
273 0x53620FFFu, // 0b01101101
274 0x53162FFFu, // 0b01101110
275 0x531620FFu, // 0b01101111
276 0x564FFFFFu, // 0b01110000
277 0x5640FFFFu, // 0b01110001
278 0x5164FFFFu, // 0b01110010
279 0x51640FFFu, // 0b01110011
280 0x5642FFFFu, // 0b01110100
281 0x56420FFFu, // 0b01110101
282 0x51642FFFu, // 0b01110110
283 0x516420FFu, // 0b01110111
284 0x5364FFFFu, // 0b01111000
285 0x53640FFFu, // 0b01111001
286 0x53164FFFu, // 0b01111010
287 0x531640FFu, // 0b01111011
288 0x53642FFFu, // 0b01111100
289 0x536420FFu, // 0b01111101
290 0x531642FFu, // 0b01111110
291 0x5316420Fu, // 0b01111111
292 0x7FFFFFFFu, // 0b10000000
293 0x70FFFFFFu, // 0b10000001
294 0x71FFFFFFu, // 0b10000010
295 0x710FFFFFu, // 0b10000011
296 0x72FFFFFFu, // 0b10000100
297 0x720FFFFFu, // 0b10000101
298 0x712FFFFFu, // 0b10000110
299 0x7120FFFFu, // 0b10000111
300 0x73FFFFFFu, // 0b10001000
301 0x730FFFFFu, // 0b10001001
302 0x731FFFFFu, // 0b10001010
303 0x7310FFFFu, // 0b10001011
304 0x732FFFFFu, // 0b10001100
305 0x7320FFFFu, // 0b10001101
306 0x7312FFFFu, // 0b10001110
307 0x73120FFFu, // 0b10001111
308 0x74FFFFFFu, // 0b10010000
309 0x740FFFFFu, // 0b10010001
310 0x714FFFFFu, // 0b10010010
311 0x7140FFFFu, // 0b10010011
312 0x742FFFFFu, // 0b10010100
313 0x7420FFFFu, // 0b10010101
314 0x7142FFFFu, // 0b10010110
315 0x71420FFFu, // 0b10010111
316 0x734FFFFFu, // 0b10011000
317 0x7340FFFFu, // 0b10011001
318 0x7314FFFFu, // 0b10011010
319 0x73140FFFu, // 0b10011011
320 0x7342FFFFu, // 0b10011100
321 0x73420FFFu, // 0b10011101
322 0x73142FFFu, // 0b10011110
323 0x731420FFu, // 0b10011111
324 0x75FFFFFFu, // 0b10100000
325 0x750FFFFFu, // 0b10100001
326 0x751FFFFFu, // 0b10100010
327 0x7510FFFFu, // 0b10100011
328 0x752FFFFFu, // 0b10100100
329 0x7520FFFFu, // 0b10100101
330 0x7512FFFFu, // 0b10100110
331 0x75120FFFu, // 0b10100111
332 0x753FFFFFu, // 0b10101000
333 0x7530FFFFu, // 0b10101001
334 0x7531FFFFu, // 0b10101010
335 0x75310FFFu, // 0b10101011
336 0x7532FFFFu, // 0b10101100
337 0x75320FFFu, // 0b10101101
338 0x75312FFFu, // 0b10101110
339 0x753120FFu, // 0b10101111
340 0x754FFFFFu, // 0b10110000
341 0x7540FFFFu, // 0b10110001
342 0x7514FFFFu, // 0b10110010
343 0x75140FFFu, // 0b10110011
344 0x7542FFFFu, // 0b10110100
345 0x75420FFFu, // 0b10110101
346 0x75142FFFu, // 0b10110110
347 0x751420FFu, // 0b10110111
348 0x7534FFFFu, // 0b10111000
349 0x75340FFFu, // 0b10111001
350 0x75314FFFu, // 0b10111010
351 0x753140FFu, // 0b10111011
352 0x75342FFFu, // 0b10111100
353 0x753420FFu, // 0b10111101
354 0x753142FFu, // 0b10111110
355 0x7531420Fu, // 0b10111111
356 0x76FFFFFFu, // 0b11000000
357 0x760FFFFFu, // 0b11000001
358 0x716FFFFFu, // 0b11000010
359 0x7160FFFFu, // 0b11000011
360 0x762FFFFFu, // 0b11000100
361 0x7620FFFFu, // 0b11000101
362 0x7162FFFFu, // 0b11000110
363 0x71620FFFu, // 0b11000111
364 0x736FFFFFu, // 0b11001000
365 0x7360FFFFu, // 0b11001001
366 0x7316FFFFu, // 0b11001010
367 0x73160FFFu, // 0b11001011
368 0x7362FFFFu, // 0b11001100
369 0x73620FFFu, // 0b11001101
370 0x73162FFFu, // 0b11001110
371 0x731620FFu, // 0b11001111
372 0x764FFFFFu, // 0b11010000
373 0x7640FFFFu, // 0b11010001
374 0x7164FFFFu, // 0b11010010
375 0x71640FFFu, // 0b11010011
376 0x7642FFFFu, // 0b11010100
377 0x76420FFFu, // 0b11010101
378 0x71642FFFu, // 0b11010110
379 0x716420FFu, // 0b11010111
380 0x7364FFFFu, // 0b11011000
381 0x73640FFFu, // 0b11011001
382 0x73164FFFu, // 0b11011010
383 0x731640FFu, // 0b11011011
384 0x73642FFFu, // 0b11011100
385 0x736420FFu, // 0b11011101
386 0x731642FFu, // 0b11011110
387 0x7316420Fu, // 0b11011111
388 0x756FFFFFu, // 0b11100000
389 0x7560FFFFu, // 0b11100001
390 0x7516FFFFu, // 0b11100010
391 0x75160FFFu, // 0b11100011
392 0x7562FFFFu, // 0b11100100
393 0x75620FFFu, // 0b11100101
394 0x75162FFFu, // 0b11100110
395 0x751620FFu, // 0b11100111
396 0x7536FFFFu, // 0b11101000
397 0x75360FFFu, // 0b11101001
398 0x75316FFFu, // 0b11101010
399 0x753160FFu, // 0b11101011
400 0x75362FFFu, // 0b11101100
401 0x753620FFu, // 0b11101101
402 0x753162FFu, // 0b11101110
403 0x7531620Fu, // 0b11101111
404 0x7564FFFFu, // 0b11110000
405 0x75640FFFu, // 0b11110001
406 0x75164FFFu, // 0b11110010
407 0x751640FFu, // 0b11110011
408 0x75642FFFu, // 0b11110100
409 0x756420FFu, // 0b11110101
410 0x751642FFu, // 0b11110110
411 0x7516420Fu, // 0b11110111
412 0x75364FFFu, // 0b11111000
413 0x753640FFu, // 0b11111001
414 0x753164FFu, // 0b11111010
415 0x7531640Fu, // 0b11111011
416 0x753642FFu, // 0b11111100
417 0x7536420Fu, // 0b11111101
418 0x7531642Fu, // 0b11111110
419 0x75316420u // 0b11111111
420};
421
422template <SIMDWidth>
423struct StreamOutResult;
424
425template <>
426struct StreamOutResult<SIMDWidth::SSE> {
427 uint32_t nElemens;
428 __m128i streamOutVec;
429};
430
431inline StreamOutResult<SIMDWidth::SSE> streamOut(const __m128i* __restrict__ stateVec, const __m128i* __restrict__ cmpVec) noexcept
432{
433 auto shifted1 = _mm_slli_epi64(stateVec[1], 32);
434
435 __m128i statesFused = _mm_blend_epi16(stateVec[0], shifted1, 0b11001100);
436 __m128i cmpFused = _mm_blend_epi16(cmpVec[0], cmpVec[1], 0b11001100);
437 const uint32_t id = _mm_movemask_ps(_mm_castsi128_ps(cmpFused));
438
439 __m128i permutationMask = load(SSEStreamOutLUT[id]);
440 __m128i streamOutVec = _mm_shuffle_epi8(statesFused, permutationMask);
441
442 return {static_cast<uint32_t>(_mm_popcnt_u32(id)), streamOutVec};
443};
444
445#ifdef RANS_AVX2
446template <>
447struct StreamOutResult<SIMDWidth::AVX> {
448 uint32_t nElemens;
449 __m256i streamOutVec;
450};
451
452inline StreamOutResult<SIMDWidth::AVX> streamOut(const __m256i* __restrict__ stateVec, const __m256i* __restrict__ cmpVec) noexcept
453{
454 auto shifted1 = _mm256_slli_epi64(stateVec[1], 32);
455
456 __m256i statesFused = _mm256_blend_epi32(stateVec[0], shifted1, 0b10101010);
457 __m256i cmpFused = _mm256_blend_epi32(cmpVec[0], cmpVec[1], 0b10101010);
458 statesFused = _mm256_and_si256(statesFused, cmpFused);
459 const uint32_t id = _mm256_movemask_ps(_mm256_castsi256_ps(cmpFused));
460
461 __m256i permutationMask = _mm256_set1_epi32(AVXStreamOutLUT[id]);
462 constexpr epi32_t<SIMDWidth::AVX> mask{0xF0000000u, 0x0F000000u, 0x00F00000u, 0x000F0000u, 0x0000F000u, 0x00000F00u, 0x000000F0u, 0x0000000Fu};
463 permutationMask = _mm256_and_si256(permutationMask, load(mask));
464 constexpr epi32_t<SIMDWidth::AVX> shift{28u, 24u, 20u, 16u, 12u, 8u, 4u, 0u};
465 permutationMask = _mm256_srlv_epi32(permutationMask, load(shift));
466 __m256i streamOutVec = _mm256_permutevar8x32_epi32(statesFused, permutationMask);
467
468 return {static_cast<uint32_t>(_mm_popcnt_u32(id)), streamOutVec};
469};
470
471#endif /* RANS_AVX2 */
472
473template <SIMDWidth, typename output_IT>
474struct RenormResult;
475
476template <typename output_IT>
477struct RenormResult<SIMDWidth::SSE, output_IT> {
478 output_IT outputIter;
479 __m128i newState;
480};
481
482#ifdef RANS_AVX2
483template <typename output_IT>
484struct RenormResult<SIMDWidth::AVX, output_IT> {
485 output_IT outputIter;
486 __m256i newState;
487};
488#endif /* RANS_AVX2 */
489
490template <typename output_IT, uint64_t lowerBound_V, uint8_t streamBits_V>
491inline output_IT ransRenorm(const __m128i* __restrict__ state, const __m128i* __restrict__ frequency, uint8_t symbolTablePrecisionBits, output_IT outputIter, __m128i* __restrict__ newState) noexcept
492{
493 __m128i maxState[2];
494 __m128i cmp[2];
495
496 // calculate maximum state
497 maxState[0] = computeMaxState<SIMDWidth::SSE, lowerBound_V, streamBits_V>(frequency[0], symbolTablePrecisionBits);
498 maxState[1] = computeMaxState<SIMDWidth::SSE, lowerBound_V, streamBits_V>(frequency[1], symbolTablePrecisionBits);
499 // cmp = (state >= maxState)
500 cmp[0] = cmpgeq_epi64(state[0], maxState[0]);
501 cmp[1] = cmpgeq_epi64(state[1], maxState[1]);
502 // newState = (state >= maxState) ? state >> streamBits_V : state
503 newState[0] = computeNewState<streamBits_V>(state[0], cmp[0]);
504 newState[1] = computeNewState<streamBits_V>(state[1], cmp[1]);
505
506 auto [nStreamOutWords, streamOutResult] = streamOut(state, cmp);
507 if constexpr (std::is_pointer_v<output_IT>) {
508 _mm_storeu_si128(reinterpret_cast<__m128i*>(outputIter), streamOutResult);
509 outputIter += nStreamOutWords;
510 } else {
511 auto result = store<uint32_t>(streamOutResult);
512 for (size_t i = 0; i < nStreamOutWords; ++i) {
513 *outputIter = result(i);
514 ++outputIter;
515 }
516 }
517
518 return outputIter;
519};
520
521#ifdef RANS_AVX2
522template <typename output_IT, uint64_t lowerBound_V, uint8_t streamBits_V>
523inline output_IT ransRenorm(const __m256i* state, const __m128i* __restrict__ frequency, uint8_t symbolTablePrecisionBits, output_IT outputIter, __m256i* __restrict__ newState) noexcept
524{
525 __m256i maxState[2];
526 __m256i cmp[2];
527
528 // calculate maximum state
529 maxState[0] = computeMaxState<SIMDWidth::AVX, lowerBound_V, streamBits_V>(frequency[0], symbolTablePrecisionBits);
530 maxState[1] = computeMaxState<SIMDWidth::AVX, lowerBound_V, streamBits_V>(frequency[1], symbolTablePrecisionBits);
531 // cmp = (state >= maxState)
532 cmp[0] = cmpgeq_epi64(state[0], maxState[0]);
533 cmp[1] = cmpgeq_epi64(state[1], maxState[1]);
534 // newState = (state >= maxState) ? state >> streamBits_V : state
535 newState[0] = computeNewState<streamBits_V>(state[0], cmp[0]);
536 newState[1] = computeNewState<streamBits_V>(state[1], cmp[1]);
537
538 auto [nStreamOutWords, streamOutResult] = streamOut(state, cmp);
539 if constexpr (std::is_pointer_v<output_IT>) {
540 _mm256_storeu_si256(reinterpret_cast<__m256i*>(outputIter), streamOutResult);
541 outputIter += nStreamOutWords;
542 } else {
543 auto result = store<uint32_t>(streamOutResult);
544 for (size_t i = 0; i < nStreamOutWords; ++i) {
545 *outputIter = result(i);
546 ++outputIter;
547 }
548 }
549
550 return outputIter;
551};
552#endif /* RANS_AVX2 */
553
554struct UnrolledSymbols {
555 __m128i frequencies[2];
556 __m128i cumulativeFrequencies[2];
557};
558
559} // namespace o2::rans::internal::simd
560
561#endif /* RANS_SIMD */
562#endif /* RANS_INTERNAL_ENCODE_SIMDKERNEL_H_ */
Memory aligned array used for SIMD operations.
benchmark::State & state
int32_t i
Contains statistical information for one source symbol, required for encoding/decoding.
common helper classes and functions
preprocessor defines to enable features based on CPU architecture
GLuint64EXT * result
Definition glcorearb.h:5662
GLboolean * data
Definition glcorearb.h:298
GLint GLuint mask
Definition glcorearb.h:291
auto make_span(const o2::rans::internal::simd::AlignedArray< T, width_V, size_V > &array)
uint8_t itsSharedClusterMap uint8_t
constexpr T log2UIntNZ(T x) noexcept
Definition utils.h:90
constexpr size_t pow2(size_t n) noexcept
Definition utils.h:165
wrapper around basic SIMD operations
basic SIMD datatypes and traits
std::vector< o2::ctf::BufferType > vec
char const *restrict const cmp
Definition x9.h:96