28void FT0DataDecoderDPLSpec::run(ProcessingContext& pc)
30 auto t1 = std::chrono::high_resolution_clock::now();
31 auto dummyOutput = [&pc,
this]() {
32 this->mVecDigits.resize(0);
34 this->mVecChannelData.resize(0);
41 static size_t contDeadBeef = 0;
43 for (
const auto&
ref : InputRecordWalker(pc.inputs(), dummy)) {
44 const auto dh = o2::framework::DataRefUtils::getHeader<o2::header::DataHeader*>(
ref);
46 if (payloadSize == 0) {
48 if (++contDeadBeef <= maxWarn) {
49 LOGP(alarm,
"Found input [{}/{}/{:#x}] TF#{} 1st_orbit:{} Payload {} : assuming no payload for all links in this TF{}",
50 dh->dataOrigin.str, dh->dataDescription.str, dh->subSpecification, dh->tfCounter, dh->firstTForbit, payloadSize,
51 contDeadBeef == maxWarn ? fmt::format(
". {} such inputs in row received, stopping reporting", contDeadBeef) :
"");
60 DPLRawParser parser(pc.inputs(),
filter);
62 using ArrRdhPtrPerLink = std::array<std::vector<const o2::header::RAWDataHeader*>, sNlinksMax>;
63 using ArrDataPerLink = std::array<std::vector<gsl::span<const uint8_t>>, sNlinksMax>;
64 std::array<ArrRdhPtrPerLink, sNorbits> arrRdhPtrPerOrbit{};
65 std::array<ArrDataPerLink, sNorbits> arrDataPerOrbit{};
66 std::array<std::vector<const o2::header::RAWDataHeader*>, sNorbits> arrRdhTCMperOrbit{};
67 std::array<std::vector<gsl::span<const uint8_t>>, sNorbits> arrDataTCMperOrbit{};
68 std::array<std::size_t, sNorbits> arrOrbitSizePages{};
69 std::array<std::size_t, sNorbits> arrOrbitSizePagesTCM{};
70 std::array<uint64_t, sNorbits> arrOrbit{};
72 for (
auto it = parser.begin(),
end = parser.end(); it !=
end; ++it) {
80 if (verRDH < 5 || verRDH > o2::raw::RDHUtils::getVersion<o2::header::RDHHighest>()) {
81 LOGP(alarm,
"Invalid RDH version {}, abandoning TF sending dummy output", verRDH);
85 }
catch (std::exception& e) {
86 LOG(alarm) <<
"Failed to extract RDH, abandoning TF sending dummy output, exception was: " << e.what();
90 auto orb = o2::raw::RDHUtils::getHeartBeatOrbit(rdhPtr);
91 const uint16_t orbitTF = (orb) % 256;
93 arrOrbit[orbitTF] = orb;
94 const auto& linkID = o2::raw::RDHUtils::getLinkID(rdhPtr);
95 const auto& endPoint = o2::raw::RDHUtils::getEndPointID(rdhPtr);
96 const uint16_t feeID = linkID + 12 * endPoint;
97 if (feeID == mFEEID_TCM) {
99 arrOrbitSizePagesTCM[orbitTF] += it.size();
101 arrDataTCMperOrbit[orbitTF].emplace_back(it.data(), it.size());
103 arrOrbitSizePages[orbitTF] += it.size();
105 arrDataPerOrbit[orbitTF][feeID].emplace_back(it.data(), it.size());
108 uint64_t chPosOrbit{0};
109 uint64_t eventPosPerOrbit{0};
111 uint64_t posChDataPerOrbit[sNorbits]{};
112 uint64_t nChDataPerOrbit[sNorbits]{};
113 NChDataBC_t posChDataPerBC[sNorbits]{};
114 for (
int iOrbit = 0; iOrbit < sNorbits; iOrbit++) {
115 const auto&
orbit = arrOrbit[iOrbit];
116 NChDataOrbitBC_t bufBC{};
117 NChDataBC_t buf_nChPerBC{};
118 if (arrOrbitSizePages[iOrbit] > 0) {
119 for (
int iFeeID = 0; iFeeID < sNlinksMax; iFeeID++) {
120 if (iFeeID == mFEEID_TCM) {
123 const auto& nPages = arrRdhPtrPerOrbit[iOrbit][iFeeID].size();
125 for (
int iPage = 0; iPage < nPages; iPage++) {
126 const auto& rdhPtr = arrRdhPtrPerOrbit[iOrbit][iFeeID][iPage];
127 const auto& payload = arrDataPerOrbit[iOrbit][iFeeID][iPage].data();
128 const auto& payloadSize = arrDataPerOrbit[iOrbit][iFeeID][iPage].size();
130 const auto nNGBTwords = payloadSize / 16;
131 const int nNGBTwordsDiff = nNGBTwords % 16;
132 const int nChunks = nNGBTwords / 16 +
static_cast<int>(nNGBTwordsDiff > 0);
133 const auto lastChunk = nChunks - 1;
134 const uint16_t
mask = (0xffff << (16 - nNGBTwordsDiff)) | (0xffff * (nNGBTwordsDiff == 0));
135 __m512i zmm_pos1 = _mm512_set_epi32(0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240);
136 __m512i zmm_pos2 = _mm512_set1_epi32(6);
137 zmm_pos2 = _mm512_add_epi32(zmm_pos1, zmm_pos2);
138 for (
int iChunk = 0; iChunk < nChunks; iChunk++) {
139 __mmask16 mask16_MaxGBTwords = _mm512_int2mask((0xffff * (iChunk != lastChunk)) |
mask);
140 __m512i zmm_mask_zero = _mm512_setzero_epi32();
142 __m512i zmm_src_column0_part0 = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_MaxGBTwords, zmm_pos1,
src, 1);
143 __m512i zmm_src_column1_part1 = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_MaxGBTwords, zmm_pos2,
src, 1);
145 __m512i zmm_mask_localChID = _mm512_set1_epi32(0xf);
146 __m512i zmm_buf = _mm512_srai_epi32(zmm_src_column1_part1, 28);
147 __m512i zmm_ChID_column1 = _mm512_and_epi32(zmm_buf, zmm_mask_localChID);
149 __mmask16 mask16_header = _mm512_cmpeq_epi32_mask(zmm_ChID_column1, zmm_mask_localChID);
151 zmm_buf = _mm512_srai_epi32(zmm_src_column1_part1, 24);
152 __m512i zmm_NGBTwords = _mm512_maskz_and_epi32(mask16_header, zmm_buf, zmm_mask_localChID);
155 __mmask16 mask16_header_final = _mm512_mask_cmpgt_epu32_mask(mask16_header, zmm_NGBTwords, zmm_mask_zero);
158 __m512i zmm_mask_time = _mm512_set1_epi32(0xfff);
159 __m512i zmm_bc = _mm512_mask_and_epi32(zmm_mask_zero, mask16_header_final, zmm_src_column0_part0, zmm_mask_time);
162 __m512i zmm_Nchannels = _mm512_slli_epi32(zmm_NGBTwords, 1);
164 __m512i zmm_last_word_pos = zmm_NGBTwords;
165 zmm_last_word_pos = _mm512_slli_epi32(zmm_last_word_pos, 4);
166 zmm_last_word_pos = _mm512_add_epi32(zmm_last_word_pos, zmm_pos1);
167 zmm_buf = _mm512_i32gather_epi32(zmm_last_word_pos,
src + 8, 1);
168 zmm_buf = _mm512_srai_epi32(zmm_buf, 12);
169 __m512i zmm_ChID_last_column1 = _mm512_and_epi32(zmm_buf, zmm_mask_localChID);
171 __mmask16 mask16_half_word = _mm512_cmpeq_epi32_mask(zmm_ChID_last_column1, zmm_mask_zero);
172 __m512i zmm_mask_one = _mm512_set1_epi32(1);
173 __m512i zmm_Nch = _mm512_mask_sub_epi32(zmm_Nchannels, mask16_half_word, zmm_Nchannels, zmm_mask_one);
174 __m512i zmm_nEventPerBC = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_header_final, zmm_bc, buf_nChPerBC.data(), 4);
175 zmm_buf = _mm512_add_epi32(zmm_nEventPerBC, zmm_Nch);
176 _mm512_mask_i32scatter_epi32(buf_nChPerBC.data(), mask16_header_final, zmm_bc, zmm_buf, 4);
178 zmm_buf = _mm512_set1_epi32(256);
179 zmm_pos1 = _mm512_add_epi32(zmm_buf, zmm_pos1);
180 zmm_pos2 = _mm512_add_epi32(zmm_buf, zmm_pos2);
184 if (iFeeID != sNlinksMax - 1) {
185 memcpy(bufBC[iFeeID + 1].
data(), buf_nChPerBC.data(), (sNBC + 4) * 4);
190 memcpy(mPosChDataPerLinkOrbit[iOrbit].
data(), bufBC.data(), (sNBC + 4) * 4 * sNlinksMax);
194 if (arrOrbitSizePagesTCM[iOrbit] > 0) {
195 memset(mVecTriggers.data(), 0, 16 * 3564);
196 const auto& nPagesTCM = arrRdhTCMperOrbit[iOrbit].size();
197 for (
int iPage = 0; iPage < nPagesTCM; iPage++) {
198 const auto& rdhPtr = arrRdhTCMperOrbit[iOrbit][iPage];
199 const auto& payload = arrDataTCMperOrbit[iOrbit][iPage].data();
200 const auto& payloadSize = arrDataTCMperOrbit[iOrbit][iPage].size();
202 const auto nNGBTwords = payloadSize / 16;
203 const int nNGBTwordsDiff = nNGBTwords % 16;
204 const int nChunks = nNGBTwords / 16 +
static_cast<int>(nNGBTwordsDiff > 0);
205 const auto lastChunk = nChunks - 1;
206 const uint16_t
mask = (0xffff << (16 - nNGBTwordsDiff)) | (0xffff * (nNGBTwordsDiff == 0));
207 __m512i zmm_pos1 = _mm512_set_epi32(0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240);
208 for (
int iChunk = 0; iChunk < nChunks; iChunk++) {
209 __mmask16 mask16_MaxGBTwords = _mm512_int2mask((0xffff * (iChunk != lastChunk)) |
mask);
210 __m512i zmm_mask_zero = _mm512_setzero_epi32();
212 __m512i zmm_src_header = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_MaxGBTwords, zmm_pos1,
src + 9, 1);
214 __m512i zmm_mask_header = _mm512_set1_epi32(0xf1);
216 __mmask16 mask16_header = _mm512_cmpeq_epi32_mask(zmm_src_header, zmm_mask_header);
218 __m512i zmm_bc = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_MaxGBTwords, zmm_pos1,
src, 1);
219 __m512i zmm_mask_12bit = _mm512_set1_epi32(0xfff);
221 zmm_bc = _mm512_maskz_and_epi32(mask16_header, zmm_mask_12bit, zmm_bc);
223 __m512i zmm_pos2 = _mm512_set1_epi32(16);
224 zmm_pos2 = _mm512_maskz_add_epi32(mask16_header, zmm_pos1, zmm_pos2);
226 __m512i zmm_src_part0 = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_header, zmm_pos2,
src, 1);
227 __m512i zmm_src_part1 = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_header, zmm_pos2,
src + 3, 1);
228 __m512i zmm_src_part2 = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_header, zmm_pos2,
src + 7, 1);
230 __m512i zmm_mask_3byte = _mm512_set1_epi32(0xffffff);
231 __m512i zmm_dst_part0 = _mm512_and_epi32(zmm_src_part0, zmm_mask_3byte);
233 __m512i zmm_mask_17bit = _mm512_set1_epi32(0x1ffff);
234 __m512i zmm_dst_part1 = _mm512_and_epi32(zmm_src_part1, zmm_mask_17bit);
236 __m512i zmm_dst_part2 = _mm512_srai_epi32(zmm_src_part1, 18);
237 __m512i zmm_mask_14bit = _mm512_set1_epi32(0b11111111111111);
238 zmm_dst_part2 = _mm512_and_epi32(zmm_mask_14bit, zmm_dst_part2);
240 __m512i zmm_buf = _mm512_slli_epi32(zmm_src_part2, 14);
241 __m512i zmm_mask_3bit = _mm512_set1_epi32(0b11100000000000);
242 zmm_buf = _mm512_and_epi32(zmm_mask_3bit, zmm_buf);
243 zmm_dst_part2 = _mm512_or_epi32(zmm_dst_part2, zmm_buf);
245 __m512i zmm_dst_part3 = _mm512_srai_epi32(zmm_src_part2, 4);
246 __m512i zmm_mask_9bit = _mm512_set1_epi32(0x1ff);
247 zmm_dst_part3 = _mm512_and_epi32(zmm_dst_part3, zmm_mask_9bit);
249 zmm_buf = _mm512_slli_epi32(zmm_src_part2, 2);
250 __m512i zmm_mask_9bit_2 = _mm512_set1_epi32(0x1ff0000);
251 zmm_buf = _mm512_and_epi32(zmm_buf, zmm_mask_9bit_2);
253 zmm_dst_part3 = _mm512_or_epi32(zmm_buf, zmm_dst_part3);
255 __m512i zmm_dst_pos = _mm512_slli_epi32(zmm_bc, 4);
257 _mm512_mask_i32scatter_epi32(ptrDstTCM, mask16_header, zmm_dst_pos, zmm_dst_part0, 1);
258 _mm512_mask_i32scatter_epi32(ptrDstTCM + 4, mask16_header, zmm_dst_pos, zmm_dst_part1, 1);
259 _mm512_mask_i32scatter_epi32(ptrDstTCM + 8, mask16_header, zmm_dst_pos, zmm_dst_part2, 1);
260 _mm512_mask_i32scatter_epi32(ptrDstTCM + 12, mask16_header, zmm_dst_pos, zmm_dst_part3, 1);
262 zmm_buf = _mm512_set1_epi32(256);
263 zmm_pos1 = _mm512_add_epi32(zmm_buf, zmm_pos1);
267 NChDataBC_t buf_nPosPerBC{};
268 uint64_t nChPerBC{0};
269 uint64_t nEventOrbit{0};
270 posChDataPerOrbit[iOrbit] = chPosOrbit;
272 __m512i zmm_mask_seq2 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
273 __m512i zmm_pos2 = _mm512_set_epi32(75, 70, 65, 60, 55, 50, 45, 40, 35, 30, 25, 20, 15, 10, 5, 0);
274 for (
int iBC = 0; iBC < sNBC; iBC += 16) {
275 uint32_t buf0[16], buf1[16], buf2[16], buf3[16];
277 __m512i zmm_nChPerBC = _mm512_loadu_si512(&buf_nChPerBC[iBC]);
280 buf_nPosPerBC[iBC + N] = nChPerBC; \
281 nChPerBC += buf_nChPerBC[iBC + N];
302 __m512i zmm_mask_zero = _mm512_setzero_epi32();
303 __mmask16 mask16_bc = _mm512_cmpneq_epi32_mask(zmm_nChPerBC, zmm_mask_zero);
304 __m512i zmm_pos3 = _mm512_maskz_expand_epi32(mask16_bc, zmm_pos2);
305 const auto nEvents = _mm_popcnt_u32(_cvtmask16_u32(mask16_bc));
307 __m512i zmm_pos = _mm512_loadu_si512(&buf_nPosPerBC[iBC]);
308 __m512i zmm_pos_per_orbit = _mm512_set1_epi32(chPosOrbit);
309 zmm_pos = _mm512_add_epi32(zmm_pos, zmm_pos_per_orbit);
310 __m512i zmm_bc = _mm512_set1_epi32(iBC);
311 zmm_bc = _mm512_add_epi32(zmm_bc, zmm_mask_seq2);
312 __m512i zmm_orbit = _mm512_set1_epi32(
orbit);
314 _mm512_mask_i32scatter_epi32(
dst, mask16_bc, zmm_pos3, zmm_pos, 8);
315 _mm512_mask_i32scatter_epi32(
dst + 4, mask16_bc, zmm_pos3, zmm_nChPerBC, 8);
316 _mm512_mask_i32scatter_epi32(
dst + 28, mask16_bc, zmm_pos3, zmm_bc, 8);
317 _mm512_mask_i32scatter_epi32(
dst + 32, mask16_bc, zmm_pos3, zmm_orbit, 8);
319 __m512i zmm_src_pos = _mm512_slli_epi32(zmm_bc, 4);
321 __m512i zmm_dst_part0 = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_bc, zmm_src_pos, ptrDstTCM, 1);
322 __m512i zmm_dst_part1 = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_bc, zmm_src_pos, ptrDstTCM + 4, 1);
323 __m512i zmm_dst_part2 = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_bc, zmm_src_pos, ptrDstTCM + 8, 1);
324 __m512i zmm_dst_part3 = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_bc, zmm_src_pos, ptrDstTCM + 12, 1);
326 _mm512_mask_i32scatter_epi32(
dst + 8, mask16_bc, zmm_pos3, zmm_dst_part0, 8);
327 _mm512_mask_i32scatter_epi32(
dst + 12, mask16_bc, zmm_pos3, zmm_dst_part1, 8);
328 _mm512_mask_i32scatter_epi32(
dst + 16, mask16_bc, zmm_pos3, zmm_dst_part2, 8);
329 _mm512_mask_i32scatter_epi32(
dst + 20, mask16_bc, zmm_pos3, zmm_dst_part3, 8);
333 const uint64_t buf_nChDataPerOrbit = nChPerBC;
334 chPosOrbit += nChPerBC;
336 nChDataPerOrbit[iOrbit] = nChPerBC;
337 eventPosPerOrbit += nEventOrbit;
338 memcpy(posChDataPerBC[iOrbit].
data(), buf_nPosPerBC.data(), (sNBC + 4) * 4);
342 mVecDigits.resize(eventPosPerOrbit);
343 mVecChannelData.resize(chPosOrbit);
345 for (
int iOrbit = 0; iOrbit < sNorbits; iOrbit++) {
346 if (!arrOrbitSizePages[iOrbit]) {
349 const auto&
orbit = arrOrbit[iOrbit];
350 const auto& posChDataOrbit = posChDataPerOrbit[iOrbit];
351 const auto& ptrPosChDataPerBC = posChDataPerBC[iOrbit].data();
352 void* ptrDst = (
void*)mVecChannelDataBuf.data();
353 for (
int iLink = 0; iLink < sNlinksMax; iLink++) {
354 if (iLink == mFEEID_TCM) {
357 const auto& nPages = arrRdhPtrPerOrbit[iOrbit][iLink].size();
358 const auto& ptrChDataPosPerLinks = mPosChDataPerLinkOrbit[iOrbit][iLink].data();
359 void const* lutPerLink = &mLUT[iLink][0];
360 __m512i zmm_lut = _mm512_loadu_si512((
void const*)lutPerLink);
361 for (
int iPage = 0; iPage < nPages; iPage++) {
362 const auto& rdhPtr = arrRdhPtrPerOrbit[iOrbit][iLink][iPage];
363 const auto& payload = arrDataPerOrbit[iOrbit][iLink][iPage].data();
364 const auto& payloadSize = arrDataPerOrbit[iOrbit][iLink][iPage].size();
366 const auto nNGBTwords = payloadSize / 16;
367 const int nNGBTwordsDiff = nNGBTwords % 16;
368 const int nChunks = nNGBTwords / 16 +
static_cast<int>(nNGBTwordsDiff > 0);
369 const auto lastChunk = nChunks - 1;
370 const uint16_t
mask = (0xffff << (16 - nNGBTwordsDiff)) | (0xffff * (nNGBTwordsDiff == 0));
374 uint8_t nGBTwordPrevChunkDiff{0};
375 bool firstWordIsNotHeader{
false};
377 __m512i zmm_pos1 = _mm512_set_epi32(0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240);
378 for (
int iChunk = 0; iChunk < nChunks; iChunk++) {
379 __m512i zmm_mask_charge = _mm512_set1_epi32(0x1fff0000);
380 __m512i zmm_mask_PMbits = _mm512_set1_epi32(0xff00);
381 __m512i zmm_mask_localChID = _mm512_set1_epi32(0xf);
382 __m512i zmm_mask_time = _mm512_set1_epi32(0xfff);
384 __m512i zmm_buf, zmm_buf2, zmm_buf3;
386 __mmask16 mask16_MaxGBTwords = _mm512_int2mask((0xffff * (iChunk != lastChunk)) |
mask);
387 __m512i zmm_mask_zero = _mm512_setzero_epi32();
389 __m512i zmm_src_part0 = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_MaxGBTwords, zmm_pos1,
src, 1);
390 __m512i zmm_src_part1 = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_MaxGBTwords, zmm_pos1,
src + 3, 1);
391 __m512i zmm_src_part2 = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_MaxGBTwords, zmm_pos1,
src + 6, 1);
394 __m512i zmm_src_column0_time = _mm512_and_epi32(zmm_src_part0, zmm_mask_time);
396 zmm_buf = _mm512_slli_epi32(zmm_src_part0, 4);
397 __m512i zmm_src_column0_charge = _mm512_and_epi32(zmm_buf, zmm_mask_charge);
398 __m512i zmm_dst_column0_part1 = _mm512_or_epi32(zmm_src_column0_time, zmm_src_column0_charge);
400 zmm_buf = _mm512_slli_epi32(zmm_src_part1, 7);
401 __m512i zmm_src_column0_PMbits = _mm512_and_epi32(zmm_buf, zmm_mask_PMbits);
403 zmm_buf = _mm512_srai_epi32(zmm_src_part1, 12);
404 __m512i zmm_dst_column0_chID = _mm512_and_epi32(zmm_buf, zmm_mask_localChID);
405 __m512i zmm_dst_column0_globalChID = _mm512_permutexvar_epi32(zmm_dst_column0_chID, zmm_lut);
406 __m512i zmm_dst_column0_part0 = _mm512_or_epi32(zmm_dst_column0_globalChID, zmm_src_column0_PMbits);
409 zmm_buf = _mm512_srai_epi32(zmm_src_part1, 16);
410 __m512i zmm_src_column1_time = _mm512_and_epi32(zmm_buf, zmm_mask_time);
412 zmm_buf = _mm512_slli_epi32(zmm_src_part2, 12);
413 __m512i zmm_src_column1_charge = _mm512_and_epi32(zmm_buf, zmm_mask_charge);
414 __m512i zmm_dst_column1_part1 = _mm512_or_epi32(zmm_src_column1_time, zmm_src_column1_charge);
417 zmm_buf = _mm512_srai_epi32(zmm_src_part2, 9);
418 __m512i zmm_src_column1_PMbits = _mm512_and_epi32(zmm_buf, zmm_mask_PMbits);
420 zmm_buf = _mm512_srai_epi32(zmm_src_part2, 28);
421 __m512i zmm_dst_column1_chID = _mm512_and_epi32(zmm_buf, zmm_mask_localChID);
422 __m512i zmm_dst_column1_globalChID = _mm512_permutexvar_epi32(zmm_dst_column1_chID, zmm_lut);
423 __m512i zmm_dst_column1_part0 = _mm512_or_epi32(zmm_dst_column1_globalChID, zmm_src_column1_PMbits);
426 __mmask16 mask16_header = _mm512_cmpeq_epi32_mask(zmm_dst_column1_chID, zmm_mask_localChID);
427 zmm_buf = _mm512_srai_epi32(zmm_src_part2, 24);
428 __m512i zmm_NGBTwords = _mm512_maskz_and_epi32(mask16_header, zmm_buf, zmm_mask_localChID);
429 __mmask16 mask16_header_final = _mm512_mask_cmpgt_epu32_mask(mask16_header, zmm_NGBTwords, zmm_mask_zero);
430 __mmask16 mask16_data = _mm512_knot(mask16_header_final);
433 __m512i zmm_mask_maxLocalChID = _mm512_set1_epi32(12);
434 __mmask16 mask16_nonzeroChID = _mm512_mask_cmpneq_epi32_mask(mask16_data, zmm_dst_column1_chID, zmm_mask_zero);
435 __mmask16 mask16_column1_isData = _mm512_mask_cmple_epi32_mask(mask16_nonzeroChID, zmm_dst_column1_chID, zmm_mask_maxLocalChID);
437 mask16_nonzeroChID = _mm512_mask_cmpneq_epi32_mask(mask16_data, zmm_dst_column0_chID, zmm_mask_zero);
438 __mmask16 mask16_column0_isData = _mm512_mask_cmple_epi32_mask(mask16_nonzeroChID, zmm_dst_column0_chID, zmm_mask_maxLocalChID);
441 __m512i zmm_bc = _mm512_mask_and_epi32(zmm_mask_zero, mask16_header_final, zmm_src_part0, zmm_mask_time);
443 __m512i zmm_mask_seq2 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
445 __m512i zmm_column0_data_seq = _mm512_maskz_expand_epi32(mask16_column0_isData, zmm_mask_seq2);
446 __mmask16 mask16_buf = _mm512_kor(mask16_header_final, mask16_column0_isData);
447 __m512i zmm_column0_data2header = _mm512_maskz_expand_epi32(mask16_buf, zmm_mask_seq2);
449 __m512i zmm_column1_data_seq = _mm512_maskz_expand_epi32(mask16_column1_isData, zmm_mask_seq2);
450 mask16_buf = _mm512_kor(mask16_header_final, mask16_column1_isData);
451 __m512i zmm_column1_data2header = _mm512_maskz_expand_epi32(mask16_buf, zmm_mask_seq2);
453 zmm_column0_data2header = _mm512_maskz_sub_epi32(mask16_column0_isData, zmm_column0_data2header, zmm_column0_data_seq);
454 zmm_column1_data2header = _mm512_maskz_sub_epi32(mask16_column1_isData, zmm_column1_data2header, zmm_column1_data_seq);
456 __m512i zmm_column0_NGBTwords = _mm512_setzero_epi32();
457 __m512i zmm_column1_NGBTwords = zmm_NGBTwords;
458 __mmask16 mask16_header_first = _mm512_int2mask(firstWordIsNotHeader * 0b1000000000000000);
459 zmm_bc = _mm512_mask_set1_epi32(zmm_bc, mask16_header_first, firstBC);
461 zmm_column0_NGBTwords = _mm512_mask_set1_epi32(zmm_column0_NGBTwords, mask16_header_first, nGBTwordPrevChunkDiff);
462 zmm_column1_NGBTwords = _mm512_mask_set1_epi32(zmm_column1_NGBTwords, mask16_header_first, nGBTwordPrevChunk + nGBTwordPrevChunkDiff);
463 mask16_header_final = _mm512_kor(mask16_header_first, mask16_header_final);
464 uint32_t bufBC[16]{}, bufNGBTwords[16]{};
466 zmm_buf = _mm512_maskz_compress_epi32(mask16_header_final, zmm_bc);
467 zmm_buf2 = _mm512_i32gather_epi32(zmm_buf, ptrChDataPosPerLinks, 4);
468 zmm_buf3 = _mm512_i32gather_epi32(zmm_buf, ptrPosChDataPerBC, 4);
469 zmm_buf2 = _mm512_add_epi32(zmm_buf2, zmm_buf3);
471 __m512i zmm_column0_pos = _mm512_permutexvar_epi32(zmm_column0_data2header, zmm_buf2);
472 __m512i zmm_column1_pos = _mm512_permutexvar_epi32(zmm_column1_data2header, zmm_buf2);
475 __m512i zmm_mask_seq3 = _mm512_set_epi32(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
476 __m512i zmm_6 = _mm512_set1_epi32(6);
477 __m512i zmm_2 = _mm512_set1_epi32(2);
479 zmm_buf2 = _mm512_add_epi32(zmm_mask_seq2, zmm_column0_NGBTwords);
480 zmm_buf = _mm512_maskz_compress_epi32(mask16_header_final, zmm_buf2);
482 zmm_buf2 = _mm512_permutexvar_epi32(zmm_column0_data2header, zmm_buf);
483 zmm_buf = _mm512_maskz_sub_epi32(mask16_column0_isData, zmm_buf2, zmm_mask_seq3);
485 zmm_column0_pos = _mm512_maskz_add_epi32(mask16_column0_isData, zmm_column0_pos, zmm_buf);
487 __m512i zmm_column0_part0_pos = _mm512_maskz_mullo_epi32(mask16_column0_isData, zmm_column0_pos, zmm_6);
488 __m512i zmm_column0_part1_pos = _mm512_maskz_add_epi32(mask16_column0_isData, zmm_column0_part0_pos, zmm_2);
491 zmm_buf2 = _mm512_add_epi32(zmm_mask_seq2, zmm_column1_NGBTwords);
492 zmm_buf = _mm512_maskz_compress_epi32(mask16_header_final, zmm_buf2);
494 zmm_buf2 = _mm512_permutexvar_epi32(zmm_column1_data2header, zmm_buf);
495 zmm_buf = _mm512_maskz_sub_epi32(mask16_column1_isData, zmm_buf2, zmm_mask_seq3);
497 zmm_column1_pos = _mm512_maskz_add_epi32(mask16_column1_isData, zmm_column1_pos, zmm_buf);
498 __m512i zmm_column1_part0_pos = _mm512_maskz_mullo_epi32(mask16_column1_isData, zmm_column1_pos, zmm_6);
499 __m512i zmm_column1_part1_pos = _mm512_maskz_add_epi32(mask16_column1_isData, zmm_column1_part0_pos, zmm_2);
501 _mm512_mask_i32scatter_epi32(ptrDst, mask16_column0_isData, zmm_column0_part0_pos, zmm_dst_column0_part0, 1);
502 _mm512_mask_i32scatter_epi32(ptrDst, mask16_column1_isData, zmm_column1_part0_pos, zmm_dst_column1_part0, 1);
504 _mm512_mask_i32scatter_epi32(ptrDst, mask16_column0_isData, zmm_column0_part1_pos, zmm_dst_column0_part1, 1);
505 _mm512_mask_i32scatter_epi32(ptrDst, mask16_column1_isData, zmm_column1_part1_pos, zmm_dst_column1_part1, 1);
508 _mm512_storeu_si512(bufBC, zmm_bc);
509 _mm512_storeu_si512(bufNGBTwords, zmm_NGBTwords);
511 const uint32_t header32 = _cvtmask16_u32(mask16_header_final);
512 const uint16_t lastHeaderPos = (__builtin_ctz(header32)) * (header32 > 0);
513 firstBC = bufBC[lastHeaderPos];
514 nGBTwordPrevChunk = bufNGBTwords[lastHeaderPos];
515 nGBTwordPrevChunkDiff = lastHeaderPos;
516 firstWordIsNotHeader = nGBTwordPrevChunk != lastHeaderPos;
517 nGBTwordPrevChunkDiff++;
519 zmm_buf = _mm512_set1_epi32(256);
520 zmm_pos1 = _mm512_add_epi32(zmm_buf, zmm_pos1);
524 memcpy(&mVecChannelData[posChDataOrbit], mVecChannelDataBuf.data(), 6 * nChDataPerOrbit[iOrbit]);
526 if (mEnableEmptyTFprotection && mVecDigits.size() == 0) {
529 mVecChannelData.emplace_back();
533 auto t2 = std::chrono::high_resolution_clock::now();
534 auto delay = std::chrono::duration_cast<std::chrono::microseconds>(t2 -
t1);
535 LOG(
debug) <<
"Decoder delay: " << delay.count();
static const VerbosityConfig & Instance()
GLint GLint GLint GLint GLint GLint GLint GLbitfield GLenum filter
GLuint GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat t1
constexpr o2::header::DataDescription gDataDescriptionRawData
constexpr o2::header::DataOrigin gDataOriginFT0
uint8_t itsSharedClusterMap uint8_t
struct o2::upgrades_utils::@462 ft0
structure to keep V0C information
a couple of static helper functions to create timestamp values for CCDB queries or override obsolete ...
static o2::header::DataHeader::PayloadSizeType getPayloadSize(const DataRef &ref)
static constexpr int getVersion()
get numeric version of the RDH
LOG(info)<< "Compressed in "<< sw.CpuTime()<< " s"