Project
Loading...
Searching...
No Matches
FT0DataDecoderDPLSpec.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
13
16#include <numeric>
17#include <emmintrin.h>
18#include <immintrin.h>
19#include <algorithm>
20#include <cstdlib>
21#include <cstring>
22#include <string>
23namespace o2
24{
25namespace ft0
26{
27
28void FT0DataDecoderDPLSpec::run(ProcessingContext& pc)
29{
30 auto t1 = std::chrono::high_resolution_clock::now();
31 auto dummyOutput = [&pc, this]() {
32 this->mVecDigits.resize(0);
33 pc.outputs().snapshot(o2::framework::Output{o2::header::gDataOriginFT0, "DIGITSBC", 0}, mVecDigits);
34 this->mVecChannelData.resize(0);
35 pc.outputs().snapshot(o2::framework::Output{o2::header::gDataOriginFT0, "DIGITSCH", 0}, mVecChannelData);
36 };
37
38 // if we see requested data type input with 0xDEADBEEF subspec and 0 payload this means that the "delayed message"
39 // mechanism created it in absence of real data from upstream. Processor should send empty output to not block the workflow
40 {
41 static size_t contDeadBeef = 0; // number of times 0xDEADBEEF was seen continuously
42 std::vector<InputSpec> dummy{InputSpec{"dummy", ConcreteDataMatcher{"FT0", o2::header::gDataDescriptionRawData, 0xDEADBEEF}}};
43 for (const auto& ref : InputRecordWalker(pc.inputs(), dummy)) {
44 const auto dh = o2::framework::DataRefUtils::getHeader<o2::header::DataHeader*>(ref);
45 auto payloadSize = DataRefUtils::getPayloadSize(ref);
46 if (payloadSize == 0) {
48 if (++contDeadBeef <= maxWarn) {
49 LOGP(alarm, "Found input [{}/{}/{:#x}] TF#{} 1st_orbit:{} Payload {} : assuming no payload for all links in this TF{}",
50 dh->dataOrigin.str, dh->dataDescription.str, dh->subSpecification, dh->tfCounter, dh->firstTForbit, payloadSize,
51 contDeadBeef == maxWarn ? fmt::format(". {} such inputs in row received, stopping reporting", contDeadBeef) : "");
52 }
53 dummyOutput();
54 return;
55 }
56 }
57 contDeadBeef = 0; // if good data, reset the counter
58 }
59 std::vector<InputSpec> filter{InputSpec{"filter", ConcreteDataTypeMatcher{"FT0", o2::header::gDataDescriptionRawData}, Lifetime::Timeframe}};
60 DPLRawParser parser(pc.inputs(), filter);
61
62 using ArrRdhPtrPerLink = std::array<std::vector<const o2::header::RAWDataHeader*>, sNlinksMax>;
63 using ArrDataPerLink = std::array<std::vector<gsl::span<const uint8_t>>, sNlinksMax>;
64 std::array<ArrRdhPtrPerLink, sNorbits> arrRdhPtrPerOrbit{};
65 std::array<ArrDataPerLink, sNorbits> arrDataPerOrbit{};
66 std::array<std::vector<const o2::header::RAWDataHeader*>, sNorbits> arrRdhTCMperOrbit{};
67 std::array<std::vector<gsl::span<const uint8_t>>, sNorbits> arrDataTCMperOrbit{};
68 std::array<std::size_t, sNorbits> arrOrbitSizePages{};
69 std::array<std::size_t, sNorbits> arrOrbitSizePagesTCM{};
70 std::array<uint64_t, sNorbits> arrOrbit{};
71
72 for (auto it = parser.begin(), end = parser.end(); it != end; ++it) {
73 // Aggregating pages by orbit and FeeID
74 if (!it.size()) {
75 continue; // excluding pages without payload
76 }
77 auto* rdhPtr = reinterpret_cast<const o2::header::RDHAny*>(it.raw());
78 try {
79 int verRDH = o2::raw::RDHUtils::getVersion(rdhPtr);
80 if (verRDH < 5 || verRDH > o2::raw::RDHUtils::getVersion<o2::header::RDHHighest>()) {
81 LOGP(alarm, "Invalid RDH version {}, abandoning TF sending dummy output", verRDH);
82 dummyOutput();
83 return;
84 }
85 } catch (std::exception& e) {
86 LOG(alarm) << "Failed to extract RDH, abandoning TF sending dummy output, exception was: " << e.what();
87 dummyOutput();
88 return;
89 }
90 auto orb = o2::raw::RDHUtils::getHeartBeatOrbit(rdhPtr);
91 const uint16_t orbitTF = (orb) % 256;
92 // const uint16_t feeID=rdhPtr->feeId;
93 arrOrbit[orbitTF] = orb;
94 const auto& linkID = o2::raw::RDHUtils::getLinkID(rdhPtr);
95 const auto& endPoint = o2::raw::RDHUtils::getEndPointID(rdhPtr);
96 const uint16_t feeID = linkID + 12 * endPoint;
97 if (feeID == mFEEID_TCM) {
98 // Iterator is noncopyable, preparing RDH pointers and span objects
99 arrOrbitSizePagesTCM[orbitTF] += it.size();
100 arrRdhTCMperOrbit[orbitTF].push_back(reinterpret_cast<const o2::header::RAWDataHeader*>(rdhPtr));
101 arrDataTCMperOrbit[orbitTF].emplace_back(it.data(), it.size());
102 } else {
103 arrOrbitSizePages[orbitTF] += it.size();
104 arrRdhPtrPerOrbit[orbitTF][feeID].push_back(reinterpret_cast<const o2::header::RAWDataHeader*>(rdhPtr));
105 arrDataPerOrbit[orbitTF][feeID].emplace_back(it.data(), it.size());
106 }
107 }
108 uint64_t chPosOrbit{0};
109 uint64_t eventPosPerOrbit{0};
110
111 uint64_t posChDataPerOrbit[sNorbits]{}; // position per orbit
112 uint64_t nChDataPerOrbit[sNorbits]{}; // number of events per orbit
113 NChDataBC_t posChDataPerBC[sNorbits]{};
114 for (int iOrbit = 0; iOrbit < sNorbits; iOrbit++) {
115 const auto& orbit = arrOrbit[iOrbit];
116 NChDataOrbitBC_t bufBC{};
117 NChDataBC_t buf_nChPerBC{};
118 if (arrOrbitSizePages[iOrbit] > 0) {
119 for (int iFeeID = 0; iFeeID < sNlinksMax; iFeeID++) {
120 if (iFeeID == mFEEID_TCM) {
121 continue;
122 }
123 const auto& nPages = arrRdhPtrPerOrbit[iOrbit][iFeeID].size();
124
125 for (int iPage = 0; iPage < nPages; iPage++) {
126 const auto& rdhPtr = arrRdhPtrPerOrbit[iOrbit][iFeeID][iPage];
127 const auto& payload = arrDataPerOrbit[iOrbit][iFeeID][iPage].data();
128 const auto& payloadSize = arrDataPerOrbit[iOrbit][iFeeID][iPage].size();
129 const uint8_t* src = (uint8_t*)payload;
130 const auto nNGBTwords = payloadSize / 16;
131 const int nNGBTwordsDiff = nNGBTwords % 16;
132 const int nChunks = nNGBTwords / 16 + static_cast<int>(nNGBTwordsDiff > 0);
133 const auto lastChunk = nChunks - 1;
134 const uint16_t mask = (0xffff << (16 - nNGBTwordsDiff)) | (0xffff * (nNGBTwordsDiff == 0));
135 __m512i zmm_pos1 = _mm512_set_epi32(0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240);
136 __m512i zmm_pos2 = _mm512_set1_epi32(6);
137 zmm_pos2 = _mm512_add_epi32(zmm_pos1, zmm_pos2);
138 for (int iChunk = 0; iChunk < nChunks; iChunk++) {
139 __mmask16 mask16_MaxGBTwords = _mm512_int2mask((0xffff * (iChunk != lastChunk)) | mask);
140 __m512i zmm_mask_zero = _mm512_setzero_epi32();
141
142 __m512i zmm_src_column0_part0 = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_MaxGBTwords, zmm_pos1, src, 1);
143 __m512i zmm_src_column1_part1 = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_MaxGBTwords, zmm_pos2, src, 1);
144 // ChID column(contains ChID in data and descriptor in header)
145 __m512i zmm_mask_localChID = _mm512_set1_epi32(0xf);
146 __m512i zmm_buf = _mm512_srai_epi32(zmm_src_column1_part1, 28);
147 __m512i zmm_ChID_column1 = _mm512_and_epi32(zmm_buf, zmm_mask_localChID);
148 // Header
149 __mmask16 mask16_header = _mm512_cmpeq_epi32_mask(zmm_ChID_column1, zmm_mask_localChID);
150 // NGBTwords column
151 zmm_buf = _mm512_srai_epi32(zmm_src_column1_part1, 24);
152 __m512i zmm_NGBTwords = _mm512_maskz_and_epi32(mask16_header, zmm_buf, zmm_mask_localChID);
153
154 // Checking for empty events which contains only header(NGBTwords=0), and getting last header position within chunk
155 __mmask16 mask16_header_final = _mm512_mask_cmpgt_epu32_mask(mask16_header, zmm_NGBTwords, zmm_mask_zero);
156
157 // BC
158 __m512i zmm_mask_time = _mm512_set1_epi32(0xfff);
159 __m512i zmm_bc = _mm512_mask_and_epi32(zmm_mask_zero, mask16_header_final, zmm_src_column0_part0, zmm_mask_time);
160
161 // Estimation for number of channels
162 __m512i zmm_Nchannels = _mm512_slli_epi32(zmm_NGBTwords, 1); // multiply by 2
163
164 __m512i zmm_last_word_pos = zmm_NGBTwords;
165 zmm_last_word_pos = _mm512_slli_epi32(zmm_last_word_pos, 4); // multiply by 16, byte position for last word
166 zmm_last_word_pos = _mm512_add_epi32(zmm_last_word_pos, zmm_pos1);
167 zmm_buf = _mm512_i32gather_epi32(zmm_last_word_pos, src + 8, 1);
168 zmm_buf = _mm512_srai_epi32(zmm_buf, 12);
169 __m512i zmm_ChID_last_column1 = _mm512_and_epi32(zmm_buf, zmm_mask_localChID);
170
171 __mmask16 mask16_half_word = _mm512_cmpeq_epi32_mask(zmm_ChID_last_column1, zmm_mask_zero);
172 __m512i zmm_mask_one = _mm512_set1_epi32(1);
173 __m512i zmm_Nch = _mm512_mask_sub_epi32(zmm_Nchannels, mask16_half_word, zmm_Nchannels, zmm_mask_one);
174 __m512i zmm_nEventPerBC = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_header_final, zmm_bc, buf_nChPerBC.data(), 4);
175 zmm_buf = _mm512_add_epi32(zmm_nEventPerBC, zmm_Nch);
176 _mm512_mask_i32scatter_epi32(buf_nChPerBC.data(), mask16_header_final, zmm_bc, zmm_buf, 4);
177
178 zmm_buf = _mm512_set1_epi32(256);
179 zmm_pos1 = _mm512_add_epi32(zmm_buf, zmm_pos1);
180 zmm_pos2 = _mm512_add_epi32(zmm_buf, zmm_pos2);
181
182 } // chunk
183 } // Page
184 if (iFeeID != sNlinksMax - 1) {
185 memcpy(bufBC[iFeeID + 1].data(), buf_nChPerBC.data(), (sNBC + 4) * 4);
186 }
187 } // linkID
188 }
189 // Channel data position within BC per LinkID
190 memcpy(mPosChDataPerLinkOrbit[iOrbit].data(), bufBC.data(), (sNBC + 4) * 4 * sNlinksMax);
191 // TCM proccessing
192
193 uint8_t* ptrDstTCM = (uint8_t*)mVecTriggers.data();
194 if (arrOrbitSizePagesTCM[iOrbit] > 0) {
195 memset(mVecTriggers.data(), 0, 16 * 3564);
196 const auto& nPagesTCM = arrRdhTCMperOrbit[iOrbit].size();
197 for (int iPage = 0; iPage < nPagesTCM; iPage++) {
198 const auto& rdhPtr = arrRdhTCMperOrbit[iOrbit][iPage];
199 const auto& payload = arrDataTCMperOrbit[iOrbit][iPage].data();
200 const auto& payloadSize = arrDataTCMperOrbit[iOrbit][iPage].size();
201 const uint8_t* src = (uint8_t*)payload;
202 const auto nNGBTwords = payloadSize / 16;
203 const int nNGBTwordsDiff = nNGBTwords % 16;
204 const int nChunks = nNGBTwords / 16 + static_cast<int>(nNGBTwordsDiff > 0);
205 const auto lastChunk = nChunks - 1;
206 const uint16_t mask = (0xffff << (16 - nNGBTwordsDiff)) | (0xffff * (nNGBTwordsDiff == 0));
207 __m512i zmm_pos1 = _mm512_set_epi32(0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240);
208 for (int iChunk = 0; iChunk < nChunks; iChunk++) {
209 __mmask16 mask16_MaxGBTwords = _mm512_int2mask((0xffff * (iChunk != lastChunk)) | mask);
210 __m512i zmm_mask_zero = _mm512_setzero_epi32();
211
212 __m512i zmm_src_header = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_MaxGBTwords, zmm_pos1, src + 9, 1);
213
214 __m512i zmm_mask_header = _mm512_set1_epi32(0xf1); // one GBT word + descriptor
215 // Header
216 __mmask16 mask16_header = _mm512_cmpeq_epi32_mask(zmm_src_header, zmm_mask_header);
217 // BC
218 __m512i zmm_bc = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_MaxGBTwords, zmm_pos1, src, 1);
219 __m512i zmm_mask_12bit = _mm512_set1_epi32(0xfff);
220
221 zmm_bc = _mm512_maskz_and_epi32(mask16_header, zmm_mask_12bit, zmm_bc);
222 // Position of first GBT word with data
223 __m512i zmm_pos2 = _mm512_set1_epi32(16);
224 zmm_pos2 = _mm512_maskz_add_epi32(mask16_header, zmm_pos1, zmm_pos2);
225
226 __m512i zmm_src_part0 = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_header, zmm_pos2, src, 1);
227 __m512i zmm_src_part1 = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_header, zmm_pos2, src + 3, 1);
228 __m512i zmm_src_part2 = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_header, zmm_pos2, src + 7, 1);
229 // Trigger bits + NchanA + NchanC
230 __m512i zmm_mask_3byte = _mm512_set1_epi32(0xffffff);
231 __m512i zmm_dst_part0 = _mm512_and_epi32(zmm_src_part0, zmm_mask_3byte);
232 // Sum AmpA
233 __m512i zmm_mask_17bit = _mm512_set1_epi32(0x1ffff);
234 __m512i zmm_dst_part1 = _mm512_and_epi32(zmm_src_part1, zmm_mask_17bit);
235 // Sum AmpC
236 __m512i zmm_dst_part2 = _mm512_srai_epi32(zmm_src_part1, 18);
237 __m512i zmm_mask_14bit = _mm512_set1_epi32(0b11111111111111);
238 zmm_dst_part2 = _mm512_and_epi32(zmm_mask_14bit, zmm_dst_part2);
239
240 __m512i zmm_buf = _mm512_slli_epi32(zmm_src_part2, 14);
241 __m512i zmm_mask_3bit = _mm512_set1_epi32(0b11100000000000);
242 zmm_buf = _mm512_and_epi32(zmm_mask_3bit, zmm_buf);
243 zmm_dst_part2 = _mm512_or_epi32(zmm_dst_part2, zmm_buf);
244 // Average time A + C
245 __m512i zmm_dst_part3 = _mm512_srai_epi32(zmm_src_part2, 4);
246 __m512i zmm_mask_9bit = _mm512_set1_epi32(0x1ff);
247 zmm_dst_part3 = _mm512_and_epi32(zmm_dst_part3, zmm_mask_9bit);
248
249 zmm_buf = _mm512_slli_epi32(zmm_src_part2, 2);
250 __m512i zmm_mask_9bit_2 = _mm512_set1_epi32(0x1ff0000);
251 zmm_buf = _mm512_and_epi32(zmm_buf, zmm_mask_9bit_2);
252
253 zmm_dst_part3 = _mm512_or_epi32(zmm_buf, zmm_dst_part3);
254 // Position
255 __m512i zmm_dst_pos = _mm512_slli_epi32(zmm_bc, 4);
256 // Pushing data to buffer
257 _mm512_mask_i32scatter_epi32(ptrDstTCM, mask16_header, zmm_dst_pos, zmm_dst_part0, 1);
258 _mm512_mask_i32scatter_epi32(ptrDstTCM + 4, mask16_header, zmm_dst_pos, zmm_dst_part1, 1);
259 _mm512_mask_i32scatter_epi32(ptrDstTCM + 8, mask16_header, zmm_dst_pos, zmm_dst_part2, 1);
260 _mm512_mask_i32scatter_epi32(ptrDstTCM + 12, mask16_header, zmm_dst_pos, zmm_dst_part3, 1);
261
262 zmm_buf = _mm512_set1_epi32(256);
263 zmm_pos1 = _mm512_add_epi32(zmm_buf, zmm_pos1);
264 } // chunk
265 } // page
266 }
267 NChDataBC_t buf_nPosPerBC{};
268 uint64_t nChPerBC{0};
269 uint64_t nEventOrbit{0};
270 posChDataPerOrbit[iOrbit] = chPosOrbit; //
271
272 __m512i zmm_mask_seq2 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
273 __m512i zmm_pos2 = _mm512_set_epi32(75, 70, 65, 60, 55, 50, 45, 40, 35, 30, 25, 20, 15, 10, 5, 0);
274 for (int iBC = 0; iBC < sNBC; iBC += 16) {
275 uint32_t buf0[16], buf1[16], buf2[16], buf3[16];
276 uint8_t* dst = (uint8_t*)&mVecDigits[eventPosPerOrbit + nEventOrbit];
277 __m512i zmm_nChPerBC = _mm512_loadu_si512(&buf_nChPerBC[iBC]);
278
279#define SUM(N) \
280 buf_nPosPerBC[iBC + N] = nChPerBC; \
281 nChPerBC += buf_nChPerBC[iBC + N];
282
283 SUM(0);
284 SUM(1);
285 SUM(2);
286 SUM(3);
287
288 SUM(4);
289 SUM(5);
290 SUM(6);
291 SUM(7);
292
293 SUM(8);
294 SUM(9);
295 SUM(10);
296 SUM(11);
297
298 SUM(12);
299 SUM(13);
300 SUM(14);
301 SUM(15);
302 __m512i zmm_mask_zero = _mm512_setzero_epi32();
303 __mmask16 mask16_bc = _mm512_cmpneq_epi32_mask(zmm_nChPerBC, zmm_mask_zero);
304 __m512i zmm_pos3 = _mm512_maskz_expand_epi32(mask16_bc, zmm_pos2);
305 const auto nEvents = _mm_popcnt_u32(_cvtmask16_u32(mask16_bc));
306 nEventOrbit += nEvents;
307 __m512i zmm_pos = _mm512_loadu_si512(&buf_nPosPerBC[iBC]);
308 __m512i zmm_pos_per_orbit = _mm512_set1_epi32(chPosOrbit);
309 zmm_pos = _mm512_add_epi32(zmm_pos, zmm_pos_per_orbit);
310 __m512i zmm_bc = _mm512_set1_epi32(iBC);
311 zmm_bc = _mm512_add_epi32(zmm_bc, zmm_mask_seq2);
312 __m512i zmm_orbit = _mm512_set1_epi32(orbit);
313
314 _mm512_mask_i32scatter_epi32(dst, mask16_bc, zmm_pos3, zmm_pos, 8);
315 _mm512_mask_i32scatter_epi32(dst + 4, mask16_bc, zmm_pos3, zmm_nChPerBC, 8);
316 _mm512_mask_i32scatter_epi32(dst + 28, mask16_bc, zmm_pos3, zmm_bc, 8);
317 _mm512_mask_i32scatter_epi32(dst + 32, mask16_bc, zmm_pos3, zmm_orbit, 8);
318 // TCM
319 __m512i zmm_src_pos = _mm512_slli_epi32(zmm_bc, 4);
320
321 __m512i zmm_dst_part0 = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_bc, zmm_src_pos, ptrDstTCM, 1);
322 __m512i zmm_dst_part1 = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_bc, zmm_src_pos, ptrDstTCM + 4, 1);
323 __m512i zmm_dst_part2 = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_bc, zmm_src_pos, ptrDstTCM + 8, 1);
324 __m512i zmm_dst_part3 = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_bc, zmm_src_pos, ptrDstTCM + 12, 1);
325
326 _mm512_mask_i32scatter_epi32(dst + 8, mask16_bc, zmm_pos3, zmm_dst_part0, 8);
327 _mm512_mask_i32scatter_epi32(dst + 12, mask16_bc, zmm_pos3, zmm_dst_part1, 8);
328 _mm512_mask_i32scatter_epi32(dst + 16, mask16_bc, zmm_pos3, zmm_dst_part2, 8);
329 _mm512_mask_i32scatter_epi32(dst + 20, mask16_bc, zmm_pos3, zmm_dst_part3, 8);
330
331 } // BC
332
333 const uint64_t buf_nChDataPerOrbit = nChPerBC;
334 chPosOrbit += nChPerBC;
335
336 nChDataPerOrbit[iOrbit] = nChPerBC;
337 eventPosPerOrbit += nEventOrbit;
338 memcpy(posChDataPerBC[iOrbit].data(), buf_nPosPerBC.data(), (sNBC + 4) * 4);
339
340 } // Orbit
341
342 mVecDigits.resize(eventPosPerOrbit);
343 mVecChannelData.resize(chPosOrbit);
344
345 for (int iOrbit = 0; iOrbit < sNorbits; iOrbit++) {
346 if (!arrOrbitSizePages[iOrbit]) {
347 continue;
348 }
349 const auto& orbit = arrOrbit[iOrbit];
350 const auto& posChDataOrbit = posChDataPerOrbit[iOrbit];
351 const auto& ptrPosChDataPerBC = posChDataPerBC[iOrbit].data();
352 void* ptrDst = (void*)mVecChannelDataBuf.data();
353 for (int iLink = 0; iLink < sNlinksMax; iLink++) {
354 if (iLink == mFEEID_TCM) {
355 continue;
356 }
357 const auto& nPages = arrRdhPtrPerOrbit[iOrbit][iLink].size();
358 const auto& ptrChDataPosPerLinks = mPosChDataPerLinkOrbit[iOrbit][iLink].data();
359 void const* lutPerLink = &mLUT[iLink][0];
360 __m512i zmm_lut = _mm512_loadu_si512((void const*)lutPerLink);
361 for (int iPage = 0; iPage < nPages; iPage++) {
362 const auto& rdhPtr = arrRdhPtrPerOrbit[iOrbit][iLink][iPage];
363 const auto& payload = arrDataPerOrbit[iOrbit][iLink][iPage].data();
364 const auto& payloadSize = arrDataPerOrbit[iOrbit][iLink][iPage].size();
365 const uint8_t* src = (uint8_t*)payload;
366 const auto nNGBTwords = payloadSize / 16;
367 const int nNGBTwordsDiff = nNGBTwords % 16;
368 const int nChunks = nNGBTwords / 16 + static_cast<int>(nNGBTwordsDiff > 0);
369 const auto lastChunk = nChunks - 1;
370 const uint16_t mask = (0xffff << (16 - nNGBTwordsDiff)) | (0xffff * (nNGBTwordsDiff == 0));
371
372 uint16_t firstBC{0};
373 uint8_t nGBTwordPrevChunk{0};
374 uint8_t nGBTwordPrevChunkDiff{0};
375 bool firstWordIsNotHeader{false};
376
377 __m512i zmm_pos1 = _mm512_set_epi32(0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240);
378 for (int iChunk = 0; iChunk < nChunks; iChunk++) {
379 __m512i zmm_mask_charge = _mm512_set1_epi32(0x1fff0000);
380 __m512i zmm_mask_PMbits = _mm512_set1_epi32(0xff00);
381 __m512i zmm_mask_localChID = _mm512_set1_epi32(0xf);
382 __m512i zmm_mask_time = _mm512_set1_epi32(0xfff);
383
384 __m512i zmm_buf, zmm_buf2, zmm_buf3;
385
386 __mmask16 mask16_MaxGBTwords = _mm512_int2mask((0xffff * (iChunk != lastChunk)) | mask);
387 __m512i zmm_mask_zero = _mm512_setzero_epi32();
388 // Gathering data from page
389 __m512i zmm_src_part0 = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_MaxGBTwords, zmm_pos1, src, 1);
390 __m512i zmm_src_part1 = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_MaxGBTwords, zmm_pos1, src + 3, 1);
391 __m512i zmm_src_part2 = _mm512_mask_i32gather_epi32(zmm_mask_zero, mask16_MaxGBTwords, zmm_pos1, src + 6, 1);
392 // Column 0
393 // Time
394 __m512i zmm_src_column0_time = _mm512_and_epi32(zmm_src_part0, zmm_mask_time);
395 // Charge
396 zmm_buf = _mm512_slli_epi32(zmm_src_part0, 4);
397 __m512i zmm_src_column0_charge = _mm512_and_epi32(zmm_buf, zmm_mask_charge);
398 __m512i zmm_dst_column0_part1 = _mm512_or_epi32(zmm_src_column0_time, zmm_src_column0_charge);
399 // PM bits
400 zmm_buf = _mm512_slli_epi32(zmm_src_part1, 7);
401 __m512i zmm_src_column0_PMbits = _mm512_and_epi32(zmm_buf, zmm_mask_PMbits);
402 // ChannelID
403 zmm_buf = _mm512_srai_epi32(zmm_src_part1, 12);
404 __m512i zmm_dst_column0_chID = _mm512_and_epi32(zmm_buf, zmm_mask_localChID);
405 __m512i zmm_dst_column0_globalChID = _mm512_permutexvar_epi32(zmm_dst_column0_chID, zmm_lut);
406 __m512i zmm_dst_column0_part0 = _mm512_or_epi32(zmm_dst_column0_globalChID, zmm_src_column0_PMbits);
407 // Column 1
408 // Time
409 zmm_buf = _mm512_srai_epi32(zmm_src_part1, 16);
410 __m512i zmm_src_column1_time = _mm512_and_epi32(zmm_buf, zmm_mask_time);
411 // Charge
412 zmm_buf = _mm512_slli_epi32(zmm_src_part2, 12);
413 __m512i zmm_src_column1_charge = _mm512_and_epi32(zmm_buf, zmm_mask_charge);
414 __m512i zmm_dst_column1_part1 = _mm512_or_epi32(zmm_src_column1_time, zmm_src_column1_charge);
415
416 // PM bits
417 zmm_buf = _mm512_srai_epi32(zmm_src_part2, 9);
418 __m512i zmm_src_column1_PMbits = _mm512_and_epi32(zmm_buf, zmm_mask_PMbits);
419 // ChannelID
420 zmm_buf = _mm512_srai_epi32(zmm_src_part2, 28);
421 __m512i zmm_dst_column1_chID = _mm512_and_epi32(zmm_buf, zmm_mask_localChID);
422 __m512i zmm_dst_column1_globalChID = _mm512_permutexvar_epi32(zmm_dst_column1_chID, zmm_lut);
423 __m512i zmm_dst_column1_part0 = _mm512_or_epi32(zmm_dst_column1_globalChID, zmm_src_column1_PMbits);
424 // Preparing masks for data
425 // getting header and nGBTword masks
426 __mmask16 mask16_header = _mm512_cmpeq_epi32_mask(zmm_dst_column1_chID, zmm_mask_localChID); // check for header
427 zmm_buf = _mm512_srai_epi32(zmm_src_part2, 24);
428 __m512i zmm_NGBTwords = _mm512_maskz_and_epi32(mask16_header, zmm_buf, zmm_mask_localChID);
429 __mmask16 mask16_header_final = _mm512_mask_cmpgt_epu32_mask(mask16_header, zmm_NGBTwords, zmm_mask_zero);
430 __mmask16 mask16_data = _mm512_knot(mask16_header_final);
431
432 // main column wich contains also header descriptor 0xf
433 __m512i zmm_mask_maxLocalChID = _mm512_set1_epi32(12);
434 __mmask16 mask16_nonzeroChID = _mm512_mask_cmpneq_epi32_mask(mask16_data, zmm_dst_column1_chID, zmm_mask_zero); // check for non-zero channelIDs
435 __mmask16 mask16_column1_isData = _mm512_mask_cmple_epi32_mask(mask16_nonzeroChID, zmm_dst_column1_chID, zmm_mask_maxLocalChID); // check for max local channel ID - 12
436 // first column
437 mask16_nonzeroChID = _mm512_mask_cmpneq_epi32_mask(mask16_data, zmm_dst_column0_chID, zmm_mask_zero); // check for non-zero channelIDs
438 __mmask16 mask16_column0_isData = _mm512_mask_cmple_epi32_mask(mask16_nonzeroChID, zmm_dst_column0_chID, zmm_mask_maxLocalChID);
439
440 // BC
441 __m512i zmm_bc = _mm512_mask_and_epi32(zmm_mask_zero, mask16_header_final, zmm_src_part0, zmm_mask_time);
442 // Calculation for GBT word position
443 __m512i zmm_mask_seq2 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
444
445 __m512i zmm_column0_data_seq = _mm512_maskz_expand_epi32(mask16_column0_isData, zmm_mask_seq2);
446 __mmask16 mask16_buf = _mm512_kor(mask16_header_final, mask16_column0_isData);
447 __m512i zmm_column0_data2header = _mm512_maskz_expand_epi32(mask16_buf, zmm_mask_seq2);
448
449 __m512i zmm_column1_data_seq = _mm512_maskz_expand_epi32(mask16_column1_isData, zmm_mask_seq2);
450 mask16_buf = _mm512_kor(mask16_header_final, mask16_column1_isData);
451 __m512i zmm_column1_data2header = _mm512_maskz_expand_epi32(mask16_buf, zmm_mask_seq2);
452
453 zmm_column0_data2header = _mm512_maskz_sub_epi32(mask16_column0_isData, zmm_column0_data2header, zmm_column0_data_seq);
454 zmm_column1_data2header = _mm512_maskz_sub_epi32(mask16_column1_isData, zmm_column1_data2header, zmm_column1_data_seq);
455
456 __m512i zmm_column0_NGBTwords = _mm512_setzero_epi32();
457 __m512i zmm_column1_NGBTwords = zmm_NGBTwords;
458 __mmask16 mask16_header_first = _mm512_int2mask(firstWordIsNotHeader * 0b1000000000000000); // fake header, to put metadata(BC and nGBTwords) from previous chunk
459 zmm_bc = _mm512_mask_set1_epi32(zmm_bc, mask16_header_first, firstBC);
460
461 zmm_column0_NGBTwords = _mm512_mask_set1_epi32(zmm_column0_NGBTwords, mask16_header_first, nGBTwordPrevChunkDiff);
462 zmm_column1_NGBTwords = _mm512_mask_set1_epi32(zmm_column1_NGBTwords, mask16_header_first, nGBTwordPrevChunk + nGBTwordPrevChunkDiff);
463 mask16_header_final = _mm512_kor(mask16_header_first, mask16_header_final);
464 uint32_t bufBC[16]{}, bufNGBTwords[16]{};
465
466 zmm_buf = _mm512_maskz_compress_epi32(mask16_header_final, zmm_bc);
467 zmm_buf2 = _mm512_i32gather_epi32(zmm_buf, ptrChDataPosPerLinks, 4);
468 zmm_buf3 = _mm512_i32gather_epi32(zmm_buf, ptrPosChDataPerBC, 4);
469 zmm_buf2 = _mm512_add_epi32(zmm_buf2, zmm_buf3);
470
471 __m512i zmm_column0_pos = _mm512_permutexvar_epi32(zmm_column0_data2header, zmm_buf2);
472 __m512i zmm_column1_pos = _mm512_permutexvar_epi32(zmm_column1_data2header, zmm_buf2);
473
474 // Column0
475 __m512i zmm_mask_seq3 = _mm512_set_epi32(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
476 __m512i zmm_6 = _mm512_set1_epi32(6);
477 __m512i zmm_2 = _mm512_set1_epi32(2);
478
479 zmm_buf2 = _mm512_add_epi32(zmm_mask_seq2, zmm_column0_NGBTwords);
480 zmm_buf = _mm512_maskz_compress_epi32(mask16_header_final, zmm_buf2);
481
482 zmm_buf2 = _mm512_permutexvar_epi32(zmm_column0_data2header, zmm_buf);
483 zmm_buf = _mm512_maskz_sub_epi32(mask16_column0_isData, zmm_buf2, zmm_mask_seq3);
484
485 zmm_column0_pos = _mm512_maskz_add_epi32(mask16_column0_isData, zmm_column0_pos, zmm_buf);
486
487 __m512i zmm_column0_part0_pos = _mm512_maskz_mullo_epi32(mask16_column0_isData, zmm_column0_pos, zmm_6);
488 __m512i zmm_column0_part1_pos = _mm512_maskz_add_epi32(mask16_column0_isData, zmm_column0_part0_pos, zmm_2);
489
490 // Column1
491 zmm_buf2 = _mm512_add_epi32(zmm_mask_seq2, zmm_column1_NGBTwords);
492 zmm_buf = _mm512_maskz_compress_epi32(mask16_header_final, zmm_buf2);
493
494 zmm_buf2 = _mm512_permutexvar_epi32(zmm_column1_data2header, zmm_buf);
495 zmm_buf = _mm512_maskz_sub_epi32(mask16_column1_isData, zmm_buf2, zmm_mask_seq3);
496
497 zmm_column1_pos = _mm512_maskz_add_epi32(mask16_column1_isData, zmm_column1_pos, zmm_buf);
498 __m512i zmm_column1_part0_pos = _mm512_maskz_mullo_epi32(mask16_column1_isData, zmm_column1_pos, zmm_6); // Todo: exclude multilication, on fly calculation for byte position?
499 __m512i zmm_column1_part1_pos = _mm512_maskz_add_epi32(mask16_column1_isData, zmm_column1_part0_pos, zmm_2);
500 // Pushing data
501 _mm512_mask_i32scatter_epi32(ptrDst, mask16_column0_isData, zmm_column0_part0_pos, zmm_dst_column0_part0, 1);
502 _mm512_mask_i32scatter_epi32(ptrDst, mask16_column1_isData, zmm_column1_part0_pos, zmm_dst_column1_part0, 1);
503
504 _mm512_mask_i32scatter_epi32(ptrDst, mask16_column0_isData, zmm_column0_part1_pos, zmm_dst_column0_part1, 1);
505 _mm512_mask_i32scatter_epi32(ptrDst, mask16_column1_isData, zmm_column1_part1_pos, zmm_dst_column1_part1, 1);
506
507 // Getting last header position
508 _mm512_storeu_si512(bufBC, zmm_bc);
509 _mm512_storeu_si512(bufNGBTwords, zmm_NGBTwords);
510
511 const uint32_t header32 = _cvtmask16_u32(mask16_header_final);
512 const uint16_t lastHeaderPos = (__builtin_ctz(header32)) * (header32 > 0);
513 firstBC = bufBC[lastHeaderPos];
514 nGBTwordPrevChunk = bufNGBTwords[lastHeaderPos];
515 nGBTwordPrevChunkDiff = lastHeaderPos;
516 firstWordIsNotHeader = nGBTwordPrevChunk != lastHeaderPos;
517 nGBTwordPrevChunkDiff++;
518
519 zmm_buf = _mm512_set1_epi32(256);
520 zmm_pos1 = _mm512_add_epi32(zmm_buf, zmm_pos1);
521 } // chunk
522 } // page
523 } // link
524 memcpy(&mVecChannelData[posChDataOrbit], mVecChannelDataBuf.data(), 6 * nChDataPerOrbit[iOrbit]);
525 } // orbit
526 if (mEnableEmptyTFprotection && mVecDigits.size() == 0) {
527 // In case of empty payload within TF, there will be inly single dummy object in ChannelData container.
528 // Due to empty Digit container this dummy object will never participate in any further tasks.
529 mVecChannelData.emplace_back();
530 }
531 pc.outputs().snapshot(o2::framework::Output{o2::header::gDataOriginFT0, "DIGITSBC", 0}, mVecDigits);
532 pc.outputs().snapshot(o2::framework::Output{o2::header::gDataOriginFT0, "DIGITSCH", 0}, mVecChannelData);
533 auto t2 = std::chrono::high_resolution_clock::now();
534 auto delay = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1);
535 LOG(debug) << "Decoder delay: " << delay.count();
536}
537} // namespace ft0
538} // namespace o2
uint64_t orbit
Definition RawEventData.h:6
#define SUM(N)
std::ostringstream debug
GLenum src
Definition glcorearb.h:1767
GLuint GLuint end
Definition glcorearb.h:469
GLenum GLenum dst
Definition glcorearb.h:1767
GLboolean * data
Definition glcorearb.h:298
GLint GLint GLint GLint GLint GLint GLint GLbitfield GLenum filter
Definition glcorearb.h:1308
GLint GLuint mask
Definition glcorearb.h:291
GLuint GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat t1
Definition glcorearb.h:5034
constexpr o2::header::DataDescription gDataDescriptionRawData
Definition DataHeader.h:597
constexpr o2::header::DataOrigin gDataOriginFT0
Definition DataHeader.h:566
uint8_t itsSharedClusterMap uint8_t
struct o2::upgrades_utils::@462 ft0
structure to keep V0C information
a couple of static helper functions to create timestamp values for CCDB queries or override obsolete ...
static o2::header::DataHeader::PayloadSizeType getPayloadSize(const DataRef &ref)
static constexpr int getVersion()
get numeric version of the RDH
Definition RDHUtils.h:58
const int nEvents
Definition test_Fifo.cxx:27
LOG(info)<< "Compressed in "<< sw.CpuTime()<< " s"