Project
Loading...
Searching...
No Matches
GPUTPCCFDecodeZS.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
15#include "GPUTPCCFDecodeZS.h"
16#include "GPUCommonMath.h"
17#include "GPUTPCClusterFinder.h"
18#include "CfArray2D.h"
19#include "PackedCharge.h"
20#include "CfUtils.h"
22#include "GPUCommonAlgorithm.h"
23#include "TPCPadGainCalib.h"
24#include "TPCZSLinkMapping.h"
25#include "GPUTPCGeometry.h"
27
28using namespace o2::gpu;
29using namespace o2::gpu::tpccf;
30using namespace o2::tpc;
31using namespace o2::tpc::constants;
32
33// ===========================================================================
34// ===========================================================================
35// Decode ZS Row
36// ===========================================================================
37// ===========================================================================
38
39template <>
40GPUdii() void GPUTPCCFDecodeZS::Thread<GPUTPCCFDecodeZS::decodeZS>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int32_t firstHBF, int32_t tpcTimeBinCut)
41{
42 GPUTPCCFDecodeZS::decode(clusterer, smem, nBlocks, nThreads, iBlock, iThread, firstHBF, tpcTimeBinCut);
43}
44
45GPUdii() void GPUTPCCFDecodeZS::decode(GPUTPCClusterFinder& clusterer, GPUSharedMemory& s, int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, int32_t firstHBF, int32_t tpcTimeBinCut)
46{
47 const uint32_t sector = clusterer.mISector;
48#ifdef GPUCA_GPUCODE
49 const uint32_t endpoint = clusterer.mPzsOffsets[iBlock].endpoint;
50#else
51 const uint32_t endpoint = iBlock;
52#endif
53 const GPUTrackingInOutZS::GPUTrackingInOutZSSector& zs = clusterer.GetConstantMem()->ioPtrs.tpcZS->sector[sector];
54 if (zs.count[endpoint] == 0) {
55 return;
56 }
57 CfChargePos* positions = clusterer.mPpositions;
58 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
59 const size_t nDigits = clusterer.mPzsOffsets[iBlock].offset;
60 if (iThread == 0) {
61 const int32_t region = endpoint / 2;
62 s.nRowsRegion = GPUTPCGeometry::GetRegionRows(region);
63 s.regionStartRow = GPUTPCGeometry::GetRegionStart(region);
64 s.nThreadsPerRow = CAMath::Max(1u, nThreads / ((s.nRowsRegion + (endpoint & 1)) / 2));
65 s.rowStride = nThreads / s.nThreadsPerRow;
66 s.rowOffsetCounter = 0;
67 }
68 GPUbarrier();
69 const uint32_t myRow = iThread / s.nThreadsPerRow;
70 const uint32_t mySequence = iThread % s.nThreadsPerRow;
71#ifdef GPUCA_GPUCODE
72 const uint32_t i = 0;
73 const uint32_t j = clusterer.mPzsOffsets[iBlock].num;
74 {
75 {
76#else
77 for (uint32_t i = clusterer.mMinMaxCN[endpoint].zsPtrFirst; i < clusterer.mMinMaxCN[endpoint].zsPtrLast; i++) {
78 const uint32_t minJ = (i == clusterer.mMinMaxCN[endpoint].zsPtrFirst) ? clusterer.mMinMaxCN[endpoint].zsPageFirst : 0;
79 const uint32_t maxJ = (i + 1 == clusterer.mMinMaxCN[endpoint].zsPtrLast) ? clusterer.mMinMaxCN[endpoint].zsPageLast : zs.nZSPtr[endpoint][i];
80 for (uint32_t j = minJ; j < maxJ; j++) {
81#endif
82 const uint32_t* pageSrc = (const uint32_t*)(((const uint8_t*)zs.zsPtr[endpoint][i]) + j * TPCZSHDR::TPC_ZS_PAGE_SIZE);
83 CA_SHARED_CACHE_REF(&s.ZSPage[0], pageSrc, TPCZSHDR::TPC_ZS_PAGE_SIZE, uint32_t, pageCache);
84 GPUbarrier();
85 const uint8_t* page = (const uint8_t*)pageCache;
87 if (o2::raw::RDHUtils::getMemorySize(*rdh) == sizeof(o2::header::RAWDataHeader)) {
88#ifdef GPUCA_GPUCODE
89 return;
90#else
91 continue;
92#endif
93 }
94 const uint8_t* pagePtr = page + sizeof(o2::header::RAWDataHeader);
95 const TPCZSHDR* hdr = reinterpret_cast<const TPCZSHDR*>(pagePtr);
96 pagePtr += sizeof(*hdr);
97 const bool decode12bit = hdr->version == 2;
98 const uint32_t decodeBits = decode12bit ? TPCZSHDR::TPC_ZS_NBITS_V2 : TPCZSHDR::TPC_ZS_NBITS_V1;
99 const float decodeBitsFactor = 1.f / (1 << (decodeBits - 10));
100 uint32_t mask = (1 << decodeBits) - 1;
101 int32_t timeBin = (hdr->timeOffset + (o2::raw::RDHUtils::getHeartBeatOrbit(*rdh) - firstHBF) * o2::constants::lhc::LHCMaxBunches) / LHCBCPERTIMEBIN;
102 const int32_t rowOffset = s.regionStartRow + ((endpoint & 1) ? (s.nRowsRegion / 2) : 0);
103 const int32_t nRows = (endpoint & 1) ? (s.nRowsRegion - s.nRowsRegion / 2) : (s.nRowsRegion / 2);
104
105 for (int32_t l = 0; l < hdr->nTimeBinSpan; l++) { // TODO: Parallelize over time bins
106 pagePtr += (pagePtr - page) & 1; // Ensure 16 bit alignment
107 const TPCZSTBHDR* tbHdr = reinterpret_cast<const TPCZSTBHDR*>(pagePtr);
108 if ((tbHdr->rowMask & 0x7FFF) == 0) {
109 pagePtr += 2;
110 continue;
111 }
112 const int32_t nRowsUsed = CAMath::Popcount((uint32_t)(tbHdr->rowMask & 0x7FFF));
113 pagePtr += 2 * nRowsUsed;
114
115 GPUbarrier();
116 for (int32_t n = iThread; n < nRowsUsed; n += nThreads) {
117 const uint8_t* rowData = n == 0 ? pagePtr : (page + tbHdr->rowAddr1()[n - 1]);
118 s.RowClusterOffset[n] = CAMath::AtomicAddShared<uint32_t>(&s.rowOffsetCounter, rowData[2 * *rowData]);
119 }
120 /*if (iThread < GPUCA_WARP_SIZE) { // TODO: Seems to miscompile with HIP, CUDA performance doesn't really change, for now sticking to the AtomicAdd
121 GPUSharedMemory& smem = s;
122 int32_t o;
123 if (iThread < nRowsUsed) {
124 const uint8_t* rowData = iThread == 0 ? pagePtr : (page + tbHdr->rowAddr1()[iThread - 1]);
125 o = rowData[2 * *rowData];
126 } else {
127 o = 0;
128 }
129 int32_t x = warp_scan_inclusive_add(o);
130 if (iThread < nRowsUsed) {
131 s.RowClusterOffset[iThread] = s.rowOffsetCounter + x - o;
132 } else if (iThread == GPUCA_WARP_SIZE - 1) {
133 s.rowOffsetCounter += x;
134 }
135 }*/
136 GPUbarrier();
137
138 if (myRow < s.rowStride) {
139 for (int32_t m = myRow; m < nRows; m += s.rowStride) {
140 if ((tbHdr->rowMask & (1 << m)) == 0) {
141 continue;
142 }
143 const int32_t rowPos = CAMath::Popcount((uint32_t)(tbHdr->rowMask & ((1 << m) - 1)));
144 size_t nDigitsTmp = nDigits + s.RowClusterOffset[rowPos];
145 const uint8_t* rowData = rowPos == 0 ? pagePtr : (page + tbHdr->rowAddr1()[rowPos - 1]);
146 const int32_t nSeqRead = *rowData;
147 const int32_t nSeqPerThread = (nSeqRead + s.nThreadsPerRow - 1) / s.nThreadsPerRow;
148 const int32_t mySequenceStart = mySequence * nSeqPerThread;
149 const int32_t mySequenceEnd = CAMath::Min(mySequenceStart + nSeqPerThread, nSeqRead);
150 if (mySequenceEnd > mySequenceStart) {
151 const uint8_t* adcData = rowData + 2 * nSeqRead + 1;
152 const uint32_t nSamplesStart = mySequenceStart ? rowData[2 * mySequenceStart] : 0;
153 nDigitsTmp += nSamplesStart;
154 uint32_t nADCStartBits = nSamplesStart * decodeBits;
155 const uint32_t nADCStart = (nADCStartBits + 7) / 8;
156 const int32_t nADC = (rowData[2 * mySequenceEnd] * decodeBits + 7) / 8;
157 adcData += nADCStart;
158 nADCStartBits &= 0x7;
159 uint32_t byte = 0, bits = 0;
160 if (nADCStartBits) { // % 8 != 0
161 bits = 8 - nADCStartBits;
162 byte = ((*(adcData - 1) & (0xFF ^ ((1 << nADCStartBits) - 1)))) >> nADCStartBits;
163 }
164 int32_t nSeq = mySequenceStart;
165 int32_t seqLen = nSeq ? (rowData[(nSeq + 1) * 2] - rowData[nSeq * 2]) : rowData[2];
166 Pad pad = rowData[nSeq++ * 2 + 1];
167 for (int32_t n = nADCStart; n < nADC; n++) {
168 byte |= *(adcData++) << bits;
169 bits += 8;
170 while (bits >= decodeBits) {
171 if (seqLen == 0) {
172 seqLen = rowData[(nSeq + 1) * 2] - rowData[nSeq * 2];
173 pad = rowData[nSeq++ * 2 + 1];
174 }
175 const CfFragment& fragment = clusterer.mPmemory->fragment;
176 TPCTime globalTime = timeBin + l;
177 bool discardTimeBin = not fragment.contains(globalTime);
178 discardTimeBin |= (tpcTimeBinCut > 0 && globalTime > tpcTimeBinCut);
179
180 Row row = rowOffset + m;
181 CfChargePos pos(row, Pad(pad), discardTimeBin ? INVALID_TIME_BIN : fragment.toLocal(globalTime));
182 positions[nDigitsTmp++] = pos;
183
184 if (!discardTimeBin) {
185 float q = float(byte & mask) * decodeBitsFactor;
186 q *= clusterer.GetConstantMem()->calibObjects.tpcPadGain->getGainCorrection(sector, row, pad);
187 chargeMap[pos] = PackedCharge(q);
188 }
189 pad++;
190 byte = byte >> decodeBits;
191 bits -= decodeBits;
192 seqLen--;
193 }
194 }
195 }
196 }
197 }
198 if (nRowsUsed > 1) {
199 pagePtr = page + tbHdr->rowAddr1()[nRowsUsed - 2];
200 }
201 pagePtr += 2 * *pagePtr; // Go to entry for last sequence length
202 pagePtr += 1 + (*pagePtr * decodeBits + 7) / 8; // Go to beginning of next time bin
203 }
204 }
205 }
206}
207
208// ===========================================================================
209// ===========================================================================
210// Decode ZS Link
211// ===========================================================================
212// ===========================================================================
213
214template <>
215GPUdii() void GPUTPCCFDecodeZSLink::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int32_t firstHBF, int32_t tpcTimeBinCut)
216{
217 Decode<GPUTPCCFDecodeZSLink>(nBlocks, nThreads, iBlock, iThread, smem, clusterer, firstHBF, tpcTimeBinCut);
218}
219
220GPUd() size_t GPUTPCCFDecodeZSLink::DecodePage(GPUSharedMemory& smem, DecodeCtx& ctx)
221{
222 const CfFragment& fragment = ctx.clusterer.mPmemory->fragment;
223
224 const auto* rdHdr = ConsumeHeader<header::RAWDataHeader>(ctx.page);
225
226 if (o2::raw::RDHUtils::getMemorySize(*rdHdr) == sizeof(o2::header::RAWDataHeader)) {
227 return ctx.pageDigitOffset;
228 }
229
230 [[maybe_unused]] int32_t nDecoded = 0;
231 const auto* decHdr = ConsumeHeader<TPCZSHDRV2>(ctx.page);
232 ConsumeBytes(ctx.page, decHdr->firstZSDataOffset * 16);
233
234 assert(decHdr->version == ZSVersionLinkBasedWithMeta);
236
237 for (uint32_t t = 0; t < decHdr->nTimebinHeaders; t++) {
238 const auto* tbHdr = ConsumeHeader<zerosupp_link_based::CommonHeader>(ctx.page);
239 const auto* adcData = ConsumeBytes(ctx.page, tbHdr->numWordsPayload * 16); // Page now points to next timebin or past the page
240
241 int32_t timeBin = (decHdr->timeOffset + tbHdr->bunchCrossing + (uint64_t)(o2::raw::RDHUtils::getHeartBeatOrbit(*rdHdr) - ctx.firstHBF) * o2::constants::lhc::LHCMaxBunches) / LHCBCPERTIMEBIN;
242
243 uint32_t channelMask[3];
244 GetChannelBitmask(*tbHdr, channelMask);
245 uint32_t nAdc = CAMath::Popcount(channelMask[0]) + CAMath::Popcount(channelMask[1]) + CAMath::Popcount(channelMask[2]);
246
247 nDecoded += nAdc;
248
249 bool discardTimeBin = not fragment.contains(timeBin);
250 discardTimeBin |= (ctx.tpcTimeBinCut > 0 && timeBin > ctx.tpcTimeBinCut);
251
252 if (discardTimeBin) {
253 FillWithInvalid(ctx.clusterer, ctx.iThread, ctx.nThreads, ctx.pageDigitOffset, nAdc);
254 } else {
255 DecodeTB(
256 smem,
257 ctx,
258 adcData,
259 nAdc,
260 channelMask,
261 timeBin,
262 decHdr->cruID,
263 tbHdr->fecInPartition);
264 }
265
266 ctx.pageDigitOffset += nAdc;
267 } // for (uint32_t t = 0; t < decHdr->nTimebinHeaders; t++)
268
269#ifdef GPUCA_CHECK_TPCZS_CORRUPTION
270 if (iThread == 0 && nDecoded != decHdr->nADCsamples) {
271 clusterer.raiseError(GPUErrors::ERROR_TPCZS_INVALID_NADC, clusterer.mISector * 1000 + decHdr->cruID, decHdr->nADCsamples, nDecoded);
272 /*#ifndef GPUCA_GPUCODE
273 FILE* foo = fopen("dump.bin", "w+b");
274 fwrite(pageSrc, 1, o2::raw::RDHUtils::getMemorySize(*rdHdr), foo);
275 fclose(foo);
276 #endif*/
277 }
278#endif
279
280 return ctx.pageDigitOffset;
281}
282
283GPUd() void GPUTPCCFDecodeZSLink::DecodeTB(
284 GPUSharedMemory& smem,
285 DecodeCtx& ctx,
286 const uint8_t* adcData,
287 uint32_t nAdc,
288 const uint32_t* channelMask,
289 int32_t timeBin,
290 int32_t cru,
291 int32_t fecInPartition)
292{
293 constexpr int32_t NTHREADS = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFDecodeZSLink);
294 static_assert(NTHREADS == GPUCA_WARP_SIZE, "Decoding TB Headers in parallel assumes block size is a single warp.");
295
296 uint8_t blockOffset = 0;
297 for (uint8_t i = ctx.iThread; blockOffset < nAdc; i += NTHREADS) {
298
299 uint8_t rawFECChannel = i;
300
301 uint8_t myChannelActive = ChannelIsActive(channelMask, rawFECChannel);
302
303 uint8_t myOffset = warp_scan_inclusive_add(myChannelActive) - 1 + blockOffset;
304 blockOffset = warp_broadcast(myOffset, NTHREADS - 1) + 1;
305
306 if (not myChannelActive) {
307 continue;
308 }
309 assert(myOffset < nAdc);
310
311 uint32_t adc = 0;
312
313 if constexpr (TPCZSHDRV2::TIGHTLY_PACKED_V3) {
314
315 uint32_t adcBitOffset = myOffset * DECODE_BITS;
316 uint32_t adcByteOffset = adcBitOffset / CHAR_BIT;
317 uint32_t adcOffsetInByte = adcBitOffset - adcByteOffset * CHAR_BIT;
318
319 uint32_t byte = 0, bits = 0;
320
321 while (bits < DECODE_BITS) {
322 byte |= ((uint32_t)adcData[adcByteOffset]) << bits;
323 adcByteOffset++;
324 bits += CHAR_BIT;
325 }
326 adc = byte >> adcOffsetInByte;
327
328 } else { // ! TPCZSHDRV2::TIGHTLY_PACKED_V3
329 const uint64_t* adcData64 = (const uint64_t*)adcData;
330 adc = (adcData64[myOffset / TPCZSHDRV2::SAMPLESPER64BIT] >> ((myOffset % TPCZSHDRV2::SAMPLESPER64BIT) * DECODE_BITS)) & DECODE_MASK;
331 }
332
333 o2::tpc::PadPos padAndRow = GetPadAndRowFromFEC(ctx.clusterer, cru, rawFECChannel, fecInPartition);
334 const CfFragment& fragment = ctx.clusterer.mPmemory->fragment;
335 float charge = ADCToFloat(adc, DECODE_MASK, DECODE_BITS_FACTOR);
336 WriteCharge(ctx.clusterer, charge, padAndRow, fragment.toLocal(timeBin), ctx.pageDigitOffset + myOffset);
337
338 } // for (uint8_t i = iThread; blockOffset < nAdc; i += NThreads)
339}
340
341GPUd() void GPUTPCCFDecodeZSLink::GetChannelBitmask(const zerosupp_link_based::CommonHeader& tbHdr, uint32_t* chan)
342{
343 chan[0] = tbHdr.bitMaskLow & 0xfffffffful;
344 chan[1] = tbHdr.bitMaskLow >> (sizeof(uint32_t) * CHAR_BIT);
345 chan[2] = tbHdr.bitMaskHigh;
346}
347
348GPUd() bool GPUTPCCFDecodeZSLink::ChannelIsActive(const uint32_t* chan, uint8_t chanIndex)
349{
350 if (chanIndex >= zerosupp_link_based::ChannelPerTBHeader) {
351 return false;
352 }
353 constexpr uint8_t N_BITS_PER_ENTRY = sizeof(*chan) * CHAR_BIT;
354 const uint8_t entryIndex = chanIndex / N_BITS_PER_ENTRY;
355 const uint8_t bitInEntry = chanIndex % N_BITS_PER_ENTRY;
356 return chan[entryIndex] & (1 << bitInEntry);
357}
358
359// ===========================================================================
360// ===========================================================================
361// Decode ZS Link Base
362// ===========================================================================
363// ===========================================================================
364
365template <class Decoder>
366GPUd() void GPUTPCCFDecodeZSLinkBase::Decode(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, typename Decoder::GPUSharedMemory& smem, processorType& clusterer, int32_t firstHBF, int32_t tpcTimeBinCut)
367{
368 const uint32_t sector = clusterer.mISector;
369
370#ifdef GPUCA_GPUCODE
371 const uint32_t endpoint = clusterer.mPzsOffsets[iBlock].endpoint;
372#else // CPU
373 const uint32_t endpoint = iBlock;
374#endif
375
376 const GPUTrackingInOutZS::GPUTrackingInOutZSSector& zs = clusterer.GetConstantMem()->ioPtrs.tpcZS->sector[sector];
377 if (zs.count[endpoint] == 0) {
378 return;
379 }
380
381 uint32_t pageDigitOffset = clusterer.mPzsOffsets[iBlock].offset;
382
383#ifdef GPUCA_GPUCODE
384 const uint32_t i = 0;
385 const uint32_t j = clusterer.mPzsOffsets[iBlock].num;
386 {
387 {
388#else // CPU
389 for (uint32_t i = clusterer.mMinMaxCN[endpoint].zsPtrFirst; i < clusterer.mMinMaxCN[endpoint].zsPtrLast; i++) {
390 const uint32_t minJ = (i == clusterer.mMinMaxCN[endpoint].zsPtrFirst) ? clusterer.mMinMaxCN[endpoint].zsPageFirst : 0;
391 const uint32_t maxJ = (i + 1 == clusterer.mMinMaxCN[endpoint].zsPtrLast) ? clusterer.mMinMaxCN[endpoint].zsPageLast : zs.nZSPtr[endpoint][i];
392 for (uint32_t j = minJ; j < maxJ; j++) {
393#endif
394 const uint32_t* pageSrc = (const uint32_t*)(((const uint8_t*)zs.zsPtr[endpoint][i]) + j * TPCZSHDR::TPC_ZS_PAGE_SIZE);
395 // Cache zs page in shared memory. Curiously this actually degrades performance...
396 // CA_SHARED_CACHE_REF(&smem.ZSPage[0], pageSrc, TPCZSHDR::TPC_ZS_PAGE_SIZE, uint32_t, pageCache);
397 // GPUbarrier();
398 // const uint8_t* page = (const uint8_t*)pageCache;
399 const uint8_t* page = (const uint8_t*)pageSrc;
400
401 const auto* rdHdr = Peek<header::RAWDataHeader>(page);
402
403 if (o2::raw::RDHUtils::getMemorySize(*rdHdr) == sizeof(o2::header::RAWDataHeader)) {
404#ifdef GPUCA_GPUCODE
405 return;
406#else
407 continue;
408#endif
409 }
410
411 DecodeCtx ctx{
412 .clusterer = clusterer,
413 .page = page,
414 .iBlock = iBlock,
415 .nThreads = nThreads,
416 .iThread = iThread,
417 .pageDigitOffset = pageDigitOffset,
418 .firstHBF = firstHBF,
419 .tpcTimeBinCut = tpcTimeBinCut,
420 };
421
422 pageDigitOffset = Decoder::DecodePage(smem, ctx);
423 } // [CPU] for (uint32_t j = minJ; j < maxJ; j++)
424 } // [CPU] for (uint32_t i = clusterer.mMinMaxCN[endpoint].zsPtrFirst; i < clusterer.mMinMaxCN[endpoint].zsPtrLast; i++)
425
426#ifdef GPUCA_CHECK_TPCZS_CORRUPTION
427 if (iThread == 0 && iBlock < nBlocks - 1) {
428 uint32_t maxOffset = clusterer.mPzsOffsets[iBlock + 1].offset;
429 if (pageDigitOffset != maxOffset) {
430 clusterer.raiseError(GPUErrors::ERROR_TPCZS_INVALID_OFFSET, clusterer.mISector * 1000 + endpoint, pageDigitOffset, maxOffset);
431 }
432 }
433#endif
434}
435
436GPUd() o2::tpc::PadPos GPUTPCCFDecodeZSLinkBase::GetPadAndRowFromFEC(processorType& clusterer, int32_t cru, int32_t rawFECChannel, int32_t fecInPartition)
437{
438#ifdef GPUCA_TPC_GEOMETRY_O2
439 // Ported from tpc::Mapper (Not available on GPU...)
440 constexpr GPUTPCGeometry geo;
441
442 const int32_t regionIter = cru % 2;
443 const int32_t istreamm = ((rawFECChannel % 10) / 2);
444 const int32_t partitionStream = istreamm + regionIter * 5;
445 const int32_t sampaOnFEC = geo.GetSampaMapping(partitionStream);
446 const int32_t channel = (rawFECChannel % 2) + 2 * (rawFECChannel / 10);
447 const int32_t channelOnSAMPA = channel + geo.GetChannelOffset(partitionStream);
448
449 const int32_t partition = (cru % 10) / 2;
450 const int32_t fecInSector = geo.GetSectorFECOffset(partition) + fecInPartition;
451
452 const TPCZSLinkMapping* gpuMapping = clusterer.GetConstantMem()->calibObjects.tpcZSLinkMapping;
453 assert(gpuMapping != nullptr);
454
455 uint16_t globalSAMPAId = (static_cast<uint16_t>(fecInSector) << 8) + (static_cast<uint16_t>(sampaOnFEC) << 5) + static_cast<uint16_t>(channelOnSAMPA);
456 const o2::tpc::PadPos pos = gpuMapping->FECIDToPadPos[globalSAMPAId];
457
458 return pos;
459#else
460 return o2::tpc::PadPos{};
461#endif
462}
463
464GPUd() void GPUTPCCFDecodeZSLinkBase::WriteCharge(processorType& clusterer, float charge, PadPos padAndRow, TPCFragmentTime localTime, size_t positionOffset)
465{
466 const uint32_t sector = clusterer.mISector;
467 CfChargePos* positions = clusterer.mPpositions;
468#ifdef GPUCA_CHECK_TPCZS_CORRUPTION
469 if (padAndRow.getRow() >= GPUCA_ROW_COUNT) {
471 clusterer.raiseError(GPUErrors::ERROR_TPCZS_INVALID_ROW, clusterer.mISector * 1000 + padAndRow.getRow());
472 return;
473 }
474#endif
475 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
476
477 CfChargePos pos(padAndRow.getRow(), padAndRow.getPad(), localTime);
478 positions[positionOffset] = pos;
479
480 charge *= clusterer.GetConstantMem()->calibObjects.tpcPadGain->getGainCorrection(sector, padAndRow.getRow(), padAndRow.getPad());
481
482 chargeMap[pos] = PackedCharge(charge);
483}
484
485GPUd() uint16_t GPUTPCCFDecodeZSLinkBase::FillWithInvalid(processorType& clusterer, int32_t iThread, int32_t nThreads, uint32_t pageDigitOffset, uint16_t nSamples)
486{
487 for (uint16_t i = iThread; i < nSamples; i += nThreads) {
489 }
490 return nSamples;
491}
492
493// ===========================================================================
494// ===========================================================================
495// Decode ZS Dense Link
496// ===========================================================================
497// ===========================================================================
498
499template <>
500GPUd() void GPUTPCCFDecodeZSDenseLink::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int32_t firstHBF, int32_t tpcTimeBinCut)
501{
502 Decode<GPUTPCCFDecodeZSDenseLink>(nBlocks, nThreads, iBlock, iThread, smem, clusterer, firstHBF, tpcTimeBinCut);
503}
504
505GPUd() uint32_t GPUTPCCFDecodeZSDenseLink::DecodePage(GPUSharedMemory& smem, DecodeCtx& ctx)
506{
507 const uint8_t* const pageStart = ctx.page;
508
509 const auto* rawDataHeader = Peek<header::RAWDataHeader>(ctx.page);
510 const auto* decHeader = Peek<TPCZSHDRV2>(ctx.page, raw::RDHUtils::getMemorySize(*rawDataHeader) - sizeof(TPCZSHDRV2));
511 ConsumeHeader<header::RAWDataHeader>(ctx.page);
512
513 uint16_t nSamplesWritten = 0;
514 const uint16_t nSamplesInPage = decHeader->nADCsamples;
515
516 const auto* payloadEnd = Peek(pageStart, raw::RDHUtils::getMemorySize(*rawDataHeader) - sizeof(TPCZSHDRV2) - ((decHeader->flags & TPCZSHDRV2::ZSFlags::TriggerWordPresent) ? TPCZSHDRV2::TRIGGER_WORD_SIZE : 0));
517 const auto* nextPage = Peek(pageStart, TPCZSHDR::TPC_ZS_PAGE_SIZE);
518
519 const bool extendsToNextPage = decHeader->flags & TPCZSHDRV2::ZSFlags::payloadExtendsToNextPage;
520
521 ConsumeBytes(ctx.page, decHeader->firstZSDataOffset - sizeof(o2::header::RAWDataHeader));
522
523 int err = GPUErrors::ERROR_NONE;
524
525 if (decHeader->version < ZSVersionDenseLinkBased) {
526 err = GPUErrors::ERROR_TPCZS_VERSION_MISMATCH;
527 }
528
530 err = GPUErrors::ERROR_TPCZS_INVALID_MAGIC_WORD;
531 }
532
533 for (uint16_t i = 0; i < decHeader->nTimebinHeaders && !err; i++) {
534
535 ptrdiff_t sizeLeftInPage = payloadEnd - ctx.page;
536 if (sizeLeftInPage <= 0) {
537 err = GPUErrors::ERROR_TPCZS_PAGE_OVERFLOW;
538 break;
539 }
540
541 int16_t nSamplesWrittenTB = 0;
542 uint16_t nSamplesLeftInPage = nSamplesInPage - nSamplesWritten;
543
544 if (i == decHeader->nTimebinHeaders - 1 && extendsToNextPage) {
545 if (raw::RDHUtils::getMemorySize(*rawDataHeader) != TPCZSHDR::TPC_ZS_PAGE_SIZE) {
546 err = GPUErrors::ERROR_TPCZS_PAGE_OVERFLOW;
547 break;
548 }
549
550 if ((uint16_t)(raw::RDHUtils::getPageCounter(rawDataHeader) + 1) == raw::RDHUtils::getPageCounter(nextPage)) {
551 nSamplesWrittenTB = DecodeTB<true>(smem, ctx, rawDataHeader, decHeader->cruID, nSamplesLeftInPage, payloadEnd, nextPage);
552 } else {
553 err = GPUErrors::ERROR_TPCZS_INCOMPLETE_HBF;
554 break;
555 }
556 } else {
557 nSamplesWrittenTB = DecodeTB<false>(smem, ctx, rawDataHeader, decHeader->cruID, nSamplesLeftInPage, payloadEnd, nextPage);
558 }
559
560 // Abort decoding the page if an error was detected.
561 if (nSamplesWrittenTB < 0) {
562 err = -nSamplesWrittenTB;
563 break;
564 }
565
566 nSamplesWritten += nSamplesWrittenTB;
567 ctx.pageDigitOffset += nSamplesWrittenTB;
568 } // for (uint16_t i = 0; i < decHeader->nTimebinHeaders; i++)
569
570 if (nSamplesWritten != nSamplesInPage) {
571 if (nSamplesWritten < nSamplesInPage) {
572 ctx.pageDigitOffset += FillWithInvalid(ctx.clusterer, ctx.iThread, ctx.nThreads, ctx.pageDigitOffset, nSamplesInPage - nSamplesWritten);
573 }
574 err = !err ? GPUErrors::ERROR_TPCZS_INVALID_NADC : err; // Ensure we don't overwrite any previous error
575 }
576
577 if (ctx.iThread == 0 && err) {
578 [[maybe_unused]] bool dumpPage = false;
579
580 if (err == GPUErrors::ERROR_TPCZS_VERSION_MISMATCH) {
581 ctx.clusterer.raiseError(err, decHeader->version, ZSVersionDenseLinkBased);
582 } else if (err == GPUErrors::ERROR_TPCZS_INVALID_MAGIC_WORD) {
583 ctx.clusterer.raiseError(err, decHeader->magicWord);
584 } else if (err == GPUErrors::ERROR_TPCZS_INCOMPLETE_HBF) {
585 ctx.clusterer.raiseError(err, ctx.clusterer.mISector * 1000 + decHeader->cruID, raw::RDHUtils::getPageCounter(rawDataHeader), raw::RDHUtils::getPageCounter(nextPage));
586 } else if (err == GPUErrors::ERROR_TPCZS_PAGE_OVERFLOW) {
587 ctx.clusterer.raiseError(err, extendsToNextPage);
588 dumpPage = true;
589 } else if (err == GPUErrors::ERROR_TPCZS_INVALID_NADC) {
590 ctx.clusterer.raiseError(err, nSamplesInPage, nSamplesWritten, extendsToNextPage);
591 dumpPage = true;
592 } else {
593 ctx.clusterer.raiseError(GPUErrors::ERROR_TPCZS_UNKNOWN, err);
594 }
595
596#ifdef GPUCA_CHECK_TPCZS_CORRUPTION
597#ifndef GPUCA_GPUCODE
598 if (dumpPage) {
599 // allocate more space on the stack for fname, so it can be overwritten by hand in a debugger.
600 const char fname[64] = "dump00.bin";
601 FILE* foo = fopen(fname, "w+b");
602 fwrite(pageStart, 1, TPCZSHDR::TPC_ZS_PAGE_SIZE, foo);
603 fclose(foo);
604 }
605#endif
606#endif
607 }
608
609 return ctx.pageDigitOffset;
610}
611
612template <bool PayloadExtendsToNextPage>
613GPUd() int16_t GPUTPCCFDecodeZSDenseLink::DecodeTB(
614 GPUSharedMemory& smem,
615 DecodeCtx& ctx,
616 const header::RAWDataHeader* rawDataHeader,
617 int32_t cru,
618 uint16_t nSamplesLeftInPage,
619 const uint8_t* payloadEnd,
620 const uint8_t* nextPage)
621{
622#define MAYBE_PAGE_OVERFLOW(pagePtr) \
623 if constexpr (PayloadExtendsToNextPage) { \
624 if (pagePtr >= payloadEnd && pagePtr < nextPage) { \
625 ptrdiff_t diff = pagePtr - payloadEnd; \
626 pagePtr = nextPage; \
627 ConsumeBytes(pagePtr, sizeof(header::RAWDataHeader) + diff); \
628 } \
629 } else { \
630 if (pagePtr > payloadEnd) { \
631 return -GPUErrors::ERROR_TPCZS_PAGE_OVERFLOW; \
632 } \
633 }
634
635#define PEEK_OVERFLOW(pagePtr, offset) \
636 (*(PayloadExtendsToNextPage && (pagePtr) < nextPage && (pagePtr) + (offset) >= payloadEnd \
637 ? nextPage + sizeof(header::RAWDataHeader) + ((pagePtr) + (offset) - payloadEnd) \
638 : (pagePtr) + (offset)))
639
640#define TEST_BIT(x, bit) static_cast<bool>((x) & (1 << (bit)))
641
642 constexpr int32_t NTHREADS = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFDecodeZSDenseLink);
643 static_assert(NTHREADS == GPUCA_WARP_SIZE, "Decoding TB Headers in parallel assumes block size is a single warp.");
644
645 const CfFragment& fragment = ctx.clusterer.mPmemory->fragment;
646
647 // Read timebin block header
648 uint16_t tbbHdr = ConsumeByte(ctx.page);
649 MAYBE_PAGE_OVERFLOW(ctx.page);
650 tbbHdr |= static_cast<uint16_t>(ConsumeByte(ctx.page)) << CHAR_BIT;
651 MAYBE_PAGE_OVERFLOW(ctx.page);
652
653 uint8_t nLinksInTimebin = tbbHdr & 0x000F;
654 uint16_t linkBC = (tbbHdr & 0xFFF0) >> 4;
655 int32_t timeBin = (linkBC + (uint64_t)(raw::RDHUtils::getHeartBeatOrbit(*rawDataHeader) - ctx.firstHBF) * constants::lhc::LHCMaxBunches) / LHCBCPERTIMEBIN;
656
657 int16_t nSamplesInTB = 0;
658
659 // Read timebin link headers
660 for (uint8_t iLink = 0; iLink < nLinksInTimebin; iLink++) {
661 uint8_t timebinLinkHeaderStart = ConsumeByte(ctx.page);
662 MAYBE_PAGE_OVERFLOW(ctx.page);
663
664 if (ctx.iThread == 0) {
665 smem.linkIds[iLink] = timebinLinkHeaderStart & 0b00011111;
666 }
667 bool bitmaskIsFlat = timebinLinkHeaderStart & 0b00100000;
668
669 uint16_t bitmaskL2 = 0x03FF;
670 if (not bitmaskIsFlat) {
671 bitmaskL2 = static_cast<uint16_t>(timebinLinkHeaderStart & 0b11000000) << 2 | static_cast<uint16_t>(ConsumeByte(ctx.page));
672 MAYBE_PAGE_OVERFLOW(ctx.page);
673 }
674
675 int32_t nBytesBitmask = CAMath::Popcount(bitmaskL2);
676
677 for (int32_t chan = ctx.iThread; chan < CAMath::nextMultipleOf<NTHREADS>(80); chan += NTHREADS) {
678 int32_t chanL2Idx = chan / 8;
679 bool l2 = TEST_BIT(bitmaskL2, chanL2Idx);
680
681 int32_t chanByteOffset = nBytesBitmask - 1 - CAMath::Popcount(bitmaskL2 >> (chanL2Idx + 1));
682
683 uint8_t myChannelHasData = (chan < 80 && l2 ? TEST_BIT(PEEK_OVERFLOW(ctx.page, chanByteOffset), chan % 8) : 0);
684
685 int32_t nSamplesStep;
686 int32_t threadSampleOffset = CfUtils::warpPredicateScan(myChannelHasData, &nSamplesStep);
687
688 if (myChannelHasData) {
689 smem.rawFECChannels[nSamplesInTB + threadSampleOffset] = chan;
690 }
691
692 nSamplesInTB += nSamplesStep;
693 }
694
695 ConsumeBytes(ctx.page, nBytesBitmask);
696 MAYBE_PAGE_OVERFLOW(ctx.page);
697
698 if (ctx.iThread == 0) {
699 smem.samplesPerLinkEnd[iLink] = nSamplesInTB;
700 }
701
702 } // for (uint8_t iLink = 0; iLink < nLinksInTimebin; iLink++)
703
704 GPUbarrierWarp(); // Ensure all writes to shared memory are finished, before reading it
705
706 if (nSamplesInTB > nSamplesLeftInPage) {
707 return -GPUErrors::ERROR_TPCZS_INVALID_NADC;
708 }
709
710 // This needs to happen BEFORE checking if the timebin is in fragment
711 // to ensure ADC bytes are always consumed, even if data isn't decoded
712 const uint8_t* adcData = ConsumeBytes(ctx.page, (nSamplesInTB * DECODE_BITS + 7) / 8);
713 MAYBE_PAGE_OVERFLOW(ctx.page);
714
715 bool discardTimeBin = not fragment.contains(timeBin);
716 discardTimeBin |= (ctx.tpcTimeBinCut > 0 && timeBin > ctx.tpcTimeBinCut);
717
718 if (discardTimeBin) {
719 return FillWithInvalid(ctx.clusterer, ctx.iThread, NTHREADS, ctx.pageDigitOffset, nSamplesInTB);
720 }
721
722 // Unpack ADC
723 int32_t iLink = 0;
724 for (uint16_t sample = ctx.iThread; sample < nSamplesInTB; sample += NTHREADS) {
725 const uint16_t adcBitOffset = sample * DECODE_BITS;
726 uint16_t adcByteOffset = adcBitOffset / CHAR_BIT;
727 const uint8_t adcOffsetInByte = adcBitOffset - adcByteOffset * CHAR_BIT;
728
729 static_assert(DECODE_BITS <= sizeof(uint16_t) * CHAR_BIT);
730
731 uint16_t adc = 0;
732 for (uint8_t bits = 0; bits < DECODE_BITS; bits += CHAR_BIT) {
733 adc |= static_cast<uint16_t>(PEEK_OVERFLOW(adcData, adcByteOffset)) << bits;
734 adcByteOffset++;
735 }
736 adc >>= adcOffsetInByte;
737
738 while (smem.samplesPerLinkEnd[iLink] <= sample) {
739 iLink++;
740 }
741
742 int32_t rawFECChannelLink = smem.rawFECChannels[sample];
743
744 // Unpack data for cluster finder
745 o2::tpc::PadPos padAndRow = GetPadAndRowFromFEC(ctx.clusterer, cru, rawFECChannelLink, smem.linkIds[iLink]);
746
747 float charge = ADCToFloat(adc, DECODE_MASK, DECODE_BITS_FACTOR);
748 WriteCharge(ctx.clusterer, charge, padAndRow, fragment.toLocal(timeBin), ctx.pageDigitOffset + sample);
749
750 } // for (uint16_t sample = iThread; sample < nSamplesInTB; sample += NTHREADS)
751
752 GPUbarrierWarp(); // Ensure all reads to shared memory are finished, before decoding next header into shmem
753
754 return nSamplesInTB;
755
756#undef TEST_BIT
757#undef PEEK_OVERFLOW
758#undef MAYBE_PAGE_OVERFLOW
759}
760
761GPUd() bool GPUTPCCFDecodeZSDenseLink::ChannelIsActive(const uint8_t* chan, uint16_t chanIndex)
762{
763 constexpr uint8_t N_BITS_PER_ENTRY = sizeof(*chan) * CHAR_BIT;
764 const uint8_t entryIndex = chanIndex / N_BITS_PER_ENTRY;
765 const uint8_t bitInEntry = chanIndex % N_BITS_PER_ENTRY;
766 return chan[entryIndex] & (1 << bitInEntry);
767}
#define INVALID_TIME_BIN
Definition CfChargePos.h:23
int16_t charge
Definition RawEventData.h:5
int32_t i
#define GPUbarrierWarp()
#define GPUbarrier()
#define GPUCA_GET_THREAD_COUNT(...)
#define CA_SHARED_CACHE_REF(target, src, size, reftype, ref)
Definition GPUDef.h:54
GPUdii() void GPUTPCCFDecodeZS
#define TEST_BIT(x, bit)
#define MAYBE_PAGE_OVERFLOW(pagePtr)
#define PEEK_OVERFLOW(pagePtr, offset)
#define GPUCA_ROW_COUNT
Header to collect LHC related constants.
uint16_t pos
Definition RawData.h:3
uint32_t j
Definition RawData.h:0
uint8_t endpoint
Definition RawData.h:0
GPUd() static o2 float o2::tpc::PadPos tpccf::TPCFragmentTime localTime
GPUd() static o2 float o2::tpc::PadPos pos
int32_t int32_t int32_t Decoder::GPUSharedMemory processorType & clusterer
int32_t int32_t uint32_t pageDigitOffset
GPUd() static o2 float o2::tpc::PadPos tpccf::TPCFragmentTime size_t positionOffset
int32_t int32_t uint32_t uint16_t nSamples
int32_t int32_t int32_t iThread
GLdouble n
Definition glcorearb.h:1982
const GLfloat * m
Definition glcorearb.h:4066
GLenum GLint GLenum GLsizei GLsizei GLsizei GLint GLsizei const void * bits
Definition glcorearb.h:4150
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLint GLuint mask
Definition glcorearb.h:291
uint8_t itsSharedClusterMap uint8_t
constexpr int LHCMaxBunches
constexpr CfChargePos INVALID_CHARGE_POS
Definition CfChargePos.h:59
GPUd() const expr uint32_t MultivariatePolynomialHelper< Dim
RAWDataHeaderV7 RAWDataHeader
constexpr int LHCBCPERTIMEBIN
Definition Constants.h:38
Global TPC definitions and constants.
Definition SimTraits.h:168
GPUd() void PIDResponse
Definition PIDResponse.h:71
@ ZSVersionDenseLinkBased
@ ZSVersionLinkBasedWithMeta
constexpr std::array< int, nLayers > nRows
Definition Specs.h:56
static constexpr bool TIGHTLY_PACKED_V3
static constexpr unsigned int SAMPLESPER64BIT
static constexpr unsigned int TRIGGER_WORD_SIZE
unsigned char version
static constexpr unsigned int TPC_ZS_NBITS_V1
static constexpr unsigned int TPC_ZS_NBITS_V2
static constexpr size_t TPC_ZS_PAGE_SIZE
unsigned short rowMask
HistogramRegistry foo()
coder decode(ctfImage, triggersD, clustersD)
std::vector< int > row
ArrayADC adc