Project
Loading...
Searching...
No Matches
GPUChainTrackingClusterizer.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
15#include "GPUChainTracking.h"
17#include "GPULogging.h"
18#include "GPUO2DataTypes.h"
21#include "GPUNewCalibValues.h"
22#include "GPUConstantMem.h"
23#include "CfChargePos.h"
24#include "CfArray2D.h"
25#include "GPUGeneralKernels.h"
26#include "GPUDefParametersRuntime.h"
29#include "GPUTPCCFDecodeZS.h"
31#include "GPUTPCCFPeakFinder.h"
34#include "GPUTPCCFClusterizer.h"
35#include "GPUTPCCFGather.h"
37#include "GPUTriggerOutputs.h"
38#include "GPUHostDataTypes.h"
44#include "TPCBase/RDHUtils.h"
45#include "GPULogging.h"
46
47#ifdef GPUCA_HAS_ONNX
50#endif
51
52#ifdef GPUCA_O2_LIB
54#endif
55
56#include "utils/strtag.h"
57#include <fstream>
58
59#ifndef GPUCA_NO_VC
60#include <Vc/Vc>
61#endif
62
63using namespace o2::gpu;
64using namespace o2::tpc;
65using namespace o2::tpc::constants;
66using namespace o2::dataformats;
67
68#ifdef GPUCA_TPC_GEOMETRY_O2
69std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCountUpdate(uint32_t iSector, const CfFragment& fragment)
70{
72 GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
74 uint32_t digits = 0;
75 uint32_t pages = 0;
76 for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
77 clusterer.mMinMaxCN[j] = mCFContext->fragmentData[fragment.index].minMaxCN[iSector][j];
78 if (doGPU) {
79 uint16_t posInEndpoint = 0;
80 uint16_t pagesEndpoint = 0;
81 for (uint32_t k = clusterer.mMinMaxCN[j].zsPtrFirst; k < clusterer.mMinMaxCN[j].zsPtrLast; k++) {
82 const uint32_t pageFirst = (k == clusterer.mMinMaxCN[j].zsPtrFirst) ? clusterer.mMinMaxCN[j].zsPageFirst : 0;
83 const uint32_t pageLast = (k + 1 == clusterer.mMinMaxCN[j].zsPtrLast) ? clusterer.mMinMaxCN[j].zsPageLast : mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k];
84 for (uint32_t l = pageFirst; l < pageLast; l++) {
85 uint16_t pageDigits = mCFContext->fragmentData[fragment.index].pageDigits[iSector][j][posInEndpoint++];
86 if (pageDigits) {
87 *(o++) = GPUTPCClusterFinder::ZSOffset{digits, j, pagesEndpoint};
88 digits += pageDigits;
89 }
90 pagesEndpoint++;
91 }
92 }
93 if (pagesEndpoint != mCFContext->fragmentData[fragment.index].pageDigits[iSector][j].size()) {
94 if (GetProcessingSettings().ignoreNonFatalGPUErrors) {
95 GPUError("TPC raw page count mismatch in TPCClusterizerDecodeZSCountUpdate: expected %d / buffered %lu", pagesEndpoint, mCFContext->fragmentData[fragment.index].pageDigits[iSector][j].size());
96 return {0, 0};
97 } else {
98 GPUFatal("TPC raw page count mismatch in TPCClusterizerDecodeZSCountUpdate: expected %d / buffered %lu", pagesEndpoint, mCFContext->fragmentData[fragment.index].pageDigits[iSector][j].size());
99 }
100 }
101 } else {
103 digits += mCFContext->fragmentData[fragment.index].nDigits[iSector][j];
104 pages += mCFContext->fragmentData[fragment.index].nPages[iSector][j];
105 }
106 }
107 if (doGPU) {
108 pages = o - processors()->tpcClusterer[iSector].mPzsOffsets;
109 }
110 if (!doGPU && GetProcessingSettings().debugLevel >= 4 && mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
111 TPCClusterizerEnsureZSOffsets(iSector, fragment);
112 }
113 return {digits, pages};
114}
115
116void GPUChainTracking::TPCClusterizerEnsureZSOffsets(uint32_t iSector, const CfFragment& fragment)
117{
118 GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
119 uint32_t nAdcs = 0;
121 const auto& data = mCFContext->fragmentData[fragment.index];
122 uint32_t pagesEndpoint = 0;
123 const uint32_t nAdcsExpected = data.nDigits[iSector][endpoint];
124 const uint32_t nPagesExpected = data.nPages[iSector][endpoint];
125
126 uint32_t nAdcDecoded = 0;
127 const auto& zs = mIOPtrs.tpcZS->sector[iSector];
128 for (uint32_t i = data.minMaxCN[iSector][endpoint].zsPtrFirst; i < data.minMaxCN[iSector][endpoint].zsPtrLast; i++) {
129 const uint32_t pageFirst = (i == data.minMaxCN[iSector][endpoint].zsPtrFirst) ? data.minMaxCN[iSector][endpoint].zsPageFirst : 0;
130 const uint32_t pageLast = (i + 1 == data.minMaxCN[iSector][endpoint].zsPtrLast) ? data.minMaxCN[iSector][endpoint].zsPageLast : zs.nZSPtr[endpoint][i];
131 for (uint32_t j = pageFirst; j < pageLast; j++) {
132 const uint8_t* page = static_cast<const uint8_t*>(zs.zsPtr[endpoint][i]) + j * TPCZSHDR::TPC_ZS_PAGE_SIZE;
133 const header::RAWDataHeader* rawDataHeader = reinterpret_cast<const header::RAWDataHeader*>(page);
134 const TPCZSHDRV2* decHdr = reinterpret_cast<const TPCZSHDRV2*>(page + raw::RDHUtils::getMemorySize(*rawDataHeader) - sizeof(TPCZSHDRV2));
135 const uint16_t nSamplesInPage = decHdr->nADCsamples;
136
137 nAdcDecoded += nSamplesInPage;
138 pagesEndpoint++;
139 }
140 }
141
142 if (pagesEndpoint != nPagesExpected) {
143 GPUFatal("Sector %d, Endpoint %d, Fragment %d: TPC raw page count mismatch: expected %d / buffered %u", iSector, endpoint, fragment.index, pagesEndpoint, nPagesExpected);
144 }
145
146 if (nAdcDecoded != nAdcsExpected) {
147 GPUFatal("Sector %d, Endpoint %d, Fragment %d: TPC ADC count mismatch: expected %u, buffered %u", iSector, endpoint, fragment.index, nAdcsExpected, nAdcDecoded);
148 }
149
150 if (nAdcs != clusterer.mPzsOffsets[endpoint].offset) {
151 GPUFatal("Sector %d, Endpoint %d, Fragment %d: TPC ADC offset mismatch: expected %u, buffered %u", iSector, endpoint, fragment.index, nAdcs, clusterer.mPzsOffsets[endpoint].offset);
152 }
153
154 nAdcs += nAdcsExpected;
155 }
156}
157
158namespace
159{
160struct TPCCFDecodeScanTmp {
161 int32_t zsPtrFirst, zsPageFirst, zsPtrLast, zsPageLast, hasData, pageCounter;
162};
163} // namespace
164
165std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint32_t iSector, const CfFragment& fragment)
166{
167 mRec->getGeneralStepTimer(GeneralStep::Prepare).Start();
168 uint32_t nDigits = 0;
169 uint32_t nPages = 0;
170 uint32_t endpointAdcSamples[GPUTrackingInOutZS::NENDPOINTS];
171 memset(endpointAdcSamples, 0, sizeof(endpointAdcSamples));
173 int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : ((mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader*)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]) : 0);
174
175 for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
176#ifndef GPUCA_NO_VC
177 if (GetProcessingSettings().prefetchTPCpageScan >= 3 && j < GPUTrackingInOutZS::NENDPOINTS - 1) {
178 for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j + 1]; k++) {
179 for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j + 1][k]; l++) {
180 Vc::Common::prefetchMid(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j + 1][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE);
181 Vc::Common::prefetchMid(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j + 1][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
182 }
183 }
184 }
185#endif
186
187 std::vector<std::pair<CfFragment, TPCCFDecodeScanTmp>> fragments;
188 fragments.reserve(mCFContext->nFragments);
189 fragments.emplace_back(std::pair<CfFragment, TPCCFDecodeScanTmp>{fragment, {0, 0, 0, 0, 0, -1}});
190 for (uint32_t i = 1; i < mCFContext->nFragments; i++) {
191 fragments.emplace_back(std::pair<CfFragment, TPCCFDecodeScanTmp>{fragments.back().first.next(), {0, 0, 0, 0, 0, -1}});
192 }
193 std::vector<bool> fragmentExtends(mCFContext->nFragments, false);
194
195 uint32_t firstPossibleFragment = 0;
196 uint32_t pageCounter = 0;
197 uint32_t emptyPages = 0;
198 for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j]; k++) {
199 if (GetProcessingSettings().tpcSingleSector != -1 && GetProcessingSettings().tpcSingleSector != (int32_t)iSector) {
200 break;
201 }
202 nPages += mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k];
203 for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; l++) {
204#ifndef GPUCA_NO_VC
205 if (GetProcessingSettings().prefetchTPCpageScan >= 2 && l + 1 < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]) {
206 Vc::Common::prefetchForOneRead(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE);
207 Vc::Common::prefetchForOneRead(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
208 }
209#endif
210 const uint8_t* const page = ((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE;
212 if (o2::raw::RDHUtils::getMemorySize(*rdh) == sizeof(o2::header::RAWDataHeader)) {
213 emptyPages++;
214 continue;
215 }
216 pageCounter++;
217 const TPCZSHDR* const hdr = (const TPCZSHDR*)(rdh_utils::getLink(o2::raw::RDHUtils::getFEEID(*rdh)) == rdh_utils::DLBZSLinkID ? (page + o2::raw::RDHUtils::getMemorySize(*rdh) - sizeof(TPCZSHDRV2)) : (page + sizeof(o2::header::RAWDataHeader)));
218 if (mCFContext->zsVersion == -1) {
219 mCFContext->zsVersion = hdr->version;
220 if (GetProcessingSettings().param.tpcTriggerHandling && mCFContext->zsVersion < ZSVersion::ZSVersionDenseLinkBased) { // TODO: Move tpcTriggerHandling to recoSteps bitmask
221 static bool errorShown = false;
222 if (errorShown == false) {
223 GPUAlarm("Trigger handling only possible with TPC Dense Link Based data, received version %d, disabling", mCFContext->zsVersion);
224 }
225 errorShown = true;
226 }
227 } else if (mCFContext->zsVersion != (int32_t)hdr->version) {
228 GPUError("Received TPC ZS 8kb page of mixed versions, expected %d, received %d (linkid %d, feeCRU %d, feeEndpoint %d, feelinkid %d)", mCFContext->zsVersion, (int32_t)hdr->version, (int32_t)o2::raw::RDHUtils::getLinkID(*rdh), (int32_t)rdh_utils::getCRU(*rdh), (int32_t)rdh_utils::getEndPoint(*rdh), (int32_t)rdh_utils::getLink(*rdh));
229 constexpr size_t bufferSize = 3 * std::max(sizeof(*rdh), sizeof(*hdr)) + 1;
230 char dumpBuffer[bufferSize];
231 for (size_t i = 0; i < sizeof(*rdh); i++) {
232 // "%02X " guaranteed to be 3 chars + ending 0.
233 snprintf(dumpBuffer + 3 * i, 4, "%02X ", (int32_t)((uint8_t*)rdh)[i]);
234 }
235 GPUAlarm("RDH of page: %s", dumpBuffer);
236 for (size_t i = 0; i < sizeof(*hdr); i++) {
237 // "%02X " guaranteed to be 3 chars + ending 0.
238 snprintf(dumpBuffer + 3 * i, 4, "%02X ", (int32_t)((uint8_t*)hdr)[i]);
239 }
240 GPUAlarm("Metainfo of page: %s", dumpBuffer);
241 if (GetProcessingSettings().ignoreNonFatalGPUErrors) {
242 mCFContext->abandonTimeframe = true;
243 return {0, 0};
244 } else {
245 GPUFatal("Cannot process with invalid TPC ZS data, exiting");
246 }
247 }
248 if (GetProcessingSettings().param.tpcTriggerHandling) {
249 const TPCZSHDRV2* const hdr2 = (const TPCZSHDRV2*)hdr;
250 if (hdr2->flags & TPCZSHDRV2::ZSFlags::TriggerWordPresent) {
251 const char* triggerWord = (const char*)hdr - TPCZSHDRV2::TRIGGER_WORD_SIZE;
253 memcpy((void*)&tmp.triggerWord, triggerWord, TPCZSHDRV2::TRIGGER_WORD_SIZE);
254 tmp.orbit = o2::raw::RDHUtils::getHeartBeatOrbit(*rdh);
255 if (tmp.triggerWord.isValid(0)) {
256 mTriggerBuffer->triggers.emplace(tmp);
257 }
258 }
259 }
260 nDigits += hdr->nADCsamples;
261 endpointAdcSamples[j] += hdr->nADCsamples;
262 uint32_t timeBin = (hdr->timeOffset + (o2::raw::RDHUtils::getHeartBeatOrbit(*rdh) - firstHBF) * o2::constants::lhc::LHCMaxBunches) / LHCBCPERTIMEBIN;
263 uint32_t maxTimeBin = timeBin + hdr->nTimeBinSpan;
264 if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
265 const TPCZSHDRV2* const hdr2 = (const TPCZSHDRV2*)hdr;
266 if (hdr2->flags & TPCZSHDRV2::ZSFlags::nTimeBinSpanBit8) {
267 maxTimeBin += 256;
268 }
269 }
270 if (maxTimeBin > mCFContext->tpcMaxTimeBin) {
271 mCFContext->tpcMaxTimeBin = maxTimeBin;
272 }
273 bool extendsInNextPage = false;
274 if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
275 if (l + 1 < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k] && o2::raw::RDHUtils::getMemorySize(*rdh) == TPCZSHDR::TPC_ZS_PAGE_SIZE) {
277 extendsInNextPage = o2::raw::RDHUtils::getHeartBeatOrbit(*nextrdh) == o2::raw::RDHUtils::getHeartBeatOrbit(*rdh) && o2::raw::RDHUtils::getMemorySize(*nextrdh) > sizeof(o2::header::RAWDataHeader);
278 }
279 }
280 while (firstPossibleFragment && (uint32_t)fragments[firstPossibleFragment - 1].first.last() > timeBin) {
281 firstPossibleFragment--;
282 }
283 auto handleExtends = [&](uint32_t ff) {
284 if (fragmentExtends[ff]) {
285 if (doGPU) {
286 // Only add extended page on GPU. On CPU the pages are in consecutive memory anyway.
287 // Not adding the page prevents an issue where a page is decoded twice on CPU, when only the extend should be decoded.
288 fragments[ff].second.zsPageLast++;
289 mCFContext->fragmentData[ff].nPages[iSector][j]++;
290 mCFContext->fragmentData[ff].pageDigits[iSector][j].emplace_back(0);
291 }
292 fragmentExtends[ff] = false;
293 }
294 };
295 if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
296 for (uint32_t ff = 0; ff < firstPossibleFragment; ff++) {
297 handleExtends(ff);
298 }
299 }
300 for (uint32_t f = firstPossibleFragment; f < mCFContext->nFragments; f++) {
301 if (timeBin < (uint32_t)fragments[f].first.last() && (uint32_t)fragments[f].first.first() <= maxTimeBin) {
302 if (!fragments[f].second.hasData) {
303 fragments[f].second.hasData = 1;
304 fragments[f].second.zsPtrFirst = k;
305 fragments[f].second.zsPageFirst = l;
306 } else {
307 if (pageCounter > (uint32_t)fragments[f].second.pageCounter + 1) {
308 mCFContext->fragmentData[f].nPages[iSector][j] += emptyPages + pageCounter - fragments[f].second.pageCounter - 1;
309 for (uint32_t k2 = fragments[f].second.zsPtrLast - 1; k2 <= k; k2++) {
310 for (uint32_t l2 = ((int32_t)k2 == fragments[f].second.zsPtrLast - 1) ? fragments[f].second.zsPageLast : 0; l2 < (k2 < k ? mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k2] : l); l2++) {
311 if (doGPU) {
312 mCFContext->fragmentData[f].pageDigits[iSector][j].emplace_back(0);
313 } else {
314 // CPU cannot skip unneeded pages, so we must keep space to store the invalid dummy clusters
315 const uint8_t* const pageTmp = ((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k2]) + l2 * TPCZSHDR::TPC_ZS_PAGE_SIZE;
316 const o2::header::RAWDataHeader* rdhTmp = (const o2::header::RAWDataHeader*)pageTmp;
317 if (o2::raw::RDHUtils::getMemorySize(*rdhTmp) != sizeof(o2::header::RAWDataHeader)) {
318 const TPCZSHDR* const hdrTmp = (const TPCZSHDR*)(rdh_utils::getLink(o2::raw::RDHUtils::getFEEID(*rdhTmp)) == rdh_utils::DLBZSLinkID ? (pageTmp + o2::raw::RDHUtils::getMemorySize(*rdhTmp) - sizeof(TPCZSHDRV2)) : (pageTmp + sizeof(o2::header::RAWDataHeader)));
319 mCFContext->fragmentData[f].nDigits[iSector][j] += hdrTmp->nADCsamples;
320 }
321 }
322 }
323 }
324 } else if (emptyPages) {
325 mCFContext->fragmentData[f].nPages[iSector][j] += emptyPages;
326 if (doGPU) {
327 for (uint32_t m = 0; m < emptyPages; m++) {
328 mCFContext->fragmentData[f].pageDigits[iSector][j].emplace_back(0);
329 }
330 }
331 }
332 }
333 fragments[f].second.zsPtrLast = k + 1;
334 fragments[f].second.zsPageLast = l + 1;
335 fragments[f].second.pageCounter = pageCounter;
336 mCFContext->fragmentData[f].nPages[iSector][j]++;
337 mCFContext->fragmentData[f].nDigits[iSector][j] += hdr->nADCsamples;
338 if (doGPU) {
339 mCFContext->fragmentData[f].pageDigits[iSector][j].emplace_back(hdr->nADCsamples);
340 }
341 fragmentExtends[f] = extendsInNextPage;
342 } else {
343 handleExtends(f);
344 if (timeBin < (uint32_t)fragments[f].first.last()) {
345 if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
346 for (uint32_t ff = f + 1; ff < mCFContext->nFragments; ff++) {
347 handleExtends(ff);
348 }
349 }
350 break;
351 } else {
352 firstPossibleFragment = f + 1;
353 }
354 }
355 }
356 emptyPages = 0;
357 }
358 }
359 for (uint32_t f = 0; f < mCFContext->nFragments; f++) {
360 mCFContext->fragmentData[f].minMaxCN[iSector][j].zsPtrLast = fragments[f].second.zsPtrLast;
361 mCFContext->fragmentData[f].minMaxCN[iSector][j].zsPtrFirst = fragments[f].second.zsPtrFirst;
362 mCFContext->fragmentData[f].minMaxCN[iSector][j].zsPageLast = fragments[f].second.zsPageLast;
363 mCFContext->fragmentData[f].minMaxCN[iSector][j].zsPageFirst = fragments[f].second.zsPageFirst;
364 }
365 }
366 mCFContext->nPagesTotal += nPages;
367 mCFContext->nPagesSector[iSector] = nPages;
368
369 mCFContext->nDigitsEndpointMax[iSector] = 0;
370 for (uint32_t i = 0; i < GPUTrackingInOutZS::NENDPOINTS; i++) {
371 if (endpointAdcSamples[i] > mCFContext->nDigitsEndpointMax[iSector]) {
372 mCFContext->nDigitsEndpointMax[iSector] = endpointAdcSamples[i];
373 }
374 }
375 uint32_t nDigitsFragmentMax = 0;
376 for (uint32_t i = 0; i < mCFContext->nFragments; i++) {
377 uint32_t pagesInFragment = 0;
378 uint32_t digitsInFragment = 0;
379 for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
380 pagesInFragment += mCFContext->fragmentData[i].nPages[iSector][j];
381 digitsInFragment += mCFContext->fragmentData[i].nDigits[iSector][j];
382 }
383 mCFContext->nPagesFragmentMax = std::max(mCFContext->nPagesFragmentMax, pagesInFragment);
384 nDigitsFragmentMax = std::max(nDigitsFragmentMax, digitsInFragment);
385 }
386 mRec->getGeneralStepTimer(GeneralStep::Prepare).Stop();
387 return {nDigits, nDigitsFragmentMax};
388}
389
390void GPUChainTracking::RunTPCClusterizer_compactPeaks(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int32_t stage, bool doGPU, int32_t lane)
391{
392 auto& in = stage ? clustererShadow.mPpeakPositions : clustererShadow.mPpositions;
393 auto& out = stage ? clustererShadow.mPfilteredPeakPositions : clustererShadow.mPpeakPositions;
394 if (doGPU) {
395 const uint32_t iSector = clusterer.mISector;
396 auto& count = stage ? clusterer.mPmemory->counters.nPeaks : clusterer.mPmemory->counters.nPositions;
397
398 std::vector<size_t> counts;
399
400 uint32_t nSteps = clusterer.getNSteps(count);
401 if (nSteps > clusterer.mNBufs) {
402 GPUError("Clusterer buffers exceeded (%u > %u)", nSteps, (int32_t)clusterer.mNBufs);
403 exit(1);
404 }
405
406 int32_t scanWorkgroupSize = mRec->getGPUParameters(doGPU).par_CF_SCAN_WORKGROUP_SIZE;
407 size_t tmpCount = count;
408 if (nSteps > 1) {
409 for (uint32_t i = 1; i < nSteps; i++) {
410 counts.push_back(tmpCount);
411 if (i == 1) {
412 runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanStart>({GetGrid(tmpCount, scanWorkgroupSize, lane), {iSector}}, i, stage);
413 } else {
414 runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanUp>({GetGrid(tmpCount, scanWorkgroupSize, lane), {iSector}}, i, tmpCount);
415 }
416 tmpCount = (tmpCount + scanWorkgroupSize - 1) / scanWorkgroupSize;
417 }
418
419 runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanTop>({GetGrid(tmpCount, scanWorkgroupSize, lane), {iSector}}, nSteps, tmpCount);
420
421 for (uint32_t i = nSteps - 1; i > 1; i--) {
422 tmpCount = counts[i - 1];
423 runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanDown>({GetGrid(tmpCount - scanWorkgroupSize, scanWorkgroupSize, lane), {iSector}}, i, scanWorkgroupSize, tmpCount);
424 }
425 }
426
427 runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::compactDigits>({GetGrid(count, scanWorkgroupSize, lane), {iSector}}, 1, stage, in, out);
428 } else {
429 auto& nOut = stage ? clusterer.mPmemory->counters.nClusters : clusterer.mPmemory->counters.nPeaks;
430 auto& nIn = stage ? clusterer.mPmemory->counters.nPeaks : clusterer.mPmemory->counters.nPositions;
431 size_t count = 0;
432 for (size_t i = 0; i < nIn; i++) {
433 if (clusterer.mPisPeak[i]) {
434 out[count++] = in[i];
435 }
436 }
437 nOut = count;
438 }
439}
440
441std::pair<uint32_t, uint32_t> GPUChainTracking::RunTPCClusterizer_transferZS(int32_t iSector, const CfFragment& fragment, int32_t lane)
442{
443 bool doGPU = GetRecoStepsGPU() & RecoStep::TPCClusterFinding;
444 if (mCFContext->abandonTimeframe) {
445 return {0, 0};
446 }
447 const auto& retVal = TPCClusterizerDecodeZSCountUpdate(iSector, fragment);
448 if (doGPU) {
449 GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
450 GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
451 uint32_t nPagesSector = 0;
452 for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
453 uint32_t nPages = 0;
454 mInputsHost->mPzsMeta->sector[iSector].zsPtr[j] = &mInputsShadow->mPzsPtrs[iSector * GPUTrackingInOutZS::NENDPOINTS + j];
455 mInputsHost->mPzsPtrs[iSector * GPUTrackingInOutZS::NENDPOINTS + j] = clustererShadow.mPzs + (nPagesSector + nPages) * TPCZSHDR::TPC_ZS_PAGE_SIZE;
456 for (uint32_t k = clusterer.mMinMaxCN[j].zsPtrFirst; k < clusterer.mMinMaxCN[j].zsPtrLast; k++) {
457 const uint32_t min = (k == clusterer.mMinMaxCN[j].zsPtrFirst) ? clusterer.mMinMaxCN[j].zsPageFirst : 0;
458 const uint32_t max = (k + 1 == clusterer.mMinMaxCN[j].zsPtrLast) ? clusterer.mMinMaxCN[j].zsPageLast : mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k];
459 if (max > min) {
460 char* src = (char*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k] + min * TPCZSHDR::TPC_ZS_PAGE_SIZE;
461 char* ptrLast = (char*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k] + (max - 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE;
462 size_t size = (ptrLast - src) + o2::raw::RDHUtils::getMemorySize(*(const o2::header::RAWDataHeader*)ptrLast);
463 GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPzs + (nPagesSector + nPages) * TPCZSHDR::TPC_ZS_PAGE_SIZE, src, size, lane, true);
464 }
465 nPages += max - min;
466 }
467 mInputsHost->mPzsMeta->sector[iSector].nZSPtr[j] = &mInputsShadow->mPzsSizes[iSector * GPUTrackingInOutZS::NENDPOINTS + j];
468 mInputsHost->mPzsSizes[iSector * GPUTrackingInOutZS::NENDPOINTS + j] = nPages;
469 mInputsHost->mPzsMeta->sector[iSector].count[j] = 1;
470 nPagesSector += nPages;
471 }
472 GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPzsOffsets, clusterer.mPzsOffsets, clusterer.mNMaxPages * sizeof(*clusterer.mPzsOffsets), lane, true);
473 }
474 return retVal;
475}
476
477int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
478{
480 if (restorePointers) {
481 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
482 processors()->tpcClusterer[iSector].mPzsOffsets = mCFContext->ptrSave[iSector].zsOffsetHost;
483 processorsShadow()->tpcClusterer[iSector].mPzsOffsets = mCFContext->ptrSave[iSector].zsOffsetDevice;
484 processorsShadow()->tpcClusterer[iSector].mPzs = mCFContext->ptrSave[iSector].zsDevice;
485 }
486 processorsShadow()->ioPtrs.clustersNative = mCFContext->ptrClusterNativeSave;
487 return 0;
488 }
489 const auto& threadContext = GetThreadContext();
491 if (mCFContext == nullptr) {
493 }
494 const int16_t maxFragmentLen = GetProcessingSettings().overrideClusterizerFragmentLen;
495 const uint32_t maxAllowedTimebin = param().par.continuousTracking ? std::max<int32_t>(param().continuousMaxTimeBin, maxFragmentLen) : TPC_MAX_TIME_BIN_TRIGGERED;
496 mCFContext->tpcMaxTimeBin = maxAllowedTimebin;
497 const CfFragment fragmentMax{(tpccf::TPCTime)mCFContext->tpcMaxTimeBin + 1, maxFragmentLen};
498 mCFContext->prepare(mIOPtrs.tpcZS, fragmentMax);
499 if (GetProcessingSettings().param.tpcTriggerHandling) {
500 mTriggerBuffer->triggers.clear();
501 }
502 if (mIOPtrs.tpcZS) {
503 uint32_t nDigitsFragmentMax[NSECTORS];
504 mCFContext->zsVersion = -1;
505 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
506 if (mIOPtrs.tpcZS->sector[iSector].count[0]) {
507 const void* rdh = mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0];
508 if (rdh && o2::raw::RDHUtils::getVersion<o2::header::RAWDataHeaderV6>() > o2::raw::RDHUtils::getVersion(rdh)) {
509 GPUError("Data has invalid RDH version %d, %d required\n", o2::raw::RDHUtils::getVersion(rdh), o2::raw::RDHUtils::getVersion<o2::header::RAWDataHeader>());
510 return 1;
511 }
512 }
513#ifndef GPUCA_NO_VC
514 if (GetProcessingSettings().prefetchTPCpageScan >= 1 && iSector < NSECTORS - 1) {
515 for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
516 for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j]; k++) {
517 for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; l++) {
518 Vc::Common::prefetchFar(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector + 1].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE);
519 Vc::Common::prefetchFar(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector + 1].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
520 }
521 }
522 }
523 }
524#endif
525 const auto& x = TPCClusterizerDecodeZSCount(iSector, fragmentMax);
526 nDigitsFragmentMax[iSector] = x.first;
527 processors()->tpcClusterer[iSector].mPmemory->counters.nDigits = x.first;
528 mRec->MemoryScalers()->nTPCdigits += x.first;
529 }
530 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
531 uint32_t nDigitsBase = nDigitsFragmentMax[iSector];
532 uint32_t threshold = 40000000;
533 uint32_t nDigitsScaled = nDigitsBase > threshold ? nDigitsBase : std::min((threshold + nDigitsBase) / 2, 2 * nDigitsBase);
534 processors()->tpcClusterer[iSector].SetNMaxDigits(processors()->tpcClusterer[iSector].mPmemory->counters.nDigits, mCFContext->nPagesFragmentMax, nDigitsScaled, mCFContext->nDigitsEndpointMax[iSector]);
535 if (doGPU) {
536 processorsShadow()->tpcClusterer[iSector].SetNMaxDigits(processors()->tpcClusterer[iSector].mPmemory->counters.nDigits, mCFContext->nPagesFragmentMax, nDigitsScaled, mCFContext->nDigitsEndpointMax[iSector]);
537 }
538 if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) {
539 mPipelineNotifyCtx->rec->AllocateRegisteredForeignMemory(processors()->tpcClusterer[iSector].mZSOffsetId, mRec);
540 mPipelineNotifyCtx->rec->AllocateRegisteredForeignMemory(processors()->tpcClusterer[iSector].mZSId, mRec);
541 } else {
542 AllocateRegisteredMemory(processors()->tpcClusterer[iSector].mZSOffsetId);
543 AllocateRegisteredMemory(processors()->tpcClusterer[iSector].mZSId);
544 }
545 }
546 } else {
547 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
548 uint32_t nDigits = mIOPtrs.tpcPackedDigits->nTPCDigits[iSector];
549 mRec->MemoryScalers()->nTPCdigits += nDigits;
550 processors()->tpcClusterer[iSector].SetNMaxDigits(nDigits, mCFContext->nPagesFragmentMax, nDigits, 0);
551 }
552 }
553
554 if (mIOPtrs.tpcZS) {
555 GPUInfo("Event has %u 8kb TPC ZS pages (version %d), %ld digits", mCFContext->nPagesTotal, mCFContext->zsVersion, (int64_t)mRec->MemoryScalers()->nTPCdigits);
556 } else {
557 GPUInfo("Event has %ld TPC Digits", (int64_t)mRec->MemoryScalers()->nTPCdigits);
558 }
559
560 if (mCFContext->tpcMaxTimeBin > maxAllowedTimebin) {
561 GPUError("Input data has invalid time bin %u > %d", mCFContext->tpcMaxTimeBin, maxAllowedTimebin);
562 if (GetProcessingSettings().ignoreNonFatalGPUErrors) {
563 mCFContext->abandonTimeframe = true;
564 mCFContext->tpcMaxTimeBin = maxAllowedTimebin;
565 } else {
566 return 1;
567 }
568 }
569
570 mCFContext->fragmentFirst = CfFragment{std::max<int32_t>(mCFContext->tpcMaxTimeBin + 1, maxFragmentLen), maxFragmentLen};
571 for (int32_t iSector = 0; iSector < GetProcessingSettings().nTPCClustererLanes && iSector < NSECTORS; iSector++) {
572 if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) {
573 mCFContext->nextPos[iSector] = RunTPCClusterizer_transferZS(iSector, mCFContext->fragmentFirst, GetProcessingSettings().nTPCClustererLanes + iSector);
574 }
575 }
576
577 if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) {
578 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
579 mCFContext->ptrSave[iSector].zsOffsetHost = processors()->tpcClusterer[iSector].mPzsOffsets;
580 mCFContext->ptrSave[iSector].zsOffsetDevice = processorsShadow()->tpcClusterer[iSector].mPzsOffsets;
581 mCFContext->ptrSave[iSector].zsDevice = processorsShadow()->tpcClusterer[iSector].mPzs;
582 }
583 }
584 return 0;
585}
586#endif
587
588int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
589{
590 if (param().rec.fwdTPCDigitsAsClusters) {
591 return ForwardTPCDigits();
592 }
593#ifdef GPUCA_TPC_GEOMETRY_O2
594 int32_t tpcTimeBinCut = mUpdateNewCalibObjects && mNewCalibValues->newTPCTimeBinCut ? mNewCalibValues->tpcTimeBinCut : param().tpcCutTimeBin;
596 const auto& threadContext = GetThreadContext();
597 const bool doGPU = GetRecoStepsGPU() & RecoStep::TPCClusterFinding;
598 if (RunTPCClusterizer_prepare(mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer)) {
599 return 1;
600 }
601 if (GetProcessingSettings().autoAdjustHostThreads && !doGPU) {
603 }
604
606 float tpcHitLowOccupancyScalingFactor = 1.f;
608 uint32_t nHitsBase = mRec->MemoryScalers()->nTPCHits;
609 uint32_t threshold = 30000000 / 256 * mIOPtrs.settingsTF->nHBFPerTF;
610 if (mIOPtrs.settingsTF->nHBFPerTF < 64) {
611 threshold *= 2;
612 }
613 mRec->MemoryScalers()->nTPCHits = std::max<uint32_t>(nHitsBase, std::min<uint32_t>(threshold, nHitsBase * 3.5f)); // Increase the buffer size for low occupancy data to compensate for noisy pads creating exceiive clusters
614 if (nHitsBase < threshold) {
615 float maxFactor = mRec->MemoryScalers()->nTPCHits < threshold * 2 / 3 ? 3 : (mRec->MemoryScalers()->nTPCHits < threshold ? 2.25f : 1.75f);
616 mRec->MemoryScalers()->temporaryFactor *= std::min(maxFactor, (float)threshold / nHitsBase);
617 tpcHitLowOccupancyScalingFactor = std::min(3.5f, (float)threshold / nHitsBase);
618 }
619 }
620 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
621 processors()->tpcClusterer[iSector].SetMaxData(mIOPtrs); // First iteration to set data sizes
622 }
623 mRec->ComputeReuseMax(nullptr); // Resolve maximums for shared buffers
624 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
625 SetupGPUProcessor(&processors()->tpcClusterer[iSector], true); // Now we allocate
626 }
627 if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) {
628 RunTPCClusterizer_prepare(true); // Restore some pointers, allocated by the other pipeline, and set to 0 by SetupGPUProcessor (since not allocated in this pipeline)
629 }
630
631 if (doGPU && mIOPtrs.tpcZS) {
633 WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), mRec->NStreams() - 1);
634 }
635 if (doGPU) {
636 WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)processors()->tpcClusterer - (char*)processors(), processorsShadow()->tpcClusterer, sizeof(GPUTPCClusterFinder) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
637 }
638
639#ifdef GPUCA_HAS_ONNX
640 const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
641 GPUTPCNNClusterizerHost nnApplications[GetProcessingSettings().nTPCClustererLanes];
642
643 if (GetProcessingSettings().nn.applyNNclusterizer) {
644 int32_t deviceId = -1;
645 int32_t numLanes = GetProcessingSettings().nTPCClustererLanes;
646 int32_t maxThreads = mRec->getNKernelHostThreads(true);
647 // bool recreateMemoryAllocator = false;
648 mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) {
649 nnApplications[lane].init(nn_settings);
650 if (nnApplications[lane].mModelsUsed[0]) {
651 SetONNXGPUStream(*(nnApplications[lane].mModelClass).getSessionOptions(), lane, &deviceId);
652 (nnApplications[lane].mModelClass).setDeviceId(deviceId);
653 if (nnApplications[lane].mModelClass.getIntraOpNumThreads() > maxThreads) {
654 nnApplications[lane].mModelClass.setIntraOpNumThreads(maxThreads);
655 }
656 (nnApplications[lane].mModelClass).initEnvironment();
657 // Registering this once seems to be enough, even with different environmnents / models. ONNX apparently uses this per device and stores the OrtAllocator internally. All models will then use the volatile allocation.
658 // But environment must be valid, so we init the model environment first and use it here afterwards.
659 // Either this is done in one environment with lane == 0 or by recreating the allocator using recreateMemoryAllocator.
660 // TODO: Volatile allocation works for reserving, but not yet for allocations when binding the input tensor
661 // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
662 // recreateMemoryAllocator = true;
663 (nnApplications[lane].mModelClass).initSession();
664 }
665 if (nnApplications[lane].mModelsUsed[1]) {
666 SetONNXGPUStream(*(nnApplications[lane].mModelReg1).getSessionOptions(), lane, &deviceId);
667 (nnApplications[lane].mModelReg1).setDeviceId(deviceId);
668 if (nnApplications[lane].mModelReg1.getIntraOpNumThreads() > maxThreads) {
669 nnApplications[lane].mModelReg1.setIntraOpNumThreads(maxThreads);
670 }
671 // (nnApplications[lane].mModelReg1).setEnv((nnApplications[lane].mModelClass).getEnv());
672 (nnApplications[lane].mModelReg1).initEnvironment();
673 // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelReg1).getEnv(), (nnApplications[lane].mModelReg1).getMemoryInfo(), mRec, recreateMemoryAllocator);
674 (nnApplications[lane].mModelReg1).initSession();
675 }
676 if (nnApplications[lane].mModelsUsed[2]) {
677 SetONNXGPUStream(*(nnApplications[lane].mModelReg2).getSessionOptions(), lane, &deviceId);
678 (nnApplications[lane].mModelReg2).setDeviceId(deviceId);
679 if (nnApplications[lane].mModelReg2.getIntraOpNumThreads() > maxThreads) {
680 nnApplications[lane].mModelReg2.setIntraOpNumThreads(maxThreads);
681 }
682 (nnApplications[lane].mModelReg2).initEnvironment();
683 // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
684 (nnApplications[lane].mModelReg2).initSession();
685 }
686 if (nn_settings.nnClusterizerVerbosity < 3) {
687 LOG(info) << "(ORT) Allocated ONNX stream for lane " << lane << " and device " << deviceId;
688 }
689 });
690 mRec->runParallelOuterLoop(doGPU, NSECTORS, [&](uint32_t sector) {
691 GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[sector];
692 GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[sector] : clustererNN;
693 int32_t lane = sector % numLanes;
694 clustererNN.mDeviceId = deviceId;
695 clustererNN.mISector = sector;
697 nnApplications[lane].initClusterizer(nn_settings, clustererNN);
698 if (doGPU) {
699 clustererNNShadow.mDeviceId = deviceId;
700 clustererNNShadow.mISector = sector;
702 nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow);
703 }
705 });
706 if (doGPU) {
707 WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
708 }
709 LOG(info) << "Size of nnApplications[lane]: " << sizeof(nnApplications[0]) << " bytes";
710 LOG(info) << "Size of nnApplications: " << sizeof(GPUTPCNNClusterizerHost) * GetProcessingSettings().nTPCClustererLanes << " bytes";
711 }
712#endif
713
714 size_t nClsTotal = 0;
715 ClusterNativeAccess* tmpNativeAccess = mClusterNativeAccess.get();
716 ClusterNative* tmpNativeClusters = nullptr;
717 std::unique_ptr<ClusterNative[]> tmpNativeClusterBuffer;
718
719 // setup MC Labels
721
722 auto* digitsMC = propagateMCLabels ? processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC : nullptr;
723
724 bool buildNativeGPU = doGPU && NeedTPCClustersOnGPU();
725 bool buildNativeHost = (mRec->GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCClusters) || GetProcessingSettings().deterministicGPUReconstruction; // TODO: Should do this also when clusters are needed for later steps on the host but not requested as output
726
727 mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = mRec->MemoryScalers()->nTPCHits * tpcHitLowOccupancyScalingFactor;
728 if (buildNativeGPU) {
729 AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
730 }
731 if (buildNativeHost && !(buildNativeGPU && GetProcessingSettings().delayedOutput)) {
732 if (mWaitForFinalInputs) {
733 GPUFatal("Cannot use waitForFinalInput callback without delayed output");
734 }
735 if (!GetProcessingSettings().tpcApplyClusterFilterOnCPU) {
737 tmpNativeClusters = mInputsHost->mPclusterNativeOutput;
738 } else {
739 tmpNativeClusterBuffer = std::make_unique<ClusterNative[]>(mInputsHost->mNClusterNative);
740 tmpNativeClusters = tmpNativeClusterBuffer.get();
741 }
742 }
743
744 GPUTPCLinearLabels mcLinearLabels;
745 if (propagateMCLabels) {
746 // No need to overallocate here, nTPCHits is anyway an upper bound used for the GPU cluster buffer, and we can always enlarge the buffer anyway
747 mcLinearLabels.header.reserve(mRec->MemoryScalers()->nTPCHits / 2);
748 mcLinearLabels.data.reserve(mRec->MemoryScalers()->nTPCHits);
749 }
750
751 int8_t transferRunning[NSECTORS] = {0};
752 uint32_t outputQueueStart = mOutputQueue.size();
753
754 auto notifyForeignChainFinished = [this]() {
755 if (mPipelineNotifyCtx) {
756 SynchronizeStream(OutputStream()); // Must finish before updating ioPtrs in (global) constant memory
757 {
758 std::lock_guard<std::mutex> lock(mPipelineNotifyCtx->mutex);
759 mPipelineNotifyCtx->ready = true;
760 }
761 mPipelineNotifyCtx->cond.notify_one();
762 }
763 };
764 bool synchronizeCalibUpdate = false;
765
766 for (uint32_t iSectorBase = 0; iSectorBase < NSECTORS; iSectorBase += GetProcessingSettings().nTPCClustererLanes) {
767 std::vector<bool> laneHasData(GetProcessingSettings().nTPCClustererLanes, false);
768 static_assert(NSECTORS <= GPUCA_MAX_STREAMS, "Stream events must be able to hold all sectors");
769 const int32_t maxLane = std::min<int32_t>(GetProcessingSettings().nTPCClustererLanes, NSECTORS - iSectorBase);
770 for (CfFragment fragment = mCFContext->fragmentFirst; !fragment.isEnd(); fragment = fragment.next()) {
771 if (GetProcessingSettings().debugLevel >= 3) {
772 GPUInfo("Processing time bins [%d, %d) for sectors %d to %d", fragment.start, fragment.last(), iSectorBase, iSectorBase + GetProcessingSettings().nTPCClustererLanes - 1);
773 }
774 mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
775 if (doGPU && fragment.index != 0) {
776 SynchronizeStream(lane); // Don't overwrite charge map from previous iteration until cluster computation is finished
777 }
778
779 uint32_t iSector = iSectorBase + lane;
780 GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
781 GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
782 clusterer.mPmemory->counters.nPeaks = clusterer.mPmemory->counters.nClusters = 0;
783 clusterer.mPmemory->fragment = fragment;
784
786 bool setDigitsOnGPU = doGPU && not mIOPtrs.tpcZS;
787 bool setDigitsOnHost = (not doGPU && not mIOPtrs.tpcZS) || propagateMCLabels;
788 auto* inDigits = mIOPtrs.tpcPackedDigits;
789 size_t numDigits = inDigits->nTPCDigits[iSector];
790 if (setDigitsOnGPU) {
791 GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPdigits, inDigits->tpcDigits[iSector], sizeof(clustererShadow.mPdigits[0]) * numDigits, lane, true);
792 }
793 if (setDigitsOnHost) {
794 clusterer.mPdigits = const_cast<o2::tpc::Digit*>(inDigits->tpcDigits[iSector]); // TODO: Needs fixing (invalid const cast)
795 }
796 clusterer.mPmemory->counters.nDigits = numDigits;
797 }
798
799 if (mIOPtrs.tpcZS) {
800 if (mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) {
801 clusterer.mPmemory->counters.nPositions = mCFContext->nextPos[iSector].first;
802 clusterer.mPmemory->counters.nPagesSubsector = mCFContext->nextPos[iSector].second;
803 } else {
804 clusterer.mPmemory->counters.nPositions = clusterer.mPmemory->counters.nPagesSubsector = 0;
805 }
806 }
807 TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
808
809 using ChargeMapType = decltype(*clustererShadow.mPchargeMap);
810 using PeakMapType = decltype(*clustererShadow.mPpeakMap);
811 runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPchargeMap, TPCMapMemoryLayout<ChargeMapType>::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(ChargeMapType));
812 runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpeakMap, TPCMapMemoryLayout<PeakMapType>::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(PeakMapType));
813 if (fragment.index == 0) {
814 runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpadIsNoisy, TPC_PADS_IN_SECTOR * sizeof(*clustererShadow.mPpadIsNoisy));
815 }
816 DoDebugAndDump(RecoStep::TPCClusterFinding, 262144, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Zeroed Charges");
817
818 if (doGPU) {
819 if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) {
820 TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, mInputsHost->mResourceZS, lane);
821 SynchronizeStream(GetProcessingSettings().nTPCClustererLanes + lane);
822 }
823 SynchronizeStream(mRec->NStreams() - 1); // Wait for copying to constant memory
824 }
825
826 if (mIOPtrs.tpcZS && (mCFContext->abandonTimeframe || !mCFContext->nPagesSector[iSector] || mCFContext->zsVersion == -1)) {
827 clusterer.mPmemory->counters.nPositions = 0;
828 return;
829 }
830 if (!mIOPtrs.tpcZS && mIOPtrs.tpcPackedDigits->nTPCDigits[iSector] == 0) {
831 clusterer.mPmemory->counters.nPositions = 0;
832 return;
833 }
834
835 if (propagateMCLabels && fragment.index == 0) {
836 clusterer.PrepareMC();
837 clusterer.mPinputLabels = digitsMC->v[iSector];
838 if (clusterer.mPinputLabels == nullptr) {
839 GPUFatal("MC label container missing, sector %d", iSector);
840 }
841 if (clusterer.mPinputLabels->getIndexedSize() != mIOPtrs.tpcPackedDigits->nTPCDigits[iSector]) {
842 GPUFatal("MC label container has incorrect number of entries: %d expected, has %d\n", (int32_t)mIOPtrs.tpcPackedDigits->nTPCDigits[iSector], (int32_t)clusterer.mPinputLabels->getIndexedSize());
843 }
844 }
845
846 if (GetProcessingSettings().tpcSingleSector == -1 || GetProcessingSettings().tpcSingleSector == (int32_t)iSector) {
847 if (not mIOPtrs.tpcZS) {
848 runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({GetGrid(1, lane), {iSector}}, mIOPtrs.tpcZS == nullptr);
849 TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
850 } else if (propagateMCLabels) {
851 runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({GetGrid(1, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, mIOPtrs.tpcZS == nullptr);
852 TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
853 }
854 }
855
856 if (mIOPtrs.tpcZS) {
857 int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : ((mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader*)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]) : 0);
858 uint32_t nBlocks = doGPU ? clusterer.mPmemory->counters.nPagesSubsector : GPUTrackingInOutZS::NENDPOINTS;
859
860 (void)tpcTimeBinCut; // TODO: To be used in decoding kernels
861 switch (mCFContext->zsVersion) {
862 default:
863 GPUFatal("Data with invalid TPC ZS mode (%d) received", mCFContext->zsVersion);
864 break;
867 runKernel<GPUTPCCFDecodeZS>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
868 break;
870 runKernel<GPUTPCCFDecodeZSLink>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
871 break;
873 runKernel<GPUTPCCFDecodeZSDenseLink>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
874 break;
875 }
876 TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
877 } // clang-format off
878 });
879 mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
880 uint32_t iSector = iSectorBase + lane;
881 if (doGPU) {
882 SynchronizeStream(lane);
883 }
884 if (mIOPtrs.tpcZS) {
885 CfFragment f = fragment.next();
886 int32_t nextSector = iSector;
887 if (f.isEnd()) {
888 nextSector += GetProcessingSettings().nTPCClustererLanes;
889 f = mCFContext->fragmentFirst;
890 }
891 if (nextSector < NSECTORS && mIOPtrs.tpcZS && mCFContext->nPagesSector[nextSector] && mCFContext->zsVersion != -1 && !mCFContext->abandonTimeframe) {
892 mCFContext->nextPos[nextSector] = RunTPCClusterizer_transferZS(nextSector, f, GetProcessingSettings().nTPCClustererLanes + lane);
893 }
894 }
895 GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
896 GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
897 if (clusterer.mPmemory->counters.nPositions == 0) {
898 return;
899 }
900 if (!mIOPtrs.tpcZS) {
901 runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::fillFromDigits>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
902 }
903 if (DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 1, clusterer, &GPUTPCClusterFinder::DumpDigits, *mDebugFile)) {
904 clusterer.DumpChargeMap(*mDebugFile, "Charges");
905 }
906
907 if (propagateMCLabels) {
908 runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::fillIndexMap>({GetGrid(clusterer.mPmemory->counters.nDigitsInFragment, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}});
909 }
910
911 bool checkForNoisyPads = (rec()->GetParam().rec.tpc.maxTimeBinAboveThresholdIn1000Bin > 0) || (rec()->GetParam().rec.tpc.maxConsecTimeBinAboveThreshold > 0);
912 checkForNoisyPads &= (rec()->GetParam().rec.tpc.noisyPadsQuickCheck ? fragment.index == 0 : true);
913 checkForNoisyPads &= !GetProcessingSettings().disableTPCNoisyPadFilter;
914
915 if (checkForNoisyPads) {
917
918 runKernel<GPUTPCCFCheckPadBaseline>({GetGridBlk(nBlocks, lane), {iSector}});
919 }
920
921 runKernel<GPUTPCCFPeakFinder>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
922 if (DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 2, clusterer, &GPUTPCClusterFinder::DumpPeaks, *mDebugFile)) {
923 clusterer.DumpPeakMap(*mDebugFile, "Peaks");
924 }
925
926 RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 0, doGPU, lane);
927 TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
928 DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 2, clusterer, &GPUTPCClusterFinder::DumpPeaksCompacted, *mDebugFile); // clang-format off
929 });
930 mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
931 uint32_t iSector = iSectorBase + lane;
932 GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
933 GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
934 if (doGPU) {
935 SynchronizeStream(lane);
936 }
937 if (clusterer.mPmemory->counters.nPeaks == 0) {
938 return;
939 }
940 runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSector}});
941 runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSector}});
942 if (DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 3, clusterer, &GPUTPCClusterFinder::DumpSuppressedPeaks, *mDebugFile)) {
943 clusterer.DumpPeakMap(*mDebugFile, "Suppressed Peaks");
944 }
945
946 RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 1, doGPU, lane);
947 TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
948 DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 3, clusterer, &GPUTPCClusterFinder::DumpSuppressedPeaksCompacted, *mDebugFile); // clang-format off
949 });
950 mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
951 uint32_t iSector = iSectorBase + lane;
952 GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
953 GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
954
955 if (doGPU) {
956 SynchronizeStream(lane);
957 }
958
959 if (fragment.index == 0) {
960 deviceEvent* waitEvent = nullptr;
961 if (transferRunning[lane] == 1) {
962 waitEvent = &mEvents->stream[lane];
963 transferRunning[lane] = 2;
964 }
965 runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding), krnlRunRangeNone, {nullptr, waitEvent}}, clustererShadow.mPclusterInRow, GPUCA_ROW_COUNT * sizeof(*clustererShadow.mPclusterInRow));
966 }
967
968 if (clusterer.mPmemory->counters.nClusters == 0) {
969 return;
970 }
971
972 if (GetProcessingSettings().nn.applyNNclusterizer) {
973#ifdef GPUCA_HAS_ONNX
974 GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[lane];
975 GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN;
976 GPUTPCNNClusterizerHost& nnApplication = nnApplications[lane];
977
978 int withMC = (doGPU && propagateMCLabels);
979
980 if (clustererNNShadow.mNnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
981 runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
982 DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
983 }
984
985 // float time_clusterizer = 0, time_fill = 0, time_networks = 0;
986 for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNNShadow.mNnClusterizerBatchedMode); batch++) {
987 uint batchStart = batch * clustererNNShadow.mNnClusterizerBatchedMode;
988 size_t iSize = CAMath::Min((uint)clustererNNShadow.mNnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
989
990 // auto start0 = std::chrono::high_resolution_clock::now();
991 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNSingleElement>({GetGrid(iSize * clustererNNShadow.mNnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, withMC, batchStart); // Filling the data
992
993 // auto stop0 = std::chrono::high_resolution_clock::now();
994 // auto start1 = std::chrono::high_resolution_clock::now();
995
996 // NN evaluations
997 if (clustererNNShadow.mNnInferenceInputDType == 0) {
998 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
999 (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16);
1000 } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
1001 (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_32);
1002 }
1003 } else if (clustererNNShadow.mNnInferenceInputDType == 1) {
1004 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
1005 (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_16);
1006 } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
1007 (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32);
1008 }
1009 }
1010 if (!clustererNNShadow.mNnClusterizerUseCfRegression) {
1011 if (clustererNNShadow.mNnInferenceInputDType == 0) {
1012 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
1013 (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16);
1014 } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
1015 (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_32);
1016 }
1017 } else if (clustererNNShadow.mNnInferenceInputDType == 1) {
1018 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
1019 (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_16);
1020 } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
1021 (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32);
1022 }
1023 }
1024 if (nnApplication.mModelClass.getNumOutputNodes()[0][1] > 1 && nnApplication.mModelReg2.isInitialized()) {
1025 if (clustererNNShadow.mNnInferenceInputDType == 0) {
1026 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
1027 (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16);
1028 } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
1029 (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_32);
1030 }
1031 } else if (clustererNNShadow.mNnInferenceInputDType == 1) {
1032 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
1033 (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_16);
1034 } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
1035 (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32);
1036 }
1037 }
1038 }
1039 }
1040
1041 // auto stopNNs = std::chrono::high_resolution_clock::now();
1042
1043 // Publishing kernels
1044 if (nnApplication.mModelClass.getNumOutputNodes()[0][1] == 1) {
1045 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, withMC, batchStart); // Assigning class labels
1046 } else {
1047 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, withMC, batchStart); // Assigning class labels
1048 }
1049 if (!clustererNNShadow.mNnClusterizerUseCfRegression) {
1050 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, withMC, batchStart); // Publishing class 1 regression results
1051 if (nnApplication.mModelClass.getNumOutputNodes()[0][1] > 1 && nnApplication.mModelReg2.isInitialized()) {
1052 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, withMC, batchStart); // Publishing class 2 regression results
1053 }
1054 }
1055
1056 // for(int i = 0; i < iSize; ++i) {
1057 // if(clustererNNShadow.mOutputDataClass[i + batchStart] > 1) {
1058 // LOG(info) << "WARNING ORT: Output of " << i + batchStart << " / " << clusterer.mPmemory->counters.nClusters << " is " << clustererNNShadow.mModelProbabilities_16[i].ToFloat() << " and " << clustererNNShadow.mOutputDataClass[i + batchStart] << " thresh " << clustererNNShadow.mNnClassThreshold << " instead of 0 or 1. Please check the model and the input data.";
1059 // // std::string input = "[";
1060 // // for(int j = 0; j < clustererNNShadow.mNnClusterizerElementSize; j++){
1061 // // input += std::to_string(clustererNNShadow.mInputData_16[i * clustererNNShadow.mNnClusterizerElementSize + j].ToFloat()) + ", ";
1062 // // }
1063 // // input += "]";
1064 // // LOG(info) << "Input is: " << input;
1065 // }
1066 // }
1067
1068 // auto stop1 = std::chrono::high_resolution_clock::now();
1069
1070 // time_networks += std::chrono::duration_cast<std::chrono::nanoseconds>(stopNNs - start1).count() / 1e9;
1071 // time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
1072 // time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
1073 }
1074 if (clustererNNShadow.mNnClusterizerUseCfRegression) {
1075 // auto start1 = std::chrono::high_resolution_clock::now();
1076 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
1077 // auto stop1 = std::chrono::high_resolution_clock::now();
1078 // time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
1079 }
1080 // if (clustererNNShadow.mNnClusterizerVerbosity < 3) {
1081 // int acceptedClusters = 0;
1082 // for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) {
1083 // if(clustererNNShadow.mOutputDataClass[i] > 1 || clustererNNShadow.mOutputDataClass[i] < 0) {
1084 // LOG(info) << "WARNING ORT 2: " << clustererNNShadow.mOutputDataClass[i] << " for index " << i << " / " << clusterer.mPmemory->counters.nClusters;
1085 // }
1086 // acceptedClusters += clustererNNShadow.mOutputDataClass[i];
1087 // }
1088 // LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; networks: " << time_networks << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t)clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
1089 // }
1090#else
1091 GPUFatal("Project not compiled with neural network clusterization. Aborting.");
1092#endif
1093 } else {
1094 runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
1095 DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
1096 runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSector}}, 0);
1097 }
1098
1099 if (doGPU && propagateMCLabels) {
1100 TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane);
1101 if (doGPU) {
1102 SynchronizeStream(lane);
1103 }
1104 runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, 1); // Computes MC labels
1105 }
1106
1107 if (GetProcessingSettings().debugLevel >= 3) {
1108 GPUInfo("Sector %02d Fragment %02d Lane %d: Found clusters: digits %u peaks %u clusters %u", iSector, fragment.index, lane, (int32_t)clusterer.mPmemory->counters.nPositions, (int32_t)clusterer.mPmemory->counters.nPeaks, (int32_t)clusterer.mPmemory->counters.nClusters);
1109 }
1110
1111 TransferMemoryResourcesToHost(RecoStep::TPCClusterFinding, &clusterer, lane);
1112 laneHasData[lane] = true;
1113 // Include clusters in default debug mask, exclude other debug output by default
1114 DoDebugAndDump(RecoStep::TPCClusterFinding, 131072, clusterer, &GPUTPCClusterFinder::DumpClusters, *mDebugFile); // clang-format off
1115 });
1117 }
1118
1119 size_t nClsFirst = nClsTotal;
1120 bool anyLaneHasData = false;
1121 for (int32_t lane = 0; lane < maxLane; lane++) {
1122 uint32_t iSector = iSectorBase + lane;
1123 std::fill(&tmpNativeAccess->nClusters[iSector][0], &tmpNativeAccess->nClusters[iSector][0] + MAXGLOBALPADROW, 0);
1124 if (doGPU) {
1125 SynchronizeStream(lane);
1126 }
1127 GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
1128 GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
1129
1130 if (laneHasData[lane]) {
1131 anyLaneHasData = true;
1132 if (buildNativeGPU && GetProcessingSettings().tpccfGatherKernel) {
1133 runKernel<GPUTPCCFGather>({GetGridBlk(GPUCA_ROW_COUNT, mRec->NStreams() - 1), {iSector}}, &mInputsShadow->mPclusterNativeBuffer[nClsTotal]);
1134 }
1135 for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
1136 if (nClsTotal + clusterer.mPclusterInRow[j] > mInputsHost->mNClusterNative) {
1137 clusterer.raiseError(GPUErrors::ERROR_CF_GLOBAL_CLUSTER_OVERFLOW, iSector * 1000 + j, nClsTotal + clusterer.mPclusterInRow[j], mInputsHost->mNClusterNative);
1138 continue;
1139 }
1140 if (buildNativeGPU) {
1141 if (!GetProcessingSettings().tpccfGatherKernel) {
1142 GPUMemCpyAlways(RecoStep::TPCClusterFinding, (void*)&mInputsShadow->mPclusterNativeBuffer[nClsTotal], (const void*)&clustererShadow.mPclusterByRow[j * clusterer.mNMaxClusterPerRow], sizeof(mIOPtrs.clustersNative->clustersLinear[0]) * clusterer.mPclusterInRow[j], mRec->NStreams() - 1, -2);
1143 }
1144 } else if (buildNativeHost) {
1145 GPUMemCpyAlways(RecoStep::TPCClusterFinding, (void*)&tmpNativeClusters[nClsTotal], (const void*)&clustererShadow.mPclusterByRow[j * clusterer.mNMaxClusterPerRow], sizeof(mIOPtrs.clustersNative->clustersLinear[0]) * clusterer.mPclusterInRow[j], mRec->NStreams() - 1, false);
1146 }
1147 tmpNativeAccess->nClusters[iSector][j] += clusterer.mPclusterInRow[j];
1148 nClsTotal += clusterer.mPclusterInRow[j];
1149 }
1150 if (transferRunning[lane]) {
1151 ReleaseEvent(mEvents->stream[lane], doGPU);
1152 }
1153 RecordMarker(&mEvents->stream[lane], mRec->NStreams() - 1);
1154 transferRunning[lane] = 1;
1155 }
1156
1157 if (not propagateMCLabels || not laneHasData[lane]) {
1158 assert(propagateMCLabels ? mcLinearLabels.header.size() == nClsTotal : true);
1159 continue;
1160 }
1161
1162 runKernel<GPUTPCCFMCLabelFlattener, GPUTPCCFMCLabelFlattener::setRowOffsets>({GetGrid(GPUCA_ROW_COUNT, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}});
1164 runKernel<GPUTPCCFMCLabelFlattener, GPUTPCCFMCLabelFlattener::flatten>({GetGrid(GPUCA_ROW_COUNT, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, &mcLinearLabels);
1165 clusterer.clearMCMemory();
1166 assert(propagateMCLabels ? mcLinearLabels.header.size() == nClsTotal : true);
1167 }
1168 if (propagateMCLabels) {
1169 for (int32_t lane = 0; lane < maxLane; lane++) {
1170 processors()->tpcClusterer[iSectorBase + lane].clearMCMemory();
1171 }
1172 }
1173 if (buildNativeHost && buildNativeGPU && anyLaneHasData) {
1174 if (GetProcessingSettings().delayedOutput) {
1175 mOutputQueue.emplace_back(outputQueueEntry{(void*)((char*)&tmpNativeClusters[nClsFirst] - (char*)&tmpNativeClusters[0]), &mInputsShadow->mPclusterNativeBuffer[nClsFirst], (nClsTotal - nClsFirst) * sizeof(tmpNativeClusters[0]), RecoStep::TPCClusterFinding});
1176 } else {
1177 GPUMemCpy(RecoStep::TPCClusterFinding, (void*)&tmpNativeClusters[nClsFirst], (const void*)&mInputsShadow->mPclusterNativeBuffer[nClsFirst], (nClsTotal - nClsFirst) * sizeof(tmpNativeClusters[0]), mRec->NStreams() - 1, false);
1178 }
1179 }
1180
1181 if (mWaitForFinalInputs && iSectorBase >= 21 && (int32_t)iSectorBase < 21 + GetProcessingSettings().nTPCClustererLanes) {
1182 notifyForeignChainFinished();
1183 }
1184 if (mWaitForFinalInputs && iSectorBase >= 30 && (int32_t)iSectorBase < 30 + GetProcessingSettings().nTPCClustererLanes) {
1185 mWaitForFinalInputs();
1186 synchronizeCalibUpdate = DoQueuedUpdates(0, false);
1187 }
1188 }
1189 for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) {
1190 // if (GetProcessingSettings().nn.applyNNclusterizer) {
1191 // GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
1192 // nnApplication.mModelClass.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
1193 // nnApplication.mModelReg1.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
1194 // nnApplication.mModelReg2.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
1195 // }
1196 if (transferRunning[i]) {
1197 ReleaseEvent(mEvents->stream[i], doGPU);
1198 }
1199 }
1200
1201 if (GetProcessingSettings().param.tpcTriggerHandling) {
1203 if (triggerOutput && triggerOutput->allocator) {
1204 // GPUInfo("Storing %lu trigger words", mTriggerBuffer->triggers.size());
1205 auto* outputBuffer = (decltype(mTriggerBuffer->triggers)::value_type*)triggerOutput->allocator(mTriggerBuffer->triggers.size() * sizeof(decltype(mTriggerBuffer->triggers)::value_type));
1206 std::copy(mTriggerBuffer->triggers.begin(), mTriggerBuffer->triggers.end(), outputBuffer);
1207 }
1208 mTriggerBuffer->triggers.clear();
1209 }
1210
1211 ClusterNativeAccess::ConstMCLabelContainerView* mcLabelsConstView = nullptr;
1212 if (propagateMCLabels) {
1213 // TODO: write to buffer directly
1215 std::pair<ConstMCLabelContainer*, ConstMCLabelContainerView*> buffer;
1218 throw std::runtime_error("Cluster MC Label buffer missing");
1219 }
1221 buffer = {&container->first, &container->second};
1222 } else {
1223 mIOMem.clusterNativeMCView = std::make_unique<ConstMCLabelContainerView>();
1224 mIOMem.clusterNativeMCBuffer = std::make_unique<ConstMCLabelContainer>();
1225 buffer.first = mIOMem.clusterNativeMCBuffer.get();
1226 buffer.second = mIOMem.clusterNativeMCView.get();
1227 }
1228
1229 assert(propagateMCLabels ? mcLinearLabels.header.size() == nClsTotal : true);
1230 assert(propagateMCLabels ? mcLinearLabels.data.size() >= nClsTotal : true);
1231
1232 mcLabels.setFrom(mcLinearLabels.header, mcLinearLabels.data);
1233 mcLabels.flatten_to(*buffer.first);
1234 *buffer.second = *buffer.first;
1235 mcLabelsConstView = buffer.second;
1236 }
1237
1238 if (buildNativeHost && buildNativeGPU && GetProcessingSettings().delayedOutput) {
1239 mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = nClsTotal;
1241 tmpNativeClusters = mInputsHost->mPclusterNativeOutput;
1242 for (uint32_t i = outputQueueStart; i < mOutputQueue.size(); i++) {
1243 mOutputQueue[i].dst = (char*)tmpNativeClusters + (size_t)mOutputQueue[i].dst;
1244 }
1245 }
1246
1247 if (buildNativeHost) {
1248 tmpNativeAccess->clustersLinear = tmpNativeClusters;
1249 tmpNativeAccess->clustersMCTruth = mcLabelsConstView;
1250 tmpNativeAccess->setOffsetPtrs();
1251 mIOPtrs.clustersNative = tmpNativeAccess;
1252 if (GetProcessingSettings().tpcApplyClusterFilterOnCPU) {
1253 auto allocator = [this, &tmpNativeClusters](size_t size) {
1254 this->mInputsHost->mNClusterNative = size;
1255 this->AllocateRegisteredMemory(this->mInputsHost->mResourceClusterNativeOutput, this->mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
1256 return (tmpNativeClusters = this->mInputsHost->mPclusterNativeOutput);
1257 };
1258 RunTPCClusterFilter(tmpNativeAccess, allocator, false);
1259 nClsTotal = tmpNativeAccess->nClustersTotal;
1260 }
1261 }
1262
1263 if (!mWaitForFinalInputs) {
1264 notifyForeignChainFinished();
1265 }
1266
1267 if (buildNativeGPU) {
1268 processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
1269 WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0);
1270 *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative;
1271 mInputsHost->mPclusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer;
1272 mInputsHost->mPclusterNativeAccess->setOffsetPtrs();
1273 TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, mInputsHost->mResourceClusterNativeAccess, 0);
1274 }
1275 if (doGPU && synchronizeOutput) {
1277 }
1278 if (doGPU && synchronizeCalibUpdate) {
1280 }
1281 if (buildNativeHost && (GetProcessingSettings().deterministicGPUReconstruction || GetProcessingSettings().debugLevel >= 4)) {
1282 for (uint32_t i = 0; i < NSECTORS; i++) {
1283 for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
1284 std::sort(&tmpNativeClusters[tmpNativeAccess->clusterOffset[i][j]], &tmpNativeClusters[tmpNativeAccess->clusterOffset[i][j] + tmpNativeAccess->nClusters[i][j]]);
1285 }
1286 }
1287 if (buildNativeGPU) {
1288 GPUMemCpy(RecoStep::TPCClusterFinding, (void*)mInputsShadow->mPclusterNativeBuffer, (const void*)tmpNativeClusters, nClsTotal * sizeof(tmpNativeClusters[0]), -1, true);
1289 }
1290 }
1291 mRec->MemoryScalers()->nTPCHits = nClsTotal;
1292 mRec->PopNonPersistentMemory(RecoStep::TPCClusterFinding, qStr2Tag("TPCCLUST"));
1293 if (mPipelineNotifyCtx) {
1295 mPipelineNotifyCtx = nullptr;
1296 }
1297
1298 if (GetProcessingSettings().autoAdjustHostThreads && !doGPU) {
1300 }
1301
1302#endif
1303 return 0;
1304}
Definition of the TPC Digit.
int32_t i
#define TPC_MAX_TIME_BIN_TRIGGERED
#define GPUCA_MAX_STREAMS
int32_t retVal
bool o
#define GPUCA_ROW_COUNT
std::enable_if_t< std::is_signed< T >::value, bool > hasData(const CalArray< T > &cal)
Definition Painter.cxx:515
uint32_t j
Definition RawData.h:0
uint8_t endpoint
Definition RawData.h:0
Definitions of TPC Zero Suppression Data Headers.
void Start()
Definition timer.cxx:57
void Stop()
Definition timer.cxx:69
A container to hold and manage MC truth information/labels.
void setFrom(std::vector< MCTruthHeaderElement > &header, std::vector< TruthElement > &truthArray)
size_t flatten_to(ContainerType &container) const
std::unique_ptr< o2::tpc::ClusterNativeAccess > mClusterNativeAccess
int32_t RunTPCClusterizer(bool synchronizeOutput=true)
std::unique_ptr< GPUTrackingInputProvider > mInputsHost
std::array< GPUOutputControl *, GPUTrackingOutputs::count()> mSubOutputControls
std::unique_ptr< std::ofstream > mDebugFile
std::unique_ptr< GPUTriggerOutputs > mTriggerBuffer
std::vector< outputQueueEntry > mOutputQueue
std::unique_ptr< GPUTPCCFChainContext > mCFContext
int32_t DoQueuedUpdates(int32_t stream, bool updateSlave=true)
std::unique_ptr< GPUNewCalibValues > mNewCalibValues
GPUTrackingInOutPointers & mIOPtrs
struct o2::gpu::GPUChainTracking::InOutMemory mIOMem
std::unique_ptr< GPUTrackingInputProvider > mInputsShadow
void RecordMarker(deviceEvent *ev, int32_t stream)
Definition GPUChain.h:107
void TransferMemoryResourceLinkToGPU(RecoStep step, int16_t res, int32_t stream=-1, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
Definition GPUChain.h:123
void GPUMemCpyAlways(RecoStep step, void *dst, const void *src, size_t size, int32_t stream, int32_t toGPU, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
Definition GPUChain.h:128
void GPUMemCpy(RecoStep step, void *dst, const void *src, size_t size, int32_t stream, int32_t toGPU, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
Definition GPUChain.h:127
GPUReconstruction::RecoStepField GetRecoStepsGPU() const
Definition GPUChain.h:71
void WriteToConstantMemory(RecoStep step, size_t offset, const void *src, size_t size, int32_t stream=-1, deviceEvent *ev=nullptr)
Definition GPUChain.h:126
void ReleaseEvent(deviceEvent ev, bool doGPU=true)
Definition GPUChain.h:110
size_t AllocateRegisteredMemory(GPUProcessor *proc)
Definition GPUChain.h:216
virtual std::unique_ptr< GPUReconstructionProcessing::threadContext > GetThreadContext()
Definition GPUChain.h:108
GPUConstantMem * processors()
Definition GPUChain.h:83
static constexpr krnlRunRange krnlRunRangeNone
Definition GPUChain.h:40
void SetONNXGPUStream(Ort::SessionOptions &opt, int32_t stream, int32_t *deviceId)
Definition GPUChain.h:89
GPUParam & param()
Definition GPUChain.h:86
void SetupGPUProcessor(T *proc, bool allocate)
Definition GPUChain.h:219
const GPUSettingsProcessing & GetProcessingSettings() const
Definition GPUChain.h:75
void SynchronizeStream(int32_t stream)
Definition GPUChain.h:88
GPUReconstructionCPU * mRec
Definition GPUChain.h:78
GPUConstantMem * processorsShadow()
Definition GPUChain.h:84
krnlExec GetGridAutoStep(int32_t stream, GPUDataTypes::RecoStep st=GPUDataTypes::RecoStep::NoRecoStep)
Definition GPUChain.cxx:47
static constexpr int32_t NSECTORS
Definition GPUChain.h:57
void TransferMemoryResourceLinkToHost(RecoStep step, int16_t res, int32_t stream=-1, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
Definition GPUChain.h:124
void TransferMemoryResourcesToHost(RecoStep step, GPUProcessor *proc, int32_t stream=-1, bool all=false)
Definition GPUChain.h:122
bool DoDebugAndDump(RecoStep step, int32_t mask, T &processor, S T::*func, Args &&... args)
Definition GPUChain.h:229
krnlExec GetGrid(uint32_t totalItems, uint32_t nThreads, int32_t stream, GPUReconstruction::krnlDeviceType d=GPUReconstruction::krnlDeviceType::Auto, GPUDataTypes::RecoStep st=GPUDataTypes::RecoStep::NoRecoStep)
Definition GPUChain.cxx:21
krnlExec GetGridBlk(uint32_t nBlocks, int32_t stream, GPUReconstruction::krnlDeviceType d=GPUReconstruction::krnlDeviceType::Auto, GPUDataTypes::RecoStep st=GPUDataTypes::RecoStep::NoRecoStep)
Definition GPUChain.cxx:32
GPUReconstruction * rec()
Definition GPUChain.h:65
HighResTimer & getGeneralStepTimer(GeneralStep step)
void runParallelOuterLoop(bool doGPU, uint32_t nThreads, std::function< void(uint32_t)> lambda)
const GPUDefParameters & getGPUParameters(bool doGPU) const override
void AllocateRegisteredForeignMemory(int16_t res, GPUReconstruction *rec, GPUOutputControl *control=nullptr)
void PopNonPersistentMemory(RecoStep step, uint64_t tag)
void ComputeReuseMax(GPUProcessor *proc)
RecoStepField GetRecoStepsGPU() const
const GPUParam & GetParam() const
void PushNonPersistentMemory(uint64_t tag)
InOutTypeField GetRecoStepsOutputs() const
GPUMemorySizeScalers * MemoryScalers()
static void setGlobalOffsetsAndAllocate(GPUTPCClusterFinder &, GPUTPCLinearLabels &)
void SetMaxData(const GPUTrackingInOutPointers &io)
void SetNMaxDigits(size_t nDigits, size_t nPages, size_t nDigitsFragment, size_t nDigitsEndpointMax)
void DumpSuppressedPeaks(std::ostream &out)
void DumpPeakMap(std::ostream &out, std::string_view)
o2::dataformats::ConstMCTruthContainerView< o2::MCCompLabel > const * mPinputLabels
void DumpChargeMap(std::ostream &out, std::string_view)
uint32_t getNSteps(size_t items) const
void DumpSuppressedPeaksCompacted(std::ostream &out)
void DumpPeaksCompacted(std::ostream &out)
tpc::ClusterNative * mPclusterByRow
void initClusterizer(const GPUSettingsProcessingNNclusterizer &, GPUTPCNNClusterizer &)
void init(const GPUSettingsProcessingNNclusterizer &)
OrtDataType::Float16_t * mInputData_16
OrtDataType::Float16_t * mOutputDataReg2_16
OrtDataType::Float16_t * mModelProbabilities_16
OrtDataType::Float16_t * mOutputDataReg1_16
void setIntraOpNumThreads(int threads)
std::vector< std::vector< int64_t > > getNumOutputNodes() const
#define TPC_PADS_IN_SECTOR
GLint GLenum GLint x
Definition glcorearb.h:403
const GLfloat * m
Definition glcorearb.h:4066
GLenum src
Definition glcorearb.h:1767
GLint GLsizei count
Definition glcorearb.h:399
GLuint buffer
Definition glcorearb.h:655
GLsizeiptr size
Definition glcorearb.h:659
GLdouble f
Definition glcorearb.h:310
GLboolean * data
Definition glcorearb.h:298
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLenum GLfloat param
Definition glcorearb.h:271
uint8_t itsSharedClusterMap uint8_t
constexpr int LHCMaxBunches
Definition of a container to keep/associate and arbitrary number of labels associated to an index wit...
RAWDataHeaderV7 RAWDataHeader
void dumpBuffer(gsl::span< const std::byte > buffer, std::ostream &out=std::cout, size_t maxbytes=std::numeric_limits< size_t >::max())
Definition DumpBuffer.h:139
constexpr int LHCBCPERTIMEBIN
Definition Constants.h:38
constexpr int MAXGLOBALPADROW
Definition Constants.h:34
Global TPC definitions and constants.
Definition SimTraits.h:167
@ ZSVersionDenseLinkBased
@ ZSVersionLinkBasedWithMeta
@ ZSVersionRowBased10BitADC
@ ZSVersionRowBased12BitADC
a couple of static helper functions to create timestamp values for CCDB queries or override obsolete ...
constexpr T qStr2Tag(const char *str)
Definition strtag.h:22
tpccf::TPCTime start
Definition CfFragment.h:31
std::unique_ptr< o2::dataformats::ConstMCTruthContainerView< o2::MCCompLabel > > clusterNativeMCView
std::unique_ptr< o2::dataformats::ConstMCTruthContainer< o2::MCCompLabel > > clusterNativeMCBuffer
deviceEvent stream[GPUCA_MAX_STREAMS]
GPUTPCClusterFinder tpcClusterer[GPUCA_NSECTORS]
GPUTrackingInOutPointers ioPtrs
size_t NTPCClusters(size_t tpcDigits, bool perSector=false)
std::function< void *(size_t)> allocator
struct o2::gpu::GPUTPCClusterFinder::Memory::counters_t counters
std::vector< o2::MCCompLabel > data
std::vector< o2::dataformats::MCTruthHeaderElement > header
const GPUTPCDigitsMCInput * tpcDigitsMC
const o2::tpc::ClusterNativeAccess * clustersNative
const GPUSettingsTF * settingsTF
const GPUTrackingInOutZS * tpcZS
const GPUTrackingInOutDigits * tpcPackedDigits
GPUTrackingInOutZSSector sector[NSECTORS]
static constexpr uint32_t NENDPOINTS
size_t getIndex(const GPUOutputControl &v)
static constexpr int getVersion()
get numeric version of the RDH
Definition RDHUtils.h:58
unsigned int nClusters[constants::MAXSECTOR][constants::MAXGLOBALPADROW]
const o2::dataformats::ConstMCTruthContainerView< o2::MCCompLabel > * clustersMCTruth
std::pair< ConstMCLabelContainer, ConstMCLabelContainerView > ConstMCLabelContainerViewWithBuffer
unsigned int clusterOffset[constants::MAXSECTOR][constants::MAXGLOBALPADROW]
const ClusterNative * clustersLinear
static constexpr unsigned int TRIGGER_WORD_SIZE
unsigned char version
static constexpr size_t TPC_ZS_PAGE_SIZE
unsigned short nADCsamples
Trigger info including the orbit.
uint32_t orbit
orbit of the trigger word
TriggerWordDLBZS triggerWord
trigger Word information
bool isValid(int entry=0) const
constexpr size_t min
constexpr size_t max
LOG(info)<< "Compressed in "<< sw.CpuTime()<< " s"
std::vector< Digit > digits