Project
Loading...
Searching...
No Matches
GPUChainTrackingClusterizer.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
15#include "GPUChainTracking.h"
18#include "GPULogging.h"
19#include "GPUO2DataTypes.h"
22#include "GPUNewCalibValues.h"
23#include "GPUConstantMem.h"
24#include "CfChargePos.h"
25#include "CfArray2D.h"
26#include "GPUGeneralKernels.h"
27#include "GPUDefParametersRuntime.h"
30#include "GPUTPCCFDecodeZS.h"
32#include "GPUTPCCFPeakFinder.h"
35#include "GPUTPCCFClusterizer.h"
36#include "GPUTPCCFGather.h"
38#include "GPUTriggerOutputs.h"
39#include "GPUHostDataTypes.h"
45#include "TPCBase/RDHUtils.h"
46
47#ifdef GPUCA_HAS_ONNX
50#include "ORTRootSerializer.h"
51#endif
52
53#ifdef GPUCA_O2_LIB
55#endif
56
57#include "utils/VcShim.h"
58#include "utils/strtag.h"
59#include <fstream>
60
61using namespace o2::gpu;
62using namespace o2::tpc;
63using namespace o2::tpc::constants;
64using namespace o2::dataformats;
65
66#ifdef GPUCA_TPC_GEOMETRY_O2
67std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCountUpdate(uint32_t iSector, const CfFragment& fragment)
68{
70 GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
72 uint32_t digits = 0;
73 uint32_t pages = 0;
74 for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
75 clusterer.mMinMaxCN[j] = mCFContext->fragmentData[fragment.index].minMaxCN[iSector][j];
76 if (doGPU) {
77 uint16_t posInEndpoint = 0;
78 uint16_t pagesEndpoint = 0;
79 for (uint32_t k = clusterer.mMinMaxCN[j].zsPtrFirst; k < clusterer.mMinMaxCN[j].zsPtrLast; k++) {
80 const uint32_t pageFirst = (k == clusterer.mMinMaxCN[j].zsPtrFirst) ? clusterer.mMinMaxCN[j].zsPageFirst : 0;
81 const uint32_t pageLast = (k + 1 == clusterer.mMinMaxCN[j].zsPtrLast) ? clusterer.mMinMaxCN[j].zsPageLast : mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k];
82 for (uint32_t l = pageFirst; l < pageLast; l++) {
83 uint16_t pageDigits = mCFContext->fragmentData[fragment.index].pageDigits[iSector][j][posInEndpoint++];
84 if (pageDigits) {
85 *(o++) = GPUTPCClusterFinder::ZSOffset{digits, j, pagesEndpoint};
86 digits += pageDigits;
87 }
88 pagesEndpoint++;
89 }
90 }
91 if (pagesEndpoint != mCFContext->fragmentData[fragment.index].pageDigits[iSector][j].size()) {
92 if (GetProcessingSettings().ignoreNonFatalGPUErrors) {
93 GPUError("TPC raw page count mismatch in TPCClusterizerDecodeZSCountUpdate: expected %d / buffered %lu", pagesEndpoint, mCFContext->fragmentData[fragment.index].pageDigits[iSector][j].size());
94 return {0, 0};
95 } else {
96 GPUFatal("TPC raw page count mismatch in TPCClusterizerDecodeZSCountUpdate: expected %d / buffered %lu", pagesEndpoint, mCFContext->fragmentData[fragment.index].pageDigits[iSector][j].size());
97 }
98 }
99 } else {
101 digits += mCFContext->fragmentData[fragment.index].nDigits[iSector][j];
102 pages += mCFContext->fragmentData[fragment.index].nPages[iSector][j];
103 }
104 }
105 if (doGPU) {
106 pages = o - processors()->tpcClusterer[iSector].mPzsOffsets;
107 }
108 if (GetProcessingSettings().clusterizerZSSanityCheck && mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
109 TPCClusterizerEnsureZSOffsets(iSector, fragment);
110 }
111 return {digits, pages};
112}
113
114void GPUChainTracking::TPCClusterizerEnsureZSOffsets(uint32_t iSector, const CfFragment& fragment)
115{
116 GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
117 uint32_t nAdcs = 0;
119 const auto& data = mCFContext->fragmentData[fragment.index];
120 uint32_t pagesEndpoint = 0;
121 const uint32_t nAdcsExpected = data.nDigits[iSector][endpoint];
122 const uint32_t nPagesExpected = data.nPages[iSector][endpoint];
123
124 uint32_t nAdcDecoded = 0;
125 const auto& zs = mIOPtrs.tpcZS->sector[iSector];
126 for (uint32_t i = data.minMaxCN[iSector][endpoint].zsPtrFirst; i < data.minMaxCN[iSector][endpoint].zsPtrLast; i++) {
127 const uint32_t pageFirst = (i == data.minMaxCN[iSector][endpoint].zsPtrFirst) ? data.minMaxCN[iSector][endpoint].zsPageFirst : 0;
128 const uint32_t pageLast = (i + 1 == data.minMaxCN[iSector][endpoint].zsPtrLast) ? data.minMaxCN[iSector][endpoint].zsPageLast : zs.nZSPtr[endpoint][i];
129 for (uint32_t j = pageFirst; j < pageLast; j++) {
130 const uint8_t* page = static_cast<const uint8_t*>(zs.zsPtr[endpoint][i]) + j * TPCZSHDR::TPC_ZS_PAGE_SIZE;
131 const header::RAWDataHeader* rawDataHeader = reinterpret_cast<const header::RAWDataHeader*>(page);
132 const TPCZSHDRV2* decHdr = reinterpret_cast<const TPCZSHDRV2*>(page + raw::RDHUtils::getMemorySize(*rawDataHeader) - sizeof(TPCZSHDRV2));
133 const uint16_t nSamplesInPage = decHdr->nADCsamples;
134
135 nAdcDecoded += nSamplesInPage;
136 pagesEndpoint++;
137 }
138 }
139
140 if (pagesEndpoint != nPagesExpected) {
141 GPUFatal("Sector %d, Endpoint %d, Fragment %d: TPC raw page count mismatch: expected %d / buffered %u", iSector, endpoint, fragment.index, pagesEndpoint, nPagesExpected);
142 }
143
144 if (nAdcDecoded != nAdcsExpected) {
145 GPUFatal("Sector %d, Endpoint %d, Fragment %d: TPC ADC count mismatch: expected %u, buffered %u", iSector, endpoint, fragment.index, nAdcsExpected, nAdcDecoded);
146 }
147
148 if (nAdcs != clusterer.mPzsOffsets[endpoint].offset) {
149 GPUFatal("Sector %d, Endpoint %d, Fragment %d: TPC ADC offset mismatch: expected %u, buffered %u", iSector, endpoint, fragment.index, nAdcs, clusterer.mPzsOffsets[endpoint].offset);
150 }
151
152 nAdcs += nAdcsExpected;
153 }
154}
155
156namespace
157{
158struct TPCCFDecodeScanTmp {
159 int32_t zsPtrFirst, zsPageFirst, zsPtrLast, zsPageLast, hasData, pageCounter;
160};
161} // namespace
162
163std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint32_t iSector, const CfFragment& fragment)
164{
165 mRec->getGeneralStepTimer(GeneralStep::Prepare).Start();
166 uint32_t nDigits = 0;
167 uint32_t nPages = 0;
168 uint32_t endpointAdcSamples[GPUTrackingInOutZS::NENDPOINTS];
169 memset(endpointAdcSamples, 0, sizeof(endpointAdcSamples));
171 int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : ((mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader*)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]) : 0);
172
173 for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
174
175 if (GetProcessingSettings().prefetchTPCpageScan >= 3 && j < GPUTrackingInOutZS::NENDPOINTS - 1) {
176 for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j + 1]; k++) {
177 for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j + 1][k]; l++) {
178 Vc::Common::prefetchMid(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j + 1][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE);
179 Vc::Common::prefetchMid(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j + 1][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
180 }
181 }
182 }
183
184 std::vector<std::pair<CfFragment, TPCCFDecodeScanTmp>> fragments;
185 fragments.reserve(mCFContext->nFragments);
186 fragments.emplace_back(std::pair<CfFragment, TPCCFDecodeScanTmp>{fragment, {0, 0, 0, 0, 0, -1}});
187 for (uint32_t i = 1; i < mCFContext->nFragments; i++) {
188 fragments.emplace_back(std::pair<CfFragment, TPCCFDecodeScanTmp>{fragments.back().first.next(), {0, 0, 0, 0, 0, -1}});
189 }
190 std::vector<bool> fragmentExtends(mCFContext->nFragments, false);
191
192 uint32_t firstPossibleFragment = 0;
193 uint32_t pageCounter = 0;
194 uint32_t emptyPages = 0;
195 for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j]; k++) {
196 if (GetProcessingSettings().tpcSingleSector != -1 && GetProcessingSettings().tpcSingleSector != (int32_t)iSector) {
197 break;
198 }
199 nPages += mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k];
200 for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; l++) {
201
202 if (GetProcessingSettings().prefetchTPCpageScan >= 2 && l + 1 < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]) {
203 Vc::Common::prefetchForOneRead(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE);
204 Vc::Common::prefetchForOneRead(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
205 }
206
207 const uint8_t* const page = ((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE;
209 if (o2::raw::RDHUtils::getMemorySize(*rdh) == sizeof(o2::header::RAWDataHeader)) {
210 emptyPages++;
211 continue;
212 }
213 pageCounter++;
214 const TPCZSHDR* const hdr = (const TPCZSHDR*)(rdh_utils::getLink(o2::raw::RDHUtils::getFEEID(*rdh)) == rdh_utils::DLBZSLinkID ? (page + o2::raw::RDHUtils::getMemorySize(*rdh) - sizeof(TPCZSHDRV2)) : (page + sizeof(o2::header::RAWDataHeader)));
215 if (mCFContext->zsVersion == -1) {
216 mCFContext->zsVersion = hdr->version;
217 if (GetProcessingSettings().param.tpcTriggerHandling && mCFContext->zsVersion < ZSVersion::ZSVersionDenseLinkBased) { // TODO: Move tpcTriggerHandling to recoSteps bitmask
218 static bool errorShown = false;
219 if (errorShown == false) {
220 GPUAlarm("Trigger handling only possible with TPC Dense Link Based data, received version %d, disabling", mCFContext->zsVersion);
221 }
222 errorShown = true;
223 }
224 } else if (mCFContext->zsVersion != (int32_t)hdr->version) {
225 GPUError("Received TPC ZS 8kb page of mixed versions, expected %d, received %d (linkid %d, feeCRU %d, feeEndpoint %d, feelinkid %d)", mCFContext->zsVersion, (int32_t)hdr->version, (int32_t)o2::raw::RDHUtils::getLinkID(*rdh), (int32_t)rdh_utils::getCRU(*rdh), (int32_t)rdh_utils::getEndPoint(*rdh), (int32_t)rdh_utils::getLink(*rdh));
226 constexpr size_t bufferSize = 3 * std::max(sizeof(*rdh), sizeof(*hdr)) + 1;
227 char dumpBuffer[bufferSize];
228 for (size_t i = 0; i < sizeof(*rdh); i++) {
229 // "%02X " guaranteed to be 3 chars + ending 0.
230 snprintf(dumpBuffer + 3 * i, 4, "%02X ", (int32_t)((uint8_t*)rdh)[i]);
231 }
232 GPUAlarm("RDH of page: %s", dumpBuffer);
233 for (size_t i = 0; i < sizeof(*hdr); i++) {
234 // "%02X " guaranteed to be 3 chars + ending 0.
235 snprintf(dumpBuffer + 3 * i, 4, "%02X ", (int32_t)((uint8_t*)hdr)[i]);
236 }
237 GPUAlarm("Metainfo of page: %s", dumpBuffer);
238 if (GetProcessingSettings().ignoreNonFatalGPUErrors) {
239 mCFContext->abandonTimeframe = true;
240 return {0, 0};
241 } else {
242 GPUFatal("Cannot process with invalid TPC ZS data, exiting");
243 }
244 }
245 if (GetProcessingSettings().param.tpcTriggerHandling) {
246 const TPCZSHDRV2* const hdr2 = (const TPCZSHDRV2*)hdr;
247 if (hdr2->flags & TPCZSHDRV2::ZSFlags::TriggerWordPresent) {
248 const char* triggerWord = (const char*)hdr - TPCZSHDRV2::TRIGGER_WORD_SIZE;
250 memcpy((void*)&tmp.triggerWord, triggerWord, TPCZSHDRV2::TRIGGER_WORD_SIZE);
251 tmp.orbit = o2::raw::RDHUtils::getHeartBeatOrbit(*rdh);
252 if (tmp.triggerWord.isValid(0)) {
253 mTriggerBuffer->triggers.emplace(tmp);
254 }
255 }
256 }
257 nDigits += hdr->nADCsamples;
258 endpointAdcSamples[j] += hdr->nADCsamples;
259 uint32_t timeBin = (hdr->timeOffset + (o2::raw::RDHUtils::getHeartBeatOrbit(*rdh) - firstHBF) * o2::constants::lhc::LHCMaxBunches) / LHCBCPERTIMEBIN;
260 uint32_t maxTimeBin = timeBin + hdr->nTimeBinSpan;
261 if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
262 const TPCZSHDRV2* const hdr2 = (const TPCZSHDRV2*)hdr;
263 if (hdr2->flags & TPCZSHDRV2::ZSFlags::nTimeBinSpanBit8) {
264 maxTimeBin += 256;
265 }
266 }
267 if (maxTimeBin > mCFContext->tpcMaxTimeBin) {
268 mCFContext->tpcMaxTimeBin = maxTimeBin;
269 }
270 bool extendsInNextPage = false;
271 if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
272 if (l + 1 < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k] && o2::raw::RDHUtils::getMemorySize(*rdh) == TPCZSHDR::TPC_ZS_PAGE_SIZE) {
274 extendsInNextPage = o2::raw::RDHUtils::getHeartBeatOrbit(*nextrdh) == o2::raw::RDHUtils::getHeartBeatOrbit(*rdh) && o2::raw::RDHUtils::getMemorySize(*nextrdh) > sizeof(o2::header::RAWDataHeader);
275 }
276 }
277 while (firstPossibleFragment && (uint32_t)fragments[firstPossibleFragment - 1].first.last() > timeBin) {
278 firstPossibleFragment--;
279 }
280 auto handleExtends = [&](uint32_t ff) {
281 if (fragmentExtends[ff]) {
282 if (doGPU) {
283 // Only add extended page on GPU. On CPU the pages are in consecutive memory anyway.
284 // Not adding the page prevents an issue where a page is decoded twice on CPU, when only the extend should be decoded.
285 fragments[ff].second.zsPageLast++;
286 mCFContext->fragmentData[ff].nPages[iSector][j]++;
287 mCFContext->fragmentData[ff].pageDigits[iSector][j].emplace_back(0);
288 }
289 fragmentExtends[ff] = false;
290 }
291 };
292 if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
293 for (uint32_t ff = 0; ff < firstPossibleFragment; ff++) {
294 handleExtends(ff);
295 }
296 }
297 for (uint32_t f = firstPossibleFragment; f < mCFContext->nFragments; f++) {
298 if (timeBin < (uint32_t)fragments[f].first.last() && (uint32_t)fragments[f].first.first() <= maxTimeBin) {
299 if (!fragments[f].second.hasData) {
300 fragments[f].second.hasData = 1;
301 fragments[f].second.zsPtrFirst = k;
302 fragments[f].second.zsPageFirst = l;
303 } else {
304 if (pageCounter > (uint32_t)fragments[f].second.pageCounter + 1) {
305 mCFContext->fragmentData[f].nPages[iSector][j] += emptyPages + pageCounter - fragments[f].second.pageCounter - 1;
306 for (uint32_t k2 = fragments[f].second.zsPtrLast - 1; k2 <= k; k2++) {
307 for (uint32_t l2 = ((int32_t)k2 == fragments[f].second.zsPtrLast - 1) ? fragments[f].second.zsPageLast : 0; l2 < (k2 < k ? mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k2] : l); l2++) {
308 if (doGPU) {
309 mCFContext->fragmentData[f].pageDigits[iSector][j].emplace_back(0);
310 } else {
311 // CPU cannot skip unneeded pages, so we must keep space to store the invalid dummy clusters
312 const uint8_t* const pageTmp = ((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k2]) + l2 * TPCZSHDR::TPC_ZS_PAGE_SIZE;
313 const o2::header::RAWDataHeader* rdhTmp = (const o2::header::RAWDataHeader*)pageTmp;
314 if (o2::raw::RDHUtils::getMemorySize(*rdhTmp) != sizeof(o2::header::RAWDataHeader)) {
315 const TPCZSHDR* const hdrTmp = (const TPCZSHDR*)(rdh_utils::getLink(o2::raw::RDHUtils::getFEEID(*rdhTmp)) == rdh_utils::DLBZSLinkID ? (pageTmp + o2::raw::RDHUtils::getMemorySize(*rdhTmp) - sizeof(TPCZSHDRV2)) : (pageTmp + sizeof(o2::header::RAWDataHeader)));
316 mCFContext->fragmentData[f].nDigits[iSector][j] += hdrTmp->nADCsamples;
317 }
318 }
319 }
320 }
321 } else if (emptyPages) {
322 mCFContext->fragmentData[f].nPages[iSector][j] += emptyPages;
323 if (doGPU) {
324 for (uint32_t m = 0; m < emptyPages; m++) {
325 mCFContext->fragmentData[f].pageDigits[iSector][j].emplace_back(0);
326 }
327 }
328 }
329 }
330 fragments[f].second.zsPtrLast = k + 1;
331 fragments[f].second.zsPageLast = l + 1;
332 fragments[f].second.pageCounter = pageCounter;
333 mCFContext->fragmentData[f].nPages[iSector][j]++;
334 mCFContext->fragmentData[f].nDigits[iSector][j] += hdr->nADCsamples;
335 if (doGPU) {
336 mCFContext->fragmentData[f].pageDigits[iSector][j].emplace_back(hdr->nADCsamples);
337 }
338 fragmentExtends[f] = extendsInNextPage;
339 } else {
340 handleExtends(f);
341 if (timeBin < (uint32_t)fragments[f].first.last()) {
342 if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
343 for (uint32_t ff = f + 1; ff < mCFContext->nFragments; ff++) {
344 handleExtends(ff);
345 }
346 }
347 break;
348 } else {
349 firstPossibleFragment = f + 1;
350 }
351 }
352 }
353 emptyPages = 0;
354 }
355 }
356 for (uint32_t f = 0; f < mCFContext->nFragments; f++) {
357 mCFContext->fragmentData[f].minMaxCN[iSector][j].zsPtrLast = fragments[f].second.zsPtrLast;
358 mCFContext->fragmentData[f].minMaxCN[iSector][j].zsPtrFirst = fragments[f].second.zsPtrFirst;
359 mCFContext->fragmentData[f].minMaxCN[iSector][j].zsPageLast = fragments[f].second.zsPageLast;
360 mCFContext->fragmentData[f].minMaxCN[iSector][j].zsPageFirst = fragments[f].second.zsPageFirst;
361 }
362 }
363 mCFContext->nPagesTotal += nPages;
364 mCFContext->nPagesSector[iSector] = nPages;
365
366 mCFContext->nDigitsEndpointMax[iSector] = 0;
367 for (uint32_t i = 0; i < GPUTrackingInOutZS::NENDPOINTS; i++) {
368 if (endpointAdcSamples[i] > mCFContext->nDigitsEndpointMax[iSector]) {
369 mCFContext->nDigitsEndpointMax[iSector] = endpointAdcSamples[i];
370 }
371 }
372 uint32_t nDigitsFragmentMax = 0;
373 for (uint32_t i = 0; i < mCFContext->nFragments; i++) {
374 uint32_t pagesInFragment = 0;
375 uint32_t digitsInFragment = 0;
376 for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
377 pagesInFragment += mCFContext->fragmentData[i].nPages[iSector][j];
378 digitsInFragment += mCFContext->fragmentData[i].nDigits[iSector][j];
379 }
380 mCFContext->nPagesFragmentMax = std::max(mCFContext->nPagesFragmentMax, pagesInFragment);
381 nDigitsFragmentMax = std::max(nDigitsFragmentMax, digitsInFragment);
382 }
383 mRec->getGeneralStepTimer(GeneralStep::Prepare).Stop();
384 return {nDigits, nDigitsFragmentMax};
385}
386
387void GPUChainTracking::RunTPCClusterizer_compactPeaks(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int32_t stage, bool doGPU, int32_t lane)
388{
389 auto& in = stage ? clustererShadow.mPpeakPositions : clustererShadow.mPpositions;
390 auto& out = stage ? clustererShadow.mPfilteredPeakPositions : clustererShadow.mPpeakPositions;
391 if (doGPU) {
392 const uint32_t iSector = clusterer.mISector;
393 auto& count = stage ? clusterer.mPmemory->counters.nPeaks : clusterer.mPmemory->counters.nPositions;
394
395 std::vector<size_t> counts;
396
397 uint32_t nSteps = clusterer.getNSteps(count);
398 if (nSteps > clusterer.mNBufs) {
399 GPUError("Clusterer buffers exceeded (%u > %u)", nSteps, (int32_t)clusterer.mNBufs);
400 exit(1);
401 }
402
403 int32_t scanWorkgroupSize = mRec->getGPUParameters(doGPU).par_CF_SCAN_WORKGROUP_SIZE;
404 size_t tmpCount = count;
405 if (nSteps > 1) {
406 for (uint32_t i = 1; i < nSteps; i++) {
407 counts.push_back(tmpCount);
408 if (i == 1) {
409 runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanStart>({GetGrid(tmpCount, scanWorkgroupSize, lane), {iSector}}, i, stage);
410 } else {
411 runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanUp>({GetGrid(tmpCount, scanWorkgroupSize, lane), {iSector}}, i, tmpCount);
412 }
413 tmpCount = (tmpCount + scanWorkgroupSize - 1) / scanWorkgroupSize;
414 }
415
416 runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanTop>({GetGrid(tmpCount, scanWorkgroupSize, lane), {iSector}}, nSteps, tmpCount);
417
418 for (uint32_t i = nSteps - 1; i > 1; i--) {
419 tmpCount = counts[i - 1];
420 runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanDown>({GetGrid(tmpCount - scanWorkgroupSize, scanWorkgroupSize, lane), {iSector}}, i, scanWorkgroupSize, tmpCount);
421 }
422 }
423
424 runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::compactDigits>({GetGrid(count, scanWorkgroupSize, lane), {iSector}}, 1, stage, in, out);
425 } else {
426 auto& nOut = stage ? clusterer.mPmemory->counters.nClusters : clusterer.mPmemory->counters.nPeaks;
427 auto& nIn = stage ? clusterer.mPmemory->counters.nPeaks : clusterer.mPmemory->counters.nPositions;
428 size_t count = 0;
429 for (size_t i = 0; i < nIn; i++) {
430 if (clusterer.mPisPeak[i]) {
431 out[count++] = in[i];
432 }
433 }
434 nOut = count;
435 }
436}
437
438std::pair<uint32_t, uint32_t> GPUChainTracking::RunTPCClusterizer_transferZS(int32_t iSector, const CfFragment& fragment, int32_t lane)
439{
440 bool doGPU = GetRecoStepsGPU() & RecoStep::TPCClusterFinding;
441 if (mCFContext->abandonTimeframe) {
442 return {0, 0};
443 }
444 const auto& retVal = TPCClusterizerDecodeZSCountUpdate(iSector, fragment);
445 if (doGPU) {
446 GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
447 GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
448 uint32_t nPagesSector = 0;
449 for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
450 uint32_t nPages = 0;
451 mInputsHost->mPzsMeta->sector[iSector].zsPtr[j] = &mInputsShadow->mPzsPtrs[iSector * GPUTrackingInOutZS::NENDPOINTS + j];
452 mInputsHost->mPzsPtrs[iSector * GPUTrackingInOutZS::NENDPOINTS + j] = clustererShadow.mPzs + (nPagesSector + nPages) * TPCZSHDR::TPC_ZS_PAGE_SIZE;
453 for (uint32_t k = clusterer.mMinMaxCN[j].zsPtrFirst; k < clusterer.mMinMaxCN[j].zsPtrLast; k++) {
454 const uint32_t min = (k == clusterer.mMinMaxCN[j].zsPtrFirst) ? clusterer.mMinMaxCN[j].zsPageFirst : 0;
455 const uint32_t max = (k + 1 == clusterer.mMinMaxCN[j].zsPtrLast) ? clusterer.mMinMaxCN[j].zsPageLast : mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k];
456 if (max > min) {
457 char* src = (char*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k] + min * TPCZSHDR::TPC_ZS_PAGE_SIZE;
458 char* ptrLast = (char*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k] + (max - 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE;
459 size_t size = (ptrLast - src) + o2::raw::RDHUtils::getMemorySize(*(const o2::header::RAWDataHeader*)ptrLast);
460 GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPzs + (nPagesSector + nPages) * TPCZSHDR::TPC_ZS_PAGE_SIZE, src, size, lane, true);
461 }
462 nPages += max - min;
463 }
464 mInputsHost->mPzsMeta->sector[iSector].nZSPtr[j] = &mInputsShadow->mPzsSizes[iSector * GPUTrackingInOutZS::NENDPOINTS + j];
465 mInputsHost->mPzsSizes[iSector * GPUTrackingInOutZS::NENDPOINTS + j] = nPages;
466 mInputsHost->mPzsMeta->sector[iSector].count[j] = 1;
467 nPagesSector += nPages;
468 }
469 GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPzsOffsets, clusterer.mPzsOffsets, clusterer.mNMaxPages * sizeof(*clusterer.mPzsOffsets), lane, true);
470 }
471 return retVal;
472}
473
474int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
475{
477 if (restorePointers) {
478 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
479 processors()->tpcClusterer[iSector].mPzsOffsets = mCFContext->ptrSave[iSector].zsOffsetHost;
480 processorsShadow()->tpcClusterer[iSector].mPzsOffsets = mCFContext->ptrSave[iSector].zsOffsetDevice;
481 processorsShadow()->tpcClusterer[iSector].mPzs = mCFContext->ptrSave[iSector].zsDevice;
482 }
483 processorsShadow()->ioPtrs.clustersNative = mCFContext->ptrClusterNativeSave;
484 return 0;
485 }
486 const auto& threadContext = GetThreadContext();
488 if (mCFContext == nullptr) {
490 }
491 const int16_t maxFragmentLen = GetProcessingSettings().overrideClusterizerFragmentLen;
492 const uint32_t maxAllowedTimebin = param().par.continuousTracking ? std::max<int32_t>(param().continuousMaxTimeBin, maxFragmentLen) : TPC_MAX_TIME_BIN_TRIGGERED;
493 mCFContext->tpcMaxTimeBin = maxAllowedTimebin;
494 const CfFragment fragmentMax{(tpccf::TPCTime)mCFContext->tpcMaxTimeBin + 1, maxFragmentLen};
495 mCFContext->prepare(mIOPtrs.tpcZS, fragmentMax);
496 if (GetProcessingSettings().param.tpcTriggerHandling) {
497 mTriggerBuffer->triggers.clear();
498 }
499 if (mIOPtrs.tpcZS) {
500 uint32_t nDigitsFragmentMax[NSECTORS];
501 mCFContext->zsVersion = -1;
502 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
503 if (mIOPtrs.tpcZS->sector[iSector].count[0]) {
504 const void* rdh = mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0];
505 if (rdh && o2::raw::RDHUtils::getVersion<o2::header::RAWDataHeaderV6>() > o2::raw::RDHUtils::getVersion(rdh)) {
506 GPUError("Data has invalid RDH version %d, %d required\n", o2::raw::RDHUtils::getVersion(rdh), o2::raw::RDHUtils::getVersion<o2::header::RAWDataHeader>());
507 return 1;
508 }
509 }
510
511 if (GetProcessingSettings().prefetchTPCpageScan >= 1 && iSector < NSECTORS - 1) {
512 for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
513 for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j]; k++) {
514 for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; l++) {
515 Vc::Common::prefetchFar(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector + 1].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE);
516 Vc::Common::prefetchFar(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector + 1].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
517 }
518 }
519 }
520 }
521
522 const auto& x = TPCClusterizerDecodeZSCount(iSector, fragmentMax);
523 nDigitsFragmentMax[iSector] = x.first;
524 processors()->tpcClusterer[iSector].mPmemory->counters.nDigits = x.first;
525 mRec->MemoryScalers()->nTPCdigits += x.first;
526 }
527 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
528 uint32_t nDigitsBase = nDigitsFragmentMax[iSector];
529 uint32_t threshold = 40000000;
530 uint32_t nDigitsScaled = nDigitsBase > threshold ? nDigitsBase : std::min((threshold + nDigitsBase) / 2, 2 * nDigitsBase);
531 processors()->tpcClusterer[iSector].SetNMaxDigits(processors()->tpcClusterer[iSector].mPmemory->counters.nDigits, mCFContext->nPagesFragmentMax, nDigitsScaled, mCFContext->nDigitsEndpointMax[iSector]);
532 if (doGPU) {
533 processorsShadow()->tpcClusterer[iSector].SetNMaxDigits(processors()->tpcClusterer[iSector].mPmemory->counters.nDigits, mCFContext->nPagesFragmentMax, nDigitsScaled, mCFContext->nDigitsEndpointMax[iSector]);
534 }
535 if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) {
536 mPipelineNotifyCtx->rec->AllocateRegisteredForeignMemory(processors()->tpcClusterer[iSector].mZSOffsetId, mRec);
537 mPipelineNotifyCtx->rec->AllocateRegisteredForeignMemory(processors()->tpcClusterer[iSector].mZSId, mRec);
538 } else {
539 AllocateRegisteredMemory(processors()->tpcClusterer[iSector].mZSOffsetId);
540 AllocateRegisteredMemory(processors()->tpcClusterer[iSector].mZSId);
541 }
542 }
543 } else {
544 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
545 uint32_t nDigits = mIOPtrs.tpcPackedDigits->nTPCDigits[iSector];
546 mRec->MemoryScalers()->nTPCdigits += nDigits;
547 processors()->tpcClusterer[iSector].SetNMaxDigits(nDigits, mCFContext->nPagesFragmentMax, nDigits, 0);
548 }
549 }
550
551 if (mIOPtrs.tpcZS) {
552 GPUInfo("Event has %u 8kb TPC ZS pages (version %d), %ld digits", mCFContext->nPagesTotal, mCFContext->zsVersion, (int64_t)mRec->MemoryScalers()->nTPCdigits);
553 } else {
554 GPUInfo("Event has %ld TPC Digits", (int64_t)mRec->MemoryScalers()->nTPCdigits);
555 }
556
557 if (mCFContext->tpcMaxTimeBin > maxAllowedTimebin) {
558 GPUError("Input data has invalid time bin %u > %d", mCFContext->tpcMaxTimeBin, maxAllowedTimebin);
559 if (GetProcessingSettings().ignoreNonFatalGPUErrors) {
560 mCFContext->abandonTimeframe = true;
561 mCFContext->tpcMaxTimeBin = maxAllowedTimebin;
562 } else {
563 return 1;
564 }
565 }
566
567 mCFContext->fragmentFirst = CfFragment{std::max<int32_t>(mCFContext->tpcMaxTimeBin + 1, maxFragmentLen), maxFragmentLen};
568 for (int32_t iSector = 0; iSector < GetProcessingSettings().nTPCClustererLanes && iSector < NSECTORS; iSector++) {
569 if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) {
570 mCFContext->nextPos[iSector] = RunTPCClusterizer_transferZS(iSector, mCFContext->fragmentFirst, GetProcessingSettings().nTPCClustererLanes + iSector);
571 }
572 }
573
574 if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) {
575 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
576 mCFContext->ptrSave[iSector].zsOffsetHost = processors()->tpcClusterer[iSector].mPzsOffsets;
577 mCFContext->ptrSave[iSector].zsOffsetDevice = processorsShadow()->tpcClusterer[iSector].mPzsOffsets;
578 mCFContext->ptrSave[iSector].zsDevice = processorsShadow()->tpcClusterer[iSector].mPzs;
579 }
580 }
581 return 0;
582}
583#endif
584
585int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
586{
587 if (param().rec.fwdTPCDigitsAsClusters) {
588 return ForwardTPCDigits();
589 }
590#ifdef GPUCA_TPC_GEOMETRY_O2
591 int32_t tpcTimeBinCut = (mUpdateNewCalibObjects && mNewCalibValues->newTPCTimeBinCut) ? mNewCalibValues->tpcTimeBinCut : param().tpcCutTimeBin;
592
594 const auto& threadContext = GetThreadContext();
595 const bool doGPU = GetRecoStepsGPU() & RecoStep::TPCClusterFinding;
596 if (RunTPCClusterizer_prepare(mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer)) {
597 return 1;
598 }
599 if (GetProcessingSettings().autoAdjustHostThreads && !doGPU) {
601 }
602
604 float tpcHitLowOccupancyScalingFactor = 1.f;
606 uint32_t nHitsBase = mRec->MemoryScalers()->nTPCHits;
607 uint32_t threshold = 30000000 / 256 * mIOPtrs.settingsTF->nHBFPerTF;
608 if (mIOPtrs.settingsTF->nHBFPerTF < 64) {
609 threshold *= 2;
610 }
611 mRec->MemoryScalers()->nTPCHits = std::max<uint32_t>(nHitsBase, std::min<uint32_t>(threshold, nHitsBase * 3.5f)); // Increase the buffer size for low occupancy data to compensate for noisy pads creating exceiive clusters
612 if (nHitsBase < threshold) {
613 float maxFactor = mRec->MemoryScalers()->nTPCHits < threshold * 2 / 3 ? 3 : (mRec->MemoryScalers()->nTPCHits < threshold ? 2.25f : 1.75f);
614 mRec->MemoryScalers()->temporaryFactor *= std::min(maxFactor, (float)threshold / nHitsBase);
615 tpcHitLowOccupancyScalingFactor = std::min(3.5f, (float)threshold / nHitsBase);
616 }
617 }
618 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
619 processors()->tpcClusterer[iSector].SetMaxData(mIOPtrs); // First iteration to set data sizes
620 }
621 mRec->ComputeReuseMax(nullptr); // Resolve maximums for shared buffers
622 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
623 SetupGPUProcessor(&processors()->tpcClusterer[iSector], true); // Now we allocate
624 }
625 if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) {
626 RunTPCClusterizer_prepare(true); // Restore some pointers, allocated by the other pipeline, and set to 0 by SetupGPUProcessor (since not allocated in this pipeline)
627 }
628
629 if (doGPU && mIOPtrs.tpcZS) {
631 WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), mRec->NStreams() - 1);
632 }
633 if (doGPU) {
634 WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)processors()->tpcClusterer - (char*)processors(), processorsShadow()->tpcClusterer, sizeof(GPUTPCClusterFinder) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
635 }
636
637#ifdef GPUCA_HAS_ONNX
638 const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
639 GPUTPCNNClusterizerHost nnApplications[GetProcessingSettings().nTPCClustererLanes];
640
641 // Maximum of 4 lanes supported
642 HighResTimer* nnTimers[12];
643
644 if (nn_settings.applyNNclusterizer) {
645 int32_t deviceId = -1;
646 int32_t numLanes = GetProcessingSettings().nTPCClustererLanes;
647 int32_t maxThreads = mRec->getNKernelHostThreads(true);
648 // bool recreateMemoryAllocator = false;
649
650 if (GetProcessingSettings().debugLevel >= 1) {
651 nnTimers[0] = &getTimer<GPUTPCNNClusterizer, 0>("GPUTPCNNClusterizer_ONNXClassification_0_", 0);
652 nnTimers[1] = &getTimer<GPUTPCNNClusterizer, 1>("GPUTPCNNClusterizer_ONNXRegression_1_", 1);
653 nnTimers[2] = &getTimer<GPUTPCNNClusterizer, 2>("GPUTPCNNClusterizer_ONNXRegression2_2_", 2);
654 nnTimers[3] = &getTimer<GPUTPCNNClusterizer, 3>("GPUTPCNNClusterizer_ONNXClassification_0_", 3);
655 nnTimers[4] = &getTimer<GPUTPCNNClusterizer, 4>("GPUTPCNNClusterizer_ONNXRegression_1_", 4);
656 nnTimers[5] = &getTimer<GPUTPCNNClusterizer, 5>("GPUTPCNNClusterizer_ONNXRegression2_2_", 5);
657 nnTimers[6] = &getTimer<GPUTPCNNClusterizer, 6>("GPUTPCNNClusterizer_ONNXClassification_0_", 6);
658 nnTimers[7] = &getTimer<GPUTPCNNClusterizer, 7>("GPUTPCNNClusterizer_ONNXRegression_1_", 7);
659 nnTimers[8] = &getTimer<GPUTPCNNClusterizer, 8>("GPUTPCNNClusterizer_ONNXRegression2_2_", 8);
660 nnTimers[9] = &getTimer<GPUTPCNNClusterizer, 9>("GPUTPCNNClusterizer_ONNXClassification_0_", 9);
661 nnTimers[10] = &getTimer<GPUTPCNNClusterizer, 10>("GPUTPCNNClusterizer_ONNXRegression_1_", 10);
662 nnTimers[11] = &getTimer<GPUTPCNNClusterizer, 11>("GPUTPCNNClusterizer_ONNXRegression2_2_", 11);
663 }
664
665 mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) {
666 nnApplications[lane].init(nn_settings, GetProcessingSettings().deterministicGPUReconstruction);
667 if (nnApplications[lane].mModelsUsed[0]) {
668 SetONNXGPUStream(*(nnApplications[lane].mModelClass).getSessionOptions(), lane, &deviceId);
669 (nnApplications[lane].mModelClass).setDeviceId(deviceId);
670 if (nnApplications[lane].mModelClass.getIntraOpNumThreads() > maxThreads) {
671 nnApplications[lane].mModelClass.setIntraOpNumThreads(maxThreads);
672 }
673 (nnApplications[lane].mModelClass).initEnvironment();
674 // Registering this once seems to be enough, even with different environmnents / models. ONNX apparently uses this per device and stores the OrtAllocator internally. All models will then use the volatile allocation.
675 // But environment must be valid, so we init the model environment first and use it here afterwards.
676 // Either this is done in one environment with lane == 0 or by recreating the allocator using recreateMemoryAllocator.
677 // TODO: Volatile allocation works for reserving, but not yet for allocations when binding the input tensor
678 // if (lane == 0) {
679 // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
680 // }
681 // recreateMemoryAllocator = true;
682 if (!nn_settings.nnLoadFromCCDB) {
683 (nnApplications[lane].mModelClass).initSession(); // loads from file
684 } else {
685 (nnApplications[lane].mModelClass).initSessionFromBuffer((processors()->calibObjects.nnClusterizerNetworks[0])->getONNXModel(), (processors()->calibObjects.nnClusterizerNetworks[0])->getONNXModelSize()); // loads from CCDB
686 }
687 }
688 if (nnApplications[lane].mModelsUsed[1]) {
689 SetONNXGPUStream(*(nnApplications[lane].mModelReg1).getSessionOptions(), lane, &deviceId);
690 (nnApplications[lane].mModelReg1).setDeviceId(deviceId);
691 if (nnApplications[lane].mModelReg1.getIntraOpNumThreads() > maxThreads) {
692 nnApplications[lane].mModelReg1.setIntraOpNumThreads(maxThreads);
693 }
694 // (nnApplications[lane].mModelReg1).setEnv((nnApplications[lane].mModelClass).getEnv());
695 (nnApplications[lane].mModelReg1).initEnvironment();
696 // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelReg1).getEnv(), (nnApplications[lane].mModelReg1).getMemoryInfo(), mRec, recreateMemoryAllocator);
697 if (!nn_settings.nnLoadFromCCDB) {
698 (nnApplications[lane].mModelReg1).initSession(); // loads from file
699 } else {
700 (nnApplications[lane].mModelReg1).initSessionFromBuffer((processors()->calibObjects.nnClusterizerNetworks[1])->getONNXModel(), (processors()->calibObjects.nnClusterizerNetworks[1])->getONNXModelSize()); // loads from CCDB
701 }
702 }
703 if (nnApplications[lane].mModelsUsed[2]) {
704 SetONNXGPUStream(*(nnApplications[lane].mModelReg2).getSessionOptions(), lane, &deviceId);
705 (nnApplications[lane].mModelReg2).setDeviceId(deviceId);
706 if (nnApplications[lane].mModelReg2.getIntraOpNumThreads() > maxThreads) {
707 nnApplications[lane].mModelReg2.setIntraOpNumThreads(maxThreads);
708 }
709 // (nnApplications[lane].mModelReg2).setEnv((nnApplications[lane].mModelClass).getEnv());
710 (nnApplications[lane].mModelReg2).initEnvironment();
711 // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
712 if (!nn_settings.nnLoadFromCCDB) {
713 (nnApplications[lane].mModelReg2).initSession(); // loads from file
714 } else {
715 (nnApplications[lane].mModelReg2).initSessionFromBuffer((processors()->calibObjects.nnClusterizerNetworks[2])->getONNXModel(), (processors()->calibObjects.nnClusterizerNetworks[2])->getONNXModelSize()); // loads from CCDB
716 }
717 }
718 if (nn_settings.nnClusterizerVerbosity > 0) {
719 LOG(info) << "(ORT) Allocated ONNX stream for lane " << lane << " and device " << deviceId;
720 }
721 });
722 const int16_t maxFragmentLen = GetProcessingSettings().overrideClusterizerFragmentLen;
723 const uint32_t maxAllowedTimebin = param().par.continuousTracking ? std::max<int32_t>(param().continuousMaxTimeBin, maxFragmentLen) : TPC_MAX_TIME_BIN_TRIGGERED;
724 for (int32_t sector = 0; sector < NSECTORS; sector++) {
725 GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[sector];
726 GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[sector] : clustererNN;
727 int32_t lane = sector % numLanes;
728 clustererNN.mDeviceId = deviceId;
729 clustererNN.mISector = sector;
731 nnApplications[lane].initClusterizer(nn_settings, clustererNN, maxFragmentLen, maxAllowedTimebin);
732 if (doGPU) {
733 clustererNNShadow.mDeviceId = deviceId;
734 clustererNNShadow.mISector = sector;
736 nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow, maxFragmentLen, maxAllowedTimebin);
737 }
738 if (nn_settings.nnClusterizerVerbosity > 2) {
739 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Processor initialized. Sector " << sector << ", lane " << lane << ", max clusters " << clustererNN.mNnClusterizerTotalClusters << " (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
740 }
742 if (nn_settings.nnClusterizerVerbosity > 2) {
743 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Memory registered for memoryId " << clustererNN.mMemoryId << " (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
744 }
745 // nnApplications[lane].createBoundary(clustererNNShadow);
746 // nnApplications[lane].createIndexLookup(clustererNNShadow);
747 }
748 if (doGPU) {
749 if (nn_settings.nnClusterizerVerbosity > 2) {
750 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Writing to constant memory...";
751 }
752 WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
753 if (nn_settings.nnClusterizerVerbosity > 2) {
754 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Writing to constant memory done";
755 }
756 }
757 }
758#endif
759
760 size_t nClsTotal = 0;
761 ClusterNativeAccess* tmpNativeAccess = mClusterNativeAccess.get();
762 ClusterNative* tmpNativeClusters = nullptr;
763 std::unique_ptr<ClusterNative[]> tmpNativeClusterBuffer;
764
765 // setup MC Labels
767
768 auto* digitsMC = propagateMCLabels ? processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC : nullptr;
769
770 bool buildNativeGPU = doGPU && NeedTPCClustersOnGPU();
771 bool buildNativeHost = (mRec->GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCClusters) || GetProcessingSettings().deterministicGPUReconstruction; // TODO: Should do this also when clusters are needed for later steps on the host but not requested as output
772
773 mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = mRec->MemoryScalers()->nTPCHits * tpcHitLowOccupancyScalingFactor;
774 if (buildNativeGPU) {
775 AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
776 }
777 if (mWaitForFinalInputs && GetProcessingSettings().nTPCClustererLanes > 6) {
778 GPUFatal("ERROR, mWaitForFinalInputs cannot be called with nTPCClustererLanes > 6");
779 }
780 if (buildNativeHost && !(buildNativeGPU && GetProcessingSettings().delayedOutput)) {
781 if (mWaitForFinalInputs) {
782 GPUFatal("Cannot use waitForFinalInput callback without delayed output");
783 }
784 if (!GetProcessingSettings().tpcApplyClusterFilterOnCPU) {
785 AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput, GetProcessingSettings().tpcWriteClustersAfterRejection ? nullptr : mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
786 tmpNativeClusters = mInputsHost->mPclusterNativeOutput;
787 } else {
788 tmpNativeClusterBuffer = std::make_unique<ClusterNative[]>(mInputsHost->mNClusterNative);
789 tmpNativeClusters = tmpNativeClusterBuffer.get();
790 }
791 }
792
793 GPUTPCLinearLabels mcLinearLabels;
794 if (propagateMCLabels) {
795 // No need to overallocate here, nTPCHits is anyway an upper bound used for the GPU cluster buffer, and we can always enlarge the buffer anyway
796 mcLinearLabels.header.reserve(mRec->MemoryScalers()->nTPCHits / 2);
797 mcLinearLabels.data.reserve(mRec->MemoryScalers()->nTPCHits);
798 }
799
800 int8_t transferRunning[NSECTORS] = {0};
801 uint32_t outputQueueStart = mOutputQueue.size();
802
803 auto notifyForeignChainFinished = [this]() {
804 if (mPipelineNotifyCtx) {
805 SynchronizeStream(OutputStream()); // Must finish before updating ioPtrs in (global) constant memory
806 {
807 std::lock_guard<std::mutex> lock(mPipelineNotifyCtx->mutex);
808 mPipelineNotifyCtx->ready = true;
809 }
810 mPipelineNotifyCtx->cond.notify_one();
811 }
812 };
813 bool synchronizeCalibUpdate = false;
814
815 for (uint32_t iSectorBase = 0; iSectorBase < NSECTORS; iSectorBase += GetProcessingSettings().nTPCClustererLanes) {
816 std::vector<bool> laneHasData(GetProcessingSettings().nTPCClustererLanes, false);
817 static_assert(NSECTORS <= GPUCA_MAX_STREAMS, "Stream events must be able to hold all sectors");
818 const int32_t maxLane = std::min<int32_t>(GetProcessingSettings().nTPCClustererLanes, NSECTORS - iSectorBase);
819 for (CfFragment fragment = mCFContext->fragmentFirst; !fragment.isEnd(); fragment = fragment.next()) {
820 if (GetProcessingSettings().debugLevel >= 3) {
821 GPUInfo("Processing time bins [%d, %d) for sectors %d to %d", fragment.start, fragment.last(), iSectorBase, iSectorBase + GetProcessingSettings().nTPCClustererLanes - 1);
822 }
823 mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
824 if (doGPU && fragment.index != 0) {
825 SynchronizeStream(lane); // Don't overwrite charge map from previous iteration until cluster computation is finished
826 }
827
828 uint32_t iSector = iSectorBase + lane;
829 GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
830 GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
831 clusterer.mPmemory->counters.nPeaks = clusterer.mPmemory->counters.nClusters = 0;
832 clusterer.mPmemory->fragment = fragment;
833
835 bool setDigitsOnGPU = doGPU && not mIOPtrs.tpcZS;
836 bool setDigitsOnHost = (not doGPU && not mIOPtrs.tpcZS) || propagateMCLabels;
837 auto* inDigits = mIOPtrs.tpcPackedDigits;
838 size_t numDigits = inDigits->nTPCDigits[iSector];
839 if (setDigitsOnGPU) {
840 GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPdigits, inDigits->tpcDigits[iSector], sizeof(clustererShadow.mPdigits[0]) * numDigits, lane, true);
841 }
842 if (setDigitsOnHost) {
843 clusterer.mPdigits = const_cast<o2::tpc::Digit*>(inDigits->tpcDigits[iSector]); // TODO: Needs fixing (invalid const cast)
844 }
845 clusterer.mPmemory->counters.nDigits = numDigits;
846 }
847
848 if (mIOPtrs.tpcZS) {
849 if (mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) {
850 clusterer.mPmemory->counters.nPositions = mCFContext->nextPos[iSector].first;
851 clusterer.mPmemory->counters.nPagesSubsector = mCFContext->nextPos[iSector].second;
852 } else {
853 clusterer.mPmemory->counters.nPositions = clusterer.mPmemory->counters.nPagesSubsector = 0;
854 }
855 }
856 TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
857
858 using ChargeMapType = decltype(*clustererShadow.mPchargeMap);
859 using PeakMapType = decltype(*clustererShadow.mPpeakMap);
860 runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPchargeMap, TPCMapMemoryLayout<ChargeMapType>::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(ChargeMapType));
861 runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpeakMap, TPCMapMemoryLayout<PeakMapType>::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(PeakMapType));
862 if (fragment.index == 0) {
863 runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpadIsNoisy, TPC_PADS_IN_SECTOR * sizeof(*clustererShadow.mPpadIsNoisy));
864 }
866
867 if (doGPU) {
868 if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) {
869 TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, mInputsHost->mResourceZS, lane);
870 SynchronizeStream(GetProcessingSettings().nTPCClustererLanes + lane);
871 }
872 SynchronizeStream(mRec->NStreams() - 1); // Wait for copying to constant memory
873 }
874
875 if (mIOPtrs.tpcZS && (mCFContext->abandonTimeframe || !mCFContext->nPagesSector[iSector] || mCFContext->zsVersion == -1)) {
876 clusterer.mPmemory->counters.nPositions = 0;
877 return;
878 }
879 if (!mIOPtrs.tpcZS && mIOPtrs.tpcPackedDigits->nTPCDigits[iSector] == 0) {
880 clusterer.mPmemory->counters.nPositions = 0;
881 return;
882 }
883
884 if (propagateMCLabels && fragment.index == 0) {
885 clusterer.PrepareMC();
886 clusterer.mPinputLabels = digitsMC->v[iSector];
887 if (clusterer.mPinputLabels == nullptr) {
888 GPUFatal("MC label container missing, sector %d", iSector);
889 }
890 if (clusterer.mPinputLabels->getIndexedSize() != mIOPtrs.tpcPackedDigits->nTPCDigits[iSector]) {
891 GPUFatal("MC label container has incorrect number of entries: %d expected, has %d\n", (int32_t)mIOPtrs.tpcPackedDigits->nTPCDigits[iSector], (int32_t)clusterer.mPinputLabels->getIndexedSize());
892 }
893 }
894
895 if (GetProcessingSettings().tpcSingleSector == -1 || GetProcessingSettings().tpcSingleSector == (int32_t)iSector) {
896 if (not mIOPtrs.tpcZS) {
897 runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({GetGrid(1, lane), {iSector}}, mIOPtrs.tpcZS == nullptr);
898 TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
899 } else if (propagateMCLabels) {
900 runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({GetGrid(1, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, mIOPtrs.tpcZS == nullptr);
901 TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
902 }
903 }
904
905 if (mIOPtrs.tpcZS) {
906 int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : ((mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader*)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]) : 0);
907 uint32_t nBlocks = doGPU ? clusterer.mPmemory->counters.nPagesSubsector : GPUTrackingInOutZS::NENDPOINTS;
908
909 switch (mCFContext->zsVersion) {
910 default:
911 GPUFatal("Data with invalid TPC ZS mode (%d) received", mCFContext->zsVersion);
912 break;
915 runKernel<GPUTPCCFDecodeZS>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF, tpcTimeBinCut);
916 break;
918 runKernel<GPUTPCCFDecodeZSLink>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF, tpcTimeBinCut);
919 break;
921 runKernel<GPUTPCCFDecodeZSDenseLink>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF, tpcTimeBinCut);
922 break;
923 }
924 TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
925 } // clang-format off
926 });
927 mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
928 uint32_t iSector = iSectorBase + lane;
929 if (doGPU) {
930 SynchronizeStream(lane);
931 }
932 if (mIOPtrs.tpcZS) {
933 CfFragment f = fragment.next();
934 int32_t nextSector = iSector;
935 if (f.isEnd()) {
936 nextSector += GetProcessingSettings().nTPCClustererLanes;
937 f = mCFContext->fragmentFirst;
938 }
939 if (nextSector < NSECTORS && mIOPtrs.tpcZS && mCFContext->nPagesSector[nextSector] && mCFContext->zsVersion != -1 && !mCFContext->abandonTimeframe) {
940 mCFContext->nextPos[nextSector] = RunTPCClusterizer_transferZS(nextSector, f, GetProcessingSettings().nTPCClustererLanes + lane);
941 }
942 }
943 GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
944 GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
945 if (clusterer.mPmemory->counters.nPositions == 0) {
946 return;
947 }
948 if (!mIOPtrs.tpcZS) {
949 runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::fillFromDigits>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
950 }
952 clusterer.DumpChargeMap(*mDebugFile, "Charges");
953 }
954
955 if (propagateMCLabels) {
956 runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::fillIndexMap>({GetGrid(clusterer.mPmemory->counters.nDigitsInFragment, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}});
957 }
958
959 bool checkForNoisyPads = (rec()->GetParam().rec.tpc.maxTimeBinAboveThresholdIn1000Bin > 0) || (rec()->GetParam().rec.tpc.maxConsecTimeBinAboveThreshold > 0);
960 checkForNoisyPads &= (rec()->GetParam().rec.tpc.noisyPadsQuickCheck ? fragment.index == 0 : true);
961 checkForNoisyPads &= !GetProcessingSettings().disableTPCNoisyPadFilter;
962
963 if (checkForNoisyPads) {
965
966 runKernel<GPUTPCCFCheckPadBaseline>({GetGridBlk(nBlocks, lane), {iSector}});
967 getKernelTimer<GPUTPCCFCheckPadBaseline>(RecoStep::TPCClusterFinding, iSector, TPC_PADS_IN_SECTOR * fragment.lengthWithoutOverlap() * sizeof(PackedCharge), false);
968 }
969
970 runKernel<GPUTPCCFPeakFinder>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
972 clusterer.DumpPeakMap(*mDebugFile, "Peaks");
973 }
974
975 RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 0, doGPU, lane);
976 TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
977 DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererPeaks, clusterer, &GPUTPCClusterFinder::DumpPeaksCompacted, *mDebugFile); // clang-format off
978 });
979 mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
980 uint32_t iSector = iSectorBase + lane;
981 GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
982 GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
983 if (doGPU) {
984 SynchronizeStream(lane);
985 }
986 if (clusterer.mPmemory->counters.nPeaks == 0) {
987 return;
988 }
989 runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSector}});
990 runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSector}});
992 clusterer.DumpPeakMap(*mDebugFile, "Suppressed Peaks");
993 }
994
995 RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 1, doGPU, lane);
996 TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
998 });
999 mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
1000 uint32_t iSector = iSectorBase + lane;
1001 GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
1002 GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
1003
1004 if (doGPU) {
1005 SynchronizeStream(lane);
1006 }
1007
1008 if (fragment.index == 0) {
1009 deviceEvent* waitEvent = nullptr;
1010 if (transferRunning[lane] == 1) {
1011 waitEvent = &mEvents->stream[lane];
1012 transferRunning[lane] = 2;
1013 }
1014 runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding), krnlRunRangeNone, {nullptr, waitEvent}}, clustererShadow.mPclusterInRow, GPUCA_ROW_COUNT * sizeof(*clustererShadow.mPclusterInRow));
1015 }
1016
1017 if (clusterer.mPmemory->counters.nClusters == 0) {
1018 return;
1019 }
1020
1021 if (GetProcessingSettings().nn.applyNNclusterizer) {
1022#ifdef GPUCA_HAS_ONNX
1023 GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[lane];
1024 GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN;
1025 GPUTPCNNClusterizerHost& nnApplication = nnApplications[lane];
1026
1027 // int withMC = (doGPU && propagateMCLabels);
1028
1029 if (nn_settings.nnClusterizerApplyCfDeconvolution) {
1030 runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, true);
1031 } else if (clustererNNShadow.mNnClusterizerSetDeconvolutionFlags) {
1032 runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, false);
1033 }
1034
1035 // float time_clusterizer = 0, time_fill = 0, time_networks = 0;
1036 if (nn_settings.nnClusterizerVerbosity > 2) {
1037 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Starting loop over batched data. clustererNNShadow.mNnClusterizerBatchedMode=" << clustererNNShadow.mNnClusterizerBatchedMode << ", numLoops=" << std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNNShadow.mNnClusterizerBatchedMode) << ", numClusters=" << clusterer.mPmemory->counters.nClusters << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1038 }
1039 for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNNShadow.mNnClusterizerBatchedMode); batch++) {
1040 if (nn_settings.nnClusterizerVerbosity > 3) {
1041 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Start. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1042 }
1043 uint batchStart = batch * clustererNNShadow.mNnClusterizerBatchedMode;
1044 size_t iSize = CAMath::Min((uint)clustererNNShadow.mNnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
1045
1046 // Filling the data
1047 if (mRec->IsGPU() || GetProcessingSettings().nn.nnClusterizerForceGpuInputFill) {
1048 // Fills element by element of each input matrix -> better parallelizability, but worse on CPU due to unnecessary computations
1049 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNGPU>({GetGrid(iSize * clustererNNShadow.mNnClusterizerRowTimeSizeThreads , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
1050 } else {
1051 // Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability
1052 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNCPU>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
1053 }
1054 if (nn_settings.nnClusterizerVerbosity > 3) {
1055 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done filling data. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1056 }
1057
1058 if (clustererNNShadow.mNnClusterizerSetDeconvolutionFlags) {
1059 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishDeconvolutionFlags>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); // Publishing the deconvolution flags
1060 if (nn_settings.nnClusterizerVerbosity > 3) {
1061 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done setting deconvolution flags. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1062 }
1063 }
1064
1065 // NN evaluations
1066 if(clustererNNShadow.mNnClusterizerUseClassification) {
1067 if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane]->Start(); }
1068 if (clustererNNShadow.mNnInferenceInputDType == 0) {
1069 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
1070 (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16);
1071 } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
1072 (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_32);
1073 }
1074 } else if (clustererNNShadow.mNnInferenceInputDType == 1) {
1075 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
1076 (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_16);
1077 } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
1078 (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32);
1079 }
1080 }
1081 if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane]->Stop(); } // doGPU || lane<4 -> only for GPU or first 4 CPU lanes (to limit number of concurrent timers). At least gives some statistics for CPU time...
1082 if (nn_settings.nnClusterizerVerbosity > 3) {
1083 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with NN classification inference. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1084 }
1085 }
1086 if (!clustererNNShadow.mNnClusterizerUseCfRegression) {
1087 if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 1]->Start(); }
1088 if (clustererNNShadow.mNnInferenceInputDType == 0) {
1089 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
1090 (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16);
1091 } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
1092 (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_32);
1093 }
1094 } else if (clustererNNShadow.mNnInferenceInputDType == 1) {
1095 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
1096 (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_16);
1097 } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
1098 (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32);
1099 }
1100 }
1101 if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 1]->Stop(); }
1102 if (nnApplication.mModelClass.getNumOutputNodes()[0][1] > 1 && nnApplication.mModelReg2.isInitialized()) {
1103 if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 2]->Start(); }
1104 if (clustererNNShadow.mNnInferenceInputDType == 0) {
1105 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
1106 (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16);
1107 } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
1108 (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_32);
1109 }
1110 } else if (clustererNNShadow.mNnInferenceInputDType == 1) {
1111 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
1112 (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_16);
1113 } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
1114 (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32);
1115 }
1116 }
1117 if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 2]->Stop(); }
1118 }
1119 if (nn_settings.nnClusterizerVerbosity > 3) {
1120 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with NN regression inference. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1121 }
1122 }
1123
1124 // Publishing kernels for class labels and regression results
1125 // In case classification should not be used, this kernel should still be executed to fill the mOutputDataClass array with default values
1126 if (nnApplication.mModelClass.getNumOutputNodes()[0][1] == 1) {
1127 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Assigning class labels
1128 } else {
1129 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Assigning class labels
1130 }
1131 if (!clustererNNShadow.mNnClusterizerUseCfRegression) {
1132 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Publishing class 1 regression results
1133 if (nnApplication.mModelClass.getNumOutputNodes()[0][1] > 1 && nnApplication.mModelReg2.isInitialized()) {
1134 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Publishing class 2 regression results
1135 }
1136 }
1137 if (nn_settings.nnClusterizerVerbosity > 3) {
1138 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done publishing. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1139 }
1140 }
1141
1142 if (clustererNNShadow.mNnClusterizerUseCfRegression) {
1143 if(!nn_settings.nnClusterizerApplyCfDeconvolution) { // If it is already applied don't do it twice, otherwise apply now
1144 runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, true);
1145 }
1146 DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
1147 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
1148 if (nn_settings.nnClusterizerVerbosity > 3) {
1149 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with CF regression. (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1150 }
1151 }
1152#else
1153 GPUFatal("Project not compiled with neural network clusterization. Aborting.");
1154#endif
1155 } else {
1156 runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, true);
1157 DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
1158 runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSector}}, 0);
1159 }
1160
1161 if (doGPU && propagateMCLabels) {
1162 TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane);
1163 if (doGPU) {
1164 SynchronizeStream(lane);
1165 }
1166 runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, 1); // Computes MC labels
1167 }
1168
1169 if (GetProcessingSettings().debugLevel >= 3) {
1170 GPUInfo("Sector %02d Fragment %02d Lane %d: Found clusters: digits %u peaks %u clusters %u", iSector, fragment.index, lane, (int32_t)clusterer.mPmemory->counters.nPositions, (int32_t)clusterer.mPmemory->counters.nPeaks, (int32_t)clusterer.mPmemory->counters.nClusters);
1171 }
1172
1173 TransferMemoryResourcesToHost(RecoStep::TPCClusterFinding, &clusterer, lane);
1174 laneHasData[lane] = true;
1175 // Include clusters in default debug mask, exclude other debug output by default
1176 DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererClusters, clusterer, &GPUTPCClusterFinder::DumpClusters, *mDebugFile); // clang-format off
1177 });
1179 }
1180
1181 size_t nClsFirst = nClsTotal;
1182 bool anyLaneHasData = false;
1183 for (int32_t lane = 0; lane < maxLane; lane++) {
1184 uint32_t iSector = iSectorBase + lane;
1185 std::fill(&tmpNativeAccess->nClusters[iSector][0], &tmpNativeAccess->nClusters[iSector][0] + MAXGLOBALPADROW, 0);
1186 if (doGPU) {
1187 SynchronizeStream(lane);
1188 }
1189 GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
1190 GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
1191
1192 if (laneHasData[lane]) {
1193 anyLaneHasData = true;
1194 if (buildNativeGPU && GetProcessingSettings().tpccfGatherKernel) {
1195 runKernel<GPUTPCCFGather>({GetGridBlk(GPUCA_ROW_COUNT, mRec->NStreams() - 1), {iSector}}, &mInputsShadow->mPclusterNativeBuffer[nClsTotal]);
1196 }
1197 for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
1198 if (nClsTotal + clusterer.mPclusterInRow[j] > mInputsHost->mNClusterNative) {
1199 clusterer.raiseError(GPUErrors::ERROR_CF_GLOBAL_CLUSTER_OVERFLOW, iSector * 1000 + j, nClsTotal + clusterer.mPclusterInRow[j], mInputsHost->mNClusterNative);
1200 continue;
1201 }
1202 if (buildNativeGPU) {
1203 if (!GetProcessingSettings().tpccfGatherKernel) {
1204 GPUMemCpyAlways(RecoStep::TPCClusterFinding, (void*)&mInputsShadow->mPclusterNativeBuffer[nClsTotal], (const void*)&clustererShadow.mPclusterByRow[j * clusterer.mNMaxClusterPerRow], sizeof(mIOPtrs.clustersNative->clustersLinear[0]) * clusterer.mPclusterInRow[j], mRec->NStreams() - 1, -2);
1205 }
1206 } else if (buildNativeHost) {
1207 GPUMemCpyAlways(RecoStep::TPCClusterFinding, (void*)&tmpNativeClusters[nClsTotal], (const void*)&clustererShadow.mPclusterByRow[j * clusterer.mNMaxClusterPerRow], sizeof(mIOPtrs.clustersNative->clustersLinear[0]) * clusterer.mPclusterInRow[j], mRec->NStreams() - 1, false);
1208 }
1209 tmpNativeAccess->nClusters[iSector][j] += clusterer.mPclusterInRow[j];
1210 nClsTotal += clusterer.mPclusterInRow[j];
1211 }
1212 if (transferRunning[lane]) {
1213 ReleaseEvent(mEvents->stream[lane], doGPU);
1214 }
1215 RecordMarker(&mEvents->stream[lane], mRec->NStreams() - 1);
1216 transferRunning[lane] = 1;
1217 }
1218
1219 if (not propagateMCLabels || not laneHasData[lane]) {
1220 assert(propagateMCLabels ? mcLinearLabels.header.size() == nClsTotal : true);
1221 continue;
1222 }
1223
1224 runKernel<GPUTPCCFMCLabelFlattener, GPUTPCCFMCLabelFlattener::setRowOffsets>({GetGrid(GPUCA_ROW_COUNT, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}});
1226 runKernel<GPUTPCCFMCLabelFlattener, GPUTPCCFMCLabelFlattener::flatten>({GetGrid(GPUCA_ROW_COUNT, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, &mcLinearLabels);
1227 clusterer.clearMCMemory();
1228 assert(propagateMCLabels ? mcLinearLabels.header.size() == nClsTotal : true);
1229 }
1230 if (propagateMCLabels) {
1231 for (int32_t lane = 0; lane < maxLane; lane++) {
1232 processors()->tpcClusterer[iSectorBase + lane].clearMCMemory();
1233 }
1234 }
1235 if (buildNativeHost && buildNativeGPU && anyLaneHasData) {
1236 if (GetProcessingSettings().delayedOutput) {
1237 mOutputQueue.emplace_back(outputQueueEntry{(void*)((char*)&tmpNativeClusters[nClsFirst] - (char*)&tmpNativeClusters[0]), &mInputsShadow->mPclusterNativeBuffer[nClsFirst], (nClsTotal - nClsFirst) * sizeof(tmpNativeClusters[0]), RecoStep::TPCClusterFinding});
1238 } else {
1239 GPUMemCpy(RecoStep::TPCClusterFinding, (void*)&tmpNativeClusters[nClsFirst], (const void*)&mInputsShadow->mPclusterNativeBuffer[nClsFirst], (nClsTotal - nClsFirst) * sizeof(tmpNativeClusters[0]), mRec->NStreams() - 1, false);
1240 }
1241 }
1242
1243 if (mWaitForFinalInputs && iSectorBase >= 21 && (int32_t)iSectorBase < 21 + GetProcessingSettings().nTPCClustererLanes) {
1244 notifyForeignChainFinished();
1245 }
1246 if (mWaitForFinalInputs && iSectorBase >= 30 && (int32_t)iSectorBase < 30 + GetProcessingSettings().nTPCClustererLanes) {
1247 mWaitForFinalInputs();
1248 synchronizeCalibUpdate = DoQueuedUpdates(0, false);
1249 }
1250 }
1251 for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) {
1252#ifdef GPUCA_HAS_ONNX
1253 if (GetProcessingSettings().nn.applyNNclusterizer) {
1254 if (GetProcessingSettings().nn.nnClusterizerVerbosity > 0) {
1255 LOG(info) << "(ORT) Environment releasing...";
1256 }
1257 GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
1258 nnApplication.mModelClass.release(true);
1259 nnApplication.mModelReg1.release(true);
1260 nnApplication.mModelReg2.release(true);
1261 }
1262#endif
1263 if (transferRunning[i]) {
1264 ReleaseEvent(mEvents->stream[i], doGPU);
1265 }
1266 }
1267
1268 if (GetProcessingSettings().param.tpcTriggerHandling) {
1270 if (triggerOutput && triggerOutput->allocator) {
1271 // GPUInfo("Storing %lu trigger words", mTriggerBuffer->triggers.size());
1272 auto* outputBuffer = (decltype(mTriggerBuffer->triggers)::value_type*)triggerOutput->allocator(mTriggerBuffer->triggers.size() * sizeof(decltype(mTriggerBuffer->triggers)::value_type));
1273 std::copy(mTriggerBuffer->triggers.begin(), mTriggerBuffer->triggers.end(), outputBuffer);
1274 }
1275 mTriggerBuffer->triggers.clear();
1276 }
1277
1278 // Number of clusters is logged by tracking. This ensures clusters are still printed if it's not running
1280 GPUInfo("Event has %zu TPC Clusters", nClsTotal);
1281 }
1282
1283 ClusterNativeAccess::ConstMCLabelContainerView* mcLabelsConstView = nullptr;
1284 if (propagateMCLabels) {
1285 // TODO: write to buffer directly
1287 std::pair<ConstMCLabelContainer*, ConstMCLabelContainerView*> buffer;
1290 throw std::runtime_error("Cluster MC Label buffer missing");
1291 }
1293 buffer = {&container->first, &container->second};
1294 } else {
1295 mIOMem.clusterNativeMCView = std::make_unique<ConstMCLabelContainerView>();
1296 mIOMem.clusterNativeMCBuffer = std::make_unique<ConstMCLabelContainer>();
1297 buffer.first = mIOMem.clusterNativeMCBuffer.get();
1298 buffer.second = mIOMem.clusterNativeMCView.get();
1299 }
1300
1301 assert(propagateMCLabels ? mcLinearLabels.header.size() == nClsTotal : true);
1302 assert(propagateMCLabels ? mcLinearLabels.data.size() >= nClsTotal : true);
1303
1304 mcLabels.setFrom(mcLinearLabels.header, mcLinearLabels.data);
1305 mcLabels.flatten_to(*buffer.first);
1306 *buffer.second = *buffer.first;
1307 mcLabelsConstView = buffer.second;
1308 }
1309
1310 if (buildNativeHost && buildNativeGPU && GetProcessingSettings().delayedOutput) {
1311 mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = nClsTotal;
1312 AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput, GetProcessingSettings().tpcWriteClustersAfterRejection ? nullptr : mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
1313 tmpNativeClusters = mInputsHost->mPclusterNativeOutput;
1314 for (uint32_t i = outputQueueStart; i < mOutputQueue.size(); i++) {
1315 mOutputQueue[i].dst = (char*)tmpNativeClusters + (size_t)mOutputQueue[i].dst;
1316 }
1317 }
1318
1319 if (buildNativeHost) {
1320 tmpNativeAccess->clustersLinear = tmpNativeClusters;
1321 tmpNativeAccess->clustersMCTruth = mcLabelsConstView;
1322 tmpNativeAccess->setOffsetPtrs();
1323 mIOPtrs.clustersNative = tmpNativeAccess;
1324 if (GetProcessingSettings().tpcApplyClusterFilterOnCPU) {
1325 auto allocator = [this, &tmpNativeClusters](size_t size) {
1326 this->mInputsHost->mNClusterNative = size;
1327 this->AllocateRegisteredMemory(this->mInputsHost->mResourceClusterNativeOutput, this->GetProcessingSettings().tpcWriteClustersAfterRejection ? nullptr : this->mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
1328 return (tmpNativeClusters = this->mInputsHost->mPclusterNativeOutput);
1329 };
1330 RunTPCClusterFilter(tmpNativeAccess, allocator, false);
1331 nClsTotal = tmpNativeAccess->nClustersTotal;
1332 }
1333 }
1334
1335 if (!mWaitForFinalInputs) {
1336 notifyForeignChainFinished();
1337 }
1338
1339 if (buildNativeGPU) {
1340 processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
1341 WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0);
1342 *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative;
1343 mInputsHost->mPclusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer;
1344 mInputsHost->mPclusterNativeAccess->setOffsetPtrs();
1345 TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, mInputsHost->mResourceClusterNativeAccess, 0);
1346 }
1347 if (doGPU && synchronizeOutput) {
1349 }
1350 if (doGPU && synchronizeCalibUpdate) {
1352 }
1353 if (buildNativeHost && (GetProcessingSettings().deterministicGPUReconstruction || GetProcessingSettings().debugLevel >= 4)) {
1354 for (uint32_t i = 0; i < NSECTORS; i++) {
1355 for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
1356 std::sort(&tmpNativeClusters[tmpNativeAccess->clusterOffset[i][j]], &tmpNativeClusters[tmpNativeAccess->clusterOffset[i][j] + tmpNativeAccess->nClusters[i][j]]);
1357 }
1358 }
1359 if (buildNativeGPU) {
1360 GPUMemCpy(RecoStep::TPCClusterFinding, (void*)mInputsShadow->mPclusterNativeBuffer, (const void*)tmpNativeClusters, nClsTotal * sizeof(tmpNativeClusters[0]), -1, true);
1361 }
1362 }
1363 mRec->MemoryScalers()->nTPCHits = nClsTotal;
1364 mRec->PopNonPersistentMemory(RecoStep::TPCClusterFinding, qStr2Tag("TPCCLUST"));
1365 if (mPipelineNotifyCtx) {
1367 mPipelineNotifyCtx = nullptr;
1368 }
1369
1370 if (GetProcessingSettings().autoAdjustHostThreads && !doGPU) {
1372 }
1373
1374#endif
1375 return 0;
1376}
Definition of the TPC Digit.
int32_t i
#define TPC_MAX_TIME_BIN_TRIGGERED
#define GPUCA_MAX_STREAMS
int32_t retVal
#define GPUCA_ROW_COUNT
Class to serialize ONNX objects for ROOT snapshots of CCDB objects at runtime.
std::enable_if_t< std::is_signed< T >::value, bool > hasData(const CalArray< T > &cal)
Definition Painter.cxx:599
uint32_t j
Definition RawData.h:0
uint8_t endpoint
Definition RawData.h:0
Provides a basic fallback implementation for Vc.
Definitions of TPC Zero Suppression Data Headers.
void Start()
Definition timer.cxx:64
void Stop()
Definition timer.cxx:76
A container to hold and manage MC truth information/labels.
void setFrom(std::vector< MCTruthHeaderElement > &header, std::vector< TruthElement > &truthArray)
size_t flatten_to(ContainerType &container) const
std::unique_ptr< o2::tpc::ClusterNativeAccess > mClusterNativeAccess
int32_t RunTPCClusterizer(bool synchronizeOutput=true)
std::unique_ptr< GPUTrackingInputProvider > mInputsHost
std::array< GPUOutputControl *, GPUTrackingOutputs::count()> mSubOutputControls
std::unique_ptr< std::ofstream > mDebugFile
std::unique_ptr< GPUTriggerOutputs > mTriggerBuffer
std::vector< outputQueueEntry > mOutputQueue
std::unique_ptr< GPUTPCCFChainContext > mCFContext
int32_t DoQueuedUpdates(int32_t stream, bool updateSlave=true)
std::unique_ptr< GPUNewCalibValues > mNewCalibValues
GPUTrackingInOutPointers & mIOPtrs
struct o2::gpu::GPUChainTracking::InOutMemory mIOMem
std::unique_ptr< GPUTrackingInputProvider > mInputsShadow
void RecordMarker(deviceEvent *ev, int32_t stream)
Definition GPUChain.h:108
void TransferMemoryResourceLinkToGPU(RecoStep step, int16_t res, int32_t stream=-1, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
Definition GPUChain.h:124
void GPUMemCpyAlways(RecoStep step, void *dst, const void *src, size_t size, int32_t stream, int32_t toGPU, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
Definition GPUChain.h:129
void GPUMemCpy(RecoStep step, void *dst, const void *src, size_t size, int32_t stream, int32_t toGPU, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
Definition GPUChain.h:128
bool DoDebugAndDump(RecoStep step, uint32_t mask, T &processor, S T::*func, Args &&... args)
Definition GPUChain.h:230
GPUReconstruction::RecoStepField GetRecoStepsGPU() const
Definition GPUChain.h:72
GPUReconstruction::RecoStepField GetRecoSteps() const
Definition GPUChain.h:71
void WriteToConstantMemory(RecoStep step, size_t offset, const void *src, size_t size, int32_t stream=-1, deviceEvent *ev=nullptr)
Definition GPUChain.h:127
void ReleaseEvent(deviceEvent ev, bool doGPU=true)
Definition GPUChain.h:111
size_t AllocateRegisteredMemory(GPUProcessor *proc)
Definition GPUChain.h:217
virtual std::unique_ptr< GPUReconstructionProcessing::threadContext > GetThreadContext()
Definition GPUChain.h:109
GPUConstantMem * processors()
Definition GPUChain.h:84
static constexpr krnlRunRange krnlRunRangeNone
Definition GPUChain.h:41
void SetONNXGPUStream(Ort::SessionOptions &opt, int32_t stream, int32_t *deviceId)
Definition GPUChain.h:90
GPUParam & param()
Definition GPUChain.h:87
void SetupGPUProcessor(T *proc, bool allocate)
Definition GPUChain.h:220
const GPUSettingsProcessing & GetProcessingSettings() const
Definition GPUChain.h:76
void SynchronizeStream(int32_t stream)
Definition GPUChain.h:89
GPUReconstructionCPU * mRec
Definition GPUChain.h:79
GPUConstantMem * processorsShadow()
Definition GPUChain.h:85
krnlExec GetGridAutoStep(int32_t stream, GPUDataTypes::RecoStep st=GPUDataTypes::RecoStep::NoRecoStep)
Definition GPUChain.cxx:47
static constexpr int32_t NSECTORS
Definition GPUChain.h:58
void TransferMemoryResourceLinkToHost(RecoStep step, int16_t res, int32_t stream=-1, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
Definition GPUChain.h:125
void TransferMemoryResourcesToHost(RecoStep step, GPUProcessor *proc, int32_t stream=-1, bool all=false)
Definition GPUChain.h:123
krnlExec GetGrid(uint32_t totalItems, uint32_t nThreads, int32_t stream, GPUReconstruction::krnlDeviceType d=GPUReconstruction::krnlDeviceType::Auto, GPUDataTypes::RecoStep st=GPUDataTypes::RecoStep::NoRecoStep)
Definition GPUChain.cxx:21
krnlExec GetGridBlk(uint32_t nBlocks, int32_t stream, GPUReconstruction::krnlDeviceType d=GPUReconstruction::krnlDeviceType::Auto, GPUDataTypes::RecoStep st=GPUDataTypes::RecoStep::NoRecoStep)
Definition GPUChain.cxx:32
GPUReconstruction * rec()
Definition GPUChain.h:66
HighResTimer & getGeneralStepTimer(GeneralStep step)
void runParallelOuterLoop(bool doGPU, uint32_t nThreads, std::function< void(uint32_t)> lambda)
const GPUDefParameters & getGPUParameters(bool doGPU) const override
void AllocateRegisteredForeignMemory(int16_t res, GPUReconstruction *rec, GPUOutputControl *control=nullptr)
void ComputeReuseMax(GPUProcessor *proc)
RecoStepField GetRecoStepsGPU() const
void PopNonPersistentMemory(RecoStep step, uint64_t tag, const GPUProcessor *proc=nullptr)
const GPUParam & GetParam() const
void PushNonPersistentMemory(uint64_t tag)
InOutTypeField GetRecoStepsOutputs() const
GPUMemorySizeScalers * MemoryScalers()
static void setGlobalOffsetsAndAllocate(GPUTPCClusterFinder &, GPUTPCLinearLabels &)
void SetMaxData(const GPUTrackingInOutPointers &io)
void SetNMaxDigits(size_t nDigits, size_t nPages, size_t nDigitsFragment, size_t nDigitsEndpointMax)
void DumpSuppressedPeaks(std::ostream &out)
void DumpPeakMap(std::ostream &out, std::string_view)
o2::dataformats::ConstMCTruthContainerView< o2::MCCompLabel > const * mPinputLabels
void DumpChargeMap(std::ostream &out, std::string_view)
uint32_t getNSteps(size_t items) const
void DumpSuppressedPeaksCompacted(std::ostream &out)
void DumpPeaksCompacted(std::ostream &out)
tpc::ClusterNative * mPclusterByRow
void init(const GPUSettingsProcessingNNclusterizer &, bool=false)
void initClusterizer(const GPUSettingsProcessingNNclusterizer &, GPUTPCNNClusterizer &, int32_t=-1, int32_t=-1)
OrtDataType::Float16_t * mInputData_16
OrtDataType::Float16_t * mOutputDataReg2_16
OrtDataType::Float16_t * mModelProbabilities_16
OrtDataType::Float16_t * mOutputDataReg1_16
void release(bool=false)
void setIntraOpNumThreads(int threads)
std::vector< std::vector< int64_t > > getNumOutputNodes() const
#define TPC_PADS_IN_SECTOR
GLint GLenum GLint x
Definition glcorearb.h:403
const GLfloat * m
Definition glcorearb.h:4066
GLenum src
Definition glcorearb.h:1767
GLint GLsizei count
Definition glcorearb.h:399
GLuint buffer
Definition glcorearb.h:655
GLsizeiptr size
Definition glcorearb.h:659
GLdouble f
Definition glcorearb.h:310
GLboolean * data
Definition glcorearb.h:298
GLenum GLfloat param
Definition glcorearb.h:271
uint8_t itsSharedClusterMap uint8_t
constexpr int LHCMaxBunches
Definition of a container to keep/associate and arbitrary number of labels associated to an index wit...
RAWDataHeaderV7 RAWDataHeader
const float k2
Definition MathUtils.h:72
void dumpBuffer(gsl::span< const std::byte > buffer, std::ostream &out=std::cout, size_t maxbytes=std::numeric_limits< size_t >::max())
Definition DumpBuffer.h:139
constexpr int LHCBCPERTIMEBIN
Definition Constants.h:38
constexpr int MAXGLOBALPADROW
Definition Constants.h:34
Global TPC definitions and constants.
Definition SimTraits.h:168
@ ZSVersionDenseLinkBased
@ ZSVersionLinkBasedWithMeta
@ ZSVersionRowBased10BitADC
@ ZSVersionRowBased12BitADC
a couple of static helper functions to create timestamp values for CCDB queries or override obsolete ...
constexpr T qStr2Tag(const char *str)
Definition strtag.h:22
tpccf::TPCTime start
Definition CfFragment.h:31
S< o2::tpc::ORTRootSerializer >::type * nnClusterizerNetworks[3]
std::unique_ptr< o2::dataformats::ConstMCTruthContainerView< o2::MCCompLabel > > clusterNativeMCView
std::unique_ptr< o2::dataformats::ConstMCTruthContainer< o2::MCCompLabel > > clusterNativeMCBuffer
deviceEvent stream[GPUCA_MAX_STREAMS]
GPUTPCClusterFinder tpcClusterer[GPUCA_NSECTORS]
GPUCalibObjectsConst calibObjects
GPUTrackingInOutPointers ioPtrs
size_t NTPCClusters(size_t tpcDigits, bool perSector=false)
std::function< void *(size_t)> allocator
struct o2::gpu::GPUTPCClusterFinder::Memory::counters_t counters
std::vector< o2::MCCompLabel > data
std::vector< o2::dataformats::MCTruthHeaderElement > header
const GPUTPCDigitsMCInput * tpcDigitsMC
const o2::tpc::ClusterNativeAccess * clustersNative
const GPUSettingsTF * settingsTF
const GPUTrackingInOutZS * tpcZS
const GPUTrackingInOutDigits * tpcPackedDigits
GPUTrackingInOutZSSector sector[NSECTORS]
static constexpr uint32_t NENDPOINTS
size_t getIndex(const GPUOutputControl &v)
static constexpr int getVersion()
get numeric version of the RDH
Definition RDHUtils.h:60
unsigned int nClusters[constants::MAXSECTOR][constants::MAXGLOBALPADROW]
const o2::dataformats::ConstMCTruthContainerView< o2::MCCompLabel > * clustersMCTruth
std::pair< ConstMCLabelContainer, ConstMCLabelContainerView > ConstMCLabelContainerViewWithBuffer
unsigned int clusterOffset[constants::MAXSECTOR][constants::MAXGLOBALPADROW]
const ClusterNative * clustersLinear
static constexpr unsigned int TRIGGER_WORD_SIZE
unsigned char version
static constexpr size_t TPC_ZS_PAGE_SIZE
unsigned short nADCsamples
Trigger info including the orbit.
uint32_t orbit
orbit of the trigger word
TriggerWordDLBZS triggerWord
trigger Word information
bool isValid(int entry=0) const
constexpr size_t min
constexpr size_t max
LOG(info)<< "Compressed in "<< sw.CpuTime()<< " s"
std::vector< Digit > digits