Project
Loading...
Searching...
No Matches
GPUChainTrackingClusterizer.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
15#include "GPUChainTracking.h"
18#include "GPULogging.h"
19#include "GPUO2DataTypes.h"
20#include "GPUTPCExtraADC.h"
23#include "GPUNewCalibValues.h"
24#include "GPUConstantMem.h"
25#include "CfChargePos.h"
26#include "CfArray2D.h"
27#include "GPUGeneralKernels.h"
28#include "GPUDefParametersRuntime.h"
31#include "GPUTPCCFDecodeZS.h"
33#include "GPUTPCCFPeakFinder.h"
36#include "GPUTPCCFClusterizer.h"
37#include "GPUTPCCFGather.h"
39#include "GPUTriggerOutputs.h"
40#include "GPUHostDataTypes.h"
46#include "TPCBase/RDHUtils.h"
47
48#ifdef GPUCA_HAS_ONNX
51#include "ORTRootSerializer.h"
52#endif
53
54#ifndef GPUCA_STANDALONE
56#endif
57
58#include "utils/VcShim.h"
59#include "utils/strtag.h"
60#include "utils/vecpod.h"
61#include <numeric>
62#include <random>
63#include <vector>
64
65// #define INSERT_SATURATED_SIGNALS
66
67using namespace o2::gpu;
68using namespace o2::tpc;
69using namespace o2::tpc::constants;
70using namespace o2::dataformats;
71
72#ifndef GPUCA_RUN2
73std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCountUpdate(uint32_t iSector, const CfFragment& fragment)
74{
78 uint32_t digits = 0;
79 uint32_t pages = 0;
80 for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
81 clusterer.mMinMaxCN[j] = mCFContext->fragmentData[fragment.index].minMaxCN[iSector][j];
82 if (doGPU) {
83 uint16_t posInEndpoint = 0;
84 uint16_t pagesEndpoint = 0;
85 for (uint32_t k = clusterer.mMinMaxCN[j].zsPtrFirst; k < clusterer.mMinMaxCN[j].zsPtrLast; k++) {
86 const uint32_t pageFirst = (k == clusterer.mMinMaxCN[j].zsPtrFirst) ? clusterer.mMinMaxCN[j].zsPageFirst : 0;
87 const uint32_t pageLast = (k + 1 == clusterer.mMinMaxCN[j].zsPtrLast) ? clusterer.mMinMaxCN[j].zsPageLast : mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k];
88 for (uint32_t l = pageFirst; l < pageLast; l++) {
89 uint16_t pageDigits = mCFContext->fragmentData[fragment.index].pageDigits[iSector][j][posInEndpoint++];
90 if (pageDigits) {
91 *(o++) = GPUTPCClusterFinder::ZSOffset{digits, j, pagesEndpoint};
92 digits += pageDigits;
93 }
94 pagesEndpoint++;
95 }
96 }
97 if (pagesEndpoint != mCFContext->fragmentData[fragment.index].pageDigits[iSector][j].size()) {
98 if (GetProcessingSettings().ignoreNonFatalGPUErrors) {
99 GPUError("TPC raw page count mismatch in TPCClusterizerDecodeZSCountUpdate: expected %d / buffered %lu", pagesEndpoint, mCFContext->fragmentData[fragment.index].pageDigits[iSector][j].size());
100 return {0, 0};
101 } else {
102 GPUFatal("TPC raw page count mismatch in TPCClusterizerDecodeZSCountUpdate: expected %d / buffered %lu", pagesEndpoint, mCFContext->fragmentData[fragment.index].pageDigits[iSector][j].size());
103 }
104 }
105 } else {
107 digits += mCFContext->fragmentData[fragment.index].nDigits[iSector][j];
108 pages += mCFContext->fragmentData[fragment.index].nPages[iSector][j];
109 }
110 }
111 if (doGPU) {
113 }
114 if (GetProcessingSettings().clusterizerZSSanityCheck && mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
115 TPCClusterizerEnsureZSOffsets(iSector, fragment);
116 }
117 return {digits, pages};
118}
119
120void GPUChainTracking::TPCClusterizerEnsureZSOffsets(uint32_t iSector, const CfFragment& fragment)
121{
123 uint32_t nAdcs = 0;
125 const auto& data = mCFContext->fragmentData[fragment.index];
126 uint32_t pagesEndpoint = 0;
127 const uint32_t nAdcsExpected = data.nDigits[iSector][endpoint];
128 const uint32_t nPagesExpected = data.nPages[iSector][endpoint];
129
130 uint32_t nAdcDecoded = 0;
131 const auto& zs = mIOPtrs.tpcZS->sector[iSector];
132 for (uint32_t i = data.minMaxCN[iSector][endpoint].zsPtrFirst; i < data.minMaxCN[iSector][endpoint].zsPtrLast; i++) {
133 const uint32_t pageFirst = (i == data.minMaxCN[iSector][endpoint].zsPtrFirst) ? data.minMaxCN[iSector][endpoint].zsPageFirst : 0;
134 const uint32_t pageLast = (i + 1 == data.minMaxCN[iSector][endpoint].zsPtrLast) ? data.minMaxCN[iSector][endpoint].zsPageLast : zs.nZSPtr[endpoint][i];
135 for (uint32_t j = pageFirst; j < pageLast; j++) {
136 const uint8_t* page = static_cast<const uint8_t*>(zs.zsPtr[endpoint][i]) + j * TPCZSHDR::TPC_ZS_PAGE_SIZE;
137 const header::RAWDataHeader* rawDataHeader = reinterpret_cast<const header::RAWDataHeader*>(page);
138 const TPCZSHDRV2* decHdr = reinterpret_cast<const TPCZSHDRV2*>(page + raw::RDHUtils::getMemorySize(*rawDataHeader) - sizeof(TPCZSHDRV2));
139 const uint16_t nSamplesInPage = decHdr->nADCsamples;
140
141 nAdcDecoded += nSamplesInPage;
142 pagesEndpoint++;
143 }
144 }
145
146 if (pagesEndpoint != nPagesExpected) {
147 GPUFatal("Sector %d, Endpoint %d, Fragment %d: TPC raw page count mismatch: expected %d / buffered %u", iSector, endpoint, fragment.index, pagesEndpoint, nPagesExpected);
148 }
149
150 if (nAdcDecoded != nAdcsExpected) {
151 GPUFatal("Sector %d, Endpoint %d, Fragment %d: TPC ADC count mismatch: expected %u, buffered %u", iSector, endpoint, fragment.index, nAdcsExpected, nAdcDecoded);
152 }
153
154 if (nAdcs != clusterer.mPzsOffsets[endpoint].offset) {
155 GPUFatal("Sector %d, Endpoint %d, Fragment %d: TPC ADC offset mismatch: expected %u, buffered %u", iSector, endpoint, fragment.index, nAdcs, clusterer.mPzsOffsets[endpoint].offset);
156 }
157
158 nAdcs += nAdcsExpected;
159 }
160}
161
162void GPUChainTracking::TPCClusterizerTransferExtraADC(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int lane, const GPUTPCExtraADC& extraADCs)
163{
164 const int32_t iSector = clusterer.mISector;
165 const auto& fragment = clusterer.mPmemory->fragment;
166 const auto& digits = extraADCs.digitsBySector[iSector];
167
168 if (fragment.index != 0) {
169 return;
170 }
171
172 if (digits.empty()) {
173 return;
174 }
175
176 const size_t chargeMapSize = TPCMapMemoryLayout<PackedCharge>::items(GetProcessingSettings().overrideClusterizerFragmentLen);
177 const size_t chargeMapSizeBytes = chargeMapSize * sizeof(PackedCharge);
178
179 vecpod<uint16_t> chargeMapHostData;
180 chargeMapHostData.resize(chargeMapSize);
181
182 CfArray2D<PackedCharge> chargeMapHost(reinterpret_cast<PackedCharge*>(chargeMapHostData.data()));
183
184 vecpod<CfChargePos> extraPositions;
185 extraPositions.reserve(digits.size());
186
187 GPUMemCpy(RecoStep::TPCClusterFinding, chargeMapHostData.data(), clustererShadow.mPchargeMap, chargeMapSizeBytes, lane, false);
188 SynchronizeStream(lane);
189
190 for (const auto& d : digits) {
191 if (!fragment.contains(d.getTimeStamp())) {
192 continue;
193 }
194
195 CfChargePos pos{(tpccf::Row)d.getRow(), (tpccf::Pad)d.getPad(), (tpccf::TPCFragmentTime)(d.getTimeStamp() - fragment.start)};
196 chargeMapHost[pos] = PackedCharge(d.getChargeFloat());
197
198 extraPositions.push_back(pos);
199 }
200
201 GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPchargeMap, chargeMapHostData.data(), chargeMapSizeBytes, lane, true);
202
203 const size_t nPositions = clusterer.mPmemory->counters.nPositions;
204 const size_t extraPositionsOffset = nPositions - extraPositions.size();
205 GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPpositions + extraPositionsOffset, extraPositions.data(), extraPositions.size() * sizeof(CfChargePos), lane, true);
206}
207
208void GPUChainTracking::TPCClusterizerCheckExtraADCZeros(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int lane, const GPUTPCExtraADC& extraADCs)
209{
210 const int32_t iSector = clusterer.mISector;
211 const auto& fragment = clusterer.mPmemory->fragment;
212 const auto& digits = extraADCs.digitsBySector[iSector];
213
214 if (fragment.index != 0) {
215 return;
216 }
217
218 if (digits.empty()) {
219 return;
220 }
221
222 const size_t chargeMapSize = TPCMapMemoryLayout<PackedCharge>::items(GetProcessingSettings().overrideClusterizerFragmentLen);
223 const size_t chargeMapSizeBytes = chargeMapSize * sizeof(PackedCharge);
224
225 vecpod<uint16_t> chargeMapHostData;
226 chargeMapHostData.resize(chargeMapSize);
227
228 CfArray2D<PackedCharge> chargeMapHost(reinterpret_cast<PackedCharge*>(chargeMapHostData.data()));
229
230 GPUMemCpy(RecoStep::TPCClusterFinding, chargeMapHostData.data(), clustererShadow.mPchargeMap, chargeMapSizeBytes, lane, false);
231 SynchronizeStream(lane);
232
233 size_t nNonZeroADCs = 0;
234
235 for (const auto& d : digits) {
236 if (!fragment.contains(d.getTimeStamp())) {
237 continue;
238 }
239
240 CfChargePos pos{(tpccf::Row)d.getRow(), (tpccf::Pad)d.getPad(), (tpccf::TPCFragmentTime)(d.getTimeStamp() - fragment.start)};
241
242 auto adc = chargeMapHost[pos].unpack();
243
244 if (adc != 0) {
245 nNonZeroADCs++;
246 }
247 }
248
249 if (nNonZeroADCs > 0) {
250 GPUInfo("Non Zero ADCs: %zu", nNonZeroADCs);
251 } else {
252 GPUInfo("Cleared all extra ADC values!", nNonZeroADCs);
253 }
254}
255
256namespace
257{
258struct TPCCFDecodeScanTmp {
259 int32_t zsPtrFirst, zsPageFirst, zsPtrLast, zsPageLast, hasData, pageCounter;
260};
261
262// Additional ADC values must be generated at start of clusterizer
263// This is required, so enough memory is allocated for the charge points
264// And ADCs can be injected by "simply"
265// -> copying chargeMap + chargePositions to host
266// -> writing additional adcs to chargeMap + positions
267// -> copying values to device
268GPUTPCExtraADC GenerateSaturatedSignals(size_t seed = 42)
269{
270 constexpr int32_t MinTailLength = 50;
271 constexpr int32_t MaxTailLength = 200;
272 constexpr int32_t TailWidth = 3; // Assume tails are 3 pads wide at the moment
273
274 constexpr GPUTPCGeometry geo;
275
276 GPUTPCExtraADC adcs;
277
278 const int32_t nHIPs = 50;
279 const int32_t firstTB = 0; // Place all HIPs in first fragment for now
280 const int32_t lastTB = 4000 - MaxTailLength; // Don't allow cut off tails at fragment borders
281 const int32_t tailADC = 250; // charge should decrease over time, but for now just hardcode ADC above the threshold
282
283 std::mt19937 gen{(uint32_t)seed};
284 std::uniform_int_distribution<> randomRow(0, GPUTPCGeometry::NROWS - 1);
285 std::uniform_int_distribution<> randomTB(firstTB, lastTB);
286 std::uniform_int_distribution<> randomTailLength(MinTailLength, MaxTailLength);
287 // std::normal_distribution<> tailLengthNoise(8, 2.0);
288
289 for (int32_t iHIP = 0; iHIP < nHIPs; iHIP++) {
290
291 const int32_t row = randomRow(gen);
292 const int32_t nPads = geo.NPads(row);
293 std::uniform_int_distribution<> randomPad(0, nPads - 1);
294
295 const int32_t basePad = randomPad(gen);
296 const int32_t baseTb = randomTB(gen);
297
298 auto& digits = adcs.digitsBySector[0];
299
300 const int32_t tailLength = randomTailLength(gen);
301
302 for (int32_t dPad = -TailWidth; dPad <= TailWidth; dPad++) {
303 const int32_t iPad = basePad + dPad;
304 if (iPad < 0 || iPad >= nPads) {
305 continue;
306 }
307
308 for (int32_t dTime = 0; dTime < tailLength; dTime++) {
309 const int32_t iTime = baseTb + dTime;
310
311 if (iTime >= 4000) {
312 break;
313 }
314
315 const auto adc = dTime == 0 && dPad == 0 ? 1023 : tailADC;
316
317 digits.emplace_back(0, adc, row, iPad, iTime);
318 }
319 }
320 }
321
322 GPUInfo("Generated %zu ADCs!", adcs.digitsBySector[0].size());
323
324 return adcs;
325}
326
327} // namespace
328
329std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint32_t iSector, const CfFragment& fragment)
330{
331 mRec->getGeneralStepTimer(GeneralStep::Prepare).Start();
332 uint32_t nDigits = 0;
333 uint32_t nPages = 0;
334 uint32_t endpointAdcSamples[GPUTrackingInOutZS::NENDPOINTS];
335 memset(endpointAdcSamples, 0, sizeof(endpointAdcSamples));
337 int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : ((mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader*)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]) : 0);
338
339 for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
340
341 if (GetProcessingSettings().prefetchTPCpageScan >= 3 && j < GPUTrackingInOutZS::NENDPOINTS - 1) {
342 for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j + 1]; k++) {
343 for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j + 1][k]; l++) {
344 Vc::Common::prefetchMid(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j + 1][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE);
345 Vc::Common::prefetchMid(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j + 1][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
346 }
347 }
348 }
349
350 std::vector<std::pair<CfFragment, TPCCFDecodeScanTmp>> fragments;
351 fragments.reserve(mCFContext->nFragments);
352 fragments.emplace_back(std::pair<CfFragment, TPCCFDecodeScanTmp>{fragment, {0, 0, 0, 0, 0, -1}});
353 for (uint32_t i = 1; i < mCFContext->nFragments; i++) {
354 fragments.emplace_back(std::pair<CfFragment, TPCCFDecodeScanTmp>{fragments.back().first.next(), {0, 0, 0, 0, 0, -1}});
355 }
356 std::vector<bool> fragmentExtends(mCFContext->nFragments, false);
357
358 uint32_t firstPossibleFragment = 0;
359 uint32_t pageCounter = 0;
360 uint32_t emptyPages = 0;
361 for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j]; k++) {
362 if (GetProcessingSettings().tpcSingleSector != -1 && GetProcessingSettings().tpcSingleSector != (int32_t)iSector) {
363 break;
364 }
365 nPages += mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k];
366 for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; l++) {
367
368 if (GetProcessingSettings().prefetchTPCpageScan >= 2 && l + 1 < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]) {
369 Vc::Common::prefetchForOneRead(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE);
370 Vc::Common::prefetchForOneRead(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
371 }
372
373 const uint8_t* const page = ((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE;
375 if (o2::raw::RDHUtils::getMemorySize(*rdh) == sizeof(o2::header::RAWDataHeader)) {
376 emptyPages++;
377 continue;
378 }
379 pageCounter++;
380 const TPCZSHDR* const hdr = (const TPCZSHDR*)(rdh_utils::getLink(o2::raw::RDHUtils::getFEEID(*rdh)) == rdh_utils::DLBZSLinkID ? (page + o2::raw::RDHUtils::getMemorySize(*rdh) - sizeof(TPCZSHDRV2)) : (page + sizeof(o2::header::RAWDataHeader)));
381 if (mCFContext->zsVersion == -1) {
382 mCFContext->zsVersion = hdr->version;
383 if (GetProcessingSettings().param.tpcTriggerHandling && mCFContext->zsVersion < ZSVersion::ZSVersionDenseLinkBased) { // TODO: Move tpcTriggerHandling to recoSteps bitmask
384 static bool errorShown = false;
385 if (errorShown == false) {
386 GPUAlarm("Trigger handling only possible with TPC Dense Link Based data, received version %d, disabling", mCFContext->zsVersion);
387 }
388 errorShown = true;
389 }
390 } else if (mCFContext->zsVersion != (int32_t)hdr->version) {
391 GPUError("Received TPC ZS 8kb page of mixed versions, expected %d, received %d (linkid %d, feeCRU %d, feeEndpoint %d, feelinkid %d)", mCFContext->zsVersion, (int32_t)hdr->version, (int32_t)o2::raw::RDHUtils::getLinkID(*rdh), (int32_t)rdh_utils::getCRU(*rdh), (int32_t)rdh_utils::getEndPoint(*rdh), (int32_t)rdh_utils::getLink(*rdh));
392 constexpr size_t bufferSize = 3 * std::max(sizeof(*rdh), sizeof(*hdr)) + 1;
393 char dumpBuffer[bufferSize];
394 for (size_t i = 0; i < sizeof(*rdh); i++) {
395 // "%02X " guaranteed to be 3 chars + ending 0.
396 snprintf(dumpBuffer + 3 * i, 4, "%02X ", (int32_t)((uint8_t*)rdh)[i]);
397 }
398 GPUAlarm("RDH of page: %s", dumpBuffer);
399 for (size_t i = 0; i < sizeof(*hdr); i++) {
400 // "%02X " guaranteed to be 3 chars + ending 0.
401 snprintf(dumpBuffer + 3 * i, 4, "%02X ", (int32_t)((uint8_t*)hdr)[i]);
402 }
403 GPUAlarm("Metainfo of page: %s", dumpBuffer);
404 if (GetProcessingSettings().ignoreNonFatalGPUErrors) {
405 mCFContext->abandonTimeframe = true;
406 return {0, 0};
407 } else {
408 GPUFatal("Cannot process with invalid TPC ZS data, exiting");
409 }
410 }
411 if (GetProcessingSettings().param.tpcTriggerHandling) {
412 const TPCZSHDRV2* const hdr2 = (const TPCZSHDRV2*)hdr;
413 if (hdr2->flags & TPCZSHDRV2::ZSFlags::TriggerWordPresent) {
414 const char* triggerWord = (const char*)hdr - TPCZSHDRV2::TRIGGER_WORD_SIZE;
416 memcpy((void*)&tmp.triggerWord, triggerWord, TPCZSHDRV2::TRIGGER_WORD_SIZE);
417 tmp.orbit = o2::raw::RDHUtils::getHeartBeatOrbit(*rdh);
418 if (tmp.triggerWord.isValid(0)) {
419 mTriggerBuffer->triggers.emplace(tmp);
420 }
421 }
422 }
423 nDigits += hdr->nADCsamples;
424 endpointAdcSamples[j] += hdr->nADCsamples;
425 uint32_t timeBin = (hdr->timeOffset + (o2::raw::RDHUtils::getHeartBeatOrbit(*rdh) - firstHBF) * o2::constants::lhc::LHCMaxBunches) / LHCBCPERTIMEBIN;
426 uint32_t maxTimeBin = timeBin + hdr->nTimeBinSpan;
427 if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
428 const TPCZSHDRV2* const hdr2 = (const TPCZSHDRV2*)hdr;
429 if (hdr2->flags & TPCZSHDRV2::ZSFlags::nTimeBinSpanBit8) {
430 maxTimeBin += 256;
431 }
432 }
433 if (maxTimeBin > mCFContext->tpcMaxTimeBin) {
434 mCFContext->tpcMaxTimeBin = maxTimeBin;
435 }
436 bool extendsInNextPage = false;
437 if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
438 if (l + 1 < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k] && o2::raw::RDHUtils::getMemorySize(*rdh) == TPCZSHDR::TPC_ZS_PAGE_SIZE) {
440 extendsInNextPage = o2::raw::RDHUtils::getHeartBeatOrbit(*nextrdh) == o2::raw::RDHUtils::getHeartBeatOrbit(*rdh) && o2::raw::RDHUtils::getMemorySize(*nextrdh) > sizeof(o2::header::RAWDataHeader);
441 }
442 }
443 while (firstPossibleFragment && (uint32_t)fragments[firstPossibleFragment - 1].first.last() > timeBin) {
444 firstPossibleFragment--;
445 }
446 auto handleExtends = [&](uint32_t ff) {
447 if (fragmentExtends[ff]) {
448 if (doGPU) {
449 // Only add extended page on GPU. On CPU the pages are in consecutive memory anyway.
450 // Not adding the page prevents an issue where a page is decoded twice on CPU, when only the extend should be decoded.
451 fragments[ff].second.zsPageLast++;
452 mCFContext->fragmentData[ff].nPages[iSector][j]++;
453 mCFContext->fragmentData[ff].pageDigits[iSector][j].emplace_back(0);
454 }
455 fragmentExtends[ff] = false;
456 }
457 };
458 if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
459 for (uint32_t ff = 0; ff < firstPossibleFragment; ff++) {
460 handleExtends(ff);
461 }
462 }
463 for (uint32_t f = firstPossibleFragment; f < mCFContext->nFragments; f++) {
464 if (timeBin < (uint32_t)fragments[f].first.last() && (uint32_t)fragments[f].first.first() <= maxTimeBin) {
465 if (!fragments[f].second.hasData) {
466 fragments[f].second.hasData = 1;
467 fragments[f].second.zsPtrFirst = k;
468 fragments[f].second.zsPageFirst = l;
469 } else {
470 if (pageCounter > (uint32_t)fragments[f].second.pageCounter + 1) {
471 mCFContext->fragmentData[f].nPages[iSector][j] += emptyPages + pageCounter - fragments[f].second.pageCounter - 1;
472 for (uint32_t k2 = fragments[f].second.zsPtrLast - 1; k2 <= k; k2++) {
473 for (uint32_t l2 = ((int32_t)k2 == fragments[f].second.zsPtrLast - 1) ? fragments[f].second.zsPageLast : 0; l2 < (k2 < k ? mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k2] : l); l2++) {
474 if (doGPU) {
475 mCFContext->fragmentData[f].pageDigits[iSector][j].emplace_back(0);
476 } else {
477 // CPU cannot skip unneeded pages, so we must keep space to store the invalid dummy clusters
478 const uint8_t* const pageTmp = ((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k2]) + l2 * TPCZSHDR::TPC_ZS_PAGE_SIZE;
479 const o2::header::RAWDataHeader* rdhTmp = (const o2::header::RAWDataHeader*)pageTmp;
480 if (o2::raw::RDHUtils::getMemorySize(*rdhTmp) != sizeof(o2::header::RAWDataHeader)) {
481 const TPCZSHDR* const hdrTmp = (const TPCZSHDR*)(rdh_utils::getLink(o2::raw::RDHUtils::getFEEID(*rdhTmp)) == rdh_utils::DLBZSLinkID ? (pageTmp + o2::raw::RDHUtils::getMemorySize(*rdhTmp) - sizeof(TPCZSHDRV2)) : (pageTmp + sizeof(o2::header::RAWDataHeader)));
482 mCFContext->fragmentData[f].nDigits[iSector][j] += hdrTmp->nADCsamples;
483 }
484 }
485 }
486 }
487 } else if (emptyPages) {
488 mCFContext->fragmentData[f].nPages[iSector][j] += emptyPages;
489 if (doGPU) {
490 for (uint32_t m = 0; m < emptyPages; m++) {
491 mCFContext->fragmentData[f].pageDigits[iSector][j].emplace_back(0);
492 }
493 }
494 }
495 }
496 fragments[f].second.zsPtrLast = k + 1;
497 fragments[f].second.zsPageLast = l + 1;
498 fragments[f].second.pageCounter = pageCounter;
499 mCFContext->fragmentData[f].nPages[iSector][j]++;
500 mCFContext->fragmentData[f].nDigits[iSector][j] += hdr->nADCsamples;
501 if (doGPU) {
502 mCFContext->fragmentData[f].pageDigits[iSector][j].emplace_back(hdr->nADCsamples);
503 }
504 fragmentExtends[f] = extendsInNextPage;
505 } else {
506 handleExtends(f);
507 if (timeBin < (uint32_t)fragments[f].first.last()) {
508 if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
509 for (uint32_t ff = f + 1; ff < mCFContext->nFragments; ff++) {
510 handleExtends(ff);
511 }
512 }
513 break;
514 } else {
515 firstPossibleFragment = f + 1;
516 }
517 }
518 }
519 emptyPages = 0;
520 }
521 }
522 for (uint32_t f = 0; f < mCFContext->nFragments; f++) {
523 mCFContext->fragmentData[f].minMaxCN[iSector][j].zsPtrLast = fragments[f].second.zsPtrLast;
524 mCFContext->fragmentData[f].minMaxCN[iSector][j].zsPtrFirst = fragments[f].second.zsPtrFirst;
525 mCFContext->fragmentData[f].minMaxCN[iSector][j].zsPageLast = fragments[f].second.zsPageLast;
526 mCFContext->fragmentData[f].minMaxCN[iSector][j].zsPageFirst = fragments[f].second.zsPageFirst;
527 }
528 }
529 mCFContext->nPagesTotal += nPages;
530 mCFContext->nPagesSector[iSector] = nPages;
531
532 mCFContext->nDigitsEndpointMax[iSector] = 0;
533 for (uint32_t i = 0; i < GPUTrackingInOutZS::NENDPOINTS; i++) {
534 if (endpointAdcSamples[i] > mCFContext->nDigitsEndpointMax[iSector]) {
535 mCFContext->nDigitsEndpointMax[iSector] = endpointAdcSamples[i];
536 }
537 }
538 uint32_t nDigitsFragmentMax = 0;
539 for (uint32_t i = 0; i < mCFContext->nFragments; i++) {
540 uint32_t pagesInFragment = 0;
541 uint32_t digitsInFragment = 0;
542 for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
543 pagesInFragment += mCFContext->fragmentData[i].nPages[iSector][j];
544 digitsInFragment += mCFContext->fragmentData[i].nDigits[iSector][j];
545 }
546 mCFContext->nPagesFragmentMax = std::max(mCFContext->nPagesFragmentMax, pagesInFragment);
547 nDigitsFragmentMax = std::max(nDigitsFragmentMax, digitsInFragment);
548 }
549 mRec->getGeneralStepTimer(GeneralStep::Prepare).Stop();
550 return {nDigits, nDigitsFragmentMax};
551}
552
553void GPUChainTracking::RunTPCClusterizer_compactPeaks(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int32_t stage, bool doGPU, int32_t lane)
554{
555 auto& in = stage ? clustererShadow.mPpeakPositions : clustererShadow.mPpositions;
556 auto& out = stage ? clustererShadow.mPfilteredPeakPositions : clustererShadow.mPpeakPositions;
557 if (doGPU) {
558 const uint32_t iSector = clusterer.mISector;
559 auto& count = stage ? clusterer.mPmemory->counters.nPeaks : clusterer.mPmemory->counters.nPositions;
560
561 std::vector<size_t> counts;
562
563 uint32_t nSteps = clusterer.getNSteps(count);
564 if (nSteps > clusterer.mNBufs) {
565 GPUError("Clusterer buffers exceeded (%u > %u)", nSteps, (int32_t)clusterer.mNBufs);
566 exit(1);
567 }
568
569 int32_t scanWorkgroupSize = mRec->getGPUParameters(doGPU).par_CF_SCAN_WORKGROUP_SIZE;
570 size_t tmpCount = count;
571 if (nSteps > 1) {
572 for (uint32_t i = 1; i < nSteps; i++) {
573 counts.push_back(tmpCount);
574 if (i == 1) {
575 runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanStart>({GetGrid(tmpCount, scanWorkgroupSize, lane), {iSector}}, i, stage);
576 } else {
577 runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanUp>({GetGrid(tmpCount, scanWorkgroupSize, lane), {iSector}}, i, tmpCount);
578 }
579 tmpCount = (tmpCount + scanWorkgroupSize - 1) / scanWorkgroupSize;
580 }
581
582 runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanTop>({GetGrid(tmpCount, scanWorkgroupSize, lane), {iSector}}, nSteps, tmpCount);
583
584 for (uint32_t i = nSteps - 1; i > 1; i--) {
585 tmpCount = counts[i - 1];
586 runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanDown>({GetGrid(tmpCount - scanWorkgroupSize, scanWorkgroupSize, lane), {iSector}}, i, scanWorkgroupSize, tmpCount);
587 }
588 }
589
590 runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::compactDigits>({GetGrid(count, scanWorkgroupSize, lane), {iSector}}, 1, stage, in, out);
591 } else {
592 auto& nOut = stage ? clusterer.mPmemory->counters.nClusters : clusterer.mPmemory->counters.nPeaks;
593 auto& nIn = stage ? clusterer.mPmemory->counters.nPeaks : clusterer.mPmemory->counters.nPositions;
594 size_t count = 0;
595 for (size_t i = 0; i < nIn; i++) {
596 if (clusterer.mPisPeak[i]) {
597 out[count++] = in[i];
598 }
599 }
600 nOut = count;
601 }
602}
603
604std::pair<uint32_t, uint32_t> GPUChainTracking::RunTPCClusterizer_transferZS(int32_t iSector, const CfFragment& fragment, int32_t lane, const GPUTPCExtraADC& extraADCs)
605{
606 bool doGPU = GetRecoStepsGPU() & RecoStep::TPCClusterFinding;
607 if (mCFContext->abandonTimeframe) {
608 return {0, 0};
609 }
610 auto retVal = TPCClusterizerDecodeZSCountUpdate(iSector, fragment);
611 if (fragment.index == 0) {
612 retVal.first += extraADCs.digitsBySector[iSector].size();
613 }
614 if (doGPU) {
616 GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
617 uint32_t nPagesSector = 0;
618 for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
619 uint32_t nPages = 0;
620 mInputsHost->mPzsMeta->sector[iSector].zsPtr[j] = &mInputsShadow->mPzsPtrs[iSector * GPUTrackingInOutZS::NENDPOINTS + j];
621 mInputsHost->mPzsPtrs[iSector * GPUTrackingInOutZS::NENDPOINTS + j] = clustererShadow.mPzs + (nPagesSector + nPages) * TPCZSHDR::TPC_ZS_PAGE_SIZE;
622 for (uint32_t k = clusterer.mMinMaxCN[j].zsPtrFirst; k < clusterer.mMinMaxCN[j].zsPtrLast; k++) {
623 const uint32_t min = (k == clusterer.mMinMaxCN[j].zsPtrFirst) ? clusterer.mMinMaxCN[j].zsPageFirst : 0;
624 const uint32_t max = (k + 1 == clusterer.mMinMaxCN[j].zsPtrLast) ? clusterer.mMinMaxCN[j].zsPageLast : mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k];
625 if (max > min) {
627 char* ptrLast = (char*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k] + (max - 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE;
628 size_t size = (ptrLast - src) + o2::raw::RDHUtils::getMemorySize(*(const o2::header::RAWDataHeader*)ptrLast);
629 GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPzs + (nPagesSector + nPages) * TPCZSHDR::TPC_ZS_PAGE_SIZE, src, size, lane, true);
630 }
631 nPages += max - min;
632 }
633 mInputsHost->mPzsMeta->sector[iSector].nZSPtr[j] = &mInputsShadow->mPzsSizes[iSector * GPUTrackingInOutZS::NENDPOINTS + j];
634 mInputsHost->mPzsSizes[iSector * GPUTrackingInOutZS::NENDPOINTS + j] = nPages;
635 mInputsHost->mPzsMeta->sector[iSector].count[j] = 1;
636 nPagesSector += nPages;
637 }
638 GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPzsOffsets, clusterer.mPzsOffsets, clusterer.mNMaxPages * sizeof(*clusterer.mPzsOffsets), lane, true);
639 }
640 return retVal;
641}
642
643int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers, const GPUTPCExtraADC& extraADCs)
644{
646 if (restorePointers) {
647 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
648 processors()->tpcClusterer[iSector].mPzsOffsets = mCFContext->ptrSave[iSector].zsOffsetHost;
649 processorsShadow()->tpcClusterer[iSector].mPzsOffsets = mCFContext->ptrSave[iSector].zsOffsetDevice;
651 }
652 processorsShadow()->ioPtrs.clustersNative = mCFContext->ptrClusterNativeSave;
653 return 0;
654 }
655 const auto& threadContext = GetThreadContext();
657 if (mCFContext == nullptr) {
659 }
660 const int16_t maxFragmentLen = GetProcessingSettings().overrideClusterizerFragmentLen;
661 const uint32_t maxAllowedTimebin = param().par.continuousTracking ? std::max<int32_t>(param().continuousMaxTimeBin, maxFragmentLen) : constants::TPC_MAX_TIME_BIN_TRIGGERED;
662 mCFContext->tpcMaxTimeBin = maxAllowedTimebin;
663 const CfFragment fragmentMax{(tpccf::TPCTime)mCFContext->tpcMaxTimeBin + 1, maxFragmentLen};
664 mCFContext->prepare(mIOPtrs.tpcZS, fragmentMax);
665 if (GetProcessingSettings().param.tpcTriggerHandling) {
666 mTriggerBuffer->triggers.clear();
667 }
668 if (mIOPtrs.tpcZS) {
669 uint32_t nDigitsFragmentMax[NSECTORS];
670 mCFContext->zsVersion = -1;
671 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
672 if (mIOPtrs.tpcZS->sector[iSector].count[0]) {
673 const void* rdh = mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0];
674 if (rdh && o2::raw::RDHUtils::getVersion<o2::header::RAWDataHeaderV6>() > o2::raw::RDHUtils::getVersion(rdh)) {
675 GPUError("Data has invalid RDH version %d, %d required\n", o2::raw::RDHUtils::getVersion(rdh), o2::raw::RDHUtils::getVersion<o2::header::RAWDataHeader>());
676 return 1;
677 }
678 }
679
680 if (GetProcessingSettings().prefetchTPCpageScan >= 1 && iSector < NSECTORS - 1) {
681 for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
682 for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j]; k++) {
683 for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; l++) {
684 Vc::Common::prefetchFar(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector + 1].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE);
685 Vc::Common::prefetchFar(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector + 1].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
686 }
687 }
688 }
689 }
690
691 const auto& x = TPCClusterizerDecodeZSCount(iSector, fragmentMax);
692 nDigitsFragmentMax[iSector] = x.first;
694 mRec->MemoryScalers()->nTPCdigits += x.first;
695 }
696 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
697 uint32_t nDigitsBase = nDigitsFragmentMax[iSector];
698 uint32_t threshold = 40000000;
699 uint32_t nDigitsScaled = nDigitsBase > threshold ? nDigitsBase : std::min((threshold + nDigitsBase) / 2, 2 * nDigitsBase);
700 processors()->tpcClusterer[iSector].SetNMaxDigits(processors()->tpcClusterer[iSector].mPmemory->counters.nDigits, mCFContext->nPagesFragmentMax, nDigitsScaled, mCFContext->nDigitsEndpointMax[iSector]);
701 if (doGPU) {
702 processorsShadow()->tpcClusterer[iSector].SetNMaxDigits(processors()->tpcClusterer[iSector].mPmemory->counters.nDigits, mCFContext->nPagesFragmentMax, nDigitsScaled, mCFContext->nDigitsEndpointMax[iSector]);
703 }
704 if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) {
705 mPipelineNotifyCtx->rec->AllocateRegisteredForeignMemory(processors()->tpcClusterer[iSector].mZSOffsetId, mRec);
706 mPipelineNotifyCtx->rec->AllocateRegisteredForeignMemory(processors()->tpcClusterer[iSector].mZSId, mRec);
707 } else {
708 AllocateRegisteredMemory(processors()->tpcClusterer[iSector].mZSOffsetId);
709 AllocateRegisteredMemory(processors()->tpcClusterer[iSector].mZSId);
710 }
711 }
712 } else {
713 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
714 uint32_t nDigits = mIOPtrs.tpcPackedDigits->nTPCDigits[iSector];
715 mRec->MemoryScalers()->nTPCdigits += nDigits;
716 processors()->tpcClusterer[iSector].SetNMaxDigits(nDigits, mCFContext->nPagesFragmentMax, nDigits, 0);
717 }
718 }
719
720 if (mIOPtrs.tpcZS) {
721 GPUInfo("Event has %u 8kb TPC ZS pages (version %d), %ld digits", mCFContext->nPagesTotal, mCFContext->zsVersion, (int64_t)mRec->MemoryScalers()->nTPCdigits);
722 } else {
723 GPUInfo("Event has %ld TPC Digits", (int64_t)mRec->MemoryScalers()->nTPCdigits);
724 }
725
726 if (mCFContext->tpcMaxTimeBin > maxAllowedTimebin) {
727 GPUError("Input data has invalid time bin %u > %d", mCFContext->tpcMaxTimeBin, maxAllowedTimebin);
728 if (GetProcessingSettings().ignoreNonFatalGPUErrors) {
729 mCFContext->abandonTimeframe = true;
730 mCFContext->tpcMaxTimeBin = maxAllowedTimebin;
731 } else {
732 return 1;
733 }
734 }
735
736 mCFContext->fragmentFirst = CfFragment{std::max<int32_t>(mCFContext->tpcMaxTimeBin + 1, maxFragmentLen), maxFragmentLen};
737 for (int32_t iSector = 0; iSector < GetProcessingSettings().nTPCClustererLanes && iSector < NSECTORS; iSector++) {
738 if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) {
739 mCFContext->nextPos[iSector] = RunTPCClusterizer_transferZS(iSector, mCFContext->fragmentFirst, GetProcessingSettings().nTPCClustererLanes + iSector, extraADCs);
740 }
741 }
742
743 if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) {
744 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
745 mCFContext->ptrSave[iSector].zsOffsetHost = processors()->tpcClusterer[iSector].mPzsOffsets;
746 mCFContext->ptrSave[iSector].zsOffsetDevice = processorsShadow()->tpcClusterer[iSector].mPzsOffsets;
748 }
749 }
750 return 0;
751}
752#endif
753
754int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
755{
756 if (param().rec.fwdTPCDigitsAsClusters) {
757 return ForwardTPCDigits();
758 }
759#ifndef GPUCA_RUN2
760 int32_t tpcTimeBinCut = (mUpdateNewCalibObjects && mNewCalibValues->newTPCTimeBinCut) ? mNewCalibValues->tpcTimeBinCut : param().tpcCutTimeBin;
761
763 const auto& threadContext = GetThreadContext();
764 const bool doGPU = GetRecoStepsGPU() & RecoStep::TPCClusterFinding;
765
766 GPUTPCExtraADC extraADCs;
767#ifdef INSERT_SATURATED_SIGNALS
768 extraADCs = GenerateSaturatedSignals();
769#endif
770
771 if (RunTPCClusterizer_prepare(mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer, extraADCs)) {
772 return 1;
773 }
774 if (GetProcessingSettings().autoAdjustHostThreads && !doGPU) {
776 }
777
779 float tpcHitLowOccupancyScalingFactor = 1.f;
781 uint32_t nHitsBase = mRec->MemoryScalers()->nTPCHits;
782 uint32_t threshold = 30000000 / 256 * mIOPtrs.settingsTF->nHBFPerTF;
783 if (mIOPtrs.settingsTF->nHBFPerTF < 64) {
784 threshold *= 2;
785 }
786 mRec->MemoryScalers()->nTPCHits = std::max<uint32_t>(nHitsBase, std::min<uint32_t>(threshold, nHitsBase * 3.5f)); // Increase the buffer size for low occupancy data to compensate for noisy pads creating exceiive clusters
787 if (nHitsBase < threshold) {
788 float maxFactor = mRec->MemoryScalers()->nTPCHits < threshold * 2 / 3 ? 3 : (mRec->MemoryScalers()->nTPCHits < threshold ? 2.25f : 1.75f);
789 mRec->MemoryScalers()->temporaryFactor *= std::min(maxFactor, (float)threshold / nHitsBase);
790 tpcHitLowOccupancyScalingFactor = std::min(3.5f, (float)threshold / nHitsBase);
791 }
792 }
793 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
794 processors()->tpcClusterer[iSector].SetMaxData(mIOPtrs); // First iteration to set data sizes
795 }
796 mRec->ComputeReuseMax(nullptr); // Resolve maximums for shared buffers
797 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
798 SetupGPUProcessor(&processors()->tpcClusterer[iSector], true); // Now we allocate
799 }
800 if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) {
801 RunTPCClusterizer_prepare(true, extraADCs); // Restore some pointers, allocated by the other pipeline, and set to 0 by SetupGPUProcessor (since not allocated in this pipeline)
802 }
803
804 if (doGPU && mIOPtrs.tpcZS) {
806 WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), mRec->NStreams() - 1);
807 }
808 if (doGPU) {
809 WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)processors()->tpcClusterer - (char*)processors(), processorsShadow()->tpcClusterer, sizeof(GPUTPCClusterFinder) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
810 }
811
812#ifdef GPUCA_HAS_ONNX
813 const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
814 GPUTPCNNClusterizerHost nnApplications[GetProcessingSettings().nTPCClustererLanes];
815
816 // Maximum of 4 lanes supported
817 HighResTimer* nnTimers[12];
818
819 if (nn_settings.applyNNclusterizer) {
820 int32_t deviceId = -1;
821 int32_t numLanes = GetProcessingSettings().nTPCClustererLanes;
822 int32_t maxThreads = mRec->getNKernelHostThreads(true);
823 // bool recreateMemoryAllocator = false;
824
825 if (GetProcessingSettings().debugLevel >= 1) {
826 nnTimers[0] = &getTimer<GPUTPCNNClusterizer, 0>("GPUTPCNNClusterizer_ONNXClassification_0_", 0);
827 nnTimers[1] = &getTimer<GPUTPCNNClusterizer, 1>("GPUTPCNNClusterizer_ONNXRegression_1_", 1);
828 nnTimers[2] = &getTimer<GPUTPCNNClusterizer, 2>("GPUTPCNNClusterizer_ONNXRegression2_2_", 2);
829 nnTimers[3] = &getTimer<GPUTPCNNClusterizer, 3>("GPUTPCNNClusterizer_ONNXClassification_0_", 3);
830 nnTimers[4] = &getTimer<GPUTPCNNClusterizer, 4>("GPUTPCNNClusterizer_ONNXRegression_1_", 4);
831 nnTimers[5] = &getTimer<GPUTPCNNClusterizer, 5>("GPUTPCNNClusterizer_ONNXRegression2_2_", 5);
832 nnTimers[6] = &getTimer<GPUTPCNNClusterizer, 6>("GPUTPCNNClusterizer_ONNXClassification_0_", 6);
833 nnTimers[7] = &getTimer<GPUTPCNNClusterizer, 7>("GPUTPCNNClusterizer_ONNXRegression_1_", 7);
834 nnTimers[8] = &getTimer<GPUTPCNNClusterizer, 8>("GPUTPCNNClusterizer_ONNXRegression2_2_", 8);
835 nnTimers[9] = &getTimer<GPUTPCNNClusterizer, 9>("GPUTPCNNClusterizer_ONNXClassification_0_", 9);
836 nnTimers[10] = &getTimer<GPUTPCNNClusterizer, 10>("GPUTPCNNClusterizer_ONNXRegression_1_", 10);
837 nnTimers[11] = &getTimer<GPUTPCNNClusterizer, 11>("GPUTPCNNClusterizer_ONNXRegression2_2_", 11);
838 }
839
840 mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) {
841 nnApplications[lane].init(nn_settings, GetProcessingSettings().deterministicGPUReconstruction);
842 if (nnApplications[lane].mModelsUsed[0]) {
843 SetONNXGPUStream(*(nnApplications[lane].mModelClass).getSessionOptions(), lane, &deviceId);
844 (nnApplications[lane].mModelClass).setDeviceId(deviceId);
845 if (nnApplications[lane].mModelClass.getIntraOpNumThreads() > maxThreads) {
846 nnApplications[lane].mModelClass.setIntraOpNumThreads(maxThreads);
847 }
848 (nnApplications[lane].mModelClass).initEnvironment();
849 // Registering this once seems to be enough, even with different environmnents / models. ONNX apparently uses this per device and stores the OrtAllocator internally. All models will then use the volatile allocation.
850 // But environment must be valid, so we init the model environment first and use it here afterwards.
851 // Either this is done in one environment with lane == 0 or by recreating the allocator using recreateMemoryAllocator.
852 // TODO: Volatile allocation works for reserving, but not yet for allocations when binding the input tensor
853 // if (lane == 0) {
854 // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
855 // }
856 // recreateMemoryAllocator = true;
857 if (!nn_settings.nnLoadFromCCDB) {
858 (nnApplications[lane].mModelClass).initSession(); // loads from file
859 } else {
860 (nnApplications[lane].mModelClass).initSessionFromBuffer((processors()->calibObjects.nnClusterizerNetworks[0])->getONNXModel(), (processors()->calibObjects.nnClusterizerNetworks[0])->getONNXModelSize()); // loads from CCDB
861 }
862 }
863 if (nnApplications[lane].mModelsUsed[1]) {
864 SetONNXGPUStream(*(nnApplications[lane].mModelReg1).getSessionOptions(), lane, &deviceId);
865 (nnApplications[lane].mModelReg1).setDeviceId(deviceId);
866 if (nnApplications[lane].mModelReg1.getIntraOpNumThreads() > maxThreads) {
867 nnApplications[lane].mModelReg1.setIntraOpNumThreads(maxThreads);
868 }
869 // (nnApplications[lane].mModelReg1).setEnv((nnApplications[lane].mModelClass).getEnv());
870 (nnApplications[lane].mModelReg1).initEnvironment();
871 // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelReg1).getEnv(), (nnApplications[lane].mModelReg1).getMemoryInfo(), mRec, recreateMemoryAllocator);
872 if (!nn_settings.nnLoadFromCCDB) {
873 (nnApplications[lane].mModelReg1).initSession(); // loads from file
874 } else {
875 (nnApplications[lane].mModelReg1).initSessionFromBuffer((processors()->calibObjects.nnClusterizerNetworks[1])->getONNXModel(), (processors()->calibObjects.nnClusterizerNetworks[1])->getONNXModelSize()); // loads from CCDB
876 }
877 }
878 if (nnApplications[lane].mModelsUsed[2]) {
879 SetONNXGPUStream(*(nnApplications[lane].mModelReg2).getSessionOptions(), lane, &deviceId);
880 (nnApplications[lane].mModelReg2).setDeviceId(deviceId);
881 if (nnApplications[lane].mModelReg2.getIntraOpNumThreads() > maxThreads) {
882 nnApplications[lane].mModelReg2.setIntraOpNumThreads(maxThreads);
883 }
884 // (nnApplications[lane].mModelReg2).setEnv((nnApplications[lane].mModelClass).getEnv());
885 (nnApplications[lane].mModelReg2).initEnvironment();
886 // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
887 if (!nn_settings.nnLoadFromCCDB) {
888 (nnApplications[lane].mModelReg2).initSession(); // loads from file
889 } else {
890 (nnApplications[lane].mModelReg2).initSessionFromBuffer((processors()->calibObjects.nnClusterizerNetworks[2])->getONNXModel(), (processors()->calibObjects.nnClusterizerNetworks[2])->getONNXModelSize()); // loads from CCDB
891 }
892 }
893 if (nn_settings.nnClusterizerVerbosity > 0) {
894 LOG(info) << "(ORT) Allocated ONNX stream for lane " << lane << " and device " << deviceId;
895 }
896 });
897 const int16_t maxFragmentLen = GetProcessingSettings().overrideClusterizerFragmentLen;
898 const uint32_t maxAllowedTimebin = param().par.continuousTracking ? std::max<int32_t>(param().continuousMaxTimeBin, maxFragmentLen) : constants::TPC_MAX_TIME_BIN_TRIGGERED;
899 for (int32_t sector = 0; sector < NSECTORS; sector++) {
900 GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[sector];
901 GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[sector] : clustererNN;
902 int32_t lane = sector % numLanes;
903 clustererNN.mDeviceId = deviceId;
904 clustererNN.mISector = sector;
906 nnApplications[lane].initClusterizer(nn_settings, clustererNN, maxFragmentLen, maxAllowedTimebin);
907 if (doGPU) {
908 clustererNNShadow.mDeviceId = deviceId;
909 clustererNNShadow.mISector = sector;
911 nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow, maxFragmentLen, maxAllowedTimebin);
912 }
913 if (nn_settings.nnClusterizerVerbosity > 2) {
914 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Processor initialized. Sector " << sector << ", lane " << lane << ", max clusters " << clustererNN.mNnClusterizerTotalClusters << " (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
915 }
917 if (nn_settings.nnClusterizerVerbosity > 2) {
918 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Memory registered for memoryId " << clustererNN.mMemoryId << " (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
919 }
920 // nnApplications[lane].createBoundary(clustererNNShadow);
921 // nnApplications[lane].createIndexLookup(clustererNNShadow);
922 }
923 if (doGPU) {
924 if (nn_settings.nnClusterizerVerbosity > 2) {
925 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Writing to constant memory...";
926 }
927 WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
928 if (nn_settings.nnClusterizerVerbosity > 2) {
929 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Writing to constant memory done";
930 }
931 }
932 }
933#endif
934
935 size_t nClsTotal = 0;
936 ClusterNativeAccess* tmpNativeAccess = mClusterNativeAccess.get();
937 ClusterNative* tmpNativeClusters = nullptr;
938 std::unique_ptr<ClusterNative[]> tmpNativeClusterBuffer;
939
940 const bool buildNativeGPU = doGPU && NeedTPCClustersOnGPU();
941 const bool buildNativeHost = (mRec->GetRecoStepsOutputs() & gpudatatypes::InOutType::TPCClusters) || GetProcessingSettings().deterministicGPUReconstruction; // TODO: Should do this also when clusters are needed for later steps on the host but not requested as output
942 const bool propagateMCLabels = buildNativeHost && GetProcessingSettings().runMC && processors()->ioPtrs.tpcPackedDigits && processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC;
943 const bool sortClusters = buildNativeHost && (GetProcessingSettings().deterministicGPUReconstruction || GetProcessingSettings().debugLevel >= 4);
944
946 GPUWarning("Requested to process MC labels, but no labels present");
947 }
948
949 auto* digitsMC = propagateMCLabels ? processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC : nullptr;
950
951 mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = mRec->MemoryScalers()->nTPCHits * tpcHitLowOccupancyScalingFactor;
952 if (buildNativeGPU) {
953 AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
954 }
955 if (mWaitForFinalInputs && GetProcessingSettings().nTPCClustererLanes > 6) {
956 GPUFatal("ERROR, mWaitForFinalInputs cannot be called with nTPCClustererLanes > 6");
957 }
958 if (buildNativeHost && !(buildNativeGPU && GetProcessingSettings().delayedOutput)) {
959 if (mWaitForFinalInputs) {
960 GPUFatal("Cannot use waitForFinalInput callback without delayed output");
961 }
962 if (!GetProcessingSettings().tpcApplyClusterFilterOnCPU) {
963 AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput, GetProcessingSettings().tpcWriteClustersAfterRejection ? nullptr : mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
964 tmpNativeClusters = mInputsHost->mPclusterNativeOutput;
965 } else {
966 tmpNativeClusterBuffer = std::make_unique<ClusterNative[]>(mInputsHost->mNClusterNative);
967 tmpNativeClusters = tmpNativeClusterBuffer.get();
968 }
969 }
970
971 GPUTPCLinearLabels mcLinearLabels;
972 if (propagateMCLabels) {
973 // No need to overallocate here, nTPCHits is anyway an upper bound used for the GPU cluster buffer, and we can always enlarge the buffer anyway
974 mcLinearLabels.header.reserve(mRec->MemoryScalers()->nTPCHits / 2);
975 mcLinearLabels.data.reserve(mRec->MemoryScalers()->nTPCHits);
976 }
977
978 int8_t transferRunning[NSECTORS] = {0};
979 uint32_t outputQueueStart = mOutputQueue.size();
980
981 auto notifyForeignChainFinished = [this]() {
982 if (mPipelineNotifyCtx) {
983 SynchronizeStream(OutputStream()); // Must finish before updating ioPtrs in (global) constant memory
984 {
985 std::lock_guard<std::mutex> lock(mPipelineNotifyCtx->mutex);
986 mPipelineNotifyCtx->ready = true;
987 }
988 mPipelineNotifyCtx->cond.notify_one();
989 }
990 };
991 bool synchronizeCalibUpdate = false;
992
993 for (uint32_t iSectorBase = 0; iSectorBase < NSECTORS; iSectorBase += GetProcessingSettings().nTPCClustererLanes) {
994 std::vector<bool> laneHasData(GetProcessingSettings().nTPCClustererLanes, false);
995 static_assert(NSECTORS <= constants::GPU_MAX_STREAMS, "Stream events must be able to hold all sectors");
996 const int32_t maxLane = std::min<int32_t>(GetProcessingSettings().nTPCClustererLanes, NSECTORS - iSectorBase);
997 for (CfFragment fragment = mCFContext->fragmentFirst; !fragment.isEnd(); fragment = fragment.next()) {
998 if (GetProcessingSettings().debugLevel >= 3) {
999 GPUInfo("Processing time bins [%d, %d) for sectors %d to %d", fragment.start, fragment.last(), iSectorBase, iSectorBase + GetProcessingSettings().nTPCClustererLanes - 1);
1000 }
1001 mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
1002 if (doGPU && fragment.index != 0) {
1003 SynchronizeStream(lane); // Don't overwrite charge map from previous iteration until cluster computation is finished
1004 }
1005
1006 uint32_t iSector = iSectorBase + lane;
1008 GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
1009 clusterer.mPmemory->counters.nPeaks = clusterer.mPmemory->counters.nClusters = 0;
1010 clusterer.mPmemory->fragment = fragment;
1011
1013 bool setDigitsOnGPU = doGPU && not mIOPtrs.tpcZS;
1014 bool setDigitsOnHost = (not doGPU && not mIOPtrs.tpcZS) || propagateMCLabels;
1015 auto* inDigits = mIOPtrs.tpcPackedDigits;
1016 size_t numDigits = inDigits->nTPCDigits[iSector];
1017 if (setDigitsOnGPU) {
1018 GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPdigits, inDigits->tpcDigits[iSector], sizeof(clustererShadow.mPdigits[0]) * numDigits, lane, true);
1019 }
1020 if (setDigitsOnHost) {
1021 clusterer.mPdigits = const_cast<o2::tpc::Digit*>(inDigits->tpcDigits[iSector]); // TODO: Needs fixing (invalid const cast)
1022 }
1023 clusterer.mPmemory->counters.nDigits = numDigits;
1024 }
1025
1026 if (mIOPtrs.tpcZS) {
1027 if (mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) {
1028 clusterer.mPmemory->counters.nPositions = mCFContext->nextPos[iSector].first;
1029 clusterer.mPmemory->counters.nPagesSubsector = mCFContext->nextPos[iSector].second;
1030 } else {
1031 clusterer.mPmemory->counters.nPositions = clusterer.mPmemory->counters.nPagesSubsector = 0;
1032 }
1033 }
1034 TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
1035
1036 using ChargeMapType = decltype(*clustererShadow.mPchargeMap);
1037 using PeakMapType = decltype(*clustererShadow.mPpeakMap);
1038 runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPchargeMap, TPCMapMemoryLayout<ChargeMapType>::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(ChargeMapType));
1039 runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpeakMap, TPCMapMemoryLayout<PeakMapType>::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(PeakMapType));
1040 if (fragment.index == 0) {
1041 runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpadIsNoisy, TPC_CLUSTERER_STRIDED_PAD_COUNT * sizeof(*clustererShadow.mPpadIsNoisy));
1042 }
1044
1045 if (doGPU) {
1046 if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) {
1047 TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, mInputsHost->mResourceZS, lane);
1048 SynchronizeStream(GetProcessingSettings().nTPCClustererLanes + lane);
1049 }
1050 SynchronizeStream(mRec->NStreams() - 1); // Wait for copying to constant memory
1051 }
1052
1053 if (mIOPtrs.tpcZS && (mCFContext->abandonTimeframe || !mCFContext->nPagesSector[iSector] || mCFContext->zsVersion == -1)) {
1054 clusterer.mPmemory->counters.nPositions = 0;
1055 return;
1056 }
1058 clusterer.mPmemory->counters.nPositions = 0;
1059 return;
1060 }
1061
1062 if (propagateMCLabels) {
1063 if (fragment.index == 0) {
1064 // Must be only called on the first fragment as some buffers are used across the whole timeframe
1065 clusterer.AllocMCBuffers();
1066 }
1067 clusterer.InitMCBuffersForFragment();
1068 clusterer.mPinputLabels = digitsMC->v[iSector];
1069 if (clusterer.mPinputLabels == nullptr) {
1070 GPUFatal("MC label container missing, sector %d", iSector);
1071 }
1073 GPUFatal("MC label container has incorrect number of entries: %d expected, has %d\n", (int32_t)mIOPtrs.tpcPackedDigits->nTPCDigits[iSector], (int32_t)clusterer.mPinputLabels->getIndexedSize());
1074 }
1075 }
1076
1077 if (GetProcessingSettings().tpcSingleSector == -1 || GetProcessingSettings().tpcSingleSector == (int32_t)iSector) {
1078 if (not mIOPtrs.tpcZS) {
1079 runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({GetGrid(1, lane), {iSector}}, mIOPtrs.tpcZS == nullptr);
1080 TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
1081 } else if (propagateMCLabels) {
1082 runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({GetGrid(1, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, mIOPtrs.tpcZS == nullptr);
1083 TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
1084 }
1085 }
1086
1087 if (mIOPtrs.tpcZS) {
1088 int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : ((mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader*)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]) : 0);
1089 uint32_t nBlocks = doGPU ? clusterer.mPmemory->counters.nPagesSubsector : GPUTrackingInOutZS::NENDPOINTS;
1090
1091 switch (mCFContext->zsVersion) {
1092 default:
1093 GPUFatal("Data with invalid TPC ZS mode (%d) received", mCFContext->zsVersion);
1094 break;
1097 runKernel<GPUTPCCFDecodeZS>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF, tpcTimeBinCut);
1098 break;
1100 runKernel<GPUTPCCFDecodeZSLink>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF, tpcTimeBinCut);
1101 break;
1103 runKernel<GPUTPCCFDecodeZSDenseLink>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF, tpcTimeBinCut);
1104 break;
1105 }
1106 TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
1107 } // clang-format off
1108 });
1109 mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
1110 uint32_t iSector = iSectorBase + lane;
1111 if (doGPU) {
1112 SynchronizeStream(lane);
1113 }
1114 if (mIOPtrs.tpcZS) {
1115 CfFragment f = fragment.next();
1116 int32_t nextSector = iSector;
1117 if (f.isEnd()) {
1118 nextSector += GetProcessingSettings().nTPCClustererLanes;
1119 f = mCFContext->fragmentFirst;
1120 }
1121 if (nextSector < NSECTORS && mIOPtrs.tpcZS && mCFContext->nPagesSector[nextSector] && mCFContext->zsVersion != -1 && !mCFContext->abandonTimeframe) {
1122 mCFContext->nextPos[nextSector] = RunTPCClusterizer_transferZS(nextSector, f, GetProcessingSettings().nTPCClustererLanes + lane, extraADCs);
1123 }
1124 }
1126 GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
1127 if (clusterer.mPmemory->counters.nPositions == 0) {
1128 return;
1129 }
1130 if (!mIOPtrs.tpcZS) {
1131 runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::fillFromDigits>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
1132 }
1133
1134 TPCClusterizerTransferExtraADC(clusterer, clustererShadow, lane, extraADCs);
1135
1136 if (propagateMCLabels) {
1137 runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::fillIndexMap>({GetGrid(clusterer.mPmemory->counters.nDigitsInFragment, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}});
1138 }
1139
1140 bool checkForNoisyPads = (rec()->GetParam().rec.tpc.maxTimeBinAboveThresholdIn1000Bin > 0) || (rec()->GetParam().rec.tpc.maxConsecTimeBinAboveThreshold > 0);
1141 checkForNoisyPads &= (rec()->GetParam().rec.tpc.noisyPadsQuickCheck ? fragment.index == 0 : true);
1142 checkForNoisyPads &= !GetProcessingSettings().disableTPCNoisyPadFilter;
1143 // TODO Move hipTailFilter flag to ProcessingSettings?
1144 // TODO Add some warning when re enabling pad filter with this flag, so it's not just silently enabled when disabling was requested
1145 checkForNoisyPads |= rec()->GetParam().rec.tpc.hipTailFilter;
1146
1147 if (rec()->GetParam().rec.tpc.hipTailFilter && !doGPU) {
1148 GPUError("HIP tail filter enabled, but this is currently not supported on CPU");
1149 }
1150
1151 if (checkForNoisyPads) {
1152 if (rec()->GetParam().rec.tpc.hipTailFilter) {
1153 runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPhipTailsByRow, GPUTPCGeometry::NROWS * sizeof(*clustererShadow.mPhipTailsByRow) * GPUTPCCFHIPClusterizer::MaxHIPTailsPerRow);
1154 runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPnHIPTails, GPUTPCGeometry::NROWS * sizeof(*clustererShadow.mPnHIPTails));
1155 }
1156 const int32_t nBlocks = GPUTPCCFCheckPadBaseline::GetNBlocks(doGPU);
1157
1158 runKernel<GPUTPCCFCheckPadBaseline>({GetGridBlk(nBlocks, lane), {iSector}});
1159 getKernelTimer<GPUTPCCFCheckPadBaseline>(RecoStep::TPCClusterFinding, iSector, TPC_REAL_PADS_IN_SECTOR * fragment.lengthWithoutOverlap() * sizeof(PackedCharge), false);
1160 }
1161
1163 // Avoid additional sync when also dumping digits
1164 const bool debugSyncChargeMap = !(GetProcessingSettings().debugMask & GPUChainTrackingDebugFlags::TPCClustererDigits);
1165 // DumpChargeMap should run after noisy pad filter to avoid yet another dump of intermediate data. When chargemap without pad filter is required, disable pad filter instead.
1166 DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, debugSyncChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Charges");
1167
1168 TPCClusterizerCheckExtraADCZeros(clusterer, clustererShadow, lane, extraADCs);
1169
1170 runKernel<GPUTPCCFPeakFinder>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
1172 clusterer.DumpPeakMap(*mDebugFile, "Peaks");
1173 }
1174
1175 RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 0, doGPU, lane);
1176 TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
1177 DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererPeaks, clusterer, &GPUTPCClusterFinder::DumpPeaksCompacted, *mDebugFile); // clang-format off
1178 });
1179 mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
1180 uint32_t iSector = iSectorBase + lane;
1182 GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
1183 if (doGPU) {
1184 SynchronizeStream(lane);
1185 }
1186 if (clusterer.mPmemory->counters.nPeaks == 0) {
1187 return;
1188 }
1189 runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSector}});
1190 runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSector}});
1192 clusterer.DumpPeakMap(*mDebugFile, "Suppressed Peaks");
1193 }
1194
1195 RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 1, doGPU, lane);
1196 TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
1198 });
1199 mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
1200 uint32_t iSector = iSectorBase + lane;
1202 GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
1203
1204 if (clusterer.mPmemory->counters.nPositions == 0) {
1205 return;
1206 }
1207
1208 if (doGPU) {
1209 SynchronizeStream(lane);
1210 }
1211
1212 if (fragment.index == 0) {
1213 deviceEvent* waitEvent = nullptr;
1214 if (transferRunning[lane] == 1) {
1215 waitEvent = &mEvents->stream[lane];
1216 transferRunning[lane] = 2;
1217 }
1218 runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding), krnlRunRangeNone, {nullptr, waitEvent}}, clustererShadow.mPclusterInRow, GPUTPCGeometry::NROWS * sizeof(*clustererShadow.mPclusterInRow));
1219 }
1220
1221 const auto nRegularClusters = clusterer.mPmemory->counters.nClusters;
1222 if (nRegularClusters != 0) {
1223 if (GetProcessingSettings().nn.applyNNclusterizer) {
1224#ifdef GPUCA_HAS_ONNX
1225 GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[lane];
1226 GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN;
1227 GPUTPCNNClusterizerHost& nnApplication = nnApplications[lane];
1228
1229 // int withMC = (doGPU && propagateMCLabels);
1230
1231 if (nn_settings.nnClusterizerApplyCfDeconvolution) {
1232 runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, true);
1233 } else if (clustererNNShadow.mNnClusterizerSetDeconvolutionFlags) {
1234 runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, false);
1235 }
1236
1237 // float time_clusterizer = 0, time_fill = 0, time_networks = 0;
1238 if (nn_settings.nnClusterizerVerbosity > 2) {
1239 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Starting loop over batched data. clustererNNShadow.mNnClusterizerBatchedMode=" << clustererNNShadow.mNnClusterizerBatchedMode << ", numLoops=" << std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNNShadow.mNnClusterizerBatchedMode) << ", numClusters=" << clusterer.mPmemory->counters.nClusters << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1240 }
1241 for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNNShadow.mNnClusterizerBatchedMode); batch++) {
1242 if (nn_settings.nnClusterizerVerbosity > 3) {
1243 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Start. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1244 }
1245 uint batchStart = batch * clustererNNShadow.mNnClusterizerBatchedMode;
1246 size_t iSize = CAMath::Min((uint)clustererNNShadow.mNnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
1247
1248 // Filling the data
1249 if (mRec->IsGPU() || GetProcessingSettings().nn.nnClusterizerForceGpuInputFill) {
1250 // Fills element by element of each input matrix -> better parallelizability, but worse on CPU due to unnecessary computations
1251 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNGPU>({GetGrid(iSize * clustererNNShadow.mNnClusterizerRowTimeSizeThreads , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
1252 } else {
1253 // Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability
1254 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNCPU>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
1255 }
1256 if (nn_settings.nnClusterizerVerbosity > 3) {
1257 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done filling data. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1258 }
1259
1260 if (clustererNNShadow.mNnClusterizerSetDeconvolutionFlags) {
1261 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishDeconvolutionFlags>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); // Publishing the deconvolution flags
1262 if (nn_settings.nnClusterizerVerbosity > 3) {
1263 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done setting deconvolution flags. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1264 }
1265 }
1266
1267 // NN evaluations
1268 if(clustererNNShadow.mNnClusterizerUseClassification) {
1269 if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane]->Start(); }
1270 if (clustererNNShadow.mNnInferenceInputDType == 0) {
1271 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
1272 (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16);
1273 } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
1274 (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_32);
1275 }
1276 } else if (clustererNNShadow.mNnInferenceInputDType == 1) {
1277 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
1278 (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_16);
1279 } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
1280 (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32);
1281 }
1282 }
1283 if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane]->Stop(); } // doGPU || lane<4 -> only for GPU or first 4 CPU lanes (to limit number of concurrent timers). At least gives some statistics for CPU time...
1284 if (nn_settings.nnClusterizerVerbosity > 3) {
1285 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with NN classification inference. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1286 }
1287 }
1288 if (!clustererNNShadow.mNnClusterizerUseCfRegression) {
1289 if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 1]->Start(); }
1290 if (clustererNNShadow.mNnInferenceInputDType == 0) {
1291 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
1292 (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16);
1293 } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
1294 (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_32);
1295 }
1296 } else if (clustererNNShadow.mNnInferenceInputDType == 1) {
1297 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
1298 (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_16);
1299 } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
1300 (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32);
1301 }
1302 }
1303 if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 1]->Stop(); }
1304 if (nnApplication.mModelClass.getNumOutputNodes()[0][1] > 1 && nnApplication.mModelReg2.isInitialized()) {
1305 if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 2]->Start(); }
1306 if (clustererNNShadow.mNnInferenceInputDType == 0) {
1307 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
1308 (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16);
1309 } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
1310 (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_32);
1311 }
1312 } else if (clustererNNShadow.mNnInferenceInputDType == 1) {
1313 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
1314 (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_16);
1315 } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
1316 (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32);
1317 }
1318 }
1319 if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 2]->Stop(); }
1320 }
1321 if (nn_settings.nnClusterizerVerbosity > 3) {
1322 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with NN regression inference. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1323 }
1324 }
1325
1326 // Publishing kernels for class labels and regression results
1327 // In case classification should not be used, this kernel should still be executed to fill the mOutputDataClass array with default values
1328 if (nnApplication.mModelClass.getNumOutputNodes()[0][1] == 1) {
1329 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Assigning class labels
1330 } else {
1331 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Assigning class labels
1332 }
1333 if (!clustererNNShadow.mNnClusterizerUseCfRegression) {
1334 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Publishing class 1 regression results
1335 if (nnApplication.mModelClass.getNumOutputNodes()[0][1] > 1 && nnApplication.mModelReg2.isInitialized()) {
1336 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Publishing class 2 regression results
1337 }
1338 }
1339 if (nn_settings.nnClusterizerVerbosity > 3) {
1340 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done publishing. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1341 }
1342 }
1343
1344 if (clustererNNShadow.mNnClusterizerUseCfRegression) {
1345 if(!nn_settings.nnClusterizerApplyCfDeconvolution) { // If it is already applied don't do it twice, otherwise apply now
1346 runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, true);
1347 }
1349 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
1350 if (nn_settings.nnClusterizerVerbosity > 3) {
1351 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with CF regression. (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1352 }
1353 }
1354#else
1355 GPUFatal("Project not compiled with neural network clusterization. Aborting.");
1356#endif
1357 } else {
1358 runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, true);
1360 runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSector}}, 0);
1361 } // if (GetProcessingSettings().nn.applyNNclusterizer)
1362
1363 if (doGPU && propagateMCLabels) {
1364 TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane);
1365 SynchronizeStream(lane);
1366 runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, 1); // Computes MC labels
1367 }
1368 } // if (nRegularClusters != 0) {
1369
1370
1371 // TODO: Move this right after CheckPadBaseline once tail zeroing is moved into this kernel.
1372 if (rec()->GetParam().rec.tpc.hipTailFilter) {
1373 runKernel<GPUTPCCFHIPTailConnector>({GetGridBlk(GPUTPCGeometry::NROWS, lane), {iSector}});
1374 runKernel<GPUTPCCFHIPClusterizer>({GetGridBlk(GPUTPCGeometry::NROWS, lane), {iSector}}, 0);
1375 if (doGPU && (nRegularClusters == 0 || GetProcessingSettings().debugLevel >= 3)) {
1376 TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
1377 SynchronizeStream(lane);
1378 }
1379 if (doGPU && propagateMCLabels) {
1380 TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane);
1381 SynchronizeStream(lane);
1382 runKernel<GPUTPCCFHIPClusterizer>({GetGrid(GPUTPCGeometry::NROWS, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, 1); // Computes MC labels
1383 }
1384 }
1385
1386 bool hasClusters = nRegularClusters != 0;
1387
1388 // Paranoid edge case: If no regular clusters were found, need to still check that no HIP clusters were created
1389 // HIPClusterizer kernel doesn't update counters.nClusters, because:
1390 // - 64bit atomic support in OpenCL is flaky
1391 // - nClusters is only used internally by clusterizer to track #peaks that will probably become clusters,
1392 // so storing the number of HIP clusters there is only asking for trouble anyway
1393 if (rec()->GetParam().rec.tpc.hipTailFilter && nRegularClusters == 0) {
1394 for (uint32_t row = 0; row < GPUTPCGeometry::NROWS; row++) {
1395 hasClusters |= clusterer.mPclusterInRow[row] != 0;
1396 }
1397 }
1398
1399 if (!hasClusters) {
1400 return;
1401 }
1402
1403 if (GetProcessingSettings().debugLevel >= 3) {
1404 GPUInfo("Sector %02d Fragment %02d Lane %d: Found clusters: digits %u peaks %u clusters %u", iSector, fragment.index, lane, (int32_t)clusterer.mPmemory->counters.nPositions, (int32_t)clusterer.mPmemory->counters.nPeaks, (int32_t)clusterer.mPmemory->counters.nClusters);
1405 }
1406
1407 TransferMemoryResourcesToHost(RecoStep::TPCClusterFinding, &clusterer, lane);
1408 laneHasData[lane] = true;
1409 });
1411 }
1412
1413 size_t nClsFirst = nClsTotal;
1414 bool anyLaneHasData = false;
1415 for (int32_t lane = 0; lane < maxLane; lane++) {
1416 uint32_t iSector = iSectorBase + lane;
1417 std::fill(&tmpNativeAccess->nClusters[iSector][0], &tmpNativeAccess->nClusters[iSector][0] + MAXGLOBALPADROW, 0);
1418 if (doGPU) {
1419 SynchronizeStream(lane);
1420 }
1422 GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
1423
1424 if (laneHasData[lane]) {
1425 anyLaneHasData = true;
1426 // Include clusters in default debug mask, exclude other debug output by default.
1427 // The cluster buffers are accumulated per sector, so dump them once after all fragments.
1428 DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererClusters, clusterer, &GPUTPCClusterFinder::DumpClusters, *mDebugFile); // clang-format off
1429 if (buildNativeGPU && GetProcessingSettings().tpccfGatherKernel) {
1430 runKernel<GPUTPCCFGather>({GetGridBlk(GPUTPCGeometry::NROWS, mRec->NStreams() - 1), {iSector}}, &mInputsShadow->mPclusterNativeBuffer[nClsTotal]);
1431 }
1432 for (uint32_t j = 0; j < GPUTPCGeometry::NROWS; j++) {
1433 if (nClsTotal + clusterer.mPclusterInRow[j] > mInputsHost->mNClusterNative) {
1434 clusterer.raiseError(GPUErrors::ERROR_CF_GLOBAL_CLUSTER_OVERFLOW, iSector * 1000 + j, nClsTotal + clusterer.mPclusterInRow[j], mInputsHost->mNClusterNative);
1435 continue;
1436 }
1437 if (buildNativeGPU) {
1438 if (!GetProcessingSettings().tpccfGatherKernel) {
1439 GPUMemCpyAlways(RecoStep::TPCClusterFinding, (void*)&mInputsShadow->mPclusterNativeBuffer[nClsTotal], (const void*)&clustererShadow.mPclusterByRow[j * clusterer.mNMaxClusterPerRow], sizeof(mIOPtrs.clustersNative->clustersLinear[0]) * clusterer.mPclusterInRow[j], mRec->NStreams() - 1, -2);
1440 }
1441 } else if (buildNativeHost) {
1442 GPUMemCpyAlways(RecoStep::TPCClusterFinding, (void*)&tmpNativeClusters[nClsTotal], (const void*)&clustererShadow.mPclusterByRow[j * clusterer.mNMaxClusterPerRow], sizeof(mIOPtrs.clustersNative->clustersLinear[0]) * clusterer.mPclusterInRow[j], mRec->NStreams() - 1, false);
1443 }
1444 tmpNativeAccess->nClusters[iSector][j] += clusterer.mPclusterInRow[j];
1445 nClsTotal += clusterer.mPclusterInRow[j];
1446 }
1447 if (transferRunning[lane]) {
1448 ReleaseEvent(mEvents->stream[lane], doGPU);
1449 }
1450 RecordMarker(&mEvents->stream[lane], mRec->NStreams() - 1);
1451 transferRunning[lane] = 1;
1452 }
1453
1454 if (not propagateMCLabels || not laneHasData[lane]) {
1455 continue;
1456 }
1457
1458 runKernel<GPUTPCCFMCLabelFlattener, GPUTPCCFMCLabelFlattener::setRowOffsets>({GetGrid(GPUTPCGeometry::NROWS, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}});
1460 runKernel<GPUTPCCFMCLabelFlattener, GPUTPCCFMCLabelFlattener::flatten>({GetGrid(GPUTPCGeometry::NROWS, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, &mcLinearLabels);
1461 assert(propagateMCLabels ? mcLinearLabels.header.size() == nClsTotal : true);
1462 }
1463 for (int32_t lane = 0; lane < maxLane; lane++) {
1464 processors()->tpcClusterer[iSectorBase + lane].FreeMCBuffers();
1465 }
1466 if (buildNativeHost && buildNativeGPU && anyLaneHasData) {
1467 if (GetProcessingSettings().delayedOutput) {
1468 mOutputQueue.emplace_back(outputQueueEntry{(void*)((char*)&tmpNativeClusters[nClsFirst] - (char*)&tmpNativeClusters[0]), &mInputsShadow->mPclusterNativeBuffer[nClsFirst], (nClsTotal - nClsFirst) * sizeof(tmpNativeClusters[0]), RecoStep::TPCClusterFinding});
1469 } else {
1470 GPUMemCpy(RecoStep::TPCClusterFinding, (void*)&tmpNativeClusters[nClsFirst], (const void*)&mInputsShadow->mPclusterNativeBuffer[nClsFirst], (nClsTotal - nClsFirst) * sizeof(tmpNativeClusters[0]), mRec->NStreams() - 1, false);
1471 }
1472 }
1473
1474 if (mWaitForFinalInputs && iSectorBase >= 21 && (int32_t)iSectorBase < 21 + GetProcessingSettings().nTPCClustererLanes) {
1475 notifyForeignChainFinished();
1476 }
1477 if (mWaitForFinalInputs && iSectorBase >= 30 && (int32_t)iSectorBase < 30 + GetProcessingSettings().nTPCClustererLanes) {
1478 mWaitForFinalInputs();
1479 synchronizeCalibUpdate = DoQueuedUpdates(0, false);
1480 }
1481 }
1482 for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) {
1483#ifdef GPUCA_HAS_ONNX
1484 if (GetProcessingSettings().nn.applyNNclusterizer) {
1485 if (GetProcessingSettings().nn.nnClusterizerVerbosity > 0) {
1486 LOG(info) << "(ORT) Environment releasing...";
1487 }
1488 GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
1489 nnApplication.mModelClass.release(true);
1490 nnApplication.mModelReg1.release(true);
1491 nnApplication.mModelReg2.release(true);
1492 }
1493#endif
1494 if (transferRunning[i]) {
1495 ReleaseEvent(mEvents->stream[i], doGPU);
1496 }
1497 }
1498
1499 if (GetProcessingSettings().param.tpcTriggerHandling) {
1501 if (triggerOutput && triggerOutput->allocator) {
1502 // GPUInfo("Storing %lu trigger words", mTriggerBuffer->triggers.size());
1503 auto* outputBuffer = (decltype(mTriggerBuffer->triggers)::value_type*)triggerOutput->allocator(mTriggerBuffer->triggers.size() * sizeof(decltype(mTriggerBuffer->triggers)::value_type));
1504 std::copy(mTriggerBuffer->triggers.begin(), mTriggerBuffer->triggers.end(), outputBuffer);
1505 }
1506 mTriggerBuffer->triggers.clear();
1507 }
1508
1509 // Number of clusters is logged by tracking. This ensures clusters are still printed if it's not running
1511 GPUInfo("Event has %zu TPC Clusters", nClsTotal);
1512 }
1513
1514 ClusterNativeAccess::ConstMCLabelContainerView* mcLabelsConstView = nullptr;
1515 if (propagateMCLabels) { // TODO: write to buffer directly
1517 std::pair<ConstMCLabelContainer*, ConstMCLabelContainerView*> buffer;
1519 if (!GetProcessingSettings().tpcWriteClustersAfterRejection && !sortClusters && labelOutputControl && labelOutputControl->useExternal()) {
1520 if (!labelOutputControl->allocator) {
1521 throw std::runtime_error("Cluster MC Label buffer missing");
1522 }
1524 buffer = {&container->first, &container->second};
1525 } else {
1526 mIOMem.clusterNativeMCView = std::make_unique<ConstMCLabelContainerView>();
1527 mIOMem.clusterNativeMCBuffer = std::make_unique<ConstMCLabelContainer>();
1529 }
1530
1531 assert(propagateMCLabels ? mcLinearLabels.header.size() == nClsTotal : true);
1532 assert(propagateMCLabels ? mcLinearLabels.data.size() >= nClsTotal : true);
1533
1534 mcLabels.setFrom(mcLinearLabels.header, mcLinearLabels.data);
1535 mcLabels.flatten_to(*buffer.first);
1536 *buffer.second = *buffer.first;
1537 mcLabelsConstView = buffer.second;
1538 }
1539
1540 if (buildNativeHost && buildNativeGPU && GetProcessingSettings().delayedOutput) {
1541 mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = nClsTotal;
1542 AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput, GetProcessingSettings().tpcWriteClustersAfterRejection ? nullptr : mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
1543 tmpNativeClusters = mInputsHost->mPclusterNativeOutput;
1544 for (uint32_t i = outputQueueStart; i < mOutputQueue.size(); i++) {
1545 mOutputQueue[i].dst = (char*)tmpNativeClusters + (size_t)mOutputQueue[i].dst;
1546 }
1547 }
1548
1549 if (buildNativeHost) {
1550 tmpNativeAccess->clustersLinear = tmpNativeClusters;
1551 tmpNativeAccess->clustersMCTruth = mcLabelsConstView;
1552 tmpNativeAccess->setOffsetPtrs();
1553 mIOPtrs.clustersNative = tmpNativeAccess;
1554 if (GetProcessingSettings().tpcApplyClusterFilterOnCPU) {
1555 auto allocator = [this, &tmpNativeClusters](size_t size) {
1556 this->mInputsHost->mNClusterNative = size;
1557 this->AllocateRegisteredMemory(this->mInputsHost->mResourceClusterNativeOutput, this->GetProcessingSettings().tpcWriteClustersAfterRejection ? nullptr : this->mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
1558 return (tmpNativeClusters = this->mInputsHost->mPclusterNativeOutput);
1559 };
1560 RunTPCClusterFilter(tmpNativeAccess, allocator, false);
1561 nClsTotal = tmpNativeAccess->nClustersTotal;
1562 }
1563 }
1564
1565 if (!mWaitForFinalInputs) {
1566 notifyForeignChainFinished();
1567 }
1568
1569 if (buildNativeGPU) {
1570 processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
1571 WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0);
1572 *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative;
1573 mInputsHost->mPclusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer;
1574 mInputsHost->mPclusterNativeAccess->setOffsetPtrs();
1575 TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, mInputsHost->mResourceClusterNativeAccess, 0);
1576 }
1577 if (doGPU && synchronizeOutput) {
1579 }
1580 if (doGPU && synchronizeCalibUpdate) {
1582 }
1583 if (sortClusters) {
1584 SortClusters(buildNativeGPU, propagateMCLabels, tmpNativeAccess, tmpNativeClusters);
1585 }
1586 mRec->MemoryScalers()->nTPCHits = nClsTotal;
1587 mRec->PopNonPersistentMemory(RecoStep::TPCClusterFinding, qStr2Tag("TPCCLUST"));
1588 if (mPipelineNotifyCtx) {
1590 mPipelineNotifyCtx = nullptr;
1591 }
1592
1593 if (GetProcessingSettings().autoAdjustHostThreads && !doGPU) {
1595 }
1596
1597#endif
1598 return 0;
1599}
1600
1601void GPUChainTracking::SortClusters(bool buildNativeGPU, bool propagateMCLabels, ClusterNativeAccess* clusterAccess, ClusterNative* clusters)
1602{
1603 if (propagateMCLabels) {
1604 std::vector<uint32_t> clsOrder(clusterAccess->nClustersTotal);
1605 std::iota(clsOrder.begin(), clsOrder.end(), 0);
1606 std::vector<ClusterNative> tmpClusters;
1607 for (uint32_t i = 0; i < NSECTORS; i++) {
1608 for (uint32_t j = 0; j < GPUTPCGeometry::NROWS; j++) {
1609 const uint32_t offset = clusterAccess->clusterOffset[i][j];
1610 std::sort(&clsOrder[offset], &clsOrder[offset + clusterAccess->nClusters[i][j]], [&clusters](const uint32_t a, const uint32_t b) {
1611 return clusters[a] < clusters[b];
1612 });
1613 tmpClusters.resize(clusterAccess->nClusters[i][j]);
1614 memcpy(tmpClusters.data(), &clusters[offset], clusterAccess->nClusters[i][j] * sizeof(tmpClusters[0]));
1615 for (uint32_t k = 0; k < tmpClusters.size(); k++) {
1616 clusters[offset + k] = tmpClusters[clsOrder[offset + k] - offset];
1617 }
1618 }
1619 }
1620 tmpClusters.clear();
1621
1622 std::pair<o2::dataformats::ConstMCLabelContainer*, o2::dataformats::ConstMCLabelContainerView*> labelBuffer;
1624 std::unique_ptr<ConstMCLabelContainerView> tmpUniqueContainerView;
1625 std::unique_ptr<ConstMCLabelContainer> tmpUniqueContainerBuffer;
1626 if (labelOutput && labelOutput->allocator) {
1628 labelBuffer = {&labelContainer->first, &labelContainer->second};
1629 } else {
1630 tmpUniqueContainerView = std::move(mIOMem.clusterNativeMCView);
1631 tmpUniqueContainerBuffer = std::move(mIOMem.clusterNativeMCBuffer);
1632 mIOMem.clusterNativeMCView = std::make_unique<ConstMCLabelContainerView>();
1633 mIOMem.clusterNativeMCBuffer = std::make_unique<ConstMCLabelContainer>();
1634 labelBuffer = {mIOMem.clusterNativeMCBuffer.get(), mIOMem.clusterNativeMCView.get()};
1635 }
1636
1638 for (uint32_t i = 0; i < clusterAccess->nClustersTotal; i++) {
1639 for (const auto& element : clusterAccess->clustersMCTruth->getLabels(clsOrder[i])) {
1640 tmpContainer.addElement(i, element);
1641 }
1642 }
1643 tmpContainer.flatten_to(*labelBuffer.first);
1644 *labelBuffer.second = *labelBuffer.first;
1645 clusterAccess->clustersMCTruth = labelBuffer.second;
1646 } else {
1647 for (uint32_t i = 0; i < NSECTORS; i++) {
1648 for (uint32_t j = 0; j < GPUTPCGeometry::NROWS; j++) {
1649 std::sort(&clusters[clusterAccess->clusterOffset[i][j]], &clusters[clusterAccess->clusterOffset[i][j] + clusterAccess->nClusters[i][j]]);
1650 }
1651 }
1652 }
1653 if (buildNativeGPU) {
1654 GPUMemCpy(RecoStep::TPCClusterFinding, (void*)mInputsShadow->mPclusterNativeBuffer, (const void*)clusters, clusterAccess->nClustersTotal * sizeof(clusters[0]), -1, true);
1655 }
1656}
Definition of the TPC Digit.
default_random_engine gen(dev())
atype::type element
int32_t i
TPCZSHDR * hdr
zsPage * page
uint32_t iSector
uint32_t pageCounter
o2::raw::RawFileWriter * raw
int32_t retVal
Class to serialize ONNX objects for ROOT snapshots of CCDB objects at runtime.
std::enable_if_t< std::is_signed< T >::value, bool > hasData(const CalArray< T > &cal)
Definition Painter.cxx:600
uint16_t pos
Definition RawData.h:3
uint32_t j
Definition RawData.h:0
uint8_t endpoint
Definition RawData.h:0
Provides a basic fallback implementation for Vc.
Definitions of TPC Zero Suppression Data Headers.
void Start()
Definition timer.cxx:64
void Stop()
Definition timer.cxx:76
A container to hold and manage MC truth information/labels.
void addElement(uint32_t dataindex, TruthElement const &element, bool noElement=false)
void setFrom(std::vector< MCTruthHeaderElement > &header, std::vector< TruthElement > &truthArray)
size_t flatten_to(ContainerType &container) const
std::unique_ptr< o2::tpc::ClusterNativeAccess > mClusterNativeAccess
int32_t RunTPCClusterizer(bool synchronizeOutput=true)
std::unique_ptr< GPUTrackingInputProvider > mInputsHost
std::array< GPUOutputControl *, GPUTrackingOutputs::count()> mSubOutputControls
std::unique_ptr< std::ofstream > mDebugFile
std::unique_ptr< GPUTriggerOutputs > mTriggerBuffer
std::vector< outputQueueEntry > mOutputQueue
std::unique_ptr< GPUTPCCFChainContext > mCFContext
int32_t DoQueuedUpdates(int32_t stream, bool updateSlave=true)
std::unique_ptr< GPUNewCalibValues > mNewCalibValues
GPUTrackingInOutPointers & mIOPtrs
struct o2::gpu::GPUChainTracking::InOutMemory mIOMem
std::unique_ptr< GPUTrackingInputProvider > mInputsShadow
void RecordMarker(deviceEvent *ev, int32_t stream)
Definition GPUChain.h:108
void TransferMemoryResourceLinkToGPU(RecoStep step, int16_t res, int32_t stream=-1, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
Definition GPUChain.h:124
void GPUMemCpyAlways(RecoStep step, void *dst, const void *src, size_t size, int32_t stream, int32_t toGPU, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
Definition GPUChain.h:130
void GPUMemCpy(RecoStep step, void *dst, const void *src, size_t size, int32_t stream, int32_t toGPU, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
Definition GPUChain.h:129
bool DoDebugAndDump(RecoStep step, uint32_t mask, T &processor, S T::*func, Args &&... args)
Definition GPUChain.h:239
GPUReconstruction::RecoStepField GetRecoStepsGPU() const
Definition GPUChain.h:72
GPUReconstruction::RecoStepField GetRecoSteps() const
Definition GPUChain.h:71
void WriteToConstantMemory(RecoStep step, size_t offset, const void *src, size_t size, int32_t stream=-1, deviceEvent *ev=nullptr)
Definition GPUChain.h:128
void ReleaseEvent(deviceEvent ev, bool doGPU=true)
Definition GPUChain.h:111
krnlExec GetGrid(uint32_t totalItems, uint32_t nThreads, int32_t stream, GPUReconstruction::krnlDeviceType d=GPUReconstruction::krnlDeviceType::Auto, gpudatatypes::RecoStep st=gpudatatypes::RecoStep::NoRecoStep)
Definition GPUChain.cxx:21
size_t AllocateRegisteredMemory(GPUProcessor *proc)
Definition GPUChain.h:228
virtual std::unique_ptr< GPUReconstructionProcessing::threadContext > GetThreadContext()
Definition GPUChain.h:109
GPUConstantMem * processors()
Definition GPUChain.h:84
static constexpr krnlRunRange krnlRunRangeNone
Definition GPUChain.h:41
void SetONNXGPUStream(Ort::SessionOptions &opt, int32_t stream, int32_t *deviceId)
Definition GPUChain.h:90
krnlExec GetGridAutoStep(int32_t stream, gpudatatypes::RecoStep st=gpudatatypes::RecoStep::NoRecoStep)
Definition GPUChain.cxx:47
GPUParam & param()
Definition GPUChain.h:87
void SetupGPUProcessor(T *proc, bool allocate)
Definition GPUChain.h:231
const GPUSettingsProcessing & GetProcessingSettings() const
Definition GPUChain.h:76
void SynchronizeStream(int32_t stream)
Definition GPUChain.h:89
GPUReconstructionCPU * mRec
Definition GPUChain.h:79
GPUConstantMem * processorsShadow()
Definition GPUChain.h:85
krnlExec GetGridBlk(uint32_t nBlocks, int32_t stream, GPUReconstruction::krnlDeviceType d=GPUReconstruction::krnlDeviceType::Auto, gpudatatypes::RecoStep st=gpudatatypes::RecoStep::NoRecoStep)
Definition GPUChain.cxx:32
static constexpr int32_t NSECTORS
Definition GPUChain.h:58
const GPUParam & GetParam() const
Definition GPUChain.h:63
void TransferMemoryResourceLinkToHost(RecoStep step, int16_t res, int32_t stream=-1, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
Definition GPUChain.h:125
void TransferMemoryResourcesToHost(RecoStep step, GPUProcessor *proc, int32_t stream=-1, bool all=false)
Definition GPUChain.h:123
GPUReconstruction * rec()
Definition GPUChain.h:66
HighResTimer & getGeneralStepTimer(GeneralStep step)
void runParallelOuterLoop(bool doGPU, uint32_t nThreads, std::function< void(uint32_t)> lambda)
const GPUDefParameters & getGPUParameters(bool doGPU) const override
void AllocateRegisteredForeignMemory(int16_t res, GPUReconstruction *rec, GPUOutputControl *control=nullptr)
void ComputeReuseMax(GPUProcessor *proc)
RecoStepField GetRecoStepsGPU() const
void PopNonPersistentMemory(RecoStep step, uint64_t tag, const GPUProcessor *proc=nullptr)
const GPUParam & GetParam() const
void PushNonPersistentMemory(uint64_t tag)
InOutTypeField GetRecoStepsOutputs() const
GPUMemorySizeScalers * MemoryScalers()
static void setGlobalOffsetsAndAllocate(GPUTPCClusterFinder &, GPUTPCLinearLabels &)
void SetMaxData(const GPUTrackingInOutPointers &io)
void SetNMaxDigits(size_t nDigits, size_t nPages, size_t nDigitsFragment, size_t nDigitsEndpointMax)
void DumpSuppressedPeaks(std::ostream &out)
void DumpPeakMap(std::ostream &out, std::string_view)
o2::dataformats::ConstMCTruthContainerView< o2::MCCompLabel > const * mPinputLabels
void DumpChargeMap(std::ostream &out, std::string_view)
uint32_t getNSteps(size_t items) const
void DumpSuppressedPeaksCompacted(std::ostream &out)
void DumpPeaksCompacted(std::ostream &out)
tpc::ClusterNative * mPclusterByRow
static constexpr uint32_t NROWS
void init(const GPUSettingsProcessingNNclusterizer &, bool=false)
void initClusterizer(const GPUSettingsProcessingNNclusterizer &, GPUTPCNNClusterizer &, int32_t=-1, int32_t=-1)
OrtDataType::Float16_t * mInputData_16
OrtDataType::Float16_t * mOutputDataReg2_16
OrtDataType::Float16_t * mModelProbabilities_16
OrtDataType::Float16_t * mOutputDataReg1_16
void release(bool=false)
void setIntraOpNumThreads(int threads)
std::vector< std::vector< int64_t > > getNumOutputNodes() const
#define TPC_REAL_PADS_IN_SECTOR
#define TPC_CLUSTERER_STRIDED_PAD_COUNT
GLint GLenum GLint x
Definition glcorearb.h:403
const GLfloat * m
Definition glcorearb.h:4066
GLenum src
Definition glcorearb.h:1767
GLint GLsizei count
Definition glcorearb.h:399
GLuint buffer
Definition glcorearb.h:655
GLsizeiptr size
Definition glcorearb.h:659
GLdouble f
Definition glcorearb.h:310
GLboolean GLboolean GLboolean b
Definition glcorearb.h:1233
GLboolean * data
Definition glcorearb.h:298
GLintptr offset
Definition glcorearb.h:660
GLenum GLfloat param
Definition glcorearb.h:271
GLboolean GLboolean GLboolean GLboolean a
Definition glcorearb.h:1233
uint8_t itsSharedClusterMap uint8_t
constexpr int LHCMaxBunches
Definition of a container to keep/associate and arbitrary number of labels associated to an index wit...
RAWDataHeaderV7 RAWDataHeader
const float k2
Definition MathUtils.h:75
void dumpBuffer(gsl::span< const std::byte > buffer, std::ostream &out=std::cout, size_t maxbytes=std::numeric_limits< size_t >::max())
Definition DumpBuffer.h:139
std::unique_ptr< const o2::dataformats::MCTruthContainer< MCLabel > > getLabels(framework::ProcessingContext &pc, std::string_view dataBind, EventType eventType=EventType::Standard)
constexpr int LHCBCPERTIMEBIN
Definition Constants.h:38
constexpr int MAXGLOBALPADROW
Definition Constants.h:34
Global TPC definitions and constants.
Definition SimTraits.h:168
@ ZSVersionDenseLinkBased
@ ZSVersionLinkBasedWithMeta
@ ZSVersionRowBased10BitADC
@ ZSVersionRowBased12BitADC
a couple of static helper functions to create timestamp values for CCDB queries or override obsolete ...
constexpr T qStr2Tag(const char(&str)[N])
Definition strtag.h:24
tpccf::TPCTime start
Definition CfFragment.h:31
S< o2::tpc::ORTRootSerializer >::type * nnClusterizerNetworks[3]
std::unique_ptr< o2::dataformats::ConstMCTruthContainerView< o2::MCCompLabel > > clusterNativeMCView
std::unique_ptr< o2::dataformats::ConstMCTruthContainer< o2::MCCompLabel > > clusterNativeMCBuffer
deviceEvent stream[constants::GPU_MAX_STREAMS]
GPUCalibObjectsConst calibObjects
GPUTPCClusterFinder tpcClusterer[GPUTPCGeometry::NSECTORS]
GPUTrackingInOutPointers ioPtrs
size_t NTPCClusters(size_t tpcDigits, bool perSector=false)
std::function< void *(size_t)> allocator
struct o2::gpu::GPUTPCClusterFinder::Memory::counters_t counters
std::array< std::vector< tpc::Digit >, tpc::constants::MAXSECTOR > digitsBySector
std::vector< o2::MCCompLabel > data
std::vector< o2::dataformats::MCTruthHeaderElement > header
const GPUTPCDigitsMCInput * tpcDigitsMC
const o2::tpc::ClusterNativeAccess * clustersNative
const GPUTrackingInOutZS * tpcZS
const GPUTrackingInOutDigits * tpcPackedDigits
GPUTrackingInOutZSSector sector[NSECTORS]
static constexpr uint32_t NENDPOINTS
size_t getIndex(const GPUOutputControl &v)
static constexpr int getVersion()
get numeric version of the RDH
Definition RDHUtils.h:60
unsigned int nClusters[constants::MAXSECTOR][constants::MAXGLOBALPADROW]
const o2::dataformats::ConstMCTruthContainerView< o2::MCCompLabel > * clustersMCTruth
std::pair< ConstMCLabelContainer, ConstMCLabelContainerView > ConstMCLabelContainerViewWithBuffer
unsigned int clusterOffset[constants::MAXSECTOR][constants::MAXGLOBALPADROW]
const ClusterNative * clustersLinear
static constexpr unsigned int TRIGGER_WORD_SIZE
unsigned char nTimeBinSpan
unsigned char version
unsigned short timeOffset
static constexpr size_t TPC_ZS_PAGE_SIZE
unsigned short nADCsamples
Trigger info including the orbit.
uint32_t orbit
orbit of the trigger word
TriggerWordDLBZS triggerWord
trigger Word information
bool isValid(int entry=0) const
constexpr size_t min
constexpr size_t max
LOG(info)<< "Compressed in "<< sw.CpuTime()<< " s"
std::vector< Cluster > clusters
std::vector< Digit > digits
std::vector< int > row
ArrayADC adc
typename std::vector< T, vecpod_allocator< T > > vecpod
Definition vecpod.h:31