Project
Loading...
Searching...
No Matches
GPUChainTrackingClusterizer.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
15#include "GPUChainTracking.h"
17#include "GPULogging.h"
18#include "GPUO2DataTypes.h"
21#include "GPUNewCalibValues.h"
22#include <fstream>
23
24#ifdef GPUCA_O2_LIB
26#endif
27#include "GPUTriggerOutputs.h"
28#include "GPUHostDataTypes.h"
34#include "TPCBase/RDHUtils.h"
35
36#include "utils/strtag.h"
37
38#ifndef GPUCA_NO_VC
39#include <Vc/Vc>
40#endif
41
42#ifdef GPUCA_HAS_ONNX
45#endif
46
47using namespace o2::gpu;
48using namespace o2::tpc;
49using namespace o2::tpc::constants;
50using namespace o2::dataformats;
51
52#ifdef GPUCA_TPC_GEOMETRY_O2
53std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCountUpdate(uint32_t iSector, const CfFragment& fragment)
54{
56 GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
58 uint32_t digits = 0;
59 uint32_t pages = 0;
60 for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
61 clusterer.mMinMaxCN[j] = mCFContext->fragmentData[fragment.index].minMaxCN[iSector][j];
62 if (doGPU) {
63 uint16_t posInEndpoint = 0;
64 uint16_t pagesEndpoint = 0;
65 for (uint32_t k = clusterer.mMinMaxCN[j].zsPtrFirst; k < clusterer.mMinMaxCN[j].zsPtrLast; k++) {
66 const uint32_t pageFirst = (k == clusterer.mMinMaxCN[j].zsPtrFirst) ? clusterer.mMinMaxCN[j].zsPageFirst : 0;
67 const uint32_t pageLast = (k + 1 == clusterer.mMinMaxCN[j].zsPtrLast) ? clusterer.mMinMaxCN[j].zsPageLast : mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k];
68 for (uint32_t l = pageFirst; l < pageLast; l++) {
69 uint16_t pageDigits = mCFContext->fragmentData[fragment.index].pageDigits[iSector][j][posInEndpoint++];
70 if (pageDigits) {
71 *(o++) = GPUTPCClusterFinder::ZSOffset{digits, j, pagesEndpoint};
72 digits += pageDigits;
73 }
74 pagesEndpoint++;
75 }
76 }
77 if (pagesEndpoint != mCFContext->fragmentData[fragment.index].pageDigits[iSector][j].size()) {
78 if (GetProcessingSettings().ignoreNonFatalGPUErrors) {
79 GPUError("TPC raw page count mismatch in TPCClusterizerDecodeZSCountUpdate: expected %d / buffered %lu", pagesEndpoint, mCFContext->fragmentData[fragment.index].pageDigits[iSector][j].size());
80 return {0, 0};
81 } else {
82 GPUFatal("TPC raw page count mismatch in TPCClusterizerDecodeZSCountUpdate: expected %d / buffered %lu", pagesEndpoint, mCFContext->fragmentData[fragment.index].pageDigits[iSector][j].size());
83 }
84 }
85 } else {
87 digits += mCFContext->fragmentData[fragment.index].nDigits[iSector][j];
88 pages += mCFContext->fragmentData[fragment.index].nPages[iSector][j];
89 }
90 }
91 if (doGPU) {
92 pages = o - processors()->tpcClusterer[iSector].mPzsOffsets;
93 }
94 if (!doGPU && GetProcessingSettings().debugLevel >= 4 && mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
95 TPCClusterizerEnsureZSOffsets(iSector, fragment);
96 }
97 return {digits, pages};
98}
99
100void GPUChainTracking::TPCClusterizerEnsureZSOffsets(uint32_t iSector, const CfFragment& fragment)
101{
102 GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
103 uint32_t nAdcs = 0;
105 const auto& data = mCFContext->fragmentData[fragment.index];
106 uint32_t pagesEndpoint = 0;
107 const uint32_t nAdcsExpected = data.nDigits[iSector][endpoint];
108 const uint32_t nPagesExpected = data.nPages[iSector][endpoint];
109
110 uint32_t nAdcDecoded = 0;
111 const auto& zs = mIOPtrs.tpcZS->sector[iSector];
112 for (uint32_t i = data.minMaxCN[iSector][endpoint].zsPtrFirst; i < data.minMaxCN[iSector][endpoint].zsPtrLast; i++) {
113 const uint32_t pageFirst = (i == data.minMaxCN[iSector][endpoint].zsPtrFirst) ? data.minMaxCN[iSector][endpoint].zsPageFirst : 0;
114 const uint32_t pageLast = (i + 1 == data.minMaxCN[iSector][endpoint].zsPtrLast) ? data.minMaxCN[iSector][endpoint].zsPageLast : zs.nZSPtr[endpoint][i];
115 for (uint32_t j = pageFirst; j < pageLast; j++) {
116 const uint8_t* page = static_cast<const uint8_t*>(zs.zsPtr[endpoint][i]) + j * TPCZSHDR::TPC_ZS_PAGE_SIZE;
117 const header::RAWDataHeader* rawDataHeader = reinterpret_cast<const header::RAWDataHeader*>(page);
118 const TPCZSHDRV2* decHdr = reinterpret_cast<const TPCZSHDRV2*>(page + raw::RDHUtils::getMemorySize(*rawDataHeader) - sizeof(TPCZSHDRV2));
119 const uint16_t nSamplesInPage = decHdr->nADCsamples;
120
121 nAdcDecoded += nSamplesInPage;
122 pagesEndpoint++;
123 }
124 }
125
126 if (pagesEndpoint != nPagesExpected) {
127 GPUFatal("Sector %d, Endpoint %d, Fragment %d: TPC raw page count mismatch: expected %d / buffered %u", iSector, endpoint, fragment.index, pagesEndpoint, nPagesExpected);
128 }
129
130 if (nAdcDecoded != nAdcsExpected) {
131 GPUFatal("Sector %d, Endpoint %d, Fragment %d: TPC ADC count mismatch: expected %u, buffered %u", iSector, endpoint, fragment.index, nAdcsExpected, nAdcDecoded);
132 }
133
134 if (nAdcs != clusterer.mPzsOffsets[endpoint].offset) {
135 GPUFatal("Sector %d, Endpoint %d, Fragment %d: TPC ADC offset mismatch: expected %u, buffered %u", iSector, endpoint, fragment.index, nAdcs, clusterer.mPzsOffsets[endpoint].offset);
136 }
137
138 nAdcs += nAdcsExpected;
139 }
140}
141
142namespace
143{
144struct TPCCFDecodeScanTmp {
145 int32_t zsPtrFirst, zsPageFirst, zsPtrLast, zsPageLast, hasData, pageCounter;
146};
147} // namespace
148
149std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint32_t iSector, const CfFragment& fragment)
150{
151 mRec->getGeneralStepTimer(GeneralStep::Prepare).Start();
152 uint32_t nDigits = 0;
153 uint32_t nPages = 0;
154 uint32_t endpointAdcSamples[GPUTrackingInOutZS::NENDPOINTS];
155 memset(endpointAdcSamples, 0, sizeof(endpointAdcSamples));
157 int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : ((mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader*)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]) : 0);
158
159 for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
160#ifndef GPUCA_NO_VC
161 if (GetProcessingSettings().prefetchTPCpageScan >= 3 && j < GPUTrackingInOutZS::NENDPOINTS - 1) {
162 for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j + 1]; k++) {
163 for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j + 1][k]; l++) {
164 Vc::Common::prefetchMid(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j + 1][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE);
165 Vc::Common::prefetchMid(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j + 1][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
166 }
167 }
168 }
169#endif
170
171 std::vector<std::pair<CfFragment, TPCCFDecodeScanTmp>> fragments;
172 fragments.reserve(mCFContext->nFragments);
173 fragments.emplace_back(std::pair<CfFragment, TPCCFDecodeScanTmp>{fragment, {0, 0, 0, 0, 0, -1}});
174 for (uint32_t i = 1; i < mCFContext->nFragments; i++) {
175 fragments.emplace_back(std::pair<CfFragment, TPCCFDecodeScanTmp>{fragments.back().first.next(), {0, 0, 0, 0, 0, -1}});
176 }
177 std::vector<bool> fragmentExtends(mCFContext->nFragments, false);
178
179 uint32_t firstPossibleFragment = 0;
180 uint32_t pageCounter = 0;
181 uint32_t emptyPages = 0;
182 for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j]; k++) {
183 if (GetProcessingSettings().tpcSingleSector != -1 && GetProcessingSettings().tpcSingleSector != (int32_t)iSector) {
184 break;
185 }
186 nPages += mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k];
187 for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; l++) {
188#ifndef GPUCA_NO_VC
189 if (GetProcessingSettings().prefetchTPCpageScan >= 2 && l + 1 < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]) {
190 Vc::Common::prefetchForOneRead(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE);
191 Vc::Common::prefetchForOneRead(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
192 }
193#endif
194 const uint8_t* const page = ((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE;
196 if (o2::raw::RDHUtils::getMemorySize(*rdh) == sizeof(o2::header::RAWDataHeader)) {
197 emptyPages++;
198 continue;
199 }
200 pageCounter++;
201 const TPCZSHDR* const hdr = (const TPCZSHDR*)(rdh_utils::getLink(o2::raw::RDHUtils::getFEEID(*rdh)) == rdh_utils::DLBZSLinkID ? (page + o2::raw::RDHUtils::getMemorySize(*rdh) - sizeof(TPCZSHDRV2)) : (page + sizeof(o2::header::RAWDataHeader)));
202 if (mCFContext->zsVersion == -1) {
203 mCFContext->zsVersion = hdr->version;
204 if (GetProcessingSettings().param.tpcTriggerHandling && mCFContext->zsVersion < ZSVersion::ZSVersionDenseLinkBased) { // TODO: Move tpcTriggerHandling to recoSteps bitmask
205 static bool errorShown = false;
206 if (errorShown == false) {
207 GPUAlarm("Trigger handling only possible with TPC Dense Link Based data, received version %d, disabling", mCFContext->zsVersion);
208 }
209 errorShown = true;
210 }
211 } else if (mCFContext->zsVersion != (int32_t)hdr->version) {
212 GPUError("Received TPC ZS 8kb page of mixed versions, expected %d, received %d (linkid %d, feeCRU %d, feeEndpoint %d, feelinkid %d)", mCFContext->zsVersion, (int32_t)hdr->version, (int32_t)o2::raw::RDHUtils::getLinkID(*rdh), (int32_t)rdh_utils::getCRU(*rdh), (int32_t)rdh_utils::getEndPoint(*rdh), (int32_t)rdh_utils::getLink(*rdh));
213 constexpr size_t bufferSize = 3 * std::max(sizeof(*rdh), sizeof(*hdr)) + 1;
214 char dumpBuffer[bufferSize];
215 for (size_t i = 0; i < sizeof(*rdh); i++) {
216 // "%02X " guaranteed to be 3 chars + ending 0.
217 snprintf(dumpBuffer + 3 * i, 4, "%02X ", (int32_t)((uint8_t*)rdh)[i]);
218 }
219 GPUAlarm("RDH of page: %s", dumpBuffer);
220 for (size_t i = 0; i < sizeof(*hdr); i++) {
221 // "%02X " guaranteed to be 3 chars + ending 0.
222 snprintf(dumpBuffer + 3 * i, 4, "%02X ", (int32_t)((uint8_t*)hdr)[i]);
223 }
224 GPUAlarm("Metainfo of page: %s", dumpBuffer);
225 if (GetProcessingSettings().ignoreNonFatalGPUErrors) {
226 mCFContext->abandonTimeframe = true;
227 return {0, 0};
228 } else {
229 GPUFatal("Cannot process with invalid TPC ZS data, exiting");
230 }
231 }
232 if (GetProcessingSettings().param.tpcTriggerHandling) {
233 const TPCZSHDRV2* const hdr2 = (const TPCZSHDRV2*)hdr;
234 if (hdr2->flags & TPCZSHDRV2::ZSFlags::TriggerWordPresent) {
235 const char* triggerWord = (const char*)hdr - TPCZSHDRV2::TRIGGER_WORD_SIZE;
237 memcpy((void*)&tmp.triggerWord, triggerWord, TPCZSHDRV2::TRIGGER_WORD_SIZE);
238 tmp.orbit = o2::raw::RDHUtils::getHeartBeatOrbit(*rdh);
239 if (tmp.triggerWord.isValid(0)) {
240 mTriggerBuffer->triggers.emplace(tmp);
241 }
242 }
243 }
244 nDigits += hdr->nADCsamples;
245 endpointAdcSamples[j] += hdr->nADCsamples;
246 uint32_t timeBin = (hdr->timeOffset + (o2::raw::RDHUtils::getHeartBeatOrbit(*rdh) - firstHBF) * o2::constants::lhc::LHCMaxBunches) / LHCBCPERTIMEBIN;
247 uint32_t maxTimeBin = timeBin + hdr->nTimeBinSpan;
248 if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
249 const TPCZSHDRV2* const hdr2 = (const TPCZSHDRV2*)hdr;
250 if (hdr2->flags & TPCZSHDRV2::ZSFlags::nTimeBinSpanBit8) {
251 maxTimeBin += 256;
252 }
253 }
254 if (maxTimeBin > mCFContext->tpcMaxTimeBin) {
255 mCFContext->tpcMaxTimeBin = maxTimeBin;
256 }
257 bool extendsInNextPage = false;
258 if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
259 if (l + 1 < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k] && o2::raw::RDHUtils::getMemorySize(*rdh) == TPCZSHDR::TPC_ZS_PAGE_SIZE) {
261 extendsInNextPage = o2::raw::RDHUtils::getHeartBeatOrbit(*nextrdh) == o2::raw::RDHUtils::getHeartBeatOrbit(*rdh) && o2::raw::RDHUtils::getMemorySize(*nextrdh) > sizeof(o2::header::RAWDataHeader);
262 }
263 }
264 while (firstPossibleFragment && (uint32_t)fragments[firstPossibleFragment - 1].first.last() > timeBin) {
265 firstPossibleFragment--;
266 }
267 auto handleExtends = [&](uint32_t ff) {
268 if (fragmentExtends[ff]) {
269 if (doGPU) {
270 // Only add extended page on GPU. On CPU the pages are in consecutive memory anyway.
271 // Not adding the page prevents an issue where a page is decoded twice on CPU, when only the extend should be decoded.
272 fragments[ff].second.zsPageLast++;
273 mCFContext->fragmentData[ff].nPages[iSector][j]++;
274 mCFContext->fragmentData[ff].pageDigits[iSector][j].emplace_back(0);
275 }
276 fragmentExtends[ff] = false;
277 }
278 };
279 if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
280 for (uint32_t ff = 0; ff < firstPossibleFragment; ff++) {
281 handleExtends(ff);
282 }
283 }
284 for (uint32_t f = firstPossibleFragment; f < mCFContext->nFragments; f++) {
285 if (timeBin < (uint32_t)fragments[f].first.last() && (uint32_t)fragments[f].first.first() <= maxTimeBin) {
286 if (!fragments[f].second.hasData) {
287 fragments[f].second.hasData = 1;
288 fragments[f].second.zsPtrFirst = k;
289 fragments[f].second.zsPageFirst = l;
290 } else {
291 if (pageCounter > (uint32_t)fragments[f].second.pageCounter + 1) {
292 mCFContext->fragmentData[f].nPages[iSector][j] += emptyPages + pageCounter - fragments[f].second.pageCounter - 1;
293 for (uint32_t k2 = fragments[f].second.zsPtrLast - 1; k2 <= k; k2++) {
294 for (uint32_t l2 = ((int32_t)k2 == fragments[f].second.zsPtrLast - 1) ? fragments[f].second.zsPageLast : 0; l2 < (k2 < k ? mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k2] : l); l2++) {
295 if (doGPU) {
296 mCFContext->fragmentData[f].pageDigits[iSector][j].emplace_back(0);
297 } else {
298 // CPU cannot skip unneeded pages, so we must keep space to store the invalid dummy clusters
299 const uint8_t* const pageTmp = ((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k2]) + l2 * TPCZSHDR::TPC_ZS_PAGE_SIZE;
300 const o2::header::RAWDataHeader* rdhTmp = (const o2::header::RAWDataHeader*)pageTmp;
301 if (o2::raw::RDHUtils::getMemorySize(*rdhTmp) != sizeof(o2::header::RAWDataHeader)) {
302 const TPCZSHDR* const hdrTmp = (const TPCZSHDR*)(rdh_utils::getLink(o2::raw::RDHUtils::getFEEID(*rdhTmp)) == rdh_utils::DLBZSLinkID ? (pageTmp + o2::raw::RDHUtils::getMemorySize(*rdhTmp) - sizeof(TPCZSHDRV2)) : (pageTmp + sizeof(o2::header::RAWDataHeader)));
303 mCFContext->fragmentData[f].nDigits[iSector][j] += hdrTmp->nADCsamples;
304 }
305 }
306 }
307 }
308 } else if (emptyPages) {
309 mCFContext->fragmentData[f].nPages[iSector][j] += emptyPages;
310 if (doGPU) {
311 for (uint32_t m = 0; m < emptyPages; m++) {
312 mCFContext->fragmentData[f].pageDigits[iSector][j].emplace_back(0);
313 }
314 }
315 }
316 }
317 fragments[f].second.zsPtrLast = k + 1;
318 fragments[f].second.zsPageLast = l + 1;
319 fragments[f].second.pageCounter = pageCounter;
320 mCFContext->fragmentData[f].nPages[iSector][j]++;
321 mCFContext->fragmentData[f].nDigits[iSector][j] += hdr->nADCsamples;
322 if (doGPU) {
323 mCFContext->fragmentData[f].pageDigits[iSector][j].emplace_back(hdr->nADCsamples);
324 }
325 fragmentExtends[f] = extendsInNextPage;
326 } else {
327 handleExtends(f);
328 if (timeBin < (uint32_t)fragments[f].first.last()) {
329 if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
330 for (uint32_t ff = f + 1; ff < mCFContext->nFragments; ff++) {
331 handleExtends(ff);
332 }
333 }
334 break;
335 } else {
336 firstPossibleFragment = f + 1;
337 }
338 }
339 }
340 emptyPages = 0;
341 }
342 }
343 for (uint32_t f = 0; f < mCFContext->nFragments; f++) {
344 mCFContext->fragmentData[f].minMaxCN[iSector][j].zsPtrLast = fragments[f].second.zsPtrLast;
345 mCFContext->fragmentData[f].minMaxCN[iSector][j].zsPtrFirst = fragments[f].second.zsPtrFirst;
346 mCFContext->fragmentData[f].minMaxCN[iSector][j].zsPageLast = fragments[f].second.zsPageLast;
347 mCFContext->fragmentData[f].minMaxCN[iSector][j].zsPageFirst = fragments[f].second.zsPageFirst;
348 }
349 }
350 mCFContext->nPagesTotal += nPages;
351 mCFContext->nPagesSector[iSector] = nPages;
352
353 mCFContext->nDigitsEndpointMax[iSector] = 0;
354 for (uint32_t i = 0; i < GPUTrackingInOutZS::NENDPOINTS; i++) {
355 if (endpointAdcSamples[i] > mCFContext->nDigitsEndpointMax[iSector]) {
356 mCFContext->nDigitsEndpointMax[iSector] = endpointAdcSamples[i];
357 }
358 }
359 uint32_t nDigitsFragmentMax = 0;
360 for (uint32_t i = 0; i < mCFContext->nFragments; i++) {
361 uint32_t pagesInFragment = 0;
362 uint32_t digitsInFragment = 0;
363 for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
364 pagesInFragment += mCFContext->fragmentData[i].nPages[iSector][j];
365 digitsInFragment += mCFContext->fragmentData[i].nDigits[iSector][j];
366 }
367 mCFContext->nPagesFragmentMax = std::max(mCFContext->nPagesFragmentMax, pagesInFragment);
368 nDigitsFragmentMax = std::max(nDigitsFragmentMax, digitsInFragment);
369 }
370 mRec->getGeneralStepTimer(GeneralStep::Prepare).Stop();
371 return {nDigits, nDigitsFragmentMax};
372}
373
374void GPUChainTracking::RunTPCClusterizer_compactPeaks(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int32_t stage, bool doGPU, int32_t lane)
375{
376 auto& in = stage ? clustererShadow.mPpeakPositions : clustererShadow.mPpositions;
377 auto& out = stage ? clustererShadow.mPfilteredPeakPositions : clustererShadow.mPpeakPositions;
378 if (doGPU) {
379 const uint32_t iSector = clusterer.mISector;
380 auto& count = stage ? clusterer.mPmemory->counters.nPeaks : clusterer.mPmemory->counters.nPositions;
381
382 std::vector<size_t> counts;
383
384 uint32_t nSteps = clusterer.getNSteps(count);
385 if (nSteps > clusterer.mNBufs) {
386 GPUError("Clusterer buffers exceeded (%u > %u)", nSteps, (int32_t)clusterer.mNBufs);
387 exit(1);
388 }
389
390 size_t tmpCount = count;
391 if (nSteps > 1) {
392 for (uint32_t i = 1; i < nSteps; i++) {
393 counts.push_back(tmpCount);
394 if (i == 1) {
395 runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanStart>({GetGrid(tmpCount, clusterer.mScanWorkGroupSize, lane), {iSector}}, i, stage);
396 } else {
397 runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanUp>({GetGrid(tmpCount, clusterer.mScanWorkGroupSize, lane), {iSector}}, i, tmpCount);
398 }
399 tmpCount = (tmpCount + clusterer.mScanWorkGroupSize - 1) / clusterer.mScanWorkGroupSize;
400 }
401
402 runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanTop>({GetGrid(tmpCount, clusterer.mScanWorkGroupSize, lane), {iSector}}, nSteps, tmpCount);
403
404 for (uint32_t i = nSteps - 1; i > 1; i--) {
405 tmpCount = counts[i - 1];
406 runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanDown>({GetGrid(tmpCount - clusterer.mScanWorkGroupSize, clusterer.mScanWorkGroupSize, lane), {iSector}}, i, clusterer.mScanWorkGroupSize, tmpCount);
407 }
408 }
409
410 runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::compactDigits>({GetGrid(count, clusterer.mScanWorkGroupSize, lane), {iSector}}, 1, stage, in, out);
411 } else {
412 auto& nOut = stage ? clusterer.mPmemory->counters.nClusters : clusterer.mPmemory->counters.nPeaks;
413 auto& nIn = stage ? clusterer.mPmemory->counters.nPeaks : clusterer.mPmemory->counters.nPositions;
414 size_t count = 0;
415 for (size_t i = 0; i < nIn; i++) {
416 if (clusterer.mPisPeak[i]) {
417 out[count++] = in[i];
418 }
419 }
420 nOut = count;
421 }
422}
423
424std::pair<uint32_t, uint32_t> GPUChainTracking::RunTPCClusterizer_transferZS(int32_t iSector, const CfFragment& fragment, int32_t lane)
425{
426 bool doGPU = GetRecoStepsGPU() & RecoStep::TPCClusterFinding;
427 if (mCFContext->abandonTimeframe) {
428 return {0, 0};
429 }
430 const auto& retVal = TPCClusterizerDecodeZSCountUpdate(iSector, fragment);
431 if (doGPU) {
432 GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
433 GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
434 uint32_t nPagesSector = 0;
435 for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
436 uint32_t nPages = 0;
437 mInputsHost->mPzsMeta->sector[iSector].zsPtr[j] = &mInputsShadow->mPzsPtrs[iSector * GPUTrackingInOutZS::NENDPOINTS + j];
438 mInputsHost->mPzsPtrs[iSector * GPUTrackingInOutZS::NENDPOINTS + j] = clustererShadow.mPzs + (nPagesSector + nPages) * TPCZSHDR::TPC_ZS_PAGE_SIZE;
439 for (uint32_t k = clusterer.mMinMaxCN[j].zsPtrFirst; k < clusterer.mMinMaxCN[j].zsPtrLast; k++) {
440 const uint32_t min = (k == clusterer.mMinMaxCN[j].zsPtrFirst) ? clusterer.mMinMaxCN[j].zsPageFirst : 0;
441 const uint32_t max = (k + 1 == clusterer.mMinMaxCN[j].zsPtrLast) ? clusterer.mMinMaxCN[j].zsPageLast : mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k];
442 if (max > min) {
443 char* src = (char*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k] + min * TPCZSHDR::TPC_ZS_PAGE_SIZE;
444 char* ptrLast = (char*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k] + (max - 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE;
445 size_t size = (ptrLast - src) + o2::raw::RDHUtils::getMemorySize(*(const o2::header::RAWDataHeader*)ptrLast);
446 GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPzs + (nPagesSector + nPages) * TPCZSHDR::TPC_ZS_PAGE_SIZE, src, size, lane, true);
447 }
448 nPages += max - min;
449 }
450 mInputsHost->mPzsMeta->sector[iSector].nZSPtr[j] = &mInputsShadow->mPzsSizes[iSector * GPUTrackingInOutZS::NENDPOINTS + j];
451 mInputsHost->mPzsSizes[iSector * GPUTrackingInOutZS::NENDPOINTS + j] = nPages;
452 mInputsHost->mPzsMeta->sector[iSector].count[j] = 1;
453 nPagesSector += nPages;
454 }
455 GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPzsOffsets, clusterer.mPzsOffsets, clusterer.mNMaxPages * sizeof(*clusterer.mPzsOffsets), lane, true);
456 }
457 return retVal;
458}
459
460int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
461{
463 if (restorePointers) {
464 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
465 processors()->tpcClusterer[iSector].mPzsOffsets = mCFContext->ptrSave[iSector].zsOffsetHost;
466 processorsShadow()->tpcClusterer[iSector].mPzsOffsets = mCFContext->ptrSave[iSector].zsOffsetDevice;
467 processorsShadow()->tpcClusterer[iSector].mPzs = mCFContext->ptrSave[iSector].zsDevice;
468 }
469 processorsShadow()->ioPtrs.clustersNative = mCFContext->ptrClusterNativeSave;
470 return 0;
471 }
472 const auto& threadContext = GetThreadContext();
474 if (mCFContext == nullptr) {
476 }
477 const int16_t maxFragmentLen = GetProcessingSettings().overrideClusterizerFragmentLen;
478 const uint32_t maxAllowedTimebin = param().par.continuousTracking ? std::max<int32_t>(param().continuousMaxTimeBin, maxFragmentLen) : TPC_MAX_TIME_BIN_TRIGGERED;
479 mCFContext->tpcMaxTimeBin = maxAllowedTimebin;
480 const CfFragment fragmentMax{(tpccf::TPCTime)mCFContext->tpcMaxTimeBin + 1, maxFragmentLen};
481 mCFContext->prepare(mIOPtrs.tpcZS, fragmentMax);
482 if (GetProcessingSettings().param.tpcTriggerHandling) {
483 mTriggerBuffer->triggers.clear();
484 }
485 if (mIOPtrs.tpcZS) {
486 uint32_t nDigitsFragmentMax[NSECTORS];
487 mCFContext->zsVersion = -1;
488 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
489 if (mIOPtrs.tpcZS->sector[iSector].count[0]) {
490 const void* rdh = mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0];
491 if (rdh && o2::raw::RDHUtils::getVersion<o2::header::RAWDataHeaderV6>() > o2::raw::RDHUtils::getVersion(rdh)) {
492 GPUError("Data has invalid RDH version %d, %d required\n", o2::raw::RDHUtils::getVersion(rdh), o2::raw::RDHUtils::getVersion<o2::header::RAWDataHeader>());
493 return 1;
494 }
495 }
496#ifndef GPUCA_NO_VC
497 if (GetProcessingSettings().prefetchTPCpageScan >= 1 && iSector < NSECTORS - 1) {
498 for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
499 for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j]; k++) {
500 for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; l++) {
501 Vc::Common::prefetchFar(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector + 1].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE);
502 Vc::Common::prefetchFar(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector + 1].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
503 }
504 }
505 }
506 }
507#endif
508 const auto& x = TPCClusterizerDecodeZSCount(iSector, fragmentMax);
509 nDigitsFragmentMax[iSector] = x.first;
510 processors()->tpcClusterer[iSector].mPmemory->counters.nDigits = x.first;
511 mRec->MemoryScalers()->nTPCdigits += x.first;
512 }
513 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
514 uint32_t nDigitsBase = nDigitsFragmentMax[iSector];
515 uint32_t threshold = 40000000;
516 uint32_t nDigitsScaled = nDigitsBase > threshold ? nDigitsBase : std::min((threshold + nDigitsBase) / 2, 2 * nDigitsBase);
517 processors()->tpcClusterer[iSector].SetNMaxDigits(processors()->tpcClusterer[iSector].mPmemory->counters.nDigits, mCFContext->nPagesFragmentMax, nDigitsScaled, mCFContext->nDigitsEndpointMax[iSector]);
518 if (doGPU) {
519 processorsShadow()->tpcClusterer[iSector].SetNMaxDigits(processors()->tpcClusterer[iSector].mPmemory->counters.nDigits, mCFContext->nPagesFragmentMax, nDigitsScaled, mCFContext->nDigitsEndpointMax[iSector]);
520 }
521 if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) {
522 mPipelineNotifyCtx->rec->AllocateRegisteredForeignMemory(processors()->tpcClusterer[iSector].mZSOffsetId, mRec);
523 mPipelineNotifyCtx->rec->AllocateRegisteredForeignMemory(processors()->tpcClusterer[iSector].mZSId, mRec);
524 } else {
525 AllocateRegisteredMemory(processors()->tpcClusterer[iSector].mZSOffsetId);
526 AllocateRegisteredMemory(processors()->tpcClusterer[iSector].mZSId);
527 }
528 }
529 } else {
530 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
531 uint32_t nDigits = mIOPtrs.tpcPackedDigits->nTPCDigits[iSector];
532 mRec->MemoryScalers()->nTPCdigits += nDigits;
533 processors()->tpcClusterer[iSector].SetNMaxDigits(nDigits, mCFContext->nPagesFragmentMax, nDigits, 0);
534 }
535 }
536
537 if (mIOPtrs.tpcZS) {
538 GPUInfo("Event has %u 8kb TPC ZS pages (version %d), %ld digits", mCFContext->nPagesTotal, mCFContext->zsVersion, (int64_t)mRec->MemoryScalers()->nTPCdigits);
539 } else {
540 GPUInfo("Event has %ld TPC Digits", (int64_t)mRec->MemoryScalers()->nTPCdigits);
541 }
542
543 if (mCFContext->tpcMaxTimeBin > maxAllowedTimebin) {
544 GPUError("Input data has invalid time bin %u > %d", mCFContext->tpcMaxTimeBin, maxAllowedTimebin);
545 if (GetProcessingSettings().ignoreNonFatalGPUErrors) {
546 mCFContext->abandonTimeframe = true;
547 mCFContext->tpcMaxTimeBin = maxAllowedTimebin;
548 } else {
549 return 1;
550 }
551 }
552
553 mCFContext->fragmentFirst = CfFragment{std::max<int32_t>(mCFContext->tpcMaxTimeBin + 1, maxFragmentLen), maxFragmentLen};
554 for (int32_t iSector = 0; iSector < GetProcessingSettings().nTPCClustererLanes && iSector < NSECTORS; iSector++) {
555 if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) {
556 mCFContext->nextPos[iSector] = RunTPCClusterizer_transferZS(iSector, mCFContext->fragmentFirst, GetProcessingSettings().nTPCClustererLanes + iSector);
557 }
558 }
559
560 if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) {
561 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
562 mCFContext->ptrSave[iSector].zsOffsetHost = processors()->tpcClusterer[iSector].mPzsOffsets;
563 mCFContext->ptrSave[iSector].zsOffsetDevice = processorsShadow()->tpcClusterer[iSector].mPzsOffsets;
564 mCFContext->ptrSave[iSector].zsDevice = processorsShadow()->tpcClusterer[iSector].mPzs;
565 }
566 }
567 return 0;
568}
569#endif
570
571int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
572{
573 if (param().rec.fwdTPCDigitsAsClusters) {
574 return ForwardTPCDigits();
575 }
576#ifdef GPUCA_TPC_GEOMETRY_O2
577 int32_t tpcTimeBinCut = mUpdateNewCalibObjects && mNewCalibValues->newTPCTimeBinCut ? mNewCalibValues->tpcTimeBinCut : param().tpcCutTimeBin;
579 const auto& threadContext = GetThreadContext();
580 const bool doGPU = GetRecoStepsGPU() & RecoStep::TPCClusterFinding;
581 if (RunTPCClusterizer_prepare(mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer)) {
582 return 1;
583 }
584 if (GetProcessingSettings().autoAdjustHostThreads && !doGPU) {
586 }
587
589 float tpcHitLowOccupancyScalingFactor = 1.f;
591 uint32_t nHitsBase = mRec->MemoryScalers()->nTPCHits;
592 uint32_t threshold = 30000000 / 256 * mIOPtrs.settingsTF->nHBFPerTF;
593 if (mIOPtrs.settingsTF->nHBFPerTF < 64) {
594 threshold *= 2;
595 }
596 mRec->MemoryScalers()->nTPCHits = std::max<uint32_t>(nHitsBase, std::min<uint32_t>(threshold, nHitsBase * 3.5f)); // Increase the buffer size for low occupancy data to compensate for noisy pads creating exceiive clusters
597 if (nHitsBase < threshold) {
598 float maxFactor = mRec->MemoryScalers()->nTPCHits < threshold * 2 / 3 ? 3 : (mRec->MemoryScalers()->nTPCHits < threshold ? 2.25f : 1.75f);
599 mRec->MemoryScalers()->temporaryFactor *= std::min(maxFactor, (float)threshold / nHitsBase);
600 tpcHitLowOccupancyScalingFactor = std::min(3.5f, (float)threshold / nHitsBase);
601 }
602 }
603 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
604 processors()->tpcClusterer[iSector].SetMaxData(mIOPtrs); // First iteration to set data sizes
605 }
606 mRec->ComputeReuseMax(nullptr); // Resolve maximums for shared buffers
607 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
608 SetupGPUProcessor(&processors()->tpcClusterer[iSector], true); // Now we allocate
609 }
610 if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) {
611 RunTPCClusterizer_prepare(true); // Restore some pointers, allocated by the other pipeline, and set to 0 by SetupGPUProcessor (since not allocated in this pipeline)
612 }
613
614#ifdef GPUCA_HAS_ONNX
615 if (GetProcessingSettings().nn.applyNNclusterizer) {
616 uint32_t maxClusters = -1;
617 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
618 maxClusters = std::max(maxClusters, processors()->tpcClusterer[iSector].mNMaxClusters);
619 }
620 for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
621 GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
622 const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
623 clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
624 clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
625 clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
626 clustererNN.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime;
627 clustererNN.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData;
628 clustererNN.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0);
629 clustererNN.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
630 clustererNN.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
631 clustererNN.nnClusterizerTotalClusters = maxClusters;
632 clustererNN.nnClassThreshold = nn_settings.nnClassThreshold;
633 clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
634 if (clustererNN.nnSigmoidTrafoClassThreshold) {
635 clustererNN.nnClassThreshold = (float)std::log(clustererNN.nnClassThreshold / (1.f - clustererNN.nnClassThreshold));
636 }
637 if (nn_settings.nnClusterizerVerbosity < 0) {
638 clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
639 } else {
640 clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
641 }
642 clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
643 GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
645 }
646 }
647#endif
648
649 if (doGPU && mIOPtrs.tpcZS) {
651 WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), mRec->NStreams() - 1);
652 }
653 if (doGPU) {
654 WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)processors()->tpcClusterer - (char*)processors(), processorsShadow()->tpcClusterer, sizeof(GPUTPCClusterFinder) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
655 }
656
657 size_t nClsTotal = 0;
658 ClusterNativeAccess* tmpNativeAccess = mClusterNativeAccess.get();
659 ClusterNative* tmpNativeClusters = nullptr;
660 std::unique_ptr<ClusterNative[]> tmpNativeClusterBuffer;
661
662 // setup MC Labels
664
665 auto* digitsMC = propagateMCLabels ? processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC : nullptr;
666
667 bool buildNativeGPU = doGPU && NeedTPCClustersOnGPU();
668 bool buildNativeHost = (mRec->GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCClusters) || GetProcessingSettings().deterministicGPUReconstruction; // TODO: Should do this also when clusters are needed for later steps on the host but not requested as output
669
670 mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = mRec->MemoryScalers()->nTPCHits * tpcHitLowOccupancyScalingFactor;
671 if (buildNativeGPU) {
672 AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
673 }
674 if (buildNativeHost && !(buildNativeGPU && GetProcessingSettings().delayedOutput)) {
675 if (mWaitForFinalInputs) {
676 GPUFatal("Cannot use waitForFinalInput callback without delayed output");
677 }
678 if (!GetProcessingSettings().tpcApplyClusterFilterOnCPU) {
680 tmpNativeClusters = mInputsHost->mPclusterNativeOutput;
681 } else {
682 tmpNativeClusterBuffer = std::make_unique<ClusterNative[]>(mInputsHost->mNClusterNative);
683 tmpNativeClusters = tmpNativeClusterBuffer.get();
684 }
685 }
686
687 GPUTPCLinearLabels mcLinearLabels;
688 if (propagateMCLabels) {
689 // No need to overallocate here, nTPCHits is anyway an upper bound used for the GPU cluster buffer, and we can always enlarge the buffer anyway
690 mcLinearLabels.header.reserve(mRec->MemoryScalers()->nTPCHits / 2);
691 mcLinearLabels.data.reserve(mRec->MemoryScalers()->nTPCHits);
692 }
693
694 int8_t transferRunning[NSECTORS] = {0};
695 uint32_t outputQueueStart = mOutputQueue.size();
696
697 auto notifyForeignChainFinished = [this]() {
698 if (mPipelineNotifyCtx) {
699 SynchronizeStream(OutputStream()); // Must finish before updating ioPtrs in (global) constant memory
700 {
701 std::lock_guard<std::mutex> lock(mPipelineNotifyCtx->mutex);
702 mPipelineNotifyCtx->ready = true;
703 }
704 mPipelineNotifyCtx->cond.notify_one();
705 }
706 };
707 bool synchronizeCalibUpdate = false;
708
709 for (uint32_t iSectorBase = 0; iSectorBase < NSECTORS; iSectorBase += GetProcessingSettings().nTPCClustererLanes) {
710 std::vector<bool> laneHasData(GetProcessingSettings().nTPCClustererLanes, false);
711 static_assert(NSECTORS <= GPUCA_MAX_STREAMS, "Stream events must be able to hold all sectors");
712 const int32_t maxLane = std::min<int32_t>(GetProcessingSettings().nTPCClustererLanes, NSECTORS - iSectorBase);
713 for (CfFragment fragment = mCFContext->fragmentFirst; !fragment.isEnd(); fragment = fragment.next()) {
714 if (GetProcessingSettings().debugLevel >= 3) {
715 GPUInfo("Processing time bins [%d, %d) for sectors %d to %d", fragment.start, fragment.last(), iSectorBase, iSectorBase + GetProcessingSettings().nTPCClustererLanes - 1);
716 }
717 mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
718 if (doGPU && fragment.index != 0) {
719 SynchronizeStream(lane); // Don't overwrite charge map from previous iteration until cluster computation is finished
720 }
721
722 uint32_t iSector = iSectorBase + lane;
723 GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
724 GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
725 clusterer.mPmemory->counters.nPeaks = clusterer.mPmemory->counters.nClusters = 0;
726 clusterer.mPmemory->fragment = fragment;
727
729 bool setDigitsOnGPU = doGPU && not mIOPtrs.tpcZS;
730 bool setDigitsOnHost = (not doGPU && not mIOPtrs.tpcZS) || propagateMCLabels;
731 auto* inDigits = mIOPtrs.tpcPackedDigits;
732 size_t numDigits = inDigits->nTPCDigits[iSector];
733 if (setDigitsOnGPU) {
734 GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPdigits, inDigits->tpcDigits[iSector], sizeof(clustererShadow.mPdigits[0]) * numDigits, lane, true);
735 }
736 if (setDigitsOnHost) {
737 clusterer.mPdigits = const_cast<o2::tpc::Digit*>(inDigits->tpcDigits[iSector]); // TODO: Needs fixing (invalid const cast)
738 }
739 clusterer.mPmemory->counters.nDigits = numDigits;
740 }
741
742 if (mIOPtrs.tpcZS) {
743 if (mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) {
744 clusterer.mPmemory->counters.nPositions = mCFContext->nextPos[iSector].first;
745 clusterer.mPmemory->counters.nPagesSubsector = mCFContext->nextPos[iSector].second;
746 } else {
747 clusterer.mPmemory->counters.nPositions = clusterer.mPmemory->counters.nPagesSubsector = 0;
748 }
749 }
750 TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
751
752 using ChargeMapType = decltype(*clustererShadow.mPchargeMap);
753 using PeakMapType = decltype(*clustererShadow.mPpeakMap);
754 runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPchargeMap, TPCMapMemoryLayout<ChargeMapType>::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(ChargeMapType)); // TODO: Not working in OpenCL2!!!
755 runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpeakMap, TPCMapMemoryLayout<PeakMapType>::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(PeakMapType));
756 if (fragment.index == 0) {
757 runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpadIsNoisy, TPC_PADS_IN_SECTOR * sizeof(*clustererShadow.mPpadIsNoisy));
758 }
759 DoDebugAndDump(RecoStep::TPCClusterFinding, 262144, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Zeroed Charges");
760
761 if (doGPU) {
762 if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) {
763 TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, mInputsHost->mResourceZS, lane);
764 SynchronizeStream(GetProcessingSettings().nTPCClustererLanes + lane);
765 }
766 SynchronizeStream(mRec->NStreams() - 1); // Wait for copying to constant memory
767 }
768
769 if (mIOPtrs.tpcZS && (mCFContext->abandonTimeframe || !mCFContext->nPagesSector[iSector] || mCFContext->zsVersion == -1)) {
770 clusterer.mPmemory->counters.nPositions = 0;
771 return;
772 }
773 if (!mIOPtrs.tpcZS && mIOPtrs.tpcPackedDigits->nTPCDigits[iSector] == 0) {
774 clusterer.mPmemory->counters.nPositions = 0;
775 return;
776 }
777
778 if (propagateMCLabels && fragment.index == 0) {
779 clusterer.PrepareMC();
780 clusterer.mPinputLabels = digitsMC->v[iSector];
781 if (clusterer.mPinputLabels == nullptr) {
782 GPUFatal("MC label container missing, sector %d", iSector);
783 }
784 if (clusterer.mPinputLabels->getIndexedSize() != mIOPtrs.tpcPackedDigits->nTPCDigits[iSector]) {
785 GPUFatal("MC label container has incorrect number of entries: %d expected, has %d\n", (int32_t)mIOPtrs.tpcPackedDigits->nTPCDigits[iSector], (int32_t)clusterer.mPinputLabels->getIndexedSize());
786 }
787 }
788
789 if (GetProcessingSettings().tpcSingleSector == -1 || GetProcessingSettings().tpcSingleSector == (int32_t)iSector) {
790 if (not mIOPtrs.tpcZS) {
791 runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({GetGrid(1, lane), {iSector}}, mIOPtrs.tpcZS == nullptr);
792 TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
793 } else if (propagateMCLabels) {
794 runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({GetGrid(1, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, mIOPtrs.tpcZS == nullptr);
795 TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
796 }
797 }
798
799 if (mIOPtrs.tpcZS) {
800 int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : ((mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader*)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]) : 0);
801 uint32_t nBlocks = doGPU ? clusterer.mPmemory->counters.nPagesSubsector : GPUTrackingInOutZS::NENDPOINTS;
802
803 (void)tpcTimeBinCut; // TODO: To be used in decoding kernels
804 switch (mCFContext->zsVersion) {
805 default:
806 GPUFatal("Data with invalid TPC ZS mode (%d) received", mCFContext->zsVersion);
807 break;
810 runKernel<GPUTPCCFDecodeZS>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
811 break;
813 runKernel<GPUTPCCFDecodeZSLink>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
814 break;
816 runKernel<GPUTPCCFDecodeZSDenseLink>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
817 break;
818 }
819 TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
820 } // clang-format off
821 });
822 mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
823 uint32_t iSector = iSectorBase + lane;
824 if (doGPU) {
825 SynchronizeStream(lane);
826 }
827 if (mIOPtrs.tpcZS) {
828 CfFragment f = fragment.next();
829 int32_t nextSector = iSector;
830 if (f.isEnd()) {
831 nextSector += GetProcessingSettings().nTPCClustererLanes;
832 f = mCFContext->fragmentFirst;
833 }
834 if (nextSector < NSECTORS && mIOPtrs.tpcZS && mCFContext->nPagesSector[nextSector] && mCFContext->zsVersion != -1 && !mCFContext->abandonTimeframe) {
835 mCFContext->nextPos[nextSector] = RunTPCClusterizer_transferZS(nextSector, f, GetProcessingSettings().nTPCClustererLanes + lane);
836 }
837 }
838 GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
839 GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
840 if (clusterer.mPmemory->counters.nPositions == 0) {
841 return;
842 }
843 if (!mIOPtrs.tpcZS) {
844 runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::fillFromDigits>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
845 }
846 if (DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 1, clusterer, &GPUTPCClusterFinder::DumpDigits, *mDebugFile)) {
847 clusterer.DumpChargeMap(*mDebugFile, "Charges");
848 }
849
850 if (propagateMCLabels) {
851 runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::fillIndexMap>({GetGrid(clusterer.mPmemory->counters.nDigitsInFragment, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}});
852 }
853
854 bool checkForNoisyPads = (rec()->GetParam().rec.tpc.maxTimeBinAboveThresholdIn1000Bin > 0) || (rec()->GetParam().rec.tpc.maxConsecTimeBinAboveThreshold > 0);
855 checkForNoisyPads &= (rec()->GetParam().rec.tpc.noisyPadsQuickCheck ? fragment.index == 0 : true);
856 checkForNoisyPads &= !GetProcessingSettings().disableTPCNoisyPadFilter;
857
858 if (checkForNoisyPads) {
860
861 runKernel<GPUTPCCFCheckPadBaseline>({GetGridBlk(nBlocks, lane), {iSector}});
862 }
863
864 runKernel<GPUTPCCFPeakFinder>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
865 if (DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 2, clusterer, &GPUTPCClusterFinder::DumpPeaks, *mDebugFile)) {
866 clusterer.DumpPeakMap(*mDebugFile, "Peaks");
867 }
868
869 RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 0, doGPU, lane);
870 TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
871 DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 2, clusterer, &GPUTPCClusterFinder::DumpPeaksCompacted, *mDebugFile); // clang-format off
872 });
873 mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
874 uint32_t iSector = iSectorBase + lane;
875 GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
876 GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
877 if (doGPU) {
878 SynchronizeStream(lane);
879 }
880 if (clusterer.mPmemory->counters.nPeaks == 0) {
881 return;
882 }
883 runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSector}});
884 runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSector}});
885 if (DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 3, clusterer, &GPUTPCClusterFinder::DumpSuppressedPeaks, *mDebugFile)) {
886 clusterer.DumpPeakMap(*mDebugFile, "Suppressed Peaks");
887 }
888
889 RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 1, doGPU, lane);
890 TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
891 DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 3, clusterer, &GPUTPCClusterFinder::DumpSuppressedPeaksCompacted, *mDebugFile); // clang-format off
892 });
893 mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
894 uint32_t iSector = iSectorBase + lane;
895 GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
896 GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
897
898 if (doGPU) {
899 SynchronizeStream(lane);
900 }
901
902 if (fragment.index == 0) {
903 deviceEvent* waitEvent = nullptr;
904 if (transferRunning[lane] == 1) {
905 waitEvent = &mEvents->stream[lane];
906 transferRunning[lane] = 2;
907 }
908 runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding), krnlRunRangeNone, {nullptr, waitEvent}}, clustererShadow.mPclusterInRow, GPUCA_ROW_COUNT * sizeof(*clustererShadow.mPclusterInRow));
909 }
910
911 if (clusterer.mPmemory->counters.nClusters == 0) {
912 return;
913 }
914
915 if (GetProcessingSettings().nn.applyNNclusterizer) {
916#ifdef GPUCA_HAS_ONNX
917 GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
918 const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
919 GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
920
921 if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
922 runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
923 DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
924 }
925
926 float time_clusterizer = 0, time_fill = 0;
927 for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNN.nnClusterizerBatchedMode); batch++) {
928 uint batchStart = batch * clustererNN.nnClusterizerBatchedMode;
929 size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
930
931 auto start0 = std::chrono::high_resolution_clock::now();
932 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Filling the data
933
934 auto stop0 = std::chrono::high_resolution_clock::now();
935 auto start1 = std::chrono::high_resolution_clock::now();
936 nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnClusterizerDtype);
937 if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
938 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
939 } else {
940 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
941 }
942
943 if (!clustererNN.nnClusterizerUseCfRegression) {
944 nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype);
945 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 1
946 if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) {
947 nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype);
948 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 2
949 }
950 }
951 auto stop1 = std::chrono::high_resolution_clock::now();
952
953 time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
954 time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
955 }
956 auto start1 = std::chrono::high_resolution_clock::now();
957 if (clustererNN.nnClusterizerUseCfRegression) {
958 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
959 }
960 auto stop1 = std::chrono::high_resolution_clock::now();
961 time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
962 if (clustererNN.nnClusterizerVerbosity < 3) {
963 int acceptedClusters = 0;
964 for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) {
965 acceptedClusters += clustererNN.outputDataClass[i];
966 }
967 LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
968 }
969#else
970 GPUFatal("Project not compiled with neural network clusterization. Aborting.");
971#endif
972 } else {
973 runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
974 DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
975 runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSector}}, 0);
976 }
977
978 if (doGPU && propagateMCLabels) {
979 TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane);
980 if (doGPU) {
981 SynchronizeStream(lane);
982 }
983 runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, 1); // Computes MC labels
984 }
985
986 if (GetProcessingSettings().debugLevel >= 3) {
987 GPUInfo("Sector %02d Fragment %02d Lane %d: Found clusters: digits %u peaks %u clusters %u", iSector, fragment.index, lane, (int32_t)clusterer.mPmemory->counters.nPositions, (int32_t)clusterer.mPmemory->counters.nPeaks, (int32_t)clusterer.mPmemory->counters.nClusters);
988 }
989
990 TransferMemoryResourcesToHost(RecoStep::TPCClusterFinding, &clusterer, lane);
991 laneHasData[lane] = true;
992 // Include clusters in default debug mask, exclude other debug output by default
993 DoDebugAndDump(RecoStep::TPCClusterFinding, 131072, clusterer, &GPUTPCClusterFinder::DumpClusters, *mDebugFile); // clang-format off
994 });
996 }
997
998 size_t nClsFirst = nClsTotal;
999 bool anyLaneHasData = false;
1000 for (int32_t lane = 0; lane < maxLane; lane++) {
1001 uint32_t iSector = iSectorBase + lane;
1002 std::fill(&tmpNativeAccess->nClusters[iSector][0], &tmpNativeAccess->nClusters[iSector][0] + MAXGLOBALPADROW, 0);
1003 if (doGPU) {
1004 SynchronizeStream(lane);
1005 }
1006 GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
1007 GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
1008
1009 if (laneHasData[lane]) {
1010 anyLaneHasData = true;
1011 if (buildNativeGPU && GetProcessingSettings().tpccfGatherKernel) {
1012 runKernel<GPUTPCCFGather>({GetGridBlk(GPUCA_ROW_COUNT, mRec->NStreams() - 1), {iSector}}, &mInputsShadow->mPclusterNativeBuffer[nClsTotal]);
1013 }
1014 for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
1015 if (nClsTotal + clusterer.mPclusterInRow[j] > mInputsHost->mNClusterNative) {
1016 clusterer.raiseError(GPUErrors::ERROR_CF_GLOBAL_CLUSTER_OVERFLOW, iSector * 1000 + j, nClsTotal + clusterer.mPclusterInRow[j], mInputsHost->mNClusterNative);
1017 continue;
1018 }
1019 if (buildNativeGPU) {
1020 if (!GetProcessingSettings().tpccfGatherKernel) {
1021 GPUMemCpyAlways(RecoStep::TPCClusterFinding, (void*)&mInputsShadow->mPclusterNativeBuffer[nClsTotal], (const void*)&clustererShadow.mPclusterByRow[j * clusterer.mNMaxClusterPerRow], sizeof(mIOPtrs.clustersNative->clustersLinear[0]) * clusterer.mPclusterInRow[j], mRec->NStreams() - 1, -2);
1022 }
1023 } else if (buildNativeHost) {
1024 GPUMemCpyAlways(RecoStep::TPCClusterFinding, (void*)&tmpNativeClusters[nClsTotal], (const void*)&clustererShadow.mPclusterByRow[j * clusterer.mNMaxClusterPerRow], sizeof(mIOPtrs.clustersNative->clustersLinear[0]) * clusterer.mPclusterInRow[j], mRec->NStreams() - 1, false);
1025 }
1026 tmpNativeAccess->nClusters[iSector][j] += clusterer.mPclusterInRow[j];
1027 nClsTotal += clusterer.mPclusterInRow[j];
1028 }
1029 if (transferRunning[lane]) {
1030 ReleaseEvent(mEvents->stream[lane], doGPU);
1031 }
1032 RecordMarker(&mEvents->stream[lane], mRec->NStreams() - 1);
1033 transferRunning[lane] = 1;
1034 }
1035
1036 if (not propagateMCLabels || not laneHasData[lane]) {
1037 assert(propagateMCLabels ? mcLinearLabels.header.size() == nClsTotal : true);
1038 continue;
1039 }
1040
1041 runKernel<GPUTPCCFMCLabelFlattener, GPUTPCCFMCLabelFlattener::setRowOffsets>({GetGrid(GPUCA_ROW_COUNT, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}});
1043 runKernel<GPUTPCCFMCLabelFlattener, GPUTPCCFMCLabelFlattener::flatten>({GetGrid(GPUCA_ROW_COUNT, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, &mcLinearLabels);
1044 clusterer.clearMCMemory();
1045 assert(propagateMCLabels ? mcLinearLabels.header.size() == nClsTotal : true);
1046 }
1047 if (propagateMCLabels) {
1048 for (int32_t lane = 0; lane < maxLane; lane++) {
1049 processors()->tpcClusterer[iSectorBase + lane].clearMCMemory();
1050 }
1051 }
1052 if (buildNativeHost && buildNativeGPU && anyLaneHasData) {
1053 if (GetProcessingSettings().delayedOutput) {
1054 mOutputQueue.emplace_back(outputQueueEntry{(void*)((char*)&tmpNativeClusters[nClsFirst] - (char*)&tmpNativeClusters[0]), &mInputsShadow->mPclusterNativeBuffer[nClsFirst], (nClsTotal - nClsFirst) * sizeof(tmpNativeClusters[0]), RecoStep::TPCClusterFinding});
1055 } else {
1056 GPUMemCpy(RecoStep::TPCClusterFinding, (void*)&tmpNativeClusters[nClsFirst], (const void*)&mInputsShadow->mPclusterNativeBuffer[nClsFirst], (nClsTotal - nClsFirst) * sizeof(tmpNativeClusters[0]), mRec->NStreams() - 1, false);
1057 }
1058 }
1059
1060 if (mWaitForFinalInputs && iSectorBase >= 21 && (int32_t)iSectorBase < 21 + GetProcessingSettings().nTPCClustererLanes) {
1061 notifyForeignChainFinished();
1062 }
1063 if (mWaitForFinalInputs && iSectorBase >= 30 && (int32_t)iSectorBase < 30 + GetProcessingSettings().nTPCClustererLanes) {
1064 mWaitForFinalInputs();
1065 synchronizeCalibUpdate = DoQueuedUpdates(0, false);
1066 }
1067 }
1068 for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) {
1069 if (transferRunning[i]) {
1070 ReleaseEvent(mEvents->stream[i], doGPU);
1071 }
1072 }
1073
1074 if (GetProcessingSettings().param.tpcTriggerHandling) {
1076 if (triggerOutput && triggerOutput->allocator) {
1077 // GPUInfo("Storing %lu trigger words", mTriggerBuffer->triggers.size());
1078 auto* outputBuffer = (decltype(mTriggerBuffer->triggers)::value_type*)triggerOutput->allocator(mTriggerBuffer->triggers.size() * sizeof(decltype(mTriggerBuffer->triggers)::value_type));
1079 std::copy(mTriggerBuffer->triggers.begin(), mTriggerBuffer->triggers.end(), outputBuffer);
1080 }
1081 mTriggerBuffer->triggers.clear();
1082 }
1083
1084 ClusterNativeAccess::ConstMCLabelContainerView* mcLabelsConstView = nullptr;
1085 if (propagateMCLabels) {
1086 // TODO: write to buffer directly
1088 std::pair<ConstMCLabelContainer*, ConstMCLabelContainerView*> buffer;
1091 throw std::runtime_error("Cluster MC Label buffer missing");
1092 }
1094 buffer = {&container->first, &container->second};
1095 } else {
1096 mIOMem.clusterNativeMCView = std::make_unique<ConstMCLabelContainerView>();
1097 mIOMem.clusterNativeMCBuffer = std::make_unique<ConstMCLabelContainer>();
1098 buffer.first = mIOMem.clusterNativeMCBuffer.get();
1099 buffer.second = mIOMem.clusterNativeMCView.get();
1100 }
1101
1102 assert(propagateMCLabels ? mcLinearLabels.header.size() == nClsTotal : true);
1103 assert(propagateMCLabels ? mcLinearLabels.data.size() >= nClsTotal : true);
1104
1105 mcLabels.setFrom(mcLinearLabels.header, mcLinearLabels.data);
1106 mcLabels.flatten_to(*buffer.first);
1107 *buffer.second = *buffer.first;
1108 mcLabelsConstView = buffer.second;
1109 }
1110
1111 if (buildNativeHost && buildNativeGPU && GetProcessingSettings().delayedOutput) {
1112 mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = nClsTotal;
1114 tmpNativeClusters = mInputsHost->mPclusterNativeOutput;
1115 for (uint32_t i = outputQueueStart; i < mOutputQueue.size(); i++) {
1116 mOutputQueue[i].dst = (char*)tmpNativeClusters + (size_t)mOutputQueue[i].dst;
1117 }
1118 }
1119
1120 if (buildNativeHost) {
1121 tmpNativeAccess->clustersLinear = tmpNativeClusters;
1122 tmpNativeAccess->clustersMCTruth = mcLabelsConstView;
1123 tmpNativeAccess->setOffsetPtrs();
1124 mIOPtrs.clustersNative = tmpNativeAccess;
1125 if (GetProcessingSettings().tpcApplyClusterFilterOnCPU) {
1126 auto allocator = [this, &tmpNativeClusters](size_t size) {
1127 this->mInputsHost->mNClusterNative = size;
1128 this->AllocateRegisteredMemory(this->mInputsHost->mResourceClusterNativeOutput, this->mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
1129 return (tmpNativeClusters = this->mInputsHost->mPclusterNativeOutput);
1130 };
1131 RunTPCClusterFilter(tmpNativeAccess, allocator, false);
1132 nClsTotal = tmpNativeAccess->nClustersTotal;
1133 }
1134 }
1135
1136 if (!mWaitForFinalInputs) {
1137 notifyForeignChainFinished();
1138 }
1139
1140 if (buildNativeGPU) {
1141 processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
1142 WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0);
1143 *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative;
1144 mInputsHost->mPclusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer;
1145 mInputsHost->mPclusterNativeAccess->setOffsetPtrs();
1146 TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, mInputsHost->mResourceClusterNativeAccess, 0);
1147 }
1148 if (doGPU && synchronizeOutput) {
1150 }
1151 if (doGPU && synchronizeCalibUpdate) {
1153 }
1154 if (buildNativeHost && (GetProcessingSettings().deterministicGPUReconstruction || GetProcessingSettings().debugLevel >= 4)) {
1155 for (uint32_t i = 0; i < NSECTORS; i++) {
1156 for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
1157 std::sort(&tmpNativeClusters[tmpNativeAccess->clusterOffset[i][j]], &tmpNativeClusters[tmpNativeAccess->clusterOffset[i][j] + tmpNativeAccess->nClusters[i][j]]);
1158 }
1159 }
1160 if (buildNativeGPU) {
1161 GPUMemCpy(RecoStep::TPCClusterFinding, (void*)mInputsShadow->mPclusterNativeBuffer, (const void*)tmpNativeClusters, nClsTotal * sizeof(tmpNativeClusters[0]), -1, true);
1162 }
1163 }
1164 mRec->MemoryScalers()->nTPCHits = nClsTotal;
1165 mRec->PopNonPersistentMemory(RecoStep::TPCClusterFinding, qStr2Tag("TPCCLUST"));
1166 if (mPipelineNotifyCtx) {
1168 mPipelineNotifyCtx = nullptr;
1169 }
1170
1171 if (GetProcessingSettings().autoAdjustHostThreads && !doGPU) {
1173 }
1174
1175#endif
1176 return 0;
1177}
Definition of the TPC Digit.
int32_t i
#define TPC_MAX_TIME_BIN_TRIGGERED
#define GPUCA_MAX_STREAMS
int32_t retVal
bool o
#define GPUCA_ROW_COUNT
std::enable_if_t< std::is_signed< T >::value, bool > hasData(const CalArray< T > &cal)
Definition Painter.cxx:515
uint32_t j
Definition RawData.h:0
uint8_t endpoint
Definition RawData.h:0
Definitions of TPC Zero Suppression Data Headers.
void Start()
Definition timer.cxx:57
void Stop()
Definition timer.cxx:69
A container to hold and manage MC truth information/labels.
void setFrom(std::vector< MCTruthHeaderElement > &header, std::vector< TruthElement > &truthArray)
size_t flatten_to(ContainerType &container) const
std::unique_ptr< o2::tpc::ClusterNativeAccess > mClusterNativeAccess
int32_t RunTPCClusterizer(bool synchronizeOutput=true)
std::unique_ptr< GPUTrackingInputProvider > mInputsHost
std::array< GPUOutputControl *, GPUTrackingOutputs::count()> mSubOutputControls
std::unique_ptr< std::ofstream > mDebugFile
std::unique_ptr< GPUTriggerOutputs > mTriggerBuffer
std::vector< outputQueueEntry > mOutputQueue
std::unique_ptr< GPUTPCCFChainContext > mCFContext
int32_t DoQueuedUpdates(int32_t stream, bool updateSlave=true)
std::unique_ptr< GPUNewCalibValues > mNewCalibValues
GPUTrackingInOutPointers & mIOPtrs
struct o2::gpu::GPUChainTracking::InOutMemory mIOMem
std::unique_ptr< GPUTrackingInputProvider > mInputsShadow
void RecordMarker(deviceEvent *ev, int32_t stream)
Definition GPUChain.h:103
void TransferMemoryResourceLinkToGPU(RecoStep step, int16_t res, int32_t stream=-1, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
Definition GPUChain.h:119
void GPUMemCpyAlways(RecoStep step, void *dst, const void *src, size_t size, int32_t stream, int32_t toGPU, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
Definition GPUChain.h:124
void GPUMemCpy(RecoStep step, void *dst, const void *src, size_t size, int32_t stream, int32_t toGPU, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
Definition GPUChain.h:123
GPUReconstruction::RecoStepField GetRecoStepsGPU() const
Definition GPUChain.h:68
virtual std::unique_ptr< gpu_reconstruction_kernels::threadContext > GetThreadContext()
Definition GPUChain.h:104
void WriteToConstantMemory(RecoStep step, size_t offset, const void *src, size_t size, int32_t stream=-1, deviceEvent *ev=nullptr)
Definition GPUChain.h:122
void ReleaseEvent(deviceEvent ev, bool doGPU=true)
Definition GPUChain.h:106
size_t AllocateRegisteredMemory(GPUProcessor *proc)
Definition GPUChain.h:209
GPUConstantMem * processors()
Definition GPUChain.h:80
static constexpr krnlRunRange krnlRunRangeNone
Definition GPUChain.h:37
GPUParam & param()
Definition GPUChain.h:83
void SetupGPUProcessor(T *proc, bool allocate)
Definition GPUChain.h:212
const GPUSettingsProcessing & GetProcessingSettings() const
Definition GPUChain.h:72
void SynchronizeStream(int32_t stream)
Definition GPUChain.h:85
GPUReconstructionCPU * mRec
Definition GPUChain.h:75
GPUConstantMem * processorsShadow()
Definition GPUChain.h:81
krnlExec GetGridAutoStep(int32_t stream, GPUDataTypes::RecoStep st=GPUDataTypes::RecoStep::NoRecoStep)
Definition GPUChain.cxx:47
static constexpr int32_t NSECTORS
Definition GPUChain.h:54
void TransferMemoryResourceLinkToHost(RecoStep step, int16_t res, int32_t stream=-1, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
Definition GPUChain.h:120
void TransferMemoryResourcesToHost(RecoStep step, GPUProcessor *proc, int32_t stream=-1, bool all=false)
Definition GPUChain.h:118
bool DoDebugAndDump(RecoStep step, int32_t mask, T &processor, S T::*func, Args &&... args)
Definition GPUChain.h:223
krnlExec GetGrid(uint32_t totalItems, uint32_t nThreads, int32_t stream, GPUReconstruction::krnlDeviceType d=GPUReconstruction::krnlDeviceType::Auto, GPUDataTypes::RecoStep st=GPUDataTypes::RecoStep::NoRecoStep)
Definition GPUChain.cxx:21
krnlExec GetGridBlk(uint32_t nBlocks, int32_t stream, GPUReconstruction::krnlDeviceType d=GPUReconstruction::krnlDeviceType::Auto, GPUDataTypes::RecoStep st=GPUDataTypes::RecoStep::NoRecoStep)
Definition GPUChain.cxx:32
GPUReconstruction * rec()
Definition GPUChain.h:62
HighResTimer & getGeneralStepTimer(GeneralStep step)
void runParallelOuterLoop(bool doGPU, uint32_t nThreads, std::function< void(uint32_t)> lambda)
void AllocateRegisteredForeignMemory(int16_t res, GPUReconstruction *rec, GPUOutputControl *control=nullptr)
void PopNonPersistentMemory(RecoStep step, uint64_t tag)
void ComputeReuseMax(GPUProcessor *proc)
const GPUParam & GetParam() const
RecoStepField GetRecoStepsGPU() const
void PushNonPersistentMemory(uint64_t tag)
InOutTypeField GetRecoStepsOutputs() const
GPUMemorySizeScalers * MemoryScalers()
static void setGlobalOffsetsAndAllocate(GPUTPCClusterFinder &, GPUTPCLinearLabels &)
static constexpr int32_t mScanWorkGroupSize
void SetMaxData(const GPUTrackingInOutPointers &io)
void SetNMaxDigits(size_t nDigits, size_t nPages, size_t nDigitsFragment, size_t nDigitsEndpointMax)
void DumpSuppressedPeaks(std::ostream &out)
void DumpPeakMap(std::ostream &out, std::string_view)
o2::dataformats::ConstMCTruthContainerView< o2::MCCompLabel > const * mPinputLabels
void DumpChargeMap(std::ostream &out, std::string_view)
uint32_t getNSteps(size_t items) const
void DumpSuppressedPeaksCompacted(std::ostream &out)
void DumpPeaksCompacted(std::ostream &out)
tpc::ClusterNative * mPclusterByRow
std::vector< std::string > reg_model_paths
void networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer &clusterer, size_t size, float *output, int32_t dtype)
std::vector< std::vector< int64_t > > getNumOutputNodes() const
#define TPC_PADS_IN_SECTOR
GLint GLenum GLint x
Definition glcorearb.h:403
const GLfloat * m
Definition glcorearb.h:4066
GLenum src
Definition glcorearb.h:1767
GLint GLsizei count
Definition glcorearb.h:399
GLuint buffer
Definition glcorearb.h:655
GLsizeiptr size
Definition glcorearb.h:659
GLdouble f
Definition glcorearb.h:310
GLboolean * data
Definition glcorearb.h:298
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLenum GLfloat param
Definition glcorearb.h:271
uint8_t itsSharedClusterMap uint8_t
constexpr int LHCMaxBunches
Definition of a container to keep/associate and arbitrary number of labels associated to an index wit...
RAWDataHeaderV7 RAWDataHeader
void dumpBuffer(gsl::span< const std::byte > buffer, std::ostream &out=std::cout, size_t maxbytes=std::numeric_limits< size_t >::max())
Definition DumpBuffer.h:139
constexpr int LHCBCPERTIMEBIN
Definition Constants.h:38
constexpr int MAXGLOBALPADROW
Definition Constants.h:34
Global TPC definitions and constants.
Definition SimTraits.h:167
@ ZSVersionDenseLinkBased
@ ZSVersionLinkBasedWithMeta
@ ZSVersionRowBased10BitADC
@ ZSVersionRowBased12BitADC
a couple of static helper functions to create timestamp values for CCDB queries or override obsolete ...
constexpr T qStr2Tag(const char *str)
Definition strtag.h:22
tpccf::TPCTime start
Definition CfFragment.h:31
std::unique_ptr< o2::dataformats::ConstMCTruthContainerView< o2::MCCompLabel > > clusterNativeMCView
std::unique_ptr< o2::dataformats::ConstMCTruthContainer< o2::MCCompLabel > > clusterNativeMCBuffer
deviceEvent stream[GPUCA_MAX_STREAMS]
GPUTPCClusterFinder tpcClusterer[GPUCA_NSECTORS]
GPUTrackingInOutPointers ioPtrs
size_t NTPCClusters(size_t tpcDigits, bool perSector=false)
std::function< void *(size_t)> allocator
struct o2::gpu::GPUTPCClusterFinder::Memory::counters_t counters
std::vector< o2::MCCompLabel > data
std::vector< o2::dataformats::MCTruthHeaderElement > header
const GPUTPCDigitsMCInput * tpcDigitsMC
const o2::tpc::ClusterNativeAccess * clustersNative
const GPUSettingsTF * settingsTF
const GPUTrackingInOutZS * tpcZS
const GPUTrackingInOutDigits * tpcPackedDigits
GPUTrackingInOutZSSector sector[NSECTORS]
static constexpr uint32_t NENDPOINTS
size_t getIndex(const GPUOutputControl &v)
static constexpr int getVersion()
get numeric version of the RDH
Definition RDHUtils.h:58
unsigned int nClusters[constants::MAXSECTOR][constants::MAXGLOBALPADROW]
const o2::dataformats::ConstMCTruthContainerView< o2::MCCompLabel > * clustersMCTruth
std::pair< ConstMCLabelContainer, ConstMCLabelContainerView > ConstMCLabelContainerViewWithBuffer
unsigned int clusterOffset[constants::MAXSECTOR][constants::MAXGLOBALPADROW]
const ClusterNative * clustersLinear
static constexpr unsigned int TRIGGER_WORD_SIZE
unsigned char version
static constexpr size_t TPC_ZS_PAGE_SIZE
unsigned short nADCsamples
Trigger info including the orbit.
uint32_t orbit
orbit of the trigger word
TriggerWordDLBZS triggerWord
trigger Word information
bool isValid(int entry=0) const
constexpr size_t min
constexpr size_t max
LOG(info)<< "Compressed in "<< sw.CpuTime()<< " s"
std::vector< Digit > digits