28#include "GPUDefParametersRuntime.h"
54#ifndef GPUCA_STANDALONE
73std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCountUpdate(uint32_t
iSector,
const CfFragment& fragment)
83 uint16_t posInEndpoint = 0;
84 uint16_t pagesEndpoint = 0;
88 for (uint32_t l = pageFirst; l < pageLast; l++) {
99 GPUError(
"TPC raw page count mismatch in TPCClusterizerDecodeZSCountUpdate: expected %d / buffered %lu", pagesEndpoint,
mCFContext->fragmentData[fragment.
index].pageDigits[
iSector][
j].size());
102 GPUFatal(
"TPC raw page count mismatch in TPCClusterizerDecodeZSCountUpdate: expected %d / buffered %lu", pagesEndpoint,
mCFContext->fragmentData[fragment.
index].pageDigits[
iSector][
j].size());
115 TPCClusterizerEnsureZSOffsets(
iSector, fragment);
120void GPUChainTracking::TPCClusterizerEnsureZSOffsets(uint32_t
iSector,
const CfFragment& fragment)
126 uint32_t pagesEndpoint = 0;
130 uint32_t nAdcDecoded = 0;
135 for (uint32_t
j = pageFirst;
j < pageLast;
j++) {
139 const uint16_t nSamplesInPage = decHdr->
nADCsamples;
141 nAdcDecoded += nSamplesInPage;
146 if (pagesEndpoint != nPagesExpected) {
147 GPUFatal(
"Sector %d, Endpoint %d, Fragment %d: TPC raw page count mismatch: expected %d / buffered %u",
iSector,
endpoint, fragment.
index, pagesEndpoint, nPagesExpected);
150 if (nAdcDecoded != nAdcsExpected) {
151 GPUFatal(
"Sector %d, Endpoint %d, Fragment %d: TPC ADC count mismatch: expected %u, buffered %u",
iSector,
endpoint, fragment.
index, nAdcsExpected, nAdcDecoded);
158 nAdcs += nAdcsExpected;
168 if (fragment.
index != 0) {
177 const size_t chargeMapSizeBytes = chargeMapSize *
sizeof(
PackedCharge);
180 chargeMapHostData.resize(chargeMapSize);
185 extraPositions.reserve(
digits.size());
187 GPUMemCpy(RecoStep::TPCClusterFinding, chargeMapHostData.data(), clustererShadow.
mPchargeMap, chargeMapSizeBytes, lane,
false);
190 for (
const auto& d :
digits) {
191 if (!fragment.contains(d.getTimeStamp())) {
198 extraPositions.push_back(
pos);
201 GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.
mPchargeMap, chargeMapHostData.data(), chargeMapSizeBytes, lane,
true);
204 const size_t extraPositionsOffset = nPositions - extraPositions.size();
205 GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.
mPpositions + extraPositionsOffset, extraPositions.data(), extraPositions.size() *
sizeof(
CfChargePos), lane,
true);
214 if (fragment.
index != 0) {
223 const size_t chargeMapSizeBytes = chargeMapSize *
sizeof(
PackedCharge);
226 chargeMapHostData.resize(chargeMapSize);
230 GPUMemCpy(RecoStep::TPCClusterFinding, chargeMapHostData.data(), clustererShadow.
mPchargeMap, chargeMapSizeBytes, lane,
false);
233 size_t nNonZeroADCs = 0;
235 for (
const auto& d :
digits) {
236 if (!fragment.contains(d.getTimeStamp())) {
242 auto adc = chargeMapHost[
pos].unpack();
249 if (nNonZeroADCs > 0) {
250 GPUInfo(
"Non Zero ADCs: %zu", nNonZeroADCs);
252 GPUInfo(
"Cleared all extra ADC values!", nNonZeroADCs);
258struct TPCCFDecodeScanTmp {
270 constexpr int32_t MinTailLength = 50;
271 constexpr int32_t MaxTailLength = 200;
272 constexpr int32_t TailWidth = 3;
278 const int32_t nHIPs = 50;
279 const int32_t firstTB = 0;
280 const int32_t lastTB = 4000 - MaxTailLength;
281 const int32_t tailADC = 250;
283 std::mt19937
gen{(uint32_t)seed};
285 std::uniform_int_distribution<> randomTB(firstTB, lastTB);
286 std::uniform_int_distribution<> randomTailLength(MinTailLength, MaxTailLength);
289 for (int32_t iHIP = 0; iHIP < nHIPs; iHIP++) {
291 const int32_t
row = randomRow(
gen);
292 const int32_t nPads = geo.NPads(
row);
293 std::uniform_int_distribution<> randomPad(0, nPads - 1);
295 const int32_t basePad = randomPad(
gen);
296 const int32_t baseTb = randomTB(
gen);
300 const int32_t tailLength = randomTailLength(
gen);
302 for (int32_t dPad = -TailWidth; dPad <= TailWidth; dPad++) {
303 const int32_t iPad = basePad + dPad;
304 if (iPad < 0 || iPad >= nPads) {
308 for (int32_t dTime = 0; dTime < tailLength; dTime++) {
309 const int32_t iTime = baseTb + dTime;
315 const auto adc = dTime == 0 && dPad == 0 ? 1023 : tailADC;
329std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint32_t
iSector,
const CfFragment& fragment)
332 uint32_t nDigits = 0;
335 memset(endpointAdcSamples, 0,
sizeof(endpointAdcSamples));
350 std::vector<std::pair<CfFragment, TPCCFDecodeScanTmp>> fragments;
352 fragments.emplace_back(std::pair<CfFragment, TPCCFDecodeScanTmp>{fragment, {0, 0, 0, 0, 0, -1}});
354 fragments.emplace_back(std::pair<CfFragment, TPCCFDecodeScanTmp>{fragments.back().
first.next(), {0, 0, 0, 0, 0, -1}});
356 std::vector<bool> fragmentExtends(
mCFContext->nFragments,
false);
358 uint32_t firstPossibleFragment = 0;
360 uint32_t emptyPages = 0;
384 static bool errorShown =
false;
385 if (errorShown ==
false) {
386 GPUAlarm(
"Trigger handling only possible with TPC Dense Link Based data, received version %d, disabling",
mCFContext->zsVersion);
391 GPUError(
"Received TPC ZS 8kb page of mixed versions, expected %d, received %d (linkid %d, feeCRU %d, feeEndpoint %d, feelinkid %d)",
mCFContext->zsVersion, (int32_t)
hdr->
version, (int32_t)o2::raw::RDHUtils::getLinkID(*rdh), (int32_t)rdh_utils::getCRU(*rdh), (int32_t)rdh_utils::getEndPoint(*rdh), (int32_t)rdh_utils::getLink(*rdh));
392 constexpr size_t bufferSize = 3 * std::max(
sizeof(*rdh),
sizeof(*
hdr)) + 1;
394 for (
size_t i = 0;
i <
sizeof(*rdh);
i++) {
396 snprintf(dumpBuffer + 3 *
i, 4,
"%02X ", (int32_t)((uint8_t*)rdh)[
i]);
398 GPUAlarm(
"RDH of page: %s", dumpBuffer);
399 for (
size_t i = 0;
i <
sizeof(*hdr);
i++) {
401 snprintf(dumpBuffer + 3 *
i, 4,
"%02X ", (int32_t)((uint8_t*)
hdr)[
i]);
403 GPUAlarm(
"Metainfo of page: %s", dumpBuffer);
408 GPUFatal(
"Cannot process with invalid TPC ZS data, exiting");
413 if (hdr2->
flags & TPCZSHDRV2::ZSFlags::TriggerWordPresent) {
417 tmp.
orbit = o2::raw::RDHUtils::getHeartBeatOrbit(*rdh);
427 if (
mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
429 if (hdr2->
flags & TPCZSHDRV2::ZSFlags::nTimeBinSpanBit8) {
436 bool extendsInNextPage =
false;
437 if (
mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
440 extendsInNextPage = o2::raw::RDHUtils::getHeartBeatOrbit(*nextrdh) == o2::raw::RDHUtils::getHeartBeatOrbit(*rdh) && o2::raw::RDHUtils::getMemorySize(*nextrdh) >
sizeof(
o2::header::RAWDataHeader);
443 while (firstPossibleFragment && (uint32_t)fragments[firstPossibleFragment - 1].first.last() > timeBin) {
444 firstPossibleFragment--;
446 auto handleExtends = [&](uint32_t ff) {
447 if (fragmentExtends[ff]) {
451 fragments[ff].second.zsPageLast++;
455 fragmentExtends[ff] =
false;
458 if (
mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
459 for (uint32_t ff = 0; ff < firstPossibleFragment; ff++) {
463 for (uint32_t
f = firstPossibleFragment;
f <
mCFContext->nFragments;
f++) {
464 if (timeBin < (uint32_t)fragments[
f].
first.last() && (uint32_t)fragments[
f].first.first() <= maxTimeBin) {
465 if (!fragments[
f].second.hasData) {
466 fragments[
f].second.hasData = 1;
467 fragments[
f].second.zsPtrFirst = k;
468 fragments[
f].second.zsPageFirst = l;
470 if (
pageCounter > (uint32_t)fragments[
f].second.pageCounter + 1) {
472 for (uint32_t k2 = fragments[
f].second.zsPtrLast - 1;
k2 <= k;
k2++) {
481 const TPCZSHDR*
const hdrTmp = (
const TPCZSHDR*)(rdh_utils::getLink(o2::raw::RDHUtils::getFEEID(*rdhTmp)) == rdh_utils::DLBZSLinkID ? (pageTmp + o2::raw::RDHUtils::getMemorySize(*rdhTmp) -
sizeof(
TPCZSHDRV2)) : (pageTmp +
sizeof(
o2::header::RAWDataHeader)));
487 }
else if (emptyPages) {
490 for (uint32_t
m = 0;
m < emptyPages;
m++) {
496 fragments[
f].second.zsPtrLast = k + 1;
497 fragments[
f].second.zsPageLast = l + 1;
504 fragmentExtends[
f] = extendsInNextPage;
507 if (timeBin < (uint32_t)fragments[
f].
first.last()) {
508 if (
mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
509 for (uint32_t ff =
f + 1; ff <
mCFContext->nFragments; ff++) {
515 firstPossibleFragment =
f + 1;
538 uint32_t nDigitsFragmentMax = 0;
540 uint32_t pagesInFragment = 0;
541 uint32_t digitsInFragment = 0;
547 nDigitsFragmentMax = std::max(nDigitsFragmentMax, digitsInFragment);
550 return {nDigits, nDigitsFragmentMax};
561 std::vector<size_t> counts;
564 if (nSteps > clusterer.
mNBufs) {
565 GPUError(
"Clusterer buffers exceeded (%u > %u)", nSteps, (int32_t)clusterer.
mNBufs);
570 size_t tmpCount =
count;
572 for (uint32_t
i = 1;
i < nSteps;
i++) {
573 counts.push_back(tmpCount);
575 runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanStart>({
GetGrid(tmpCount, scanWorkgroupSize, lane), {
iSector}},
i, stage);
577 runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanUp>({
GetGrid(tmpCount, scanWorkgroupSize, lane), {
iSector}},
i, tmpCount);
579 tmpCount = (tmpCount + scanWorkgroupSize - 1) / scanWorkgroupSize;
582 runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanTop>({
GetGrid(tmpCount, scanWorkgroupSize, lane), {
iSector}}, nSteps, tmpCount);
584 for (uint32_t
i = nSteps - 1;
i > 1;
i--) {
585 tmpCount = counts[
i - 1];
586 runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanDown>({
GetGrid(tmpCount - scanWorkgroupSize, scanWorkgroupSize, lane), {
iSector}},
i, scanWorkgroupSize, tmpCount);
590 runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::compactDigits>({
GetGrid(
count, scanWorkgroupSize, lane), {
iSector}}, 1, stage, in, out);
595 for (
size_t i = 0;
i < nIn;
i++) {
604std::pair<uint32_t, uint32_t> GPUChainTracking::RunTPCClusterizer_transferZS(int32_t
iSector,
const CfFragment& fragment, int32_t lane,
const GPUTPCExtraADC& extraADCs)
610 auto retVal = TPCClusterizerDecodeZSCountUpdate(
iSector, fragment);
611 if (fragment.
index == 0) {
617 uint32_t nPagesSector = 0;
636 nPagesSector += nPages;
643int32_t GPUChainTracking::RunTPCClusterizer_prepare(
bool restorePointers,
const GPUTPCExtraADC& extraADCs)
646 if (restorePointers) {
661 const uint32_t maxAllowedTimebin =
param().
par.continuousTracking ? std::max<int32_t>(
param().continuousMaxTimeBin, maxFragmentLen) : constants::TPC_MAX_TIME_BIN_TRIGGERED;
662 mCFContext->tpcMaxTimeBin = maxAllowedTimebin;
669 uint32_t nDigitsFragmentMax[
NSECTORS];
675 GPUError(
"Data has invalid RDH version %d, %d required\n",
o2::raw::RDHUtils::getVersion(rdh), o2::raw::RDHUtils::getVersion<o2::header::RAWDataHeader>());
691 const auto&
x = TPCClusterizerDecodeZSCount(
iSector, fragmentMax);
692 nDigitsFragmentMax[
iSector] =
x.first;
697 uint32_t nDigitsBase = nDigitsFragmentMax[
iSector];
698 uint32_t threshold = 40000000;
699 uint32_t nDigitsScaled = nDigitsBase > threshold ? nDigitsBase : std::min((threshold + nDigitsBase) / 2, 2 * nDigitsBase);
726 if (
mCFContext->tpcMaxTimeBin > maxAllowedTimebin) {
727 GPUError(
"Input data has invalid time bin %u > %d",
mCFContext->tpcMaxTimeBin, maxAllowedTimebin);
730 mCFContext->tpcMaxTimeBin = maxAllowedTimebin;
756 if (
param().
rec.fwdTPCDigitsAsClusters) {
767#ifdef INSERT_SATURATED_SIGNALS
768 extraADCs = GenerateSaturatedSignals();
771 if (RunTPCClusterizer_prepare(mPipelineNotifyCtx &&
GetProcessingSettings().doublePipelineClusterizer, extraADCs)) {
779 float tpcHitLowOccupancyScalingFactor = 1.f;
787 if (nHitsBase < threshold) {
790 tpcHitLowOccupancyScalingFactor = std::min(3.5f, (
float)threshold / nHitsBase);
801 RunTPCClusterizer_prepare(
true, extraADCs);
819 if (nn_settings.applyNNclusterizer) {
820 int32_t deviceId = -1;
826 nnTimers[0] = &getTimer<GPUTPCNNClusterizer, 0>(
"GPUTPCNNClusterizer_ONNXClassification_0_", 0);
827 nnTimers[1] = &getTimer<GPUTPCNNClusterizer, 1>(
"GPUTPCNNClusterizer_ONNXRegression_1_", 1);
828 nnTimers[2] = &getTimer<GPUTPCNNClusterizer, 2>(
"GPUTPCNNClusterizer_ONNXRegression2_2_", 2);
829 nnTimers[3] = &getTimer<GPUTPCNNClusterizer, 3>(
"GPUTPCNNClusterizer_ONNXClassification_0_", 3);
830 nnTimers[4] = &getTimer<GPUTPCNNClusterizer, 4>(
"GPUTPCNNClusterizer_ONNXRegression_1_", 4);
831 nnTimers[5] = &getTimer<GPUTPCNNClusterizer, 5>(
"GPUTPCNNClusterizer_ONNXRegression2_2_", 5);
832 nnTimers[6] = &getTimer<GPUTPCNNClusterizer, 6>(
"GPUTPCNNClusterizer_ONNXClassification_0_", 6);
833 nnTimers[7] = &getTimer<GPUTPCNNClusterizer, 7>(
"GPUTPCNNClusterizer_ONNXRegression_1_", 7);
834 nnTimers[8] = &getTimer<GPUTPCNNClusterizer, 8>(
"GPUTPCNNClusterizer_ONNXRegression2_2_", 8);
835 nnTimers[9] = &getTimer<GPUTPCNNClusterizer, 9>(
"GPUTPCNNClusterizer_ONNXClassification_0_", 9);
836 nnTimers[10] = &getTimer<GPUTPCNNClusterizer, 10>(
"GPUTPCNNClusterizer_ONNXRegression_1_", 10);
837 nnTimers[11] = &getTimer<GPUTPCNNClusterizer, 11>(
"GPUTPCNNClusterizer_ONNXRegression2_2_", 11);
842 if (nnApplications[lane].mModelsUsed[0]) {
843 SetONNXGPUStream(*(nnApplications[lane].mModelClass).getSessionOptions(), lane, &deviceId);
844 (nnApplications[lane].
mModelClass).setDeviceId(deviceId);
845 if (nnApplications[lane].mModelClass.getIntraOpNumThreads() > maxThreads) {
848 (nnApplications[lane].
mModelClass).initEnvironment();
857 if (!nn_settings.nnLoadFromCCDB) {
863 if (nnApplications[lane].mModelsUsed[1]) {
864 SetONNXGPUStream(*(nnApplications[lane].mModelReg1).getSessionOptions(), lane, &deviceId);
865 (nnApplications[lane].
mModelReg1).setDeviceId(deviceId);
866 if (nnApplications[lane].mModelReg1.getIntraOpNumThreads() > maxThreads) {
870 (nnApplications[lane].
mModelReg1).initEnvironment();
872 if (!nn_settings.nnLoadFromCCDB) {
873 (nnApplications[lane].
mModelReg1).initSession();
878 if (nnApplications[lane].mModelsUsed[2]) {
879 SetONNXGPUStream(*(nnApplications[lane].mModelReg2).getSessionOptions(), lane, &deviceId);
880 (nnApplications[lane].
mModelReg2).setDeviceId(deviceId);
881 if (nnApplications[lane].mModelReg2.getIntraOpNumThreads() > maxThreads) {
885 (nnApplications[lane].
mModelReg2).initEnvironment();
887 if (!nn_settings.nnLoadFromCCDB) {
888 (nnApplications[lane].
mModelReg2).initSession();
893 if (nn_settings.nnClusterizerVerbosity > 0) {
894 LOG(info) <<
"(ORT) Allocated ONNX stream for lane " << lane <<
" and device " << deviceId;
898 const uint32_t maxAllowedTimebin =
param().
par.continuousTracking ? std::max<int32_t>(
param().continuousMaxTimeBin, maxFragmentLen) : constants::TPC_MAX_TIME_BIN_TRIGGERED;
899 for (int32_t sector = 0; sector <
NSECTORS; sector++) {
902 int32_t lane = sector % numLanes;
906 nnApplications[lane].
initClusterizer(nn_settings, clustererNN, maxFragmentLen, maxAllowedTimebin);
909 clustererNNShadow.
mISector = sector;
911 nnApplications[lane].
initClusterizer(nn_settings, clustererNNShadow, maxFragmentLen, maxAllowedTimebin);
913 if (nn_settings.nnClusterizerVerbosity > 2) {
914 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Processor initialized. Sector " << sector <<
", lane " << lane <<
", max clusters " << clustererNN.
mNnClusterizerTotalClusters <<
" (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
917 if (nn_settings.nnClusterizerVerbosity > 2) {
918 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Memory registered for memoryId " << clustererNN.
mMemoryId <<
" (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
924 if (nn_settings.nnClusterizerVerbosity > 2) {
925 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Writing to constant memory...";
928 if (nn_settings.nnClusterizerVerbosity > 2) {
929 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Writing to constant memory done";
935 size_t nClsTotal = 0;
938 std::unique_ptr<ClusterNative[]> tmpNativeClusterBuffer;
940 const bool buildNativeGPU = doGPU && NeedTPCClustersOnGPU();
946 GPUWarning(
"Requested to process MC labels, but no labels present");
952 if (buildNativeGPU) {
956 GPUFatal(
"ERROR, mWaitForFinalInputs cannot be called with nTPCClustererLanes > 6");
959 if (mWaitForFinalInputs) {
960 GPUFatal(
"Cannot use waitForFinalInput callback without delayed output");
964 tmpNativeClusters =
mInputsHost->mPclusterNativeOutput;
966 tmpNativeClusterBuffer = std::make_unique<ClusterNative[]>(
mInputsHost->mNClusterNative);
967 tmpNativeClusters = tmpNativeClusterBuffer.get();
972 if (propagateMCLabels) {
978 int8_t transferRunning[
NSECTORS] = {0};
981 auto notifyForeignChainFinished = [
this]() {
982 if (mPipelineNotifyCtx) {
985 std::lock_guard<std::mutex> lock(mPipelineNotifyCtx->
mutex);
986 mPipelineNotifyCtx->
ready =
true;
988 mPipelineNotifyCtx->
cond.notify_one();
991 bool synchronizeCalibUpdate =
false;
995 static_assert(
NSECTORS <= constants::GPU_MAX_STREAMS,
"Stream events must be able to hold all sectors");
997 for (
CfFragment fragment =
mCFContext->fragmentFirst; !fragment.isEnd(); fragment = fragment.next()) {
999 GPUInfo(
"Processing time bins [%d, %d) for sectors %d to %d", fragment.
start, fragment.last(), iSectorBase, iSectorBase +
GetProcessingSettings().nTPCClustererLanes - 1);
1002 if (doGPU && fragment.
index != 0) {
1003 SynchronizeStream(lane);
1006 uint32_t
iSector = iSectorBase + lane;
1014 bool setDigitsOnHost = (not doGPU && not
mIOPtrs.
tpcZS) || propagateMCLabels;
1017 if (setDigitsOnGPU) {
1018 GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.
mPdigits, inDigits->tpcDigits[
iSector],
sizeof(clustererShadow.
mPdigits[0]) * numDigits, lane,
true);
1020 if (setDigitsOnHost) {
1036 using ChargeMapType =
decltype(*clustererShadow.
mPchargeMap);
1037 using PeakMapType =
decltype(*clustererShadow.
mPpeakMap);
1040 if (fragment.
index == 0) {
1062 if (propagateMCLabels) {
1063 if (fragment.
index == 0) {
1070 GPUFatal(
"MC label container missing, sector %d",
iSector);
1079 runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({
GetGrid(1, lane), {
iSector}},
mIOPtrs.
tpcZS ==
nullptr);
1081 }
else if (propagateMCLabels) {
1093 GPUFatal(
"Data with invalid TPC ZS mode (%d) received",
mCFContext->zsVersion);
1097 runKernel<GPUTPCCFDecodeZS>({
GetGridBlk(nBlocks, lane), {
iSector}}, firstHBF, tpcTimeBinCut);
1100 runKernel<GPUTPCCFDecodeZSLink>({
GetGridBlk(nBlocks, lane), {
iSector}}, firstHBF, tpcTimeBinCut);
1103 runKernel<GPUTPCCFDecodeZSDenseLink>({
GetGridBlk(nBlocks, lane), {
iSector}}, firstHBF, tpcTimeBinCut);
1110 uint32_t
iSector = iSectorBase + lane;
1121 if (nextSector < NSECTORS && mIOPtrs.tpcZS && mCFContext->nPagesSector[nextSector] &&
mCFContext->zsVersion != -1 && !
mCFContext->abandonTimeframe) {
1134 TPCClusterizerTransferExtraADC(clusterer, clustererShadow, lane, extraADCs);
1136 if (propagateMCLabels) {
1140 bool checkForNoisyPads = (
rec()->
GetParam().
rec.tpc.maxTimeBinAboveThresholdIn1000Bin > 0) || (
rec()->
GetParam().
rec.tpc.maxConsecTimeBinAboveThreshold > 0);
1141 checkForNoisyPads &= (
rec()->
GetParam().
rec.tpc.noisyPadsQuickCheck ? fragment.
index == 0 :
true);
1148 GPUError(
"HIP tail filter enabled, but this is currently not supported on CPU");
1151 if (checkForNoisyPads) {
1168 TPCClusterizerCheckExtraADCZeros(clusterer, clustererShadow, lane, extraADCs);
1175 RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 0, doGPU, lane);
1180 uint32_t
iSector = iSectorBase + lane;
1195 RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 1, doGPU, lane);
1200 uint32_t
iSector = iSectorBase + lane;
1212 if (fragment.
index == 0) {
1214 if (transferRunning[lane] == 1) {
1216 transferRunning[lane] = 2;
1222 if (nRegularClusters != 0) {
1224#ifdef GPUCA_HAS_ONNX
1231 if (nn_settings.nnClusterizerApplyCfDeconvolution) {
1238 if (nn_settings.nnClusterizerVerbosity > 2) {
1242 if (nn_settings.nnClusterizerVerbosity > 3) {
1243 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Start. Loop=" << batch <<
". (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1256 if (nn_settings.nnClusterizerVerbosity > 3) {
1257 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Done filling data. Loop=" << batch <<
". (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1262 if (nn_settings.nnClusterizerVerbosity > 3) {
1263 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Done setting deconvolution flags. Loop=" << batch <<
". (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1284 if (nn_settings.nnClusterizerVerbosity > 3) {
1285 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Done with NN classification inference. Loop=" << batch <<
". (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1321 if (nn_settings.nnClusterizerVerbosity > 3) {
1322 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Done with NN regression inference. Loop=" << batch <<
". (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1339 if (nn_settings.nnClusterizerVerbosity > 3) {
1340 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Done publishing. Loop=" << batch <<
". (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1345 if(!nn_settings.nnClusterizerApplyCfDeconvolution) {
1350 if (nn_settings.nnClusterizerVerbosity > 3) {
1351 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Done with CF regression. (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1355 GPUFatal(
"Project not compiled with neural network clusterization. Aborting.");
1363 if (doGPU && propagateMCLabels) {
1379 if (doGPU && propagateMCLabels) {
1386 bool hasClusters = nRegularClusters != 0;
1393 if (
rec()->
GetParam().
rec.tpc.hipTailFilter && nRegularClusters == 0) {
1408 laneHasData[lane] =
true;
1413 size_t nClsFirst = nClsTotal;
1414 bool anyLaneHasData =
false;
1415 for (int32_t lane = 0; lane < maxLane; lane++) {
1416 uint32_t
iSector = iSectorBase + lane;
1424 if (laneHasData[lane]) {
1425 anyLaneHasData =
true;
1437 if (buildNativeGPU) {
1441 }
else if (buildNativeHost) {
1447 if (transferRunning[lane]) {
1451 transferRunning[lane] = 1;
1454 if (not propagateMCLabels || not laneHasData[lane]) {
1461 assert(propagateMCLabels ? mcLinearLabels.
header.size() == nClsTotal :
true);
1463 for (int32_t lane = 0; lane < maxLane; lane++) {
1466 if (buildNativeHost && buildNativeGPU && anyLaneHasData) {
1468 mOutputQueue.emplace_back(
outputQueueEntry{(
void*)((
char*)&tmpNativeClusters[nClsFirst] - (
char*)&tmpNativeClusters[0]), &
mInputsShadow->mPclusterNativeBuffer[nClsFirst], (nClsTotal - nClsFirst) *
sizeof(tmpNativeClusters[0]), RecoStep::TPCClusterFinding});
1470 GPUMemCpy(RecoStep::TPCClusterFinding, (
void*)&tmpNativeClusters[nClsFirst], (
const void*)&
mInputsShadow->mPclusterNativeBuffer[nClsFirst], (nClsTotal - nClsFirst) *
sizeof(tmpNativeClusters[0]),
mRec->
NStreams() - 1,
false);
1474 if (mWaitForFinalInputs && iSectorBase >= 21 && (int32_t)iSectorBase < 21 +
GetProcessingSettings().nTPCClustererLanes) {
1475 notifyForeignChainFinished();
1477 if (mWaitForFinalInputs && iSectorBase >= 30 && (int32_t)iSectorBase < 30 +
GetProcessingSettings().nTPCClustererLanes) {
1478 mWaitForFinalInputs();
1483#ifdef GPUCA_HAS_ONNX
1486 LOG(info) <<
"(ORT) Environment releasing...";
1494 if (transferRunning[
i]) {
1501 if (triggerOutput && triggerOutput->
allocator) {
1511 GPUInfo(
"Event has %zu TPC Clusters", nClsTotal);
1515 if (propagateMCLabels) {
1517 std::pair<ConstMCLabelContainer*, ConstMCLabelContainerView*>
buffer;
1519 if (!
GetProcessingSettings().tpcWriteClustersAfterRejection && !sortClusters && labelOutputControl && labelOutputControl->useExternal()) {
1520 if (!labelOutputControl->allocator) {
1521 throw std::runtime_error(
"Cluster MC Label buffer missing");
1524 buffer = {&container->first, &container->second};
1531 assert(propagateMCLabels ? mcLinearLabels.
header.size() == nClsTotal :
true);
1532 assert(propagateMCLabels ? mcLinearLabels.
data.size() >= nClsTotal :
true);
1537 mcLabelsConstView =
buffer.second;
1543 tmpNativeClusters =
mInputsHost->mPclusterNativeOutput;
1549 if (buildNativeHost) {
1555 auto allocator = [
this, &tmpNativeClusters](
size_t size) {
1558 return (tmpNativeClusters = this->
mInputsHost->mPclusterNativeOutput);
1560 RunTPCClusterFilter(tmpNativeAccess, allocator,
false);
1565 if (!mWaitForFinalInputs) {
1566 notifyForeignChainFinished();
1569 if (buildNativeGPU) {
1574 mInputsHost->mPclusterNativeAccess->setOffsetPtrs();
1577 if (doGPU && synchronizeOutput) {
1580 if (doGPU && synchronizeCalibUpdate) {
1584 SortClusters(buildNativeGPU, propagateMCLabels, tmpNativeAccess, tmpNativeClusters);
1588 if (mPipelineNotifyCtx) {
1590 mPipelineNotifyCtx =
nullptr;
1603 if (propagateMCLabels) {
1605 std::iota(clsOrder.begin(), clsOrder.end(), 0);
1606 std::vector<ClusterNative> tmpClusters;
1611 return clusters[a] < clusters[b];
1613 tmpClusters.resize(clusterAccess->
nClusters[
i][
j]);
1615 for (uint32_t k = 0; k < tmpClusters.size(); k++) {
1620 tmpClusters.clear();
1622 std::pair<o2::dataformats::ConstMCLabelContainer*, o2::dataformats::ConstMCLabelContainerView*> labelBuffer;
1624 std::unique_ptr<ConstMCLabelContainerView> tmpUniqueContainerView;
1625 std::unique_ptr<ConstMCLabelContainer> tmpUniqueContainerBuffer;
1626 if (labelOutput && labelOutput->
allocator) {
1628 labelBuffer = {&labelContainer->first, &labelContainer->second};
1639 for (
const auto&
element : clusterAccess->clustersMCTruth->
getLabels(clsOrder[
i])) {
1644 *labelBuffer.second = *labelBuffer.first;
1653 if (buildNativeGPU) {
default_random_engine gen(dev())
o2::raw::RawFileWriter * raw
Class to serialize ONNX objects for ROOT snapshots of CCDB objects at runtime.
std::enable_if_t< std::is_signed< T >::value, bool > hasData(const CalArray< T > &cal)
Provides a basic fallback implementation for Vc.
Definitions of TPC Zero Suppression Data Headers.
std::unique_ptr< o2::tpc::ClusterNativeAccess > mClusterNativeAccess
int32_t RunTPCClusterizer(bool synchronizeOutput=true)
std::unique_ptr< GPUTrackingInputProvider > mInputsHost
std::array< GPUOutputControl *, GPUTrackingOutputs::count()> mSubOutputControls
std::unique_ptr< std::ofstream > mDebugFile
std::unique_ptr< GPUTriggerOutputs > mTriggerBuffer
std::vector< outputQueueEntry > mOutputQueue
bool mUpdateNewCalibObjects
std::unique_ptr< GPUTPCCFChainContext > mCFContext
int32_t DoQueuedUpdates(int32_t stream, bool updateSlave=true)
std::unique_ptr< GPUNewCalibValues > mNewCalibValues
GPUTrackingInOutPointers & mIOPtrs
struct o2::gpu::GPUChainTracking::InOutMemory mIOMem
std::unique_ptr< GPUTrackingInputProvider > mInputsShadow
int32_t ForwardTPCDigits()
void RecordMarker(deviceEvent *ev, int32_t stream)
void TransferMemoryResourceLinkToGPU(RecoStep step, int16_t res, int32_t stream=-1, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
void GPUMemCpyAlways(RecoStep step, void *dst, const void *src, size_t size, int32_t stream, int32_t toGPU, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
void GPUMemCpy(RecoStep step, void *dst, const void *src, size_t size, int32_t stream, int32_t toGPU, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
bool DoDebugAndDump(RecoStep step, uint32_t mask, T &processor, S T::*func, Args &&... args)
GPUReconstruction::RecoStepField GetRecoStepsGPU() const
GPUReconstruction::RecoStepField GetRecoSteps() const
void WriteToConstantMemory(RecoStep step, size_t offset, const void *src, size_t size, int32_t stream=-1, deviceEvent *ev=nullptr)
void ReleaseEvent(deviceEvent ev, bool doGPU=true)
krnlExec GetGrid(uint32_t totalItems, uint32_t nThreads, int32_t stream, GPUReconstruction::krnlDeviceType d=GPUReconstruction::krnlDeviceType::Auto, gpudatatypes::RecoStep st=gpudatatypes::RecoStep::NoRecoStep)
size_t AllocateRegisteredMemory(GPUProcessor *proc)
virtual std::unique_ptr< GPUReconstructionProcessing::threadContext > GetThreadContext()
GPUConstantMem * processors()
static constexpr krnlRunRange krnlRunRangeNone
void SetONNXGPUStream(Ort::SessionOptions &opt, int32_t stream, int32_t *deviceId)
krnlExec GetGridAutoStep(int32_t stream, gpudatatypes::RecoStep st=gpudatatypes::RecoStep::NoRecoStep)
void SetupGPUProcessor(T *proc, bool allocate)
const GPUSettingsProcessing & GetProcessingSettings() const
void SynchronizeStream(int32_t stream)
GPUReconstructionCPU * mRec
GPUConstantMem * processorsShadow()
krnlExec GetGridBlk(uint32_t nBlocks, int32_t stream, GPUReconstruction::krnlDeviceType d=GPUReconstruction::krnlDeviceType::Auto, gpudatatypes::RecoStep st=gpudatatypes::RecoStep::NoRecoStep)
static constexpr int32_t NSECTORS
const GPUParam & GetParam() const
void TransferMemoryResourceLinkToHost(RecoStep step, int16_t res, int32_t stream=-1, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
void TransferMemoryResourcesToHost(RecoStep step, GPUProcessor *proc, int32_t stream=-1, bool all=false)
GPUReconstruction * rec()
HighResTimer & getGeneralStepTimer(GeneralStep step)
void runParallelOuterLoop(bool doGPU, uint32_t nThreads, std::function< void(uint32_t)> lambda)
void SetNActiveThreads(int32_t n)
int32_t getNKernelHostThreads(bool splitCores)
const GPUDefParameters & getGPUParameters(bool doGPU) const override
void SetNActiveThreadsOuterLoop(uint32_t f)
void AllocateRegisteredForeignMemory(int16_t res, GPUReconstruction *rec, GPUOutputControl *control=nullptr)
void ComputeReuseMax(GPUProcessor *proc)
RecoStepField GetRecoStepsGPU() const
void UnblockStackedMemory()
void PopNonPersistentMemory(RecoStep step, uint64_t tag, const GPUProcessor *proc=nullptr)
uint32_t NStreams() const
const GPUParam & GetParam() const
void PushNonPersistentMemory(uint64_t tag)
InOutTypeField GetRecoStepsOutputs() const
GPUMemorySizeScalers * MemoryScalers()
static int32_t GetNBlocks(bool isGPU)
static void setGlobalOffsetsAndAllocate(GPUTPCClusterFinder &, GPUTPCLinearLabels &)
void DumpDigits(std::ostream &out)
void SetMaxData(const GPUTrackingInOutPointers &io)
CfChargePos * mPpeakPositions
void DumpClusters(std::ostream &out)
void SetNMaxDigits(size_t nDigits, size_t nPages, size_t nDigitsFragment, size_t nDigitsEndpointMax)
void DumpSuppressedPeaks(std::ostream &out)
uint32_t mNMaxClusterPerRow
HIPTailDescriptor * mPhipTailsByRow
void DumpPeakMap(std::ostream &out, std::string_view)
o2::dataformats::ConstMCTruthContainerView< o2::MCCompLabel > const * mPinputLabels
void InitMCBuffersForFragment()
uint32_t * mPclusterInRow
void DumpChargeMap(std::ostream &out, std::string_view)
uint32_t getNSteps(size_t items) const
CfChargePos * mPpositions
CfChargePos * mPfilteredPeakPositions
void DumpSuppressedPeaksCompacted(std::ostream &out)
void DumpPeaksCompacted(std::ostream &out)
tpc::ClusterNative * mPclusterByRow
void DumpPeaks(std::ostream &out)
static constexpr uint32_t NROWS
o2::ml::OrtModel mModelReg1
o2::ml::OrtModel mModelClass
o2::ml::OrtModel mModelReg2
void init(const GPUSettingsProcessingNNclusterizer &, bool=false)
void initClusterizer(const GPUSettingsProcessingNNclusterizer &, GPUTPCNNClusterizer &, int32_t=-1, int32_t=-1)
float * mOutputDataReg2_32
OrtDataType::Float16_t * mInputData_16
int32_t mNnClusterizerBatchedMode
int32_t mNnClusterizerTotalClusters
OrtDataType::Float16_t * mOutputDataReg2_16
float * mModelProbabilities_32
int32_t mNnClusterizerUseCfRegression
int32_t mNnInferenceInputDType
int32_t mNnInferenceOutputDType
float * mOutputDataReg1_32
uint32_t mNnClusterizerRowTimeSizeThreads
OrtDataType::Float16_t * mModelProbabilities_16
int8_t mNnClusterizerSetDeconvolutionFlags
OrtDataType::Float16_t * mOutputDataReg1_16
int8_t mNnClusterizerUseClassification
void setIntraOpNumThreads(int threads)
std::vector< std::vector< int64_t > > getNumOutputNodes() const
#define TPC_REAL_PADS_IN_SECTOR
#define TPC_CLUSTERER_STRIDED_PAD_COUNT
GLboolean GLboolean GLboolean b
GLboolean GLboolean GLboolean GLboolean a
uint8_t itsSharedClusterMap uint8_t
constexpr int LHCMaxBunches
@ TPCClustererSuppressedPeaks
@ TPCClustererZeroedCharges
@ TPCClustererChargeMapSplit
void dumpBuffer(gsl::span< const std::byte > buffer, std::ostream &out=std::cout, size_t maxbytes=std::numeric_limits< size_t >::max())
std::unique_ptr< const o2::dataformats::MCTruthContainer< MCLabel > > getLabels(framework::ProcessingContext &pc, std::string_view dataBind, EventType eventType=EventType::Standard)
constexpr int LHCBCPERTIMEBIN
constexpr int MAXGLOBALPADROW
Global TPC definitions and constants.
@ ZSVersionDenseLinkBased
@ ZSVersionLinkBasedWithMeta
@ ZSVersionRowBased10BitADC
@ ZSVersionRowBased12BitADC
a couple of static helper functions to create timestamp values for CCDB queries or override obsolete ...
constexpr T qStr2Tag(const char(&str)[N])
S< o2::tpc::ORTRootSerializer >::type * nnClusterizerNetworks[3]
std::condition_variable cond
std::unique_ptr< o2::dataformats::ConstMCTruthContainerView< o2::MCCompLabel > > clusterNativeMCView
std::unique_ptr< o2::dataformats::ConstMCTruthContainer< o2::MCCompLabel > > clusterNativeMCBuffer
deviceEvent stream[constants::GPU_MAX_STREAMS]
GPUCalibObjectsConst calibObjects
GPUTPCClusterFinder tpcClusterer[GPUTPCGeometry::NSECTORS]
GPUTrackingInOutPointers ioPtrs
size_t NTPCClusters(size_t tpcDigits, bool perSector=false)
std::function< void *(size_t)> allocator
tpccf::SizeT nDigitsInFragment
struct o2::gpu::GPUTPCClusterFinder::Memory::counters_t counters
std::vector< o2::MCCompLabel > data
std::vector< o2::dataformats::MCTruthHeaderElement > header
size_t nTPCDigits[NSECTORS]
const GPUTPCDigitsMCInput * tpcDigitsMC
const o2::tpc::ClusterNativeAccess * clustersNative
const GPUSettingsTF * settingsTF
const GPUTrackingInOutZS * tpcZS
const GPUTrackingInOutDigits * tpcPackedDigits
const void *const * zsPtr[NENDPOINTS]
uint32_t count[NENDPOINTS]
const uint32_t * nZSPtr[NENDPOINTS]
GPUTrackingInOutZSSector sector[NSECTORS]
static constexpr uint32_t NENDPOINTS
GPUOutputControl clustersNative
size_t getIndex(const GPUOutputControl &v)
GPUOutputControl clusterLabels
GPUOutputControl tpcTriggerWords
static constexpr int getVersion()
get numeric version of the RDH
unsigned int nClusters[constants::MAXSECTOR][constants::MAXGLOBALPADROW]
const o2::dataformats::ConstMCTruthContainerView< o2::MCCompLabel > * clustersMCTruth
std::pair< ConstMCLabelContainer, ConstMCLabelContainerView > ConstMCLabelContainerViewWithBuffer
unsigned int nClustersTotal
unsigned int clusterOffset[constants::MAXSECTOR][constants::MAXGLOBALPADROW]
const ClusterNative * clustersLinear
static constexpr unsigned int TRIGGER_WORD_SIZE
unsigned char nTimeBinSpan
unsigned short timeOffset
static constexpr size_t TPC_ZS_PAGE_SIZE
unsigned short nADCsamples
Trigger info including the orbit.
uint32_t orbit
orbit of the trigger word
TriggerWordDLBZS triggerWord
trigger Word information
bool isValid(int entry=0) const
LOG(info)<< "Compressed in "<< sw.CpuTime()<< " s"
std::vector< Cluster > clusters
std::vector< Digit > digits
typename std::vector< T, vecpod_allocator< T > > vecpod