590 if (
param().
rec.fwdTPCDigitsAsClusters) {
593#ifdef GPUCA_TPC_GEOMETRY_O2
598 if (RunTPCClusterizer_prepare(mPipelineNotifyCtx &&
GetProcessingSettings().doublePipelineClusterizer)) {
606 float tpcHitLowOccupancyScalingFactor = 1.f;
614 if (nHitsBase < threshold) {
617 tpcHitLowOccupancyScalingFactor = std::min(3.5f, (
float)threshold / nHitsBase);
620 for (uint32_t iSector = 0; iSector <
NSECTORS; iSector++) {
624 for (uint32_t iSector = 0; iSector <
NSECTORS; iSector++) {
628 RunTPCClusterizer_prepare(
true);
647 int32_t deviceId = -1;
653 nnTimers[0] = &getTimer<GPUTPCNNClusterizer, 0>(
"GPUTPCNNClusterizer_ONNXClassification_0_", 0);
654 nnTimers[1] = &getTimer<GPUTPCNNClusterizer, 1>(
"GPUTPCNNClusterizer_ONNXRegression_1_", 1);
655 nnTimers[2] = &getTimer<GPUTPCNNClusterizer, 2>(
"GPUTPCNNClusterizer_ONNXRegression2_2_", 2);
656 nnTimers[3] = &getTimer<GPUTPCNNClusterizer, 3>(
"GPUTPCNNClusterizer_ONNXClassification_0_", 3);
657 nnTimers[4] = &getTimer<GPUTPCNNClusterizer, 4>(
"GPUTPCNNClusterizer_ONNXRegression_1_", 4);
658 nnTimers[5] = &getTimer<GPUTPCNNClusterizer, 5>(
"GPUTPCNNClusterizer_ONNXRegression2_2_", 5);
659 nnTimers[6] = &getTimer<GPUTPCNNClusterizer, 6>(
"GPUTPCNNClusterizer_ONNXClassification_0_", 6);
660 nnTimers[7] = &getTimer<GPUTPCNNClusterizer, 7>(
"GPUTPCNNClusterizer_ONNXRegression_1_", 7);
661 nnTimers[8] = &getTimer<GPUTPCNNClusterizer, 8>(
"GPUTPCNNClusterizer_ONNXRegression2_2_", 8);
662 nnTimers[9] = &getTimer<GPUTPCNNClusterizer, 9>(
"GPUTPCNNClusterizer_ONNXClassification_0_", 9);
663 nnTimers[10] = &getTimer<GPUTPCNNClusterizer, 10>(
"GPUTPCNNClusterizer_ONNXRegression_1_", 10);
664 nnTimers[11] = &getTimer<GPUTPCNNClusterizer, 11>(
"GPUTPCNNClusterizer_ONNXRegression2_2_", 11);
669 if (nnApplications[lane].mModelsUsed[0]) {
670 SetONNXGPUStream(*(nnApplications[lane].mModelClass).getSessionOptions(), lane, &deviceId);
671 (nnApplications[lane].
mModelClass).setDeviceId(deviceId);
672 if (nnApplications[lane].mModelClass.getIntraOpNumThreads() > maxThreads) {
675 (nnApplications[lane].
mModelClass).initEnvironment();
686 if (nnApplications[lane].mModelsUsed[1]) {
687 SetONNXGPUStream(*(nnApplications[lane].mModelReg1).getSessionOptions(), lane, &deviceId);
688 (nnApplications[lane].
mModelReg1).setDeviceId(deviceId);
689 if (nnApplications[lane].mModelReg1.getIntraOpNumThreads() > maxThreads) {
693 (nnApplications[lane].
mModelReg1).initEnvironment();
695 (nnApplications[lane].
mModelReg1).initSession();
697 if (nnApplications[lane].mModelsUsed[2]) {
698 SetONNXGPUStream(*(nnApplications[lane].mModelReg2).getSessionOptions(), lane, &deviceId);
699 (nnApplications[lane].
mModelReg2).setDeviceId(deviceId);
700 if (nnApplications[lane].mModelReg2.getIntraOpNumThreads() > maxThreads) {
704 (nnApplications[lane].
mModelReg2).initEnvironment();
706 (nnApplications[lane].
mModelReg2).initSession();
708 if (nn_settings.nnClusterizerVerbosity > 0) {
709 LOG(info) <<
"(ORT) Allocated ONNX stream for lane " << lane <<
" and device " << deviceId;
714 for (int32_t sector = 0; sector <
NSECTORS; sector++) {
717 int32_t lane = sector % numLanes;
721 nnApplications[lane].
initClusterizer(nn_settings, clustererNN, maxFragmentLen, maxAllowedTimebin);
724 clustererNNShadow.
mISector = sector;
726 nnApplications[lane].
initClusterizer(nn_settings, clustererNNShadow, maxFragmentLen, maxAllowedTimebin);
728 if (nn_settings.nnClusterizerVerbosity > 2) {
729 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Processor initialized. Sector " << sector <<
", lane " << lane <<
", max clusters " << clustererNN.
mNnClusterizerTotalClusters <<
" (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
732 if (nn_settings.nnClusterizerVerbosity > 2) {
733 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Memory registered for memoryId " << clustererNN.
mMemoryId <<
" (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
739 if (nn_settings.nnClusterizerVerbosity > 2) {
740 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Writing to constant memory...";
743 if (nn_settings.nnClusterizerVerbosity > 2) {
744 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Writing to constant memory done";
750 size_t nClsTotal = 0;
753 std::unique_ptr<ClusterNative[]> tmpNativeClusterBuffer;
760 bool buildNativeGPU = doGPU && NeedTPCClustersOnGPU();
764 if (buildNativeGPU) {
768 GPUFatal(
"ERROR, mWaitForFinalInputs cannot be called with nTPCClustererLanes > 6");
771 if (mWaitForFinalInputs) {
772 GPUFatal(
"Cannot use waitForFinalInput callback without delayed output");
776 tmpNativeClusters =
mInputsHost->mPclusterNativeOutput;
778 tmpNativeClusterBuffer = std::make_unique<ClusterNative[]>(
mInputsHost->mNClusterNative);
779 tmpNativeClusters = tmpNativeClusterBuffer.get();
784 if (propagateMCLabels) {
790 int8_t transferRunning[
NSECTORS] = {0};
793 auto notifyForeignChainFinished = [
this]() {
794 if (mPipelineNotifyCtx) {
797 std::lock_guard<std::mutex> lock(mPipelineNotifyCtx->
mutex);
798 mPipelineNotifyCtx->
ready =
true;
800 mPipelineNotifyCtx->
cond.notify_one();
803 bool synchronizeCalibUpdate =
false;
809 for (
CfFragment fragment =
mCFContext->fragmentFirst; !fragment.isEnd(); fragment = fragment.next()) {
811 GPUInfo(
"Processing time bins [%d, %d) for sectors %d to %d", fragment.
start, fragment.last(), iSectorBase, iSectorBase +
GetProcessingSettings().nTPCClustererLanes - 1);
814 if (doGPU && fragment.
index != 0) {
815 SynchronizeStream(lane);
818 uint32_t iSector = iSectorBase + lane;
826 bool setDigitsOnHost = (not doGPU && not
mIOPtrs.
tpcZS) || propagateMCLabels;
828 size_t numDigits = inDigits->
nTPCDigits[iSector];
829 if (setDigitsOnGPU) {
830 GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.
mPdigits, inDigits->tpcDigits[iSector],
sizeof(clustererShadow.
mPdigits[0]) * numDigits, lane,
true);
832 if (setDigitsOnHost) {
848 using ChargeMapType =
decltype(*clustererShadow.
mPchargeMap);
849 using PeakMapType =
decltype(*clustererShadow.
mPpeakMap);
852 if (fragment.
index == 0) {
874 if (propagateMCLabels && fragment.
index == 0) {
878 GPUFatal(
"MC label container missing, sector %d", iSector);
887 runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({
GetGrid(1, lane), {iSector}},
mIOPtrs.
tpcZS ==
nullptr);
889 }
else if (propagateMCLabels) {
901 GPUFatal(
"Data with invalid TPC ZS mode (%d) received",
mCFContext->zsVersion);
905 runKernel<GPUTPCCFDecodeZS>({
GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
908 runKernel<GPUTPCCFDecodeZSLink>({
GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
911 runKernel<GPUTPCCFDecodeZSDenseLink>({
GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
918 uint32_t iSector = iSectorBase + lane;
924 int32_t nextSector = iSector;
929 if (nextSector < NSECTORS && mIOPtrs.tpcZS && mCFContext->nPagesSector[nextSector] &&
mCFContext->zsVersion != -1 && !
mCFContext->abandonTimeframe) {
945 if (propagateMCLabels) {
949 bool checkForNoisyPads = (
rec()->
GetParam().
rec.tpc.maxTimeBinAboveThresholdIn1000Bin > 0) || (
rec()->
GetParam().
rec.tpc.maxConsecTimeBinAboveThreshold > 0);
950 checkForNoisyPads &= (
rec()->
GetParam().
rec.tpc.noisyPadsQuickCheck ? fragment.
index == 0 :
true);
953 if (checkForNoisyPads) {
956 runKernel<GPUTPCCFCheckPadBaseline>({
GetGridBlk(nBlocks, lane), {iSector}});
964 RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 0, doGPU, lane);
969 uint32_t iSector = iSectorBase + lane;
978 runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({
GetGrid(clusterer.
mPmemory->
counters.
nPeaks, lane), {iSector}});
984 RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 1, doGPU, lane);
989 uint32_t iSector = iSectorBase + lane;
997 if (fragment.
index == 0) {
999 if (transferRunning[lane] == 1) {
1001 transferRunning[lane] = 2;
1011#ifdef GPUCA_HAS_ONNX
1018 if (nn_settings.nnClusterizerApplyCfDeconvolution) {
1025 if (nn_settings.nnClusterizerVerbosity > 2) {
1029 if (nn_settings.nnClusterizerVerbosity > 3) {
1030 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Start. Loop=" << batch <<
". (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1043 if (nn_settings.nnClusterizerVerbosity > 3) {
1044 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Done filling data. Loop=" << batch <<
". (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1049 if (nn_settings.nnClusterizerVerbosity > 3) {
1050 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Done setting deconvolution flags. Loop=" << batch <<
". (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1071 if (nn_settings.nnClusterizerVerbosity > 3) {
1072 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Done with NN classification inference. Loop=" << batch <<
". (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1108 if (nn_settings.nnClusterizerVerbosity > 3) {
1109 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Done with NN regression inference. Loop=" << batch <<
". (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1126 if (nn_settings.nnClusterizerVerbosity > 3) {
1127 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Done publishing. Loop=" << batch <<
". (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1132 if(!nn_settings.nnClusterizerApplyCfDeconvolution) {
1137 if (nn_settings.nnClusterizerVerbosity > 3) {
1138 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Done with CF regression. (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1142 GPUFatal(
"Project not compiled with neural network clusterization. Aborting.");
1150 if (doGPU && propagateMCLabels) {
1163 laneHasData[lane] =
true;
1170 size_t nClsFirst = nClsTotal;
1171 bool anyLaneHasData =
false;
1172 for (int32_t lane = 0; lane < maxLane; lane++) {
1173 uint32_t iSector = iSectorBase + lane;
1181 if (laneHasData[lane]) {
1182 anyLaneHasData =
true;
1188 clusterer.raiseError(GPUErrors::ERROR_CF_GLOBAL_CLUSTER_OVERFLOW, iSector * 1000 +
j, nClsTotal + clusterer.
mPclusterInRow[
j],
mInputsHost->mNClusterNative);
1191 if (buildNativeGPU) {
1195 }
else if (buildNativeHost) {
1201 if (transferRunning[lane]) {
1205 transferRunning[lane] = 1;
1208 if (not propagateMCLabels || not laneHasData[lane]) {
1209 assert(propagateMCLabels ? mcLinearLabels.
header.size() == nClsTotal :
true);
1217 assert(propagateMCLabels ? mcLinearLabels.
header.size() == nClsTotal :
true);
1219 if (propagateMCLabels) {
1220 for (int32_t lane = 0; lane < maxLane; lane++) {
1224 if (buildNativeHost && buildNativeGPU && anyLaneHasData) {
1226 mOutputQueue.emplace_back(
outputQueueEntry{(
void*)((
char*)&tmpNativeClusters[nClsFirst] - (
char*)&tmpNativeClusters[0]), &
mInputsShadow->mPclusterNativeBuffer[nClsFirst], (nClsTotal - nClsFirst) *
sizeof(tmpNativeClusters[0]), RecoStep::TPCClusterFinding});
1228 GPUMemCpy(RecoStep::TPCClusterFinding, (
void*)&tmpNativeClusters[nClsFirst], (
const void*)&
mInputsShadow->mPclusterNativeBuffer[nClsFirst], (nClsTotal - nClsFirst) *
sizeof(tmpNativeClusters[0]),
mRec->
NStreams() - 1,
false);
1232 if (mWaitForFinalInputs && iSectorBase >= 21 && (int32_t)iSectorBase < 21 +
GetProcessingSettings().nTPCClustererLanes) {
1233 notifyForeignChainFinished();
1235 if (mWaitForFinalInputs && iSectorBase >= 30 && (int32_t)iSectorBase < 30 +
GetProcessingSettings().nTPCClustererLanes) {
1236 mWaitForFinalInputs();
1241#ifdef GPUCA_HAS_ONNX
1244 LOG(info) <<
"(ORT) Environment releasing...";
1252 if (transferRunning[
i]) {
1259 if (triggerOutput && triggerOutput->
allocator) {
1268 if (propagateMCLabels) {
1271 std::pair<ConstMCLabelContainer*, ConstMCLabelContainerView*>
buffer;
1274 throw std::runtime_error(
"Cluster MC Label buffer missing");
1277 buffer = {&container->first, &container->second};
1285 assert(propagateMCLabels ? mcLinearLabels.
header.size() == nClsTotal :
true);
1286 assert(propagateMCLabels ? mcLinearLabels.
data.size() >= nClsTotal :
true);
1291 mcLabelsConstView =
buffer.second;
1297 tmpNativeClusters =
mInputsHost->mPclusterNativeOutput;
1303 if (buildNativeHost) {
1309 auto allocator = [
this, &tmpNativeClusters](
size_t size) {
1312 return (tmpNativeClusters = this->
mInputsHost->mPclusterNativeOutput);
1314 RunTPCClusterFilter(tmpNativeAccess, allocator,
false);
1319 if (!mWaitForFinalInputs) {
1320 notifyForeignChainFinished();
1323 if (buildNativeGPU) {
1328 mInputsHost->mPclusterNativeAccess->setOffsetPtrs();
1331 if (doGPU && synchronizeOutput) {
1334 if (doGPU && synchronizeCalibUpdate) {
1343 if (buildNativeGPU) {
1344 GPUMemCpy(RecoStep::TPCClusterFinding, (
void*)
mInputsShadow->mPclusterNativeBuffer, (
const void*)tmpNativeClusters, nClsTotal *
sizeof(tmpNativeClusters[0]), -1,
true);
1349 if (mPipelineNotifyCtx) {
1351 mPipelineNotifyCtx =
nullptr;