586 if (
param().
rec.fwdTPCDigitsAsClusters) {
589#ifdef GPUCA_TPC_GEOMETRY_O2
594 if (RunTPCClusterizer_prepare(mPipelineNotifyCtx &&
GetProcessingSettings().doublePipelineClusterizer)) {
602 float tpcHitLowOccupancyScalingFactor = 1.f;
610 if (nHitsBase < threshold) {
613 tpcHitLowOccupancyScalingFactor = std::min(3.5f, (
float)threshold / nHitsBase);
616 for (uint32_t iSector = 0; iSector <
NSECTORS; iSector++) {
620 for (uint32_t iSector = 0; iSector <
NSECTORS; iSector++) {
624 RunTPCClusterizer_prepare(
true);
643 int32_t deviceId = -1;
649 nnTimers[0] = &getTimer<GPUTPCNNClusterizer, 0>(
"GPUTPCNNClusterizer_ONNXClassification_0_", 0);
650 nnTimers[1] = &getTimer<GPUTPCNNClusterizer, 1>(
"GPUTPCNNClusterizer_ONNXRegression_1_", 1);
651 nnTimers[2] = &getTimer<GPUTPCNNClusterizer, 2>(
"GPUTPCNNClusterizer_ONNXRegression2_2_", 2);
652 nnTimers[3] = &getTimer<GPUTPCNNClusterizer, 3>(
"GPUTPCNNClusterizer_ONNXClassification_0_", 3);
653 nnTimers[4] = &getTimer<GPUTPCNNClusterizer, 4>(
"GPUTPCNNClusterizer_ONNXRegression_1_", 4);
654 nnTimers[5] = &getTimer<GPUTPCNNClusterizer, 5>(
"GPUTPCNNClusterizer_ONNXRegression2_2_", 5);
655 nnTimers[6] = &getTimer<GPUTPCNNClusterizer, 6>(
"GPUTPCNNClusterizer_ONNXClassification_0_", 6);
656 nnTimers[7] = &getTimer<GPUTPCNNClusterizer, 7>(
"GPUTPCNNClusterizer_ONNXRegression_1_", 7);
657 nnTimers[8] = &getTimer<GPUTPCNNClusterizer, 8>(
"GPUTPCNNClusterizer_ONNXRegression2_2_", 8);
658 nnTimers[9] = &getTimer<GPUTPCNNClusterizer, 9>(
"GPUTPCNNClusterizer_ONNXClassification_0_", 9);
659 nnTimers[10] = &getTimer<GPUTPCNNClusterizer, 10>(
"GPUTPCNNClusterizer_ONNXRegression_1_", 10);
660 nnTimers[11] = &getTimer<GPUTPCNNClusterizer, 11>(
"GPUTPCNNClusterizer_ONNXRegression2_2_", 11);
665 if (nnApplications[lane].mModelsUsed[0]) {
666 SetONNXGPUStream(*(nnApplications[lane].mModelClass).getSessionOptions(), lane, &deviceId);
667 (nnApplications[lane].
mModelClass).setDeviceId(deviceId);
668 if (nnApplications[lane].mModelClass.getIntraOpNumThreads() > maxThreads) {
671 (nnApplications[lane].
mModelClass).initEnvironment();
682 if (nnApplications[lane].mModelsUsed[1]) {
683 SetONNXGPUStream(*(nnApplications[lane].mModelReg1).getSessionOptions(), lane, &deviceId);
684 (nnApplications[lane].
mModelReg1).setDeviceId(deviceId);
685 if (nnApplications[lane].mModelReg1.getIntraOpNumThreads() > maxThreads) {
689 (nnApplications[lane].
mModelReg1).initEnvironment();
691 (nnApplications[lane].
mModelReg1).initSession();
693 if (nnApplications[lane].mModelsUsed[2]) {
694 SetONNXGPUStream(*(nnApplications[lane].mModelReg2).getSessionOptions(), lane, &deviceId);
695 (nnApplications[lane].
mModelReg2).setDeviceId(deviceId);
696 if (nnApplications[lane].mModelReg2.getIntraOpNumThreads() > maxThreads) {
700 (nnApplications[lane].
mModelReg2).initEnvironment();
702 (nnApplications[lane].
mModelReg2).initSession();
704 if (nn_settings.nnClusterizerVerbosity > 0) {
705 LOG(info) <<
"(ORT) Allocated ONNX stream for lane " << lane <<
" and device " << deviceId;
710 for (int32_t sector = 0; sector <
NSECTORS; sector++) {
713 int32_t lane = sector % numLanes;
717 nnApplications[lane].
initClusterizer(nn_settings, clustererNN, maxFragmentLen, maxAllowedTimebin);
720 clustererNNShadow.
mISector = sector;
722 nnApplications[lane].
initClusterizer(nn_settings, clustererNNShadow, maxFragmentLen, maxAllowedTimebin);
724 if (nn_settings.nnClusterizerVerbosity > 2) {
725 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Processor initialized. Sector " << sector <<
", lane " << lane <<
", max clusters " << clustererNN.
mNnClusterizerTotalClusters <<
" (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
728 if (nn_settings.nnClusterizerVerbosity > 2) {
729 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Memory registered for memoryId " << clustererNN.
mMemoryId <<
" (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
735 if (nn_settings.nnClusterizerVerbosity > 2) {
736 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Writing to constant memory...";
739 if (nn_settings.nnClusterizerVerbosity > 2) {
740 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Writing to constant memory done";
746 size_t nClsTotal = 0;
749 std::unique_ptr<ClusterNative[]> tmpNativeClusterBuffer;
756 bool buildNativeGPU = doGPU && NeedTPCClustersOnGPU();
760 if (buildNativeGPU) {
764 GPUFatal(
"ERROR, mWaitForFinalInputs cannot be called with nTPCClustererLanes > 6");
767 if (mWaitForFinalInputs) {
768 GPUFatal(
"Cannot use waitForFinalInput callback without delayed output");
772 tmpNativeClusters =
mInputsHost->mPclusterNativeOutput;
774 tmpNativeClusterBuffer = std::make_unique<ClusterNative[]>(
mInputsHost->mNClusterNative);
775 tmpNativeClusters = tmpNativeClusterBuffer.get();
780 if (propagateMCLabels) {
786 int8_t transferRunning[
NSECTORS] = {0};
789 auto notifyForeignChainFinished = [
this]() {
790 if (mPipelineNotifyCtx) {
793 std::lock_guard<std::mutex> lock(mPipelineNotifyCtx->
mutex);
794 mPipelineNotifyCtx->
ready =
true;
796 mPipelineNotifyCtx->
cond.notify_one();
799 bool synchronizeCalibUpdate =
false;
805 for (
CfFragment fragment =
mCFContext->fragmentFirst; !fragment.isEnd(); fragment = fragment.next()) {
807 GPUInfo(
"Processing time bins [%d, %d) for sectors %d to %d", fragment.
start, fragment.last(), iSectorBase, iSectorBase +
GetProcessingSettings().nTPCClustererLanes - 1);
810 if (doGPU && fragment.
index != 0) {
811 SynchronizeStream(lane);
814 uint32_t iSector = iSectorBase + lane;
822 bool setDigitsOnHost = (not doGPU && not
mIOPtrs.
tpcZS) || propagateMCLabels;
824 size_t numDigits = inDigits->
nTPCDigits[iSector];
825 if (setDigitsOnGPU) {
826 GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.
mPdigits, inDigits->tpcDigits[iSector],
sizeof(clustererShadow.
mPdigits[0]) * numDigits, lane,
true);
828 if (setDigitsOnHost) {
844 using ChargeMapType =
decltype(*clustererShadow.
mPchargeMap);
845 using PeakMapType =
decltype(*clustererShadow.
mPpeakMap);
848 if (fragment.
index == 0) {
870 if (propagateMCLabels && fragment.
index == 0) {
874 GPUFatal(
"MC label container missing, sector %d", iSector);
883 runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({
GetGrid(1, lane), {iSector}},
mIOPtrs.
tpcZS ==
nullptr);
885 }
else if (propagateMCLabels) {
897 GPUFatal(
"Data with invalid TPC ZS mode (%d) received",
mCFContext->zsVersion);
901 runKernel<GPUTPCCFDecodeZS>({
GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
904 runKernel<GPUTPCCFDecodeZSLink>({
GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
907 runKernel<GPUTPCCFDecodeZSDenseLink>({
GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
914 uint32_t iSector = iSectorBase + lane;
920 int32_t nextSector = iSector;
925 if (nextSector < NSECTORS && mIOPtrs.tpcZS && mCFContext->nPagesSector[nextSector] &&
mCFContext->zsVersion != -1 && !
mCFContext->abandonTimeframe) {
941 if (propagateMCLabels) {
945 bool checkForNoisyPads = (
rec()->
GetParam().
rec.tpc.maxTimeBinAboveThresholdIn1000Bin > 0) || (
rec()->
GetParam().
rec.tpc.maxConsecTimeBinAboveThreshold > 0);
946 checkForNoisyPads &= (
rec()->
GetParam().
rec.tpc.noisyPadsQuickCheck ? fragment.
index == 0 :
true);
949 if (checkForNoisyPads) {
952 runKernel<GPUTPCCFCheckPadBaseline>({
GetGridBlk(nBlocks, lane), {iSector}});
960 RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 0, doGPU, lane);
965 uint32_t iSector = iSectorBase + lane;
974 runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({
GetGrid(clusterer.
mPmemory->
counters.
nPeaks, lane), {iSector}});
980 RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 1, doGPU, lane);
985 uint32_t iSector = iSectorBase + lane;
993 if (fragment.
index == 0) {
995 if (transferRunning[lane] == 1) {
997 transferRunning[lane] = 2;
1007#ifdef GPUCA_HAS_ONNX
1014 if (nn_settings.nnClusterizerApplyCfDeconvolution) {
1021 if (nn_settings.nnClusterizerVerbosity > 2) {
1025 if (nn_settings.nnClusterizerVerbosity > 3) {
1026 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Start. Loop=" << batch <<
". (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1039 if (nn_settings.nnClusterizerVerbosity > 3) {
1040 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Done filling data. Loop=" << batch <<
". (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1045 if (nn_settings.nnClusterizerVerbosity > 3) {
1046 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Done setting deconvolution flags. Loop=" << batch <<
". (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1067 if (nn_settings.nnClusterizerVerbosity > 3) {
1068 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Done with NN classification inference. Loop=" << batch <<
". (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1104 if (nn_settings.nnClusterizerVerbosity > 3) {
1105 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Done with NN regression inference. Loop=" << batch <<
". (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1122 if (nn_settings.nnClusterizerVerbosity > 3) {
1123 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Done publishing. Loop=" << batch <<
". (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1128 if(!nn_settings.nnClusterizerApplyCfDeconvolution) {
1133 if (nn_settings.nnClusterizerVerbosity > 3) {
1134 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Done with CF regression. (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1138 GPUFatal(
"Project not compiled with neural network clusterization. Aborting.");
1146 if (doGPU && propagateMCLabels) {
1159 laneHasData[lane] =
true;
1166 size_t nClsFirst = nClsTotal;
1167 bool anyLaneHasData =
false;
1168 for (int32_t lane = 0; lane < maxLane; lane++) {
1169 uint32_t iSector = iSectorBase + lane;
1177 if (laneHasData[lane]) {
1178 anyLaneHasData =
true;
1184 clusterer.raiseError(GPUErrors::ERROR_CF_GLOBAL_CLUSTER_OVERFLOW, iSector * 1000 +
j, nClsTotal + clusterer.
mPclusterInRow[
j],
mInputsHost->mNClusterNative);
1187 if (buildNativeGPU) {
1191 }
else if (buildNativeHost) {
1197 if (transferRunning[lane]) {
1201 transferRunning[lane] = 1;
1204 if (not propagateMCLabels || not laneHasData[lane]) {
1205 assert(propagateMCLabels ? mcLinearLabels.
header.size() == nClsTotal :
true);
1213 assert(propagateMCLabels ? mcLinearLabels.
header.size() == nClsTotal :
true);
1215 if (propagateMCLabels) {
1216 for (int32_t lane = 0; lane < maxLane; lane++) {
1220 if (buildNativeHost && buildNativeGPU && anyLaneHasData) {
1222 mOutputQueue.emplace_back(
outputQueueEntry{(
void*)((
char*)&tmpNativeClusters[nClsFirst] - (
char*)&tmpNativeClusters[0]), &
mInputsShadow->mPclusterNativeBuffer[nClsFirst], (nClsTotal - nClsFirst) *
sizeof(tmpNativeClusters[0]), RecoStep::TPCClusterFinding});
1224 GPUMemCpy(RecoStep::TPCClusterFinding, (
void*)&tmpNativeClusters[nClsFirst], (
const void*)&
mInputsShadow->mPclusterNativeBuffer[nClsFirst], (nClsTotal - nClsFirst) *
sizeof(tmpNativeClusters[0]),
mRec->
NStreams() - 1,
false);
1228 if (mWaitForFinalInputs && iSectorBase >= 21 && (int32_t)iSectorBase < 21 +
GetProcessingSettings().nTPCClustererLanes) {
1229 notifyForeignChainFinished();
1231 if (mWaitForFinalInputs && iSectorBase >= 30 && (int32_t)iSectorBase < 30 +
GetProcessingSettings().nTPCClustererLanes) {
1232 mWaitForFinalInputs();
1237#ifdef GPUCA_HAS_ONNX
1240 LOG(info) <<
"(ORT) Environment releasing...";
1248 if (transferRunning[
i]) {
1255 if (triggerOutput && triggerOutput->
allocator) {
1264 if (propagateMCLabels) {
1267 std::pair<ConstMCLabelContainer*, ConstMCLabelContainerView*>
buffer;
1270 throw std::runtime_error(
"Cluster MC Label buffer missing");
1273 buffer = {&container->first, &container->second};
1281 assert(propagateMCLabels ? mcLinearLabels.
header.size() == nClsTotal :
true);
1282 assert(propagateMCLabels ? mcLinearLabels.
data.size() >= nClsTotal :
true);
1287 mcLabelsConstView =
buffer.second;
1293 tmpNativeClusters =
mInputsHost->mPclusterNativeOutput;
1299 if (buildNativeHost) {
1305 auto allocator = [
this, &tmpNativeClusters](
size_t size) {
1308 return (tmpNativeClusters = this->
mInputsHost->mPclusterNativeOutput);
1310 RunTPCClusterFilter(tmpNativeAccess, allocator,
false);
1315 if (!mWaitForFinalInputs) {
1316 notifyForeignChainFinished();
1319 if (buildNativeGPU) {
1324 mInputsHost->mPclusterNativeAccess->setOffsetPtrs();
1327 if (doGPU && synchronizeOutput) {
1330 if (doGPU && synchronizeCalibUpdate) {
1339 if (buildNativeGPU) {
1340 GPUMemCpy(RecoStep::TPCClusterFinding, (
void*)
mInputsShadow->mPclusterNativeBuffer, (
const void*)tmpNativeClusters, nClsTotal *
sizeof(tmpNativeClusters[0]), -1,
true);
1345 if (mPipelineNotifyCtx) {
1347 mPipelineNotifyCtx =
nullptr;