587 if (
param().
rec.fwdTPCDigitsAsClusters) {
590#ifdef GPUCA_TPC_GEOMETRY_O2
596 if (RunTPCClusterizer_prepare(mPipelineNotifyCtx &&
GetProcessingSettings().doublePipelineClusterizer)) {
604 float tpcHitLowOccupancyScalingFactor = 1.f;
612 if (nHitsBase < threshold) {
615 tpcHitLowOccupancyScalingFactor = std::min(3.5f, (
float)threshold / nHitsBase);
618 for (uint32_t iSector = 0; iSector <
NSECTORS; iSector++) {
622 for (uint32_t iSector = 0; iSector <
NSECTORS; iSector++) {
626 RunTPCClusterizer_prepare(
true);
644 if (nn_settings.applyNNclusterizer) {
645 int32_t deviceId = -1;
651 nnTimers[0] = &getTimer<GPUTPCNNClusterizer, 0>(
"GPUTPCNNClusterizer_ONNXClassification_0_", 0);
652 nnTimers[1] = &getTimer<GPUTPCNNClusterizer, 1>(
"GPUTPCNNClusterizer_ONNXRegression_1_", 1);
653 nnTimers[2] = &getTimer<GPUTPCNNClusterizer, 2>(
"GPUTPCNNClusterizer_ONNXRegression2_2_", 2);
654 nnTimers[3] = &getTimer<GPUTPCNNClusterizer, 3>(
"GPUTPCNNClusterizer_ONNXClassification_0_", 3);
655 nnTimers[4] = &getTimer<GPUTPCNNClusterizer, 4>(
"GPUTPCNNClusterizer_ONNXRegression_1_", 4);
656 nnTimers[5] = &getTimer<GPUTPCNNClusterizer, 5>(
"GPUTPCNNClusterizer_ONNXRegression2_2_", 5);
657 nnTimers[6] = &getTimer<GPUTPCNNClusterizer, 6>(
"GPUTPCNNClusterizer_ONNXClassification_0_", 6);
658 nnTimers[7] = &getTimer<GPUTPCNNClusterizer, 7>(
"GPUTPCNNClusterizer_ONNXRegression_1_", 7);
659 nnTimers[8] = &getTimer<GPUTPCNNClusterizer, 8>(
"GPUTPCNNClusterizer_ONNXRegression2_2_", 8);
660 nnTimers[9] = &getTimer<GPUTPCNNClusterizer, 9>(
"GPUTPCNNClusterizer_ONNXClassification_0_", 9);
661 nnTimers[10] = &getTimer<GPUTPCNNClusterizer, 10>(
"GPUTPCNNClusterizer_ONNXRegression_1_", 10);
662 nnTimers[11] = &getTimer<GPUTPCNNClusterizer, 11>(
"GPUTPCNNClusterizer_ONNXRegression2_2_", 11);
667 if (nnApplications[lane].mModelsUsed[0]) {
668 SetONNXGPUStream(*(nnApplications[lane].mModelClass).getSessionOptions(), lane, &deviceId);
669 (nnApplications[lane].
mModelClass).setDeviceId(deviceId);
670 if (nnApplications[lane].mModelClass.getIntraOpNumThreads() > maxThreads) {
673 (nnApplications[lane].
mModelClass).initEnvironment();
682 if (!nn_settings.nnLoadFromCCDB) {
688 if (nnApplications[lane].mModelsUsed[1]) {
689 SetONNXGPUStream(*(nnApplications[lane].mModelReg1).getSessionOptions(), lane, &deviceId);
690 (nnApplications[lane].
mModelReg1).setDeviceId(deviceId);
691 if (nnApplications[lane].mModelReg1.getIntraOpNumThreads() > maxThreads) {
695 (nnApplications[lane].
mModelReg1).initEnvironment();
697 if (!nn_settings.nnLoadFromCCDB) {
698 (nnApplications[lane].
mModelReg1).initSession();
703 if (nnApplications[lane].mModelsUsed[2]) {
704 SetONNXGPUStream(*(nnApplications[lane].mModelReg2).getSessionOptions(), lane, &deviceId);
705 (nnApplications[lane].
mModelReg2).setDeviceId(deviceId);
706 if (nnApplications[lane].mModelReg2.getIntraOpNumThreads() > maxThreads) {
710 (nnApplications[lane].
mModelReg2).initEnvironment();
712 if (!nn_settings.nnLoadFromCCDB) {
713 (nnApplications[lane].
mModelReg2).initSession();
718 if (nn_settings.nnClusterizerVerbosity > 0) {
719 LOG(info) <<
"(ORT) Allocated ONNX stream for lane " << lane <<
" and device " << deviceId;
724 for (int32_t sector = 0; sector <
NSECTORS; sector++) {
727 int32_t lane = sector % numLanes;
731 nnApplications[lane].
initClusterizer(nn_settings, clustererNN, maxFragmentLen, maxAllowedTimebin);
734 clustererNNShadow.
mISector = sector;
736 nnApplications[lane].
initClusterizer(nn_settings, clustererNNShadow, maxFragmentLen, maxAllowedTimebin);
738 if (nn_settings.nnClusterizerVerbosity > 2) {
739 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Processor initialized. Sector " << sector <<
", lane " << lane <<
", max clusters " << clustererNN.
mNnClusterizerTotalClusters <<
" (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
742 if (nn_settings.nnClusterizerVerbosity > 2) {
743 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Memory registered for memoryId " << clustererNN.
mMemoryId <<
" (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
749 if (nn_settings.nnClusterizerVerbosity > 2) {
750 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Writing to constant memory...";
753 if (nn_settings.nnClusterizerVerbosity > 2) {
754 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Writing to constant memory done";
760 size_t nClsTotal = 0;
763 std::unique_ptr<ClusterNative[]> tmpNativeClusterBuffer;
770 bool buildNativeGPU = doGPU && NeedTPCClustersOnGPU();
774 if (buildNativeGPU) {
778 GPUFatal(
"ERROR, mWaitForFinalInputs cannot be called with nTPCClustererLanes > 6");
781 if (mWaitForFinalInputs) {
782 GPUFatal(
"Cannot use waitForFinalInput callback without delayed output");
786 tmpNativeClusters =
mInputsHost->mPclusterNativeOutput;
788 tmpNativeClusterBuffer = std::make_unique<ClusterNative[]>(
mInputsHost->mNClusterNative);
789 tmpNativeClusters = tmpNativeClusterBuffer.get();
794 if (propagateMCLabels) {
800 int8_t transferRunning[
NSECTORS] = {0};
803 auto notifyForeignChainFinished = [
this]() {
804 if (mPipelineNotifyCtx) {
807 std::lock_guard<std::mutex> lock(mPipelineNotifyCtx->
mutex);
808 mPipelineNotifyCtx->
ready =
true;
810 mPipelineNotifyCtx->
cond.notify_one();
813 bool synchronizeCalibUpdate =
false;
819 for (
CfFragment fragment =
mCFContext->fragmentFirst; !fragment.isEnd(); fragment = fragment.next()) {
821 GPUInfo(
"Processing time bins [%d, %d) for sectors %d to %d", fragment.
start, fragment.last(), iSectorBase, iSectorBase +
GetProcessingSettings().nTPCClustererLanes - 1);
824 if (doGPU && fragment.
index != 0) {
825 SynchronizeStream(lane);
828 uint32_t iSector = iSectorBase + lane;
836 bool setDigitsOnHost = (not doGPU && not
mIOPtrs.
tpcZS) || propagateMCLabels;
838 size_t numDigits = inDigits->
nTPCDigits[iSector];
839 if (setDigitsOnGPU) {
840 GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.
mPdigits, inDigits->tpcDigits[iSector],
sizeof(clustererShadow.
mPdigits[0]) * numDigits, lane,
true);
842 if (setDigitsOnHost) {
858 using ChargeMapType =
decltype(*clustererShadow.
mPchargeMap);
859 using PeakMapType =
decltype(*clustererShadow.
mPpeakMap);
862 if (fragment.
index == 0) {
884 if (propagateMCLabels && fragment.
index == 0) {
888 GPUFatal(
"MC label container missing, sector %d", iSector);
897 runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({
GetGrid(1, lane), {iSector}},
mIOPtrs.
tpcZS ==
nullptr);
899 }
else if (propagateMCLabels) {
911 GPUFatal(
"Data with invalid TPC ZS mode (%d) received",
mCFContext->zsVersion);
915 runKernel<GPUTPCCFDecodeZS>({
GetGridBlk(nBlocks, lane), {iSector}}, firstHBF, tpcTimeBinCut);
918 runKernel<GPUTPCCFDecodeZSLink>({
GetGridBlk(nBlocks, lane), {iSector}}, firstHBF, tpcTimeBinCut);
921 runKernel<GPUTPCCFDecodeZSDenseLink>({
GetGridBlk(nBlocks, lane), {iSector}}, firstHBF, tpcTimeBinCut);
928 uint32_t iSector = iSectorBase + lane;
934 int32_t nextSector = iSector;
939 if (nextSector < NSECTORS && mIOPtrs.tpcZS && mCFContext->nPagesSector[nextSector] &&
mCFContext->zsVersion != -1 && !
mCFContext->abandonTimeframe) {
955 if (propagateMCLabels) {
959 bool checkForNoisyPads = (
rec()->
GetParam().
rec.tpc.maxTimeBinAboveThresholdIn1000Bin > 0) || (
rec()->
GetParam().
rec.tpc.maxConsecTimeBinAboveThreshold > 0);
960 checkForNoisyPads &= (
rec()->
GetParam().
rec.tpc.noisyPadsQuickCheck ? fragment.
index == 0 :
true);
963 if (checkForNoisyPads) {
966 runKernel<GPUTPCCFCheckPadBaseline>({
GetGridBlk(nBlocks, lane), {iSector}});
967 getKernelTimer<GPUTPCCFCheckPadBaseline>(RecoStep::TPCClusterFinding, iSector,
TPC_PADS_IN_SECTOR * fragment.lengthWithoutOverlap() *
sizeof(
PackedCharge),
false);
975 RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 0, doGPU, lane);
980 uint32_t iSector = iSectorBase + lane;
989 runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({
GetGrid(clusterer.
mPmemory->
counters.
nPeaks, lane), {iSector}});
995 RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 1, doGPU, lane);
1000 uint32_t iSector = iSectorBase + lane;
1008 if (fragment.
index == 0) {
1010 if (transferRunning[lane] == 1) {
1012 transferRunning[lane] = 2;
1022#ifdef GPUCA_HAS_ONNX
1029 if (nn_settings.nnClusterizerApplyCfDeconvolution) {
1036 if (nn_settings.nnClusterizerVerbosity > 2) {
1040 if (nn_settings.nnClusterizerVerbosity > 3) {
1041 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Start. Loop=" << batch <<
". (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1054 if (nn_settings.nnClusterizerVerbosity > 3) {
1055 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Done filling data. Loop=" << batch <<
". (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1060 if (nn_settings.nnClusterizerVerbosity > 3) {
1061 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Done setting deconvolution flags. Loop=" << batch <<
". (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1082 if (nn_settings.nnClusterizerVerbosity > 3) {
1083 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Done with NN classification inference. Loop=" << batch <<
". (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1119 if (nn_settings.nnClusterizerVerbosity > 3) {
1120 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Done with NN regression inference. Loop=" << batch <<
". (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1137 if (nn_settings.nnClusterizerVerbosity > 3) {
1138 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Done publishing. Loop=" << batch <<
". (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1143 if(!nn_settings.nnClusterizerApplyCfDeconvolution) {
1148 if (nn_settings.nnClusterizerVerbosity > 3) {
1149 LOG(info) <<
"(NNCLUS, GPUChainTrackingClusterizer, this=" <<
this <<
") Done with CF regression. (clustererNN=" << &clustererNN <<
", clustererNNShadow=" << &clustererNNShadow <<
")";
1153 GPUFatal(
"Project not compiled with neural network clusterization. Aborting.");
1161 if (doGPU && propagateMCLabels) {
1174 laneHasData[lane] =
true;
1181 size_t nClsFirst = nClsTotal;
1182 bool anyLaneHasData =
false;
1183 for (int32_t lane = 0; lane < maxLane; lane++) {
1184 uint32_t iSector = iSectorBase + lane;
1192 if (laneHasData[lane]) {
1193 anyLaneHasData =
true;
1199 clusterer.raiseError(GPUErrors::ERROR_CF_GLOBAL_CLUSTER_OVERFLOW, iSector * 1000 +
j, nClsTotal + clusterer.
mPclusterInRow[
j],
mInputsHost->mNClusterNative);
1202 if (buildNativeGPU) {
1206 }
else if (buildNativeHost) {
1212 if (transferRunning[lane]) {
1216 transferRunning[lane] = 1;
1219 if (not propagateMCLabels || not laneHasData[lane]) {
1220 assert(propagateMCLabels ? mcLinearLabels.
header.size() == nClsTotal :
true);
1228 assert(propagateMCLabels ? mcLinearLabels.
header.size() == nClsTotal :
true);
1230 if (propagateMCLabels) {
1231 for (int32_t lane = 0; lane < maxLane; lane++) {
1235 if (buildNativeHost && buildNativeGPU && anyLaneHasData) {
1237 mOutputQueue.emplace_back(
outputQueueEntry{(
void*)((
char*)&tmpNativeClusters[nClsFirst] - (
char*)&tmpNativeClusters[0]), &
mInputsShadow->mPclusterNativeBuffer[nClsFirst], (nClsTotal - nClsFirst) *
sizeof(tmpNativeClusters[0]), RecoStep::TPCClusterFinding});
1239 GPUMemCpy(RecoStep::TPCClusterFinding, (
void*)&tmpNativeClusters[nClsFirst], (
const void*)&
mInputsShadow->mPclusterNativeBuffer[nClsFirst], (nClsTotal - nClsFirst) *
sizeof(tmpNativeClusters[0]),
mRec->
NStreams() - 1,
false);
1243 if (mWaitForFinalInputs && iSectorBase >= 21 && (int32_t)iSectorBase < 21 +
GetProcessingSettings().nTPCClustererLanes) {
1244 notifyForeignChainFinished();
1246 if (mWaitForFinalInputs && iSectorBase >= 30 && (int32_t)iSectorBase < 30 +
GetProcessingSettings().nTPCClustererLanes) {
1247 mWaitForFinalInputs();
1252#ifdef GPUCA_HAS_ONNX
1255 LOG(info) <<
"(ORT) Environment releasing...";
1263 if (transferRunning[
i]) {
1270 if (triggerOutput && triggerOutput->
allocator) {
1280 GPUInfo(
"Event has %zu TPC Clusters", nClsTotal);
1284 if (propagateMCLabels) {
1287 std::pair<ConstMCLabelContainer*, ConstMCLabelContainerView*>
buffer;
1290 throw std::runtime_error(
"Cluster MC Label buffer missing");
1293 buffer = {&container->first, &container->second};
1301 assert(propagateMCLabels ? mcLinearLabels.
header.size() == nClsTotal :
true);
1302 assert(propagateMCLabels ? mcLinearLabels.
data.size() >= nClsTotal :
true);
1307 mcLabelsConstView =
buffer.second;
1313 tmpNativeClusters =
mInputsHost->mPclusterNativeOutput;
1319 if (buildNativeHost) {
1325 auto allocator = [
this, &tmpNativeClusters](
size_t size) {
1328 return (tmpNativeClusters = this->
mInputsHost->mPclusterNativeOutput);
1330 RunTPCClusterFilter(tmpNativeAccess, allocator,
false);
1335 if (!mWaitForFinalInputs) {
1336 notifyForeignChainFinished();
1339 if (buildNativeGPU) {
1344 mInputsHost->mPclusterNativeAccess->setOffsetPtrs();
1347 if (doGPU && synchronizeOutput) {
1350 if (doGPU && synchronizeCalibUpdate) {
1359 if (buildNativeGPU) {
1360 GPUMemCpy(RecoStep::TPCClusterFinding, (
void*)
mInputsShadow->mPclusterNativeBuffer, (
const void*)tmpNativeClusters, nClsTotal *
sizeof(tmpNativeClusters[0]), -1,
true);
1365 if (mPipelineNotifyCtx) {
1367 mPipelineNotifyCtx =
nullptr;