586  if (
param().
rec.fwdTPCDigitsAsClusters) {
 
  589#ifdef GPUCA_TPC_GEOMETRY_O2 
  594  if (RunTPCClusterizer_prepare(mPipelineNotifyCtx && 
GetProcessingSettings().doublePipelineClusterizer)) {
 
  602  float tpcHitLowOccupancyScalingFactor = 1.f;
 
  610    if (nHitsBase < threshold) {
 
  613      tpcHitLowOccupancyScalingFactor = std::min(3.5f, (
float)threshold / nHitsBase);
 
  616  for (uint32_t iSector = 0; iSector < 
NSECTORS; iSector++) {
 
  620  for (uint32_t iSector = 0; iSector < 
NSECTORS; iSector++) {
 
  624    RunTPCClusterizer_prepare(
true); 
 
  643    int32_t deviceId = -1;
 
  649      nnTimers[0] = &getTimer<GPUTPCNNClusterizer, 0>(
"GPUTPCNNClusterizer_ONNXClassification_0_", 0);
 
  650      nnTimers[1] = &getTimer<GPUTPCNNClusterizer, 1>(
"GPUTPCNNClusterizer_ONNXRegression_1_", 1);
 
  651      nnTimers[2] = &getTimer<GPUTPCNNClusterizer, 2>(
"GPUTPCNNClusterizer_ONNXRegression2_2_", 2);
 
  652      nnTimers[3] = &getTimer<GPUTPCNNClusterizer, 3>(
"GPUTPCNNClusterizer_ONNXClassification_0_", 3);
 
  653      nnTimers[4] = &getTimer<GPUTPCNNClusterizer, 4>(
"GPUTPCNNClusterizer_ONNXRegression_1_", 4);
 
  654      nnTimers[5] = &getTimer<GPUTPCNNClusterizer, 5>(
"GPUTPCNNClusterizer_ONNXRegression2_2_", 5);
 
  655      nnTimers[6] = &getTimer<GPUTPCNNClusterizer, 6>(
"GPUTPCNNClusterizer_ONNXClassification_0_", 6);
 
  656      nnTimers[7] = &getTimer<GPUTPCNNClusterizer, 7>(
"GPUTPCNNClusterizer_ONNXRegression_1_", 7);
 
  657      nnTimers[8] = &getTimer<GPUTPCNNClusterizer, 8>(
"GPUTPCNNClusterizer_ONNXRegression2_2_", 8);
 
  658      nnTimers[9] = &getTimer<GPUTPCNNClusterizer, 9>(
"GPUTPCNNClusterizer_ONNXClassification_0_", 9);
 
  659      nnTimers[10] = &getTimer<GPUTPCNNClusterizer, 10>(
"GPUTPCNNClusterizer_ONNXRegression_1_", 10);
 
  660      nnTimers[11] = &getTimer<GPUTPCNNClusterizer, 11>(
"GPUTPCNNClusterizer_ONNXRegression2_2_", 11);
 
  665      if (nnApplications[lane].mModelsUsed[0]) {
 
  666        SetONNXGPUStream(*(nnApplications[lane].mModelClass).getSessionOptions(), lane, &deviceId);
 
  667        (nnApplications[lane].
mModelClass).setDeviceId(deviceId);
 
  668        if (nnApplications[lane].mModelClass.getIntraOpNumThreads() > maxThreads) {
 
  671        (nnApplications[lane].
mModelClass).initEnvironment();
 
  682      if (nnApplications[lane].mModelsUsed[1]) {
 
  683        SetONNXGPUStream(*(nnApplications[lane].mModelReg1).getSessionOptions(), lane, &deviceId);
 
  684        (nnApplications[lane].
mModelReg1).setDeviceId(deviceId);
 
  685        if (nnApplications[lane].mModelReg1.getIntraOpNumThreads() > maxThreads) {
 
  689        (nnApplications[lane].
mModelReg1).initEnvironment();
 
  691        (nnApplications[lane].
mModelReg1).initSession();
 
  693      if (nnApplications[lane].mModelsUsed[2]) {
 
  694        SetONNXGPUStream(*(nnApplications[lane].mModelReg2).getSessionOptions(), lane, &deviceId);
 
  695        (nnApplications[lane].
mModelReg2).setDeviceId(deviceId);
 
  696        if (nnApplications[lane].mModelReg2.getIntraOpNumThreads() > maxThreads) {
 
  700        (nnApplications[lane].
mModelReg2).initEnvironment();
 
  702        (nnApplications[lane].
mModelReg2).initSession();
 
  704      if (nn_settings.nnClusterizerVerbosity > 0) {
 
  705        LOG(info) << 
"(ORT) Allocated ONNX stream for lane " << lane << 
" and device " << deviceId;
 
  710    for (int32_t sector = 0; sector < 
NSECTORS; sector++) {
 
  713      int32_t lane = sector % numLanes;
 
  717      nnApplications[lane].
initClusterizer(nn_settings, clustererNN, maxFragmentLen, maxAllowedTimebin);
 
  720        clustererNNShadow.
mISector = sector;
 
  722        nnApplications[lane].
initClusterizer(nn_settings, clustererNNShadow, maxFragmentLen, maxAllowedTimebin);
 
  724      if (nn_settings.nnClusterizerVerbosity > 2) {
 
  725        LOG(info) << 
"(NNCLUS, GPUChainTrackingClusterizer, this=" << 
this << 
") Processor initialized. Sector " << sector << 
", lane " << lane << 
", max clusters " << clustererNN.
mNnClusterizerTotalClusters << 
" (clustererNN=" << &clustererNN << 
", clustererNNShadow=" << &clustererNNShadow << 
")";
 
  728      if (nn_settings.nnClusterizerVerbosity > 2) {
 
  729        LOG(info) << 
"(NNCLUS, GPUChainTrackingClusterizer, this=" << 
this << 
") Memory registered for memoryId " << clustererNN.
mMemoryId << 
" (clustererNN=" << &clustererNN << 
", clustererNNShadow=" << &clustererNNShadow << 
")";
 
  735      if (nn_settings.nnClusterizerVerbosity > 2) {
 
  736        LOG(info) << 
"(NNCLUS, GPUChainTrackingClusterizer, this=" << 
this << 
") Writing to constant memory...";
 
  739      if (nn_settings.nnClusterizerVerbosity > 2) {
 
  740        LOG(info) << 
"(NNCLUS, GPUChainTrackingClusterizer, this=" << 
this << 
") Writing to constant memory done";
 
  746  size_t nClsTotal = 0;
 
  749  std::unique_ptr<ClusterNative[]> tmpNativeClusterBuffer;
 
  756  bool buildNativeGPU = doGPU && NeedTPCClustersOnGPU();
 
  760  if (buildNativeGPU) {
 
  764    GPUFatal(
"ERROR, mWaitForFinalInputs cannot be called with nTPCClustererLanes > 6");
 
  767    if (mWaitForFinalInputs) {
 
  768      GPUFatal(
"Cannot use waitForFinalInput callback without delayed output");
 
  772      tmpNativeClusters = 
mInputsHost->mPclusterNativeOutput;
 
  774      tmpNativeClusterBuffer = std::make_unique<ClusterNative[]>(
mInputsHost->mNClusterNative);
 
  775      tmpNativeClusters = tmpNativeClusterBuffer.get();
 
  780  if (propagateMCLabels) {
 
  786  int8_t transferRunning[
NSECTORS] = {0};
 
  789  auto notifyForeignChainFinished = [
this]() {
 
  790    if (mPipelineNotifyCtx) {
 
  793        std::lock_guard<std::mutex> lock(mPipelineNotifyCtx->
mutex);
 
  794        mPipelineNotifyCtx->
ready = 
true;
 
  796      mPipelineNotifyCtx->
cond.notify_one();
 
  799  bool synchronizeCalibUpdate = 
false;
 
  805    for (
CfFragment fragment = 
mCFContext->fragmentFirst; !fragment.isEnd(); fragment = fragment.next()) {
 
  807        GPUInfo(
"Processing time bins [%d, %d) for sectors %d to %d", fragment.
start, fragment.last(), iSectorBase, iSectorBase + 
GetProcessingSettings().nTPCClustererLanes - 1);
 
  810        if (doGPU && fragment.
index != 0) {
 
  811          SynchronizeStream(lane); 
 
  814        uint32_t iSector = iSectorBase + lane;
 
  822          bool setDigitsOnHost = (not doGPU && not 
mIOPtrs.
tpcZS) || propagateMCLabels;
 
  824          size_t numDigits = inDigits->
nTPCDigits[iSector];
 
  825          if (setDigitsOnGPU) {
 
  826            GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.
mPdigits, inDigits->tpcDigits[iSector], 
sizeof(clustererShadow.
mPdigits[0]) * numDigits, lane, 
true);
 
  828          if (setDigitsOnHost) {
 
  844        using ChargeMapType = 
decltype(*clustererShadow.
mPchargeMap);
 
  845        using PeakMapType = 
decltype(*clustererShadow.
mPpeakMap);
 
  848        if (fragment.
index == 0) {
 
  870        if (propagateMCLabels && fragment.
index == 0) {
 
  874            GPUFatal(
"MC label container missing, sector %d", iSector);
 
  883            runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({
GetGrid(1, lane), {iSector}}, 
mIOPtrs.
tpcZS == 
nullptr);
 
  885          } 
else if (propagateMCLabels) {
 
  897              GPUFatal(
"Data with invalid TPC ZS mode (%d) received", 
mCFContext->zsVersion);
 
  901              runKernel<GPUTPCCFDecodeZS>({
GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
 
  904              runKernel<GPUTPCCFDecodeZSLink>({
GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
 
  907              runKernel<GPUTPCCFDecodeZSDenseLink>({
GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
 
  914        uint32_t iSector = iSectorBase + lane;
 
  920          int32_t nextSector = iSector;
 
  925          if (nextSector < NSECTORS && mIOPtrs.tpcZS && mCFContext->nPagesSector[nextSector] && 
mCFContext->zsVersion != -1 && !
mCFContext->abandonTimeframe) {
 
  941        if (propagateMCLabels) {
 
  945        bool checkForNoisyPads = (
rec()->
GetParam().
rec.tpc.maxTimeBinAboveThresholdIn1000Bin > 0) || (
rec()->
GetParam().
rec.tpc.maxConsecTimeBinAboveThreshold > 0);
 
  946        checkForNoisyPads &= (
rec()->
GetParam().
rec.tpc.noisyPadsQuickCheck ? fragment.
index == 0 : 
true);
 
  949        if (checkForNoisyPads) {
 
  952          runKernel<GPUTPCCFCheckPadBaseline>({
GetGridBlk(nBlocks, lane), {iSector}});
 
  960        RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 0, doGPU, lane);
 
  965        uint32_t iSector = iSectorBase + lane;
 
  974        runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({
GetGrid(clusterer.
mPmemory->
counters.
nPeaks, lane), {iSector}});
 
  980        RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 1, doGPU, lane);
 
  985        uint32_t iSector = iSectorBase + lane;
 
  993        if (fragment.
index == 0) {
 
  995          if (transferRunning[lane] == 1) {
 
  997            transferRunning[lane] = 2;
 
 1007#ifdef GPUCA_HAS_ONNX 
 1014          if (nn_settings.nnClusterizerApplyCfDeconvolution) {
 
 1021          if (nn_settings.nnClusterizerVerbosity > 2) {
 
 1025            if (nn_settings.nnClusterizerVerbosity > 3) {
 
 1026              LOG(info) << 
"(NNCLUS, GPUChainTrackingClusterizer, this=" << 
this << 
") Start. Loop=" << batch << 
". (clustererNN=" << &clustererNN << 
", clustererNNShadow=" << &clustererNNShadow << 
")";
 
 1039            if (nn_settings.nnClusterizerVerbosity > 3) {
 
 1040              LOG(info) << 
"(NNCLUS, GPUChainTrackingClusterizer, this=" << 
this << 
") Done filling data. Loop=" << batch << 
". (clustererNN=" << &clustererNN << 
", clustererNNShadow=" << &clustererNNShadow << 
")";
 
 1045              if (nn_settings.nnClusterizerVerbosity > 3) {
 
 1046                LOG(info) << 
"(NNCLUS, GPUChainTrackingClusterizer, this=" << 
this << 
") Done setting deconvolution flags. Loop=" << batch << 
". (clustererNN=" << &clustererNN << 
", clustererNNShadow=" << &clustererNNShadow << 
")";
 
 1067              if (nn_settings.nnClusterizerVerbosity > 3) {
 
 1068                LOG(info) << 
"(NNCLUS, GPUChainTrackingClusterizer, this=" << 
this << 
") Done with NN classification inference. Loop=" << batch << 
". (clustererNN=" << &clustererNN << 
", clustererNNShadow=" << &clustererNNShadow << 
")";
 
 1104              if (nn_settings.nnClusterizerVerbosity > 3) {
 
 1105                LOG(info) << 
"(NNCLUS, GPUChainTrackingClusterizer, this=" << 
this << 
") Done with NN regression inference. Loop=" << batch << 
". (clustererNN=" << &clustererNN << 
", clustererNNShadow=" << &clustererNNShadow << 
")";
 
 1122            if (nn_settings.nnClusterizerVerbosity > 3) {
 
 1123              LOG(info) << 
"(NNCLUS, GPUChainTrackingClusterizer, this=" << 
this << 
") Done publishing. Loop=" << batch << 
". (clustererNN=" << &clustererNN << 
", clustererNNShadow=" << &clustererNNShadow << 
")";
 
 1128            if(!nn_settings.nnClusterizerApplyCfDeconvolution) { 
 
 1133            if (nn_settings.nnClusterizerVerbosity > 3) {
 
 1134              LOG(info) << 
"(NNCLUS, GPUChainTrackingClusterizer, this=" << 
this << 
") Done with CF regression. (clustererNN=" << &clustererNN << 
", clustererNNShadow=" << &clustererNNShadow << 
")";
 
 1138          GPUFatal(
"Project not compiled with neural network clusterization. Aborting.");
 
 1146        if (doGPU && propagateMCLabels) {
 
 1159        laneHasData[lane] = 
true;
 
 1166    size_t nClsFirst = nClsTotal;
 
 1167    bool anyLaneHasData = 
false;
 
 1168    for (int32_t lane = 0; lane < maxLane; lane++) {
 
 1169      uint32_t iSector = iSectorBase + lane;
 
 1177      if (laneHasData[lane]) {
 
 1178        anyLaneHasData = 
true;
 
 1184            clusterer.raiseError(GPUErrors::ERROR_CF_GLOBAL_CLUSTER_OVERFLOW, iSector * 1000 + 
j, nClsTotal + clusterer.
mPclusterInRow[
j], 
mInputsHost->mNClusterNative);
 
 1187          if (buildNativeGPU) {
 
 1191          } 
else if (buildNativeHost) {
 
 1197        if (transferRunning[lane]) {
 
 1201        transferRunning[lane] = 1;
 
 1204      if (not propagateMCLabels || not laneHasData[lane]) {
 
 1205        assert(propagateMCLabels ? mcLinearLabels.
header.size() == nClsTotal : 
true);
 
 1213      assert(propagateMCLabels ? mcLinearLabels.
header.size() == nClsTotal : 
true);
 
 1215    if (propagateMCLabels) {
 
 1216      for (int32_t lane = 0; lane < maxLane; lane++) {
 
 1220    if (buildNativeHost && buildNativeGPU && anyLaneHasData) {
 
 1222        mOutputQueue.emplace_back(
outputQueueEntry{(
void*)((
char*)&tmpNativeClusters[nClsFirst] - (
char*)&tmpNativeClusters[0]), &
mInputsShadow->mPclusterNativeBuffer[nClsFirst], (nClsTotal - nClsFirst) * 
sizeof(tmpNativeClusters[0]), RecoStep::TPCClusterFinding});
 
 1224        GPUMemCpy(RecoStep::TPCClusterFinding, (
void*)&tmpNativeClusters[nClsFirst], (
const void*)&
mInputsShadow->mPclusterNativeBuffer[nClsFirst], (nClsTotal - nClsFirst) * 
sizeof(tmpNativeClusters[0]), 
mRec->
NStreams() - 1, 
false);
 
 1228    if (mWaitForFinalInputs && iSectorBase >= 21 && (int32_t)iSectorBase < 21 + 
GetProcessingSettings().nTPCClustererLanes) {
 
 1229      notifyForeignChainFinished();
 
 1231    if (mWaitForFinalInputs && iSectorBase >= 30 && (int32_t)iSectorBase < 30 + 
GetProcessingSettings().nTPCClustererLanes) {
 
 1232      mWaitForFinalInputs();
 
 1237#ifdef GPUCA_HAS_ONNX 
 1240        LOG(info) << 
"(ORT) Environment releasing...";
 
 1248    if (transferRunning[
i]) {
 
 1255    if (triggerOutput && triggerOutput->
allocator) {
 
 1264  if (propagateMCLabels) {
 
 1267    std::pair<ConstMCLabelContainer*, ConstMCLabelContainerView*> 
buffer;
 
 1270        throw std::runtime_error(
"Cluster MC Label buffer missing");
 
 1273      buffer = {&container->first, &container->second};
 
 1281    assert(propagateMCLabels ? mcLinearLabels.
header.size() == nClsTotal : 
true);
 
 1282    assert(propagateMCLabels ? mcLinearLabels.
data.size() >= nClsTotal : 
true);
 
 1287    mcLabelsConstView = 
buffer.second;
 
 1293    tmpNativeClusters = 
mInputsHost->mPclusterNativeOutput;
 
 1299  if (buildNativeHost) {
 
 1305      auto allocator = [
this, &tmpNativeClusters](
size_t size) {
 
 1308        return (tmpNativeClusters = this->
mInputsHost->mPclusterNativeOutput);
 
 1310      RunTPCClusterFilter(tmpNativeAccess, allocator, 
false);
 
 1315  if (!mWaitForFinalInputs) {
 
 1316    notifyForeignChainFinished();
 
 1319  if (buildNativeGPU) {
 
 1324    mInputsHost->mPclusterNativeAccess->setOffsetPtrs();
 
 1327  if (doGPU && synchronizeOutput) {
 
 1330  if (doGPU && synchronizeCalibUpdate) {
 
 1339    if (buildNativeGPU) {
 
 1340      GPUMemCpy(RecoStep::TPCClusterFinding, (
void*)
mInputsShadow->mPclusterNativeBuffer, (
const void*)tmpNativeClusters, nClsTotal * 
sizeof(tmpNativeClusters[0]), -1, 
true);
 
 1345  if (mPipelineNotifyCtx) {
 
 1347    mPipelineNotifyCtx = 
nullptr;