57 auto& clusterer = processors.tpcClusterer[sector];
58 auto& clustererNN = processors.tpcNNClusterer[sector];
61 if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
65 uint32_t write_idx = glo_idx * clustererNN.mNnClusterizerElementSize;
69 CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
70 const int32_t
row =
static_cast<int>(peak.row());
71 const int32_t pad =
static_cast<int>(peak.pad());
72 const int32_t
time =
static_cast<int>(peak.time());
73 const float central_charge =
static_cast<float>(chargeMap[peak].unpack());
74 const float inverse_charge = 1.f / central_charge;
76 const int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(
row, clustererNN.mNnClusterizerSizeInputRow);
77 const int32_t iroc_row = 63 + clustererNN.mNnClusterizerSizeInputRow;
79 const int32_t npads_row = GPUTPCGeometry::NPads(
row);
80 float output_value = clustererNN.mNnClusterizerBoundaryFillValue;
82 for (int32_t target_row = -clustererNN.mNnClusterizerSizeInputRow +
row; target_row <= clustererNN.mNnClusterizerSizeInputRow +
row; ++target_row) {
84 const int32_t p_local = pad + (is_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(
row, target_row));
85 const int32_t npads_reference = is_boundary ? 0 : GPUTPCGeometry::NPads(target_row - row_offset);
87 for (int32_t target_pad = -clustererNN.mNnClusterizerSizeInputPad + p_local; target_pad <= clustererNN.mNnClusterizerSizeInputPad + p_local; ++target_pad) {
88 is_boundary = is_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, maxrow, iroc_row, npads_row, npads_reference);
90 for (int32_t target_time = -clustererNN.mNnClusterizerSizeInputTime +
time; target_time <= clustererNN.mNnClusterizerSizeInputTime +
time; ++target_time) {
91 if (is_boundary || target_time < 0 || target_time >= clustererNN.maxAllowedTimebin) {
93 output_value = clustererNN.mNnClusterizerBoundaryFillValue;
95 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
97 clustererNN.mInputData_32[write_idx] = output_value;
100 CfChargePos tmp_pos(target_row, target_pad, target_time);
101 output_value = chargeMap[tmp_pos].unpack() * inverse_charge;
103 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
105 clustererNN.mInputData_32[write_idx] = output_value;
117 if (clustererNN.mNnClusterizerAddIndexData) {
121 clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(
static_cast<float>(pad) / npads_row);
125 clustererNN.mInputData_32[write_idx + 2] =
static_cast<float>(pad) / npads_row;
129 if (!clustererNN.mNnClusterizerSetDeconvolutionFlags) {
130 clustererNN.mClusterFlags[2 * glo_idx] = 0;
131 clustererNN.mClusterFlags[2 * glo_idx + 1] = 0;
133 for (uint16_t
i = 0;
i < 8; ++
i) {
134 Delta2 d = cfconsts::InnerNeighbors[
i];
136 clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
138 clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx];
146 auto& clusterer = processors.tpcClusterer[sector];
147 auto& clustererNN = processors.tpcNNClusterer[sector];
149 if (glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerRowTimeSizeThreads) {
153 const uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeThreads;
154 const uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeThreads);
157 if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
165 CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(base_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
166 const float central_charge = chargeMap[peak].unpack();
167 const int32_t
row =
static_cast<int>(peak.row());
168 const int32_t pad =
static_cast<int>(peak.pad());
169 const int32_t
time =
static_cast<int>(peak.time());
172 if (clustererNN.mNnClusterizerAddIndexData && transient_index >= clustererNN.mNnClusterizerRowTimeSize) {
173 uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize;
174 const int32_t npads = GPUTPCGeometry::NPads(
row);
178 clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(
static_cast<float>(pad) / npads);
182 clustererNN.mInputData_32[write_idx + 2] =
static_cast<float>(pad) / npads;
187 if (transient_index < clustererNN.mNnClusterizerRowTimeSize) {
189 const int32_t row_idx = transient_index / clustererNN.mNnClusterizerFullTimeSize;
190 const int32_t time_idx = transient_index - row_idx * clustererNN.mNnClusterizerFullTimeSize;
191 int32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + row_idx * clustererNN.mNnClusterizerPadTimeSize + time_idx;
194 const int32_t target_row =
row + row_idx - clustererNN.mNnClusterizerSizeInputRow;
195 float output_value = clustererNN.mNnClusterizerBoundaryFillValue;
198 for (uint32_t target_pad = 0; target_pad < clustererNN.mNnClusterizerFullPadSize; ++target_pad) {
200 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
202 clustererNN.mInputData_32[write_idx] = output_value;
204 write_idx += clustererNN.mNnClusterizerFullTimeSize;
209 const int32_t target_time =
time + time_idx - clustererNN.mNnClusterizerSizeInputTime;
210 const uint8_t is_time_boundary = (target_time < 0) || (target_time >= clustererNN.maxAllowedTimebin);
211 const float inverse_central_charge = 1.f / central_charge;
212 const int32_t p_local = pad + GPUTPCNNClusterizerKernels::padOffset(
row, target_row);
213 const int32_t npads = GPUTPCGeometry::NPads(target_row);
215 const int32_t start_pad = -clustererNN.mNnClusterizerSizeInputPad + p_local;
216 const int32_t end_pad = clustererNN.mNnClusterizerSizeInputPad + p_local;
218 for (int32_t target_pad = start_pad; target_pad <= end_pad; ++target_pad) {
219 if (target_pad >= npads || target_pad < 0 || is_time_boundary) {
220 output_value = clustererNN.mNnClusterizerBoundaryFillValue;
224 output_value = chargeMap[
pos].unpack() * inverse_central_charge;
227 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
229 clustererNN.mInputData_32[write_idx] = output_value;
231 write_idx += clustererNN.mNnClusterizerFullTimeSize;
262 auto& clusterer = processors.tpcClusterer[sector];
263 auto& clustererNN = processors.tpcNNClusterer[sector];
264 if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
267 if (clustererNN.mNnClusterizerUseClassification) {
268 uint32_t elem_iterator = glo_idx * clustererNN.mNnClusterizerModelClassNumOutputNodes;
269 float current_max_prob = 0.f;
270 uint32_t class_label = 0;
271 for (uint32_t pIdx = elem_iterator; pIdx < elem_iterator + clustererNN.mNnClusterizerModelClassNumOutputNodes; pIdx++) {
272 if (pIdx == elem_iterator) {
274 current_max_prob =
static_cast<float>(clustererNN.mModelProbabilities_16[pIdx]);
275 }
else if (dtype == 1) {
276 current_max_prob = clustererNN.mModelProbabilities_32[pIdx];
280 current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_16[pIdx].ToFloat());
281 }
else if (dtype == 1) {
282 current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_32[pIdx]);
287 clustererNN.mOutputDataClass[glo_idx + batchStart] = class_label;
288 if (class_label > 1) {
289 clustererNN.mClusterFlags[2 * glo_idx] = 1;
290 clustererNN.mClusterFlags[2 * glo_idx + 1] = 1;
293 clustererNN.mOutputDataClass[glo_idx + batchStart] = 1;
301 auto& clusterer = processors.tpcClusterer[sector];
302 auto& clustererNN = processors.tpcNNClusterer[sector];
303 if (glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
307 uint32_t maxClusterNum = clusterer.mPmemory->counters.nClusters;
308 uint32_t full_glo_idx = glo_idx + batchStart;
309 int32_t model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg1NumOutputNodes;
312 uint32_t peakIndex = CAMath::Min(full_glo_idx, maxClusterNum - 1);
313 CfChargePos peak = clusterer.mPfilteredPeakPositions[peakIndex];
314 float central_charge =
static_cast<float>(chargeMap[peak].unpack());
319 if (full_glo_idx >= maxClusterNum) {
323 GPUTPCCFClusterizer::buildCluster(
324 clusterer.Param().rec,
329 smem.innerAboveThreshold,
343 GPUTPCCFClusterizer::buildCluster(
344 clusterer.Param().rec,
349 smem.innerAboveThreshold,
353 if ((clusterer.mPmemory->fragment).isOverlap(peak.time())) {
354 if (clusterer.mPclusterPosInRow) {
355 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
360 bool notSinglePad =
false, notSingleTime =
false;
361 for (uint16_t
i = 0;
i < 8;
i++) {
362 Delta2 d = cfconsts::InnerNeighbors[
i];
364 float v =
static_cast<float>(chargeMap[tmp_pos].unpack());
365 notSinglePad |= (d.
x != 0) && (
v > 0.f);
366 notSingleTime |= (d.
y != 0) && (
v > 0.f);
369 float publishPadPosition = 0.f, publishTimePosition = 0.f;
371 publishPadPosition =
static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_16[model_output_index].ToFloat();
372 publishTimePosition =
static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_16[model_output_index + 1].ToFloat();
373 isBoundaryPublish(full_glo_idx,
static_cast<int32_t
>(peak.row()), publishPadPosition, publishTimePosition);
374 pc.setFull(central_charge * clustererNN.mOutputDataReg1_16[model_output_index + 4].ToFloat(),
376 notSinglePad ? clustererNN.mOutputDataReg1_16[model_output_index + 2].ToFloat() : 0.f,
377 (clusterer.mPmemory->fragment).
start + publishTimePosition,
378 notSingleTime ? clustererNN.mOutputDataReg1_16[model_output_index + 3].ToFloat() : 0.f,
379 clustererNN.mClusterFlags[2 * glo_idx],
380 clustererNN.mClusterFlags[2 * glo_idx + 1]);
382 publishPadPosition =
static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_32[model_output_index];
383 publishTimePosition =
static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_32[model_output_index + 1];
384 isBoundaryPublish(full_glo_idx,
static_cast<int32_t
>(peak.row()), publishPadPosition, publishTimePosition);
385 pc.setFull(central_charge * clustererNN.mOutputDataReg1_32[model_output_index + 4],
387 notSinglePad ? clustererNN.mOutputDataReg1_32[model_output_index + 2] : 0.f,
388 (clusterer.mPmemory->fragment).
start + publishTimePosition,
389 notSingleTime ? clustererNN.mOutputDataReg1_32[model_output_index + 3] : 0.f,
390 clustererNN.mClusterFlags[2 * glo_idx],
391 clustererNN.mClusterFlags[2 * glo_idx + 1]);
464 bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
465 if (clustererNN.mNnClusterizerUseClassification) {
466 rejectCluster |= (clustererNN.mOutputDataClass[peakIndex] <= 0);
469 if (clusterer.mPclusterPosInRow) {
470 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
475 uint32_t rowIndex = 0;
476 if (clusterOut !=
nullptr) {
477 rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
481 clusterer.mNMaxClusterPerRow,
482 clusterer.mPclusterInRow,
484 if (clusterer.mPclusterPosInRow !=
nullptr) {
485 clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
487 }
else if (clusterer.mPclusterPosInRow) {
488 rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
490 CPU_ONLY(labelAcc->
commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow));
497 auto& clusterer = processors.tpcClusterer[sector];
498 auto& clustererNN = processors.tpcNNClusterer[sector];
499 if (glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
503 uint32_t maxClusterNum = clusterer.mPmemory->counters.nClusters;
505 CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
506 float central_charge =
static_cast<float>(chargeMap[peak].unpack());
511 uint32_t full_glo_idx = glo_idx + batchStart;
513 if (full_glo_idx >= maxClusterNum) {
517 GPUTPCCFClusterizer::buildCluster(
518 clusterer.Param().rec,
523 smem.innerAboveThreshold,
530 uint32_t model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg2NumOutputNodes;
537 GPUTPCCFClusterizer::buildCluster(
538 clusterer.Param().rec,
543 smem.innerAboveThreshold,
547 if ((clusterer.mPmemory->fragment).isOverlap(peak.time())) {
548 if (clusterer.mPclusterPosInRow) {
549 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
555 float publishPadPosition = 0.f, publishTimePosition = 0.f;
557 publishPadPosition =
static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index].ToFloat();
558 publishTimePosition =
static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat();
559 isBoundaryPublish(full_glo_idx,
static_cast<int32_t
>(peak.row()), publishPadPosition, publishTimePosition);
560 pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 8].ToFloat(),
562 clustererNN.mOutputDataReg2_16[model_output_index + 4].ToFloat(),
563 (clusterer.mPmemory->fragment).start + publishTimePosition,
564 clustererNN.mOutputDataReg2_16[model_output_index + 6].ToFloat(),
565 clustererNN.mClusterFlags[2 * glo_idx],
566 clustererNN.mClusterFlags[2 * glo_idx + 1]);
567 }
else if (dtype == 1) {
568 publishPadPosition =
static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index];
569 publishTimePosition =
static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 1];
570 isBoundaryPublish(full_glo_idx,
static_cast<int32_t
>(peak.row()), publishPadPosition, publishTimePosition);
571 pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 8],
573 clustererNN.mOutputDataReg2_32[model_output_index + 4],
574 (clusterer.mPmemory->fragment).start + publishTimePosition,
575 clustererNN.mOutputDataReg2_32[model_output_index + 6],
576 clustererNN.mClusterFlags[2 * glo_idx],
577 clustererNN.mClusterFlags[2 * glo_idx + 1]);
581 bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
582 if (clustererNN.mNnClusterizerUseClassification) {
583 rejectCluster |= (clustererNN.mOutputDataClass[CAMath::Min(full_glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] <= 0);
586 if (clusterer.mPclusterPosInRow) {
587 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
592 uint32_t rowIndex = 0;
593 if (clusterOut !=
nullptr) {
594 rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
598 clusterer.mNMaxClusterPerRow,
599 clusterer.mPclusterInRow,
601 if (clusterer.mPclusterPosInRow !=
nullptr) {
602 clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
604 }
else if (clusterer.mPclusterPosInRow) {
605 rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
607 CPU_ONLY(labelAcc->
commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow));
611 publishPadPosition =
static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat();
612 publishTimePosition =
static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 3].ToFloat();
613 isBoundaryPublish(full_glo_idx,
static_cast<int32_t
>(peak.row()), publishPadPosition, publishTimePosition);
614 pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 9].ToFloat(),
616 clustererNN.mOutputDataReg2_16[model_output_index + 5].ToFloat(),
617 (clusterer.mPmemory->fragment).start + publishTimePosition,
618 clustererNN.mOutputDataReg2_16[model_output_index + 7].ToFloat(),
619 clustererNN.mClusterFlags[2 * glo_idx],
620 clustererNN.mClusterFlags[2 * glo_idx + 1]);
621 }
else if (dtype == 1) {
622 publishPadPosition =
static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index + 1];
623 publishTimePosition =
static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 3];
624 isBoundaryPublish(full_glo_idx,
static_cast<int32_t
>(peak.row()), publishPadPosition, publishTimePosition);
625 pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 9],
627 clustererNN.mOutputDataReg2_32[model_output_index + 5],
628 (clusterer.mPmemory->fragment).start + publishTimePosition,
629 clustererNN.mOutputDataReg2_32[model_output_index + 7],
630 clustererNN.mClusterFlags[2 * glo_idx],
631 clustererNN.mClusterFlags[2 * glo_idx + 1]);
634 rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
635 if (clustererNN.mNnClusterizerUseClassification) {
636 rejectCluster |= (clustererNN.mOutputDataClass[CAMath::Min(full_glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] <= 0);
639 if (clusterer.mPclusterPosInRow) {
640 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
645 if (clusterOut !=
nullptr) {
646 rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
650 clusterer.mNMaxClusterPerRow,
651 clusterer.mPclusterInRow,
653 if (clusterer.mPclusterPosInRow !=
nullptr) {
654 clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
656 }
else if (clusterer.mPclusterPosInRow) {
657 rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];