45 auto& clusterer = processors.tpcClusterer[sector];
46 auto& clustererNN = processors.tpcNNClusterer[sector];
50 int8_t isAccepted = (clustererNN.mNnClusterizerUseClassification ? (clustererNN.mOutputDataClass[CAMath::Min(glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] > 0) : 1);
51 GPUTPCCFClusterizer::computeClustersImpl(
get_num_groups(0),
get_local_size(0),
get_group_id(0),
get_local_id(0), clusterer, clusterer.mPmemory->fragment,
reinterpret_cast<GPUTPCCFClusterizer::GPUSharedMemory&
>(smem), chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec,
CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, isAccepted);
57 auto& clusterer = processors.tpcClusterer[sector];
58 auto& clustererNN = processors.tpcNNClusterer[sector];
61 if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
65 uint32_t write_idx = glo_idx * clustererNN.mNnClusterizerElementSize;
69 CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
70 int32_t
row =
static_cast<int>(peak.row());
71 int32_t pad =
static_cast<int>(peak.pad());
72 int32_t
time =
static_cast<int>(peak.time());
73 float central_charge =
static_cast<float>(chargeMap[peak].unpack());
74 int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(
row, clustererNN.mNnClusterizerSizeInputRow);
76 for (int32_t
r = -clustererNN.mNnClusterizerSizeInputRow;
r <= clustererNN.mNnClusterizerSizeInputRow; ++
r) {
77 int32_t target_row =
row +
r;
79 int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(
row, target_row);
81 for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; ++p) {
82 int32_t target_pad = pad + p;
83 bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow);
85 for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; ++t) {
86 int32_t target_time =
time + t;
90 float boundary_value =
static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
92 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)boundary_value;
94 clustererNN.mInputData_32[write_idx] = boundary_value;
97 CfChargePos tmp_pos(target_row, target_pad, target_time);
98 float normalized_charge =
static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
100 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)normalized_charge;
102 clustererNN.mInputData_32[write_idx] = normalized_charge;
114 if (clustererNN.mNnClusterizerAddIndexData) {
118 clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(
static_cast<float>(pad) / GPUTPCGeometry::NPads(
row));
122 clustererNN.mInputData_32[write_idx + 2] =
static_cast<float>(pad) / GPUTPCGeometry::NPads(
row);
126 if (!clustererNN.mNnClusterizerSetDeconvolutionFlags) {
127 clustererNN.mClusterFlags[2 * glo_idx] = 0;
128 clustererNN.mClusterFlags[2 * glo_idx + 1] = 0;
130 for (uint16_t
i = 0;
i < 8; ++
i) {
131 Delta2 d = cfconsts::InnerNeighbors[
i];
133 clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
135 clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx];
143 auto& clusterer = processors.tpcClusterer[sector];
144 auto& clustererNN = processors.tpcNNClusterer[sector];
146 if (glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerRowTimeSizeFull) {
150 uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeFull;
151 uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeFull);
154 if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
162 CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(base_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
163 float central_charge =
static_cast<float>(chargeMap[peak].unpack());
164 int32_t
row =
static_cast<int>(peak.row());
165 int32_t pad =
static_cast<int>(peak.pad());
166 int32_t
time =
static_cast<int>(peak.time());
169 if (clustererNN.mNnClusterizerAddIndexData && transient_index >= clustererNN.mNnClusterizerRowTimeSize) {
170 int32_t data_idx = transient_index - clustererNN.mNnClusterizerRowTimeSize;
171 uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx;
173 float index_values[3] = {
176 static_cast<float>(pad) / GPUTPCGeometry::NPads(
row)};
179 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)index_values[data_idx];
181 clustererNN.mInputData_32[write_idx] = index_values[data_idx];
185 if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && data_idx == 2) {
186 uint8_t cluster_flags = 0;
187 for (uint16_t
i = 0;
i < 8;
i++) {
188 Delta2 d = cfconsts::InnerNeighbors[
i];
190 cluster_flags += CfUtils::isPeak(isPeakMap[tmp_pos]);
192 clustererNN.mClusterFlags[2 * base_idx] = cluster_flags;
193 clustererNN.mClusterFlags[2 * base_idx + 1] = cluster_flags;
199 if (transient_index < clustererNN.mNnClusterizerRowTimeSize) {
201 int32_t row_idx = transient_index / clustererNN.mNnClusterizerFullTimeSize;
202 int32_t r_local = row_idx - clustererNN.mNnClusterizerSizeInputRow;
203 int32_t time_idx = transient_index - row_idx * clustererNN.mNnClusterizerFullTimeSize;
204 int32_t t_local = time_idx - clustererNN.mNnClusterizerSizeInputTime;
205 int32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + row_idx * clustererNN.mNnClusterizerPadTimeSize + time_idx;
208 int32_t target_row =
row + r_local;
212 int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(
row, clustererNN.mNnClusterizerSizeInputRow);
213 int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(
row, target_row);
214 for (int32_t p_local = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p_local <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p_local++) {
215 if (is_row_boundary) {
217 float boundary_val =
static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
219 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)boundary_val;
221 clustererNN.mInputData_32[write_idx] = boundary_val;
223 write_idx += clustererNN.mNnClusterizerFullTimeSize;
228 int32_t target_pad = pad + p_local;
229 int32_t target_time =
time + t_local;
232 int8_t is_boundary = GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow) || (target_time < 0) || (target_time >=
TPC_MAX_FRAGMENT_LEN_GPU);
236 output_value =
static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
239 CfChargePos tmp_pos(target_row, target_pad, target_time);
240 output_value =
static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
245 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
247 clustererNN.mInputData_32[write_idx] = output_value;
264 write_idx += clustererNN.mNnClusterizerFullTimeSize;
293 auto& clusterer = processors.tpcClusterer[sector];
294 auto& clustererNN = processors.tpcNNClusterer[sector];
295 if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
298 if (clustererNN.mNnClusterizerUseClassification) {
299 uint32_t elem_iterator = glo_idx * clustererNN.mNnClusterizerModelClassNumOutputNodes;
300 float current_max_prob = 0.f;
301 uint32_t class_label = 0;
302 for (uint32_t pIdx = elem_iterator; pIdx < elem_iterator + clustererNN.mNnClusterizerModelClassNumOutputNodes; pIdx++) {
303 if (pIdx == elem_iterator) {
305 current_max_prob =
static_cast<float>(clustererNN.mModelProbabilities_16[pIdx]);
306 }
else if (dtype == 1) {
307 current_max_prob = clustererNN.mModelProbabilities_32[pIdx];
311 current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_16[pIdx].ToFloat());
312 }
else if (dtype == 1) {
313 current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_32[pIdx]);
318 clustererNN.mOutputDataClass[glo_idx + batchStart] = class_label;
319 if (class_label > 1) {
320 clustererNN.mClusterFlags[2 * glo_idx] = 1;
321 clustererNN.mClusterFlags[2 * glo_idx + 1] = 1;
324 clustererNN.mOutputDataClass[glo_idx + batchStart] = 1;
332 auto& clusterer = processors.tpcClusterer[sector];
333 auto& clustererNN = processors.tpcNNClusterer[sector];
334 if (glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
338 uint32_t maxClusterNum = clusterer.mPmemory->counters.nClusters;
339 uint32_t full_glo_idx = glo_idx + batchStart;
340 int32_t model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg1NumOutputNodes;
343 CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(full_glo_idx, maxClusterNum - 1)];
344 float central_charge =
static_cast<float>(chargeMap[peak].unpack());
349 if (full_glo_idx >= maxClusterNum) {
353 GPUTPCCFClusterizer::buildCluster(
354 clusterer.Param().rec,
359 smem.innerAboveThreshold,
376 GPUTPCCFClusterizer::buildCluster(
377 clusterer.Param().rec,
382 smem.innerAboveThreshold,
386 if ((clusterer.mPmemory->fragment).isOverlap(peak.time())) {
387 if (clusterer.mPclusterPosInRow) {
388 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
394 bool notSinglePad =
false, notSingleTime =
false;
395 for (uint16_t
i = 0;
i < 8;
i++) {
396 Delta2 d = cfconsts::InnerNeighbors[
i];
398 notSinglePad |= (d.
x != 0) && (
static_cast<float>(chargeMap[tmp_pos].unpack()) > 0);
399 notSingleTime |= (d.
y != 0) && (
static_cast<float>(chargeMap[tmp_pos].unpack()) > 0);
403 pc.setFull(central_charge * clustererNN.mOutputDataReg1_16[model_output_index + 4].ToFloat(),
404 static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_16[model_output_index].ToFloat(),
405 notSinglePad ? clustererNN.mOutputDataReg1_16[model_output_index + 2].ToFloat() : 0.f,
406 (clusterer.mPmemory->fragment).
start +
static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_16[model_output_index + 1].ToFloat(),
407 notSingleTime ? clustererNN.mOutputDataReg1_16[model_output_index + 3].ToFloat() : 0.f,
408 clustererNN.mClusterFlags[2 * glo_idx],
409 clustererNN.mClusterFlags[2 * glo_idx + 1]);
410 }
else if (dtype == 1) {
411 pc.setFull(central_charge * clustererNN.mOutputDataReg1_32[model_output_index + 4],
412 static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_32[model_output_index],
413 notSinglePad ? clustererNN.mOutputDataReg1_32[model_output_index + 2] : 0.f,
414 (clusterer.mPmemory->fragment).
start +
static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_32[model_output_index + 1],
415 notSingleTime ? clustererNN.mOutputDataReg1_32[model_output_index + 3] : 0.f,
416 clustererNN.mClusterFlags[2 * glo_idx],
417 clustererNN.mClusterFlags[2 * glo_idx + 1]);
421 bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
422 if (clustererNN.mNnClusterizerUseClassification) {
423 rejectCluster |= (clustererNN.mOutputDataClass[CAMath::Min(full_glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] <= 0);
426 if (clusterer.mPclusterPosInRow) {
427 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
432 uint32_t rowIndex = 0;
433 if (clusterOut !=
nullptr) {
434 rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
438 clusterer.mNMaxClusterPerRow,
439 clusterer.mPclusterInRow,
441 if (clusterer.mPclusterPosInRow !=
nullptr) {
442 clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
444 }
else if (clusterer.mPclusterPosInRow) {
445 rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
447 CPU_ONLY(labelAcc->
commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow));
454 auto& clusterer = processors.tpcClusterer[sector];
455 auto& clustererNN = processors.tpcNNClusterer[sector];
456 if (glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
460 uint32_t maxClusterNum = clusterer.mPmemory->counters.nClusters;
462 CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
463 float central_charge =
static_cast<float>(chargeMap[peak].unpack());
468 uint32_t full_glo_idx = glo_idx + batchStart;
470 if (full_glo_idx >= maxClusterNum) {
474 GPUTPCCFClusterizer::buildCluster(
475 clusterer.Param().rec,
480 smem.innerAboveThreshold,
487 uint32_t model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg2NumOutputNodes;
494 GPUTPCCFClusterizer::buildCluster(
495 clusterer.Param().rec,
500 smem.innerAboveThreshold,
504 if ((clusterer.mPmemory->fragment).isOverlap(peak.time())) {
505 if (clusterer.mPclusterPosInRow) {
506 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
513 pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 8].ToFloat(),
514 static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index].ToFloat(),
515 clustererNN.mOutputDataReg2_16[model_output_index + 4].ToFloat(),
516 (clusterer.mPmemory->fragment).start +
static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 2].ToFloat(),
517 clustererNN.mOutputDataReg2_16[model_output_index + 6].ToFloat(),
518 clustererNN.mClusterFlags[2 * glo_idx],
519 clustererNN.mClusterFlags[2 * glo_idx + 1]);
520 }
else if (dtype == 1) {
521 pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 8],
522 static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index],
523 clustererNN.mOutputDataReg2_32[model_output_index + 4],
524 (clusterer.mPmemory->fragment).start +
static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 2],
525 clustererNN.mOutputDataReg2_32[model_output_index + 6],
526 clustererNN.mClusterFlags[2 * glo_idx],
527 clustererNN.mClusterFlags[2 * glo_idx + 1]);
531 bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
532 if (clustererNN.mNnClusterizerUseClassification) {
533 rejectCluster |= (clustererNN.mOutputDataClass[CAMath::Min(full_glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] <= 0);
536 if (clusterer.mPclusterPosInRow) {
537 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
542 uint32_t rowIndex = 0;
543 if (clusterOut !=
nullptr) {
544 rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
548 clusterer.mNMaxClusterPerRow,
549 clusterer.mPclusterInRow,
551 if (clusterer.mPclusterPosInRow !=
nullptr) {
552 clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
554 }
else if (clusterer.mPclusterPosInRow) {
555 rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
557 CPU_ONLY(labelAcc->
commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow));
561 pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 9].ToFloat(),
562 static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat(),
563 clustererNN.mOutputDataReg2_16[model_output_index + 5].ToFloat(),
564 (clusterer.mPmemory->fragment).start +
static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 3].ToFloat(),
565 clustererNN.mOutputDataReg2_16[model_output_index + 7].ToFloat(),
566 clustererNN.mClusterFlags[2 * glo_idx],
567 clustererNN.mClusterFlags[2 * glo_idx + 1]);
568 }
else if (dtype == 1) {
569 pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 9],
570 static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index + 1],
571 clustererNN.mOutputDataReg2_32[model_output_index + 5],
572 (clusterer.mPmemory->fragment).start +
static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 3],
573 clustererNN.mOutputDataReg2_32[model_output_index + 7],
574 clustererNN.mClusterFlags[2 * glo_idx],
575 clustererNN.mClusterFlags[2 * glo_idx + 1]);
578 rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
579 if (clustererNN.mNnClusterizerUseClassification) {
580 rejectCluster |= (clustererNN.mOutputDataClass[CAMath::Min(full_glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] <= 0);
583 if (clusterer.mPclusterPosInRow) {
584 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
589 if (clusterOut !=
nullptr) {
590 rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
594 clusterer.mNMaxClusterPerRow,
595 clusterer.mPclusterInRow,
597 if (clusterer.mPclusterPosInRow !=
nullptr) {
598 clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
600 }
else if (clusterer.mPclusterPosInRow) {
601 rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];