27#if !defined(GPUCA_GPUCODE)
33#include "GPUTPCCFClusterizer.inc"
41 auto& clusterer = processors.tpcClusterer[sector];
42 auto& clustererNN = processors.tpcNNClusterer[sector];
43 if (clustererNN.outputDataClass[glo_idx] == 0) {
50 GPUTPCCFClusterizer::computeClustersImpl(
get_num_groups(0),
get_local_size(0),
get_group_id(0),
get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec,
CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
56 GPUTPCNNClusterizerKernels::fillInputData(nBlocks, nThreads, iBlock, iThread, processors, sector, dtype, batchStart);
63 processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (
int)(processors.tpcNNClusterer[sector].modelProbabilities[glo_idx] > processors.tpcNNClusterer[sector].nnClassThreshold);
69 auto& clusterer = processors.tpcNNClusterer[sector];
71 uint elem_iterator = glo_idx * clusterer.nnClusterizerModelClassNumOutputNodes;
72 float current_max_prob = 0.f;
74 for (
int pIdx = elem_iterator; pIdx < elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes; pIdx++) {
75 if (pIdx == elem_iterator) {
76 current_max_prob = clusterer.modelProbabilities[pIdx];
78 class_label = (clusterer.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label);
82 clusterer.outputDataClass[glo_idx + batchStart] = class_label;
89 if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) {
92 GPUTPCNNClusterizerKernels::publishClustersReg1(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart);
99 if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) {
102 GPUTPCNNClusterizerKernels::publishClustersReg2(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart);
108 return (
int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2);
113 return (
row > 62 ? global_shift : 0);
118 if (pad < 0 ||
row < 0) {
120 }
else if (
row < 63) {
121 return (pad >=
static_cast<int>(GPUTPCGeometry::NPads(
row)));
122 }
else if (
row < (63 + global_shift)) {
125 return (pad >=
static_cast<int>(GPUTPCGeometry::NPads(
row - global_shift)));
132GPUd()
void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, processorType& processors, uint8_t sector, int8_t dtype, uint batchStart)
135 auto& clusterer = processors.tpcClusterer[sector];
136 auto& clustererNN = processors.tpcNNClusterer[sector];
140 uint write_idx = glo_idx * clustererNN.nnClusterizerElementSize;
142 ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
143 int row =
static_cast<int>(peak.row()), pad =
static_cast<int>(peak.pad()),
time =
static_cast<int>(peak.time());
144 float central_charge =
static_cast<float>(chargeMap[peak].unpack());
146 clustererNN.peakPositions[glo_idx] = peak;
147 clustererNN.centralCharges[glo_idx] = central_charge;
148 clustererNN.outputDataClass[glo_idx + batchStart] = -1;
150 int row_offset = GPUTPCNNClusterizerKernels::rowOffset(
row, clustererNN.nnClusterizerSizeInputRow);
154 for (
int r = -clustererNN.nnClusterizerSizeInputRow;
r <= clustererNN.nnClusterizerSizeInputRow;
r++) {
156 int pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(
row,
row +
r);
157 for (
int p = -clustererNN.nnClusterizerSizeInputPad + pad_offset; p <= clustererNN.nnClusterizerSizeInputPad + pad_offset; p++) {
158 bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(
row +
r + row_offset, pad + p, clustererNN.nnClusterizerSizeInputRow);
159 for (
int t = -clustererNN.nnClusterizerSizeInputTime; t <= clustererNN.nnClusterizerSizeInputTime; t++) {
162 if (
r == 0 && !clustererNN.clusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) {
163 clustererNN.clusterFlags[2 * glo_idx] = CfUtils::isPeak(isPeakMap[tmp_pos]);
164 clustererNN.clusterFlags[2 * glo_idx + 1] = clustererNN.clusterFlags[2 * glo_idx];
167 clustererNN.inputData16[write_idx] = (OrtDataType::Float16_t)(
static_cast<float>(chargeMap[tmp_pos].
unpack()) / central_charge);
169 clustererNN.inputData32[write_idx] =
static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
174 clustererNN.inputData16[write_idx] = (OrtDataType::Float16_t)(
static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue));
176 clustererNN.inputData32[write_idx] =
static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue);
183 if (clustererNN.nnClusterizerAddIndexData) {
185 clustererNN.inputData16[write_idx] = (OrtDataType::Float16_t)(clusterer.mISector / 36.f);
186 clustererNN.inputData16[write_idx + 1] = (OrtDataType::Float16_t)(
row / 152.f);
187 clustererNN.inputData16[write_idx + 2] = (OrtDataType::Float16_t)(
static_cast<float>(pad) / GPUTPCGeometry::NPads(
row));
189 clustererNN.inputData32[write_idx] = clusterer.mISector / 36.f;
190 clustererNN.inputData32[write_idx + 1] =
row / 152.f;
191 clustererNN.inputData32[write_idx + 2] =
static_cast<float>(pad) / GPUTPCGeometry::NPads(
row);
196GPUd()
void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
198 auto& clusterer = processors.tpcClusterer[sector];
199 auto& clustererNN = processors.tpcNNClusterer[sector];
204 uint full_glo_idx = glo_idx + batchStart;
205 int model_output_index = glo_idx * clustererNN.nnClusterizerModelReg1NumOutputNodes;
209 if (clustererNN.outputDataClass[full_glo_idx] == 1) {
216 CPU_ONLY(labelAcc->
collect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack()));
217 GPUTPCCFClusterizer::buildCluster(
218 clusterer.Param().rec,
220 clustererNN.peakPositions[glo_idx],
223 smem.innerAboveThreshold,
228 if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) {
229 if (clusterer.mPclusterPosInRow) {
230 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
235 pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg1[model_output_index + 4],
236 static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg1[model_output_index],
237 clustererNN.outputDataReg1[model_output_index + 2],
238 (clusterer.mPmemory->fragment).start +
static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg1[model_output_index + 1],
239 clustererNN.outputDataReg1[model_output_index + 3],
240 clustererNN.clusterFlags[2 * glo_idx],
241 clustererNN.clusterFlags[2 * glo_idx + 1]);
244 bool rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap);
246 if (clusterer.mPclusterPosInRow) {
247 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
253 if (clusterer.mPclusterByRow !=
nullptr) {
254 rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
257 clustererNN.peakPositions[glo_idx].row(),
258 clusterer.mNMaxClusterPerRow,
259 clusterer.mPclusterInRow,
261 if (clusterer.mPclusterPosInRow !=
nullptr) {
262 clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
264 }
else if (clusterer.mPclusterPosInRow) {
265 rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
267 CPU_ONLY(labelAcc->
commit(clustererNN.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));
269 if (clusterer.mPclusterPosInRow) {
270 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
276GPUd()
void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
278 auto& clusterer = processors.tpcClusterer[sector];
279 auto& clustererNN = processors.tpcNNClusterer[sector];
284 uint full_glo_idx = glo_idx + batchStart;
285 int model_output_index = glo_idx * clustererNN.nnClusterizerModelReg2NumOutputNodes;
289 if (clustererNN.outputDataClass[full_glo_idx] > 0) {
295 CPU_ONLY(labelAcc->
collect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack()));
296 GPUTPCCFClusterizer::buildCluster(
297 clusterer.Param().rec,
299 clustererNN.peakPositions[glo_idx],
302 smem.innerAboveThreshold,
307 if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) {
308 if (clusterer.mPclusterPosInRow) {
309 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
315 pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg2[model_output_index + 8],
316 static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg2[model_output_index],
317 clustererNN.outputDataReg2[model_output_index + 4],
318 (clusterer.mPmemory->fragment).start +
static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 2],
319 clustererNN.outputDataReg2[model_output_index + 6],
320 clustererNN.clusterFlags[2 * glo_idx],
321 clustererNN.clusterFlags[2 * glo_idx + 1]);
324 bool rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap);
326 if (clusterer.mPclusterPosInRow) {
327 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
333 if (clusterer.mPclusterByRow !=
nullptr) {
334 rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
337 clustererNN.peakPositions[glo_idx].row(),
338 clusterer.mNMaxClusterPerRow,
339 clusterer.mPclusterInRow,
341 if (clusterer.mPclusterPosInRow !=
nullptr) {
342 clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
344 }
else if (clusterer.mPclusterPosInRow) {
345 rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
347 CPU_ONLY(labelAcc->
commit(clustererNN.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));
350 pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg2[model_output_index + 9],
351 static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg2[model_output_index + 1],
352 clustererNN.outputDataReg2[model_output_index + 5],
353 (clusterer.mPmemory->fragment).start +
static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 3],
354 clustererNN.outputDataReg2[model_output_index + 7],
355 clustererNN.clusterFlags[2 * glo_idx],
356 clustererNN.clusterFlags[2 * glo_idx + 1]);
358 rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap);
360 if (clusterer.mPclusterPosInRow) {
361 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
366 if (clusterer.mPclusterByRow !=
nullptr) {
367 rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
370 clustererNN.peakPositions[glo_idx].row(),
371 clusterer.mNMaxClusterPerRow,
372 clusterer.mPclusterInRow,
374 if (clusterer.mPclusterPosInRow !=
nullptr) {
375 clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
377 }
else if (clusterer.mPclusterPosInRow) {
378 rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
382 if (clusterer.mPclusterPosInRow) {
383 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
#define get_local_size(dim)
#define get_local_id(dim)
#define get_num_groups(dim)
#define get_global_id(dim)
#define get_group_id(dim)
#define GPUCA_UNROLL(optCu, optHi)
GPUd() int GPUTPCNNClusterizerKernels
GPUdii() void GPUTPCNNClusterizerKernels
void collect(const ChargePos &, tpccf::Charge)
void commit(tpccf::Row, uint32_t, uint32_t)
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
T unpack(BitPtr pos, size_t packingWidth)
constexpr int MAXGLOBALPADROW