Project
Loading...
Searching...
No Matches
GPUTPCNNClusterizerKernels.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
15#include "clusterFinderDefs.h"
16#include "PackedCharge.h"
18#include "GPUConstantMem.h"
19#include "GPUTPCClusterFinder.h"
20#include "GPUTPCCFClusterizer.h"
21#include "GPUTPCGeometry.h"
22
23using namespace o2::gpu;
24using namespace o2::gpu::tpccf;
25
26#include "CfConsts.h"
27#include "CfUtils.h"
28#include "ClusterAccumulator.h"
30
31#if !defined(GPUCA_GPUCODE)
32#include "GPUHostDataTypes.h"
33#include "MCLabelAccumulator.h"
34#endif
35
36#ifdef GPUCA_GPUCODE
37#include "GPUTPCCFClusterizer.inc"
38#endif
39
40// Defining individual thread functions for data filling, determining the class label and running the CF clusterizer
41template <>
42GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
43{
44 uint32_t glo_idx = get_global_id(0);
45 auto& clusterer = processors.tpcClusterer[sector];
46 auto& clustererNN = processors.tpcNNClusterer[sector];
47 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
48 CPU_ONLY(MCLabelAccumulator labelAcc(clusterer));
49 tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow;
50 int8_t isAccepted = (clustererNN.mNnClusterizerUseClassification ? (clustererNN.mOutputDataClass[CAMath::Min(glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] > 0) : 1);
51 GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, reinterpret_cast<GPUTPCCFClusterizer::GPUSharedMemory&>(smem), chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, isAccepted);
52}
53
54template <>
55GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNCPU>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
56{
57 auto& clusterer = processors.tpcClusterer[sector];
58 auto& clustererNN = processors.tpcNNClusterer[sector];
59
60 const uint32_t glo_idx = get_global_id(0);
61 if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
62 return;
63 }
64
65 uint32_t write_idx = glo_idx * clustererNN.mNnClusterizerElementSize;
66
67 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
68 CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
69 CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
70 const int32_t row = static_cast<int>(peak.row());
71 const int32_t pad = static_cast<int>(peak.pad());
72 const int32_t time = static_cast<int>(peak.time());
73 const float central_charge = static_cast<float>(chargeMap[peak].unpack());
74 const float inverse_charge = 1.f / central_charge;
75
76 const int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
77 const int32_t iroc_row = 63 + clustererNN.mNnClusterizerSizeInputRow;
78 const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + clustererNN.mNnClusterizerSizeInputRow;
79 const int32_t npads_row = GPUTPCGeometry::NPads(row);
80 float output_value = clustererNN.mNnClusterizerBoundaryFillValue;
81
82 for (int32_t target_row = -clustererNN.mNnClusterizerSizeInputRow + row; target_row <= clustererNN.mNnClusterizerSizeInputRow + row; ++target_row) {
83 uint8_t is_boundary = (target_row < 0) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW);
84 const int32_t p_local = pad + (is_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, target_row));
85 const int32_t npads_reference = is_boundary ? 0 : GPUTPCGeometry::NPads(target_row - row_offset);
86
87 for (int32_t target_pad = -clustererNN.mNnClusterizerSizeInputPad + p_local; target_pad <= clustererNN.mNnClusterizerSizeInputPad + p_local; ++target_pad) {
88 is_boundary = is_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, maxrow, iroc_row, npads_row, npads_reference);
89
90 for (int32_t target_time = -clustererNN.mNnClusterizerSizeInputTime + time; target_time <= clustererNN.mNnClusterizerSizeInputTime + time; ++target_time) {
91 if (is_boundary || target_time < 0 || target_time >= clustererNN.maxAllowedTimebin) {
92 // Fill boundary value
93 output_value = clustererNN.mNnClusterizerBoundaryFillValue;
94 if (dtype == 0) {
95 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
96 } else {
97 clustererNN.mInputData_32[write_idx] = output_value;
98 }
99 } else {
100 CfChargePos tmp_pos(target_row, target_pad, target_time);
101 output_value = chargeMap[tmp_pos].unpack() * inverse_charge;
102 if (dtype == 0) {
103 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
104 } else {
105 clustererNN.mInputData_32[write_idx] = output_value;
106 }
107 }
108 // if((CAMath::Abs(static_cast<float>(clustererNN.mInputData_16_Test[write_idx]) - static_cast<float>(clustererNN.mInputData_16[write_idx])) > 1e-4) && ((glo_idx + batchStart) < clusterer.mPmemory->counters.nClusters)) {
109 // printf("Warning: Input data mismatch at index %d, %d - row, pad, time: %d, %d, %d : %f -> %f\n", glo_idx, glo_idx + batchStart, r, p, t,
110 // static_cast<float>(clustererNN.mInputData_16_Test[write_idx]), static_cast<float>(clustererNN.mInputData_16[write_idx]));
111 // }
112 write_idx++;
113 }
114 }
115 }
116
117 if (clustererNN.mNnClusterizerAddIndexData) {
118 if (dtype == 0) {
119 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR);
120 clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW);
121 clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / npads_row);
122 } else {
123 clustererNN.mInputData_32[write_idx] = static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR;
124 clustererNN.mInputData_32[write_idx + 1] = static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW;
125 clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / npads_row;
126 }
127 }
128
129 if (!clustererNN.mNnClusterizerSetDeconvolutionFlags) {
130 clustererNN.mClusterFlags[2 * glo_idx] = 0;
131 clustererNN.mClusterFlags[2 * glo_idx + 1] = 0;
132
133 for (uint16_t i = 0; i < 8; ++i) {
134 Delta2 d = cfconsts::InnerNeighbors[i];
135 CfChargePos tmp_pos = peak.delta(d);
136 clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
137 }
138 clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx];
139 }
140}
141
142template <>
143GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNGPU>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
144{
145 const uint32_t glo_idx = get_global_id(0);
146 auto& clusterer = processors.tpcClusterer[sector];
147 auto& clustererNN = processors.tpcNNClusterer[sector];
148
149 if (glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerRowTimeSizeThreads) {
150 return;
151 }
152
153 const uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeThreads;
154 const uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeThreads);
155
156 // Early exit for out-of-bounds threads
157 if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
158 return;
159 }
160 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
161 CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
162
163 // Use dedicated neural network shared memory arrays for warp-level caching
164 // First thread in each warp loads shared data
165 CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(base_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
166 const float central_charge = chargeMap[peak].unpack();
167 const int32_t row = static_cast<int>(peak.row());
168 const int32_t pad = static_cast<int>(peak.pad());
169 const int32_t time = static_cast<int>(peak.time());
170
171 // Handle index data with fewer branches
172 if (clustererNN.mNnClusterizerAddIndexData && transient_index >= clustererNN.mNnClusterizerRowTimeSize) {
173 uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize;
174 const int32_t npads = GPUTPCGeometry::NPads(row);
175 if (dtype == 0) {
176 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR);
177 clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW);
178 clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / npads);
179 } else {
180 clustererNN.mInputData_32[write_idx] = static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR;
181 clustererNN.mInputData_32[write_idx + 1] = static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW;
182 clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / npads;
183 }
184 }
185
186 // Main data processing - optimize index calculations
187 if (transient_index < clustererNN.mNnClusterizerRowTimeSize) {
188 // Optimize 3D index calculation
189 const int32_t row_idx = transient_index / clustererNN.mNnClusterizerFullTimeSize;
190 const int32_t time_idx = transient_index - row_idx * clustererNN.mNnClusterizerFullTimeSize;
191 int32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + row_idx * clustererNN.mNnClusterizerPadTimeSize + time_idx;
192
193 // Early boundary check for row
194 const int32_t target_row = row + row_idx - clustererNN.mNnClusterizerSizeInputRow;
195 float output_value = clustererNN.mNnClusterizerBoundaryFillValue;
196
197 if ((row < 63 && target_row > 62) || (target_row < 0) || (row > 62 && target_row < 63) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW)) {
198 for (uint32_t target_pad = 0; target_pad < clustererNN.mNnClusterizerFullPadSize; ++target_pad) {
199 if (dtype == 0) {
200 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
201 } else {
202 clustererNN.mInputData_32[write_idx] = output_value;
203 }
204 write_idx += clustererNN.mNnClusterizerFullTimeSize;
205 }
206 return;
207 } else {
208 // Calculate offsets
209 const int32_t target_time = time + time_idx - clustererNN.mNnClusterizerSizeInputTime;
210 const uint8_t is_time_boundary = (target_time < 0) || (target_time >= clustererNN.maxAllowedTimebin);
211 const float inverse_central_charge = 1.f / central_charge; // multiply by inverse is cheaper than divide
212 const int32_t p_local = pad + GPUTPCNNClusterizerKernels::padOffset(row, target_row);
213 const int32_t npads = GPUTPCGeometry::NPads(target_row);
214
215 const int32_t start_pad = -clustererNN.mNnClusterizerSizeInputPad + p_local;
216 const int32_t end_pad = clustererNN.mNnClusterizerSizeInputPad + p_local;
217
218 for (int32_t target_pad = start_pad; target_pad <= end_pad; ++target_pad) {
219 if (target_pad >= npads || target_pad < 0 || is_time_boundary) {
220 output_value = clustererNN.mNnClusterizerBoundaryFillValue;
221 } else {
222 CfChargePos pos(target_row, target_pad, target_time);
223 // one load + one multiply
224 output_value = chargeMap[pos].unpack() * inverse_central_charge;
225 }
226 if (dtype == 0) {
227 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
228 } else {
229 clustererNN.mInputData_32[write_idx] = output_value;
230 }
231 write_idx += clustererNN.mNnClusterizerFullTimeSize;
232 }
233 return;
234 }
235 }
236}
237
238template <>
239GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
240{
241 uint32_t glo_idx = get_global_id(0);
242 auto& clusterer = processors.tpcClusterer[sector];
243 auto& clustererNN = processors.tpcNNClusterer[sector];
244 if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
245 return;
246 }
247 if (clustererNN.mNnClusterizerUseClassification) {
248 if (dtype == 0) {
249 clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)((clustererNN.mModelProbabilities_16[glo_idx]).ToFloat() > clustererNN.mNnClassThreshold);
250 } else if (dtype == 1) {
251 clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)(clustererNN.mModelProbabilities_32[glo_idx] > clustererNN.mNnClassThreshold);
252 }
253 } else {
254 clustererNN.mOutputDataClass[glo_idx + batchStart] = 1;
255 }
256}
257
258template <>
259GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
260{
261 uint32_t glo_idx = get_global_id(0);
262 auto& clusterer = processors.tpcClusterer[sector];
263 auto& clustererNN = processors.tpcNNClusterer[sector];
264 if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
265 return;
266 }
267 if (clustererNN.mNnClusterizerUseClassification) {
268 uint32_t elem_iterator = glo_idx * clustererNN.mNnClusterizerModelClassNumOutputNodes;
269 float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
270 uint32_t class_label = 0;
271 for (uint32_t pIdx = elem_iterator; pIdx < elem_iterator + clustererNN.mNnClusterizerModelClassNumOutputNodes; pIdx++) {
272 if (pIdx == elem_iterator) {
273 if (dtype == 0) {
274 current_max_prob = static_cast<float>(clustererNN.mModelProbabilities_16[pIdx]);
275 } else if (dtype == 1) {
276 current_max_prob = clustererNN.mModelProbabilities_32[pIdx];
277 }
278 } else {
279 if (dtype == 0) {
280 current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_16[pIdx].ToFloat());
281 } else if (dtype == 1) {
282 current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_32[pIdx]);
283 }
284 }
285 }
286 // uint32_t class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clustererNN.mNnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
287 clustererNN.mOutputDataClass[glo_idx + batchStart] = class_label;
288 if (class_label > 1) {
289 clustererNN.mClusterFlags[2 * glo_idx] = 1;
290 clustererNN.mClusterFlags[2 * glo_idx + 1] = 1;
291 }
292 } else {
293 clustererNN.mOutputDataClass[glo_idx + batchStart] = 1;
294 }
295}
296
297template <>
298GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass1Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
299{
300 uint32_t glo_idx = get_global_id(0);
301 auto& clusterer = processors.tpcClusterer[sector];
302 auto& clustererNN = processors.tpcNNClusterer[sector];
303 if (glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
304 return;
305 }
306
307 uint32_t maxClusterNum = clusterer.mPmemory->counters.nClusters;
308 uint32_t full_glo_idx = glo_idx + batchStart;
309 int32_t model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg1NumOutputNodes;
310
311 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
312 uint32_t peakIndex = CAMath::Min(full_glo_idx, maxClusterNum - 1);
313 CfChargePos peak = clusterer.mPfilteredPeakPositions[peakIndex];
314 float central_charge = static_cast<float>(chargeMap[peak].unpack());
315
316 CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
317 MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
318
319 if (full_glo_idx >= maxClusterNum) {
320 if (withMC) {
321 ClusterAccumulator dummy_pc;
322 CPU_ONLY(labelAcc->collect(peak, central_charge));
323 GPUTPCCFClusterizer::buildCluster(
324 clusterer.Param().rec,
325 chargeMap,
326 peak,
327 smem.posBcast,
328 smem.buf,
329 smem.innerAboveThreshold,
330 &dummy_pc,
331 labelAcc);
332 }
333 return;
334 }
335
336 tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow;
337
339
340 if (withMC) {
341 ClusterAccumulator dummy_pc;
342 CPU_ONLY(labelAcc->collect(peak, central_charge));
343 GPUTPCCFClusterizer::buildCluster(
344 clusterer.Param().rec,
345 chargeMap,
346 peak,
347 smem.posBcast,
348 smem.buf,
349 smem.innerAboveThreshold,
350 &dummy_pc,
351 labelAcc);
352 }
353 if ((clusterer.mPmemory->fragment).isOverlap(peak.time())) {
354 if (clusterer.mPclusterPosInRow) {
355 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
356 }
357 return;
358 }
359
360 bool notSinglePad = false, notSingleTime = false;
361 for (uint16_t i = 0; i < 8; i++) {
362 Delta2 d = cfconsts::InnerNeighbors[i];
363 CfChargePos tmp_pos = peak.delta(d);
364 float v = static_cast<float>(chargeMap[tmp_pos].unpack());
365 notSinglePad |= (d.x != 0) && (v > 0.f);
366 notSingleTime |= (d.y != 0) && (v > 0.f);
367 }
368
369 float publishPadPosition = 0.f, publishTimePosition = 0.f;
370 if (dtype == 0) {
371 publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_16[model_output_index].ToFloat();
372 publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_16[model_output_index + 1].ToFloat();
373 isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
374 pc.setFull(central_charge * clustererNN.mOutputDataReg1_16[model_output_index + 4].ToFloat(),
375 publishPadPosition,
376 notSinglePad ? clustererNN.mOutputDataReg1_16[model_output_index + 2].ToFloat() : 0.f,
377 (clusterer.mPmemory->fragment).start + publishTimePosition,
378 notSingleTime ? clustererNN.mOutputDataReg1_16[model_output_index + 3].ToFloat() : 0.f,
379 clustererNN.mClusterFlags[2 * glo_idx],
380 clustererNN.mClusterFlags[2 * glo_idx + 1]);
381 } else {
382 publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_32[model_output_index];
383 publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_32[model_output_index + 1];
384 isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
385 pc.setFull(central_charge * clustererNN.mOutputDataReg1_32[model_output_index + 4],
386 publishPadPosition,
387 notSinglePad ? clustererNN.mOutputDataReg1_32[model_output_index + 2] : 0.f,
388 (clusterer.mPmemory->fragment).start + publishTimePosition,
389 notSingleTime ? clustererNN.mOutputDataReg1_32[model_output_index + 3] : 0.f,
390 clustererNN.mClusterFlags[2 * glo_idx],
391 clustererNN.mClusterFlags[2 * glo_idx + 1]);
392 }
393
394 // if (boundaryFlag != 0) { // Prints the entire NN input for the given index
395 // // Build a simple buffer manually (float with 3 decimals)
396 // const int MAX_CHARS = 4096;
397 // char buffer[MAX_CHARS];
398 // int pos = 0;
399 //
400 // auto appendChar = [&](char c) {
401 // if (pos < MAX_CHARS - 1) buffer[pos++] = c;
402 // };
403 // auto appendStr = [&](const char* s) {
404 // while (*s && pos < MAX_CHARS - 1) buffer[pos++] = *s++;
405 // };
406 // auto appendUInt = [&](uint32_t v) {
407 // char tmp[16]; int tp = 0;
408 // if (v == 0) { appendChar('0'); return; }
409 // while (v && tp < 16) { tmp[tp++] = char('0' + (v % 10)); v /= 10; }
410 // while (tp--) appendChar(tmp[tp]);
411 // };
412 // auto appendInt = [&](int v) {
413 // if (v < 0) { appendChar('-'); v = -v; }
414 // appendUInt((uint32_t)v);
415 // };
416 // auto appendFloat = [&](float f) {
417 // if (f < 0) { appendChar('-'); f = -f; }
418 // int ip = (int)f;
419 // float frac = f - (float)ip;
420 // appendInt(ip);
421 // appendChar('.');
422 // for (int i = 0; i < 3; i++) {
423 // frac *= 10.f;
424 // int d = (int)frac;
425 // appendChar((char)('0' + (d < 0 ? 0 : (d > 9 ? 9 : d))));
426 // frac -= d;
427 // if (frac < 0) frac = 0;
428 // }
429 // };
430 //
431 // appendStr("(NN CLUS) DEBUG: Boundary cluster detected (sector ");
432 // appendUInt(sector);
433 // appendStr(", row ");
434 // appendUInt(peak.row());
435 // appendStr(", pad ");
436 // appendFloat(publishPadPosition);
437 // appendStr(", time ");
438 // appendFloat(publishTimePosition);
439 // appendStr(") [glo_idx=");
440 // appendUInt(glo_idx);
441 // appendStr(" elemSize=");
442 // appendInt(clustererNN.mNnClusterizerElementSize);
443 // appendStr(" dtype=");
444 // appendInt(dtype);
445 // appendStr("] INPUT:");
446 //
447 // int elemSize = clustererNN.mNnClusterizerElementSize;
448 // int baseIdx = glo_idx * elemSize;
449 //
450 // int maxPrint = elemSize;
451 // for (int i = 0; i < maxPrint; ++i) {
452 // appendChar(' ');
453 // float v = (dtype == 0) ? clustererNN.mInputData_16[baseIdx + i].ToFloat()
454 // : clustererNN.mInputData_32[baseIdx + i];
455 // appendFloat(v);
456 // if (pos > (MAX_CHARS - 32)) { appendStr(" ..."); break; }
457 // }
458 //
459 // buffer[pos] = 0;
460 // printf("%s\n", buffer);
461 // }
462
463 tpc::ClusterNative myCluster;
464 bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
465 if (clustererNN.mNnClusterizerUseClassification) {
466 rejectCluster |= (clustererNN.mOutputDataClass[peakIndex] <= 0);
467 }
468 if (rejectCluster) {
469 if (clusterer.mPclusterPosInRow) {
470 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
471 }
472 return;
473 }
474
475 uint32_t rowIndex = 0;
476 if (clusterOut != nullptr) {
477 rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
478 clusterer,
479 myCluster,
480 peak.row(),
481 clusterer.mNMaxClusterPerRow,
482 clusterer.mPclusterInRow,
483 clusterOut);
484 if (clusterer.mPclusterPosInRow != nullptr) {
485 clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
486 }
487 } else if (clusterer.mPclusterPosInRow) {
488 rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
489 }
490 CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow));
491}
492
493template <>
494GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass2Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
495{
496 uint32_t glo_idx = get_global_id(0);
497 auto& clusterer = processors.tpcClusterer[sector];
498 auto& clustererNN = processors.tpcNNClusterer[sector];
499 if (glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
500 return;
501 }
502
503 uint32_t maxClusterNum = clusterer.mPmemory->counters.nClusters;
504 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
505 CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
506 float central_charge = static_cast<float>(chargeMap[peak].unpack());
507
508 CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
509 MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
510 tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow;
511 uint32_t full_glo_idx = glo_idx + batchStart;
512
513 if (full_glo_idx >= maxClusterNum) {
514 if (withMC) {
515 ClusterAccumulator dummy_pc;
516 CPU_ONLY(labelAcc->collect(peak, central_charge));
517 GPUTPCCFClusterizer::buildCluster(
518 clusterer.Param().rec,
519 chargeMap,
520 peak,
521 smem.posBcast,
522 smem.buf,
523 smem.innerAboveThreshold,
524 &dummy_pc,
525 labelAcc);
526 }
527 return;
528 }
529
530 uint32_t model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg2NumOutputNodes;
531
533
534 if (withMC) {
535 ClusterAccumulator dummy_pc;
536 CPU_ONLY(labelAcc->collect(peak, central_charge));
537 GPUTPCCFClusterizer::buildCluster(
538 clusterer.Param().rec,
539 chargeMap,
540 peak,
541 smem.posBcast,
542 smem.buf,
543 smem.innerAboveThreshold,
544 &dummy_pc,
545 labelAcc);
546 }
547 if ((clusterer.mPmemory->fragment).isOverlap(peak.time())) {
548 if (clusterer.mPclusterPosInRow) {
549 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
550 }
551 return;
552 }
553
554 // Cluster 1
555 float publishPadPosition = 0.f, publishTimePosition = 0.f;
556 if (dtype == 0) {
557 publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index].ToFloat();
558 publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat();
559 isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
560 pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 8].ToFloat(),
561 publishPadPosition,
562 clustererNN.mOutputDataReg2_16[model_output_index + 4].ToFloat(),
563 (clusterer.mPmemory->fragment).start + publishTimePosition,
564 clustererNN.mOutputDataReg2_16[model_output_index + 6].ToFloat(),
565 clustererNN.mClusterFlags[2 * glo_idx],
566 clustererNN.mClusterFlags[2 * glo_idx + 1]);
567 } else if (dtype == 1) {
568 publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index];
569 publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 1];
570 isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
571 pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 8],
572 publishPadPosition,
573 clustererNN.mOutputDataReg2_32[model_output_index + 4],
574 (clusterer.mPmemory->fragment).start + publishTimePosition,
575 clustererNN.mOutputDataReg2_32[model_output_index + 6],
576 clustererNN.mClusterFlags[2 * glo_idx],
577 clustererNN.mClusterFlags[2 * glo_idx + 1]);
578 }
579
580 tpc::ClusterNative myCluster;
581 bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
582 if (clustererNN.mNnClusterizerUseClassification) {
583 rejectCluster |= (clustererNN.mOutputDataClass[CAMath::Min(full_glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] <= 0);
584 }
585 if (rejectCluster) {
586 if (clusterer.mPclusterPosInRow) {
587 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
588 }
589 return;
590 }
591
592 uint32_t rowIndex = 0;
593 if (clusterOut != nullptr) {
594 rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
595 clusterer,
596 myCluster,
597 peak.row(),
598 clusterer.mNMaxClusterPerRow,
599 clusterer.mPclusterInRow,
600 clusterOut);
601 if (clusterer.mPclusterPosInRow != nullptr) {
602 clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
603 }
604 } else if (clusterer.mPclusterPosInRow) {
605 rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
606 }
607 CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow));
608
609 // Cluster 2
610 if (dtype == 0) {
611 publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat();
612 publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 3].ToFloat();
613 isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
614 pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 9].ToFloat(),
615 publishPadPosition,
616 clustererNN.mOutputDataReg2_16[model_output_index + 5].ToFloat(),
617 (clusterer.mPmemory->fragment).start + publishTimePosition,
618 clustererNN.mOutputDataReg2_16[model_output_index + 7].ToFloat(),
619 clustererNN.mClusterFlags[2 * glo_idx],
620 clustererNN.mClusterFlags[2 * glo_idx + 1]);
621 } else if (dtype == 1) {
622 publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index + 1];
623 publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 3];
624 isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
625 pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 9],
626 publishPadPosition,
627 clustererNN.mOutputDataReg2_32[model_output_index + 5],
628 (clusterer.mPmemory->fragment).start + publishTimePosition,
629 clustererNN.mOutputDataReg2_32[model_output_index + 7],
630 clustererNN.mClusterFlags[2 * glo_idx],
631 clustererNN.mClusterFlags[2 * glo_idx + 1]);
632 }
633
634 rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
635 if (clustererNN.mNnClusterizerUseClassification) {
636 rejectCluster |= (clustererNN.mOutputDataClass[CAMath::Min(full_glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] <= 0);
637 }
638 if (rejectCluster) {
639 if (clusterer.mPclusterPosInRow) {
640 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
641 }
642 return;
643 }
644
645 if (clusterOut != nullptr) {
646 rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
647 clusterer,
648 myCluster,
649 peak.row(),
650 clusterer.mNMaxClusterPerRow,
651 clusterer.mPclusterInRow,
652 clusterOut);
653 if (clusterer.mPclusterPosInRow != nullptr) {
654 clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
655 }
656 } else if (clusterer.mPclusterPosInRow) {
657 rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
658 }
659 // CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters?
660}
661
662// ---------------------------------
663template <>
664GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishDeconvolutionFlags>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart)
665{
666 // Implements identical publishing logic as the heuristic clusterizer and deconvolution kernel
667 uint32_t glo_idx = get_global_id(0);
668 auto& clusterer = processors.tpcClusterer[sector];
669 auto& clustererNN = processors.tpcNNClusterer[sector];
670 if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
671 return;
672 }
673 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
674 CfChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
675
676 clustererNN.mClusterFlags[2 * glo_idx] = 0;
677 clustererNN.mClusterFlags[2 * glo_idx + 1] = 0;
678 for (int i = 0; i < 8; i++) {
679 Delta2 d = cfconsts::InnerNeighbors[i];
680 CfChargePos tmp_pos = peak.delta(d);
681 PackedCharge charge = chargeMap[tmp_pos];
682 clustererNN.mClusterFlags[2 * glo_idx] += (d.y != 0 && charge.isSplit());
683 clustererNN.mClusterFlags[2 * glo_idx + 1] += (d.x != 0 && charge.isSplit());
684 }
685 for (int i = 0; i < 16; i++) {
686 Delta2 d = cfconsts::OuterNeighbors[i];
687 CfChargePos tmp_pos = peak.delta(d);
688 PackedCharge charge = chargeMap[tmp_pos];
689 clustererNN.mClusterFlags[2 * glo_idx] += (d.y != 0 && charge.isSplit() && !charge.has3x3Peak());
690 clustererNN.mClusterFlags[2 * glo_idx + 1] += (d.x != 0 && charge.isSplit() && !charge.has3x3Peak());
691 }
692}
693
694// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
695GPUd() int32_t GPUTPCNNClusterizerKernels::padOffset(int32_t row_ref, int32_t row_current)
696{
697 if (row_current < 0 || row_current >= o2::tpc::constants::MAXGLOBALPADROW) {
698 return 0; // Short-circuit for out-of-bound rows
699 } else {
700 return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2);
701 }
702}
703
704GPUd() int32_t GPUTPCNNClusterizerKernels::rowOffset(int32_t row, int32_t offset)
705{
706 return (row > 62 ? offset : 0);
707}
708
709GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t maxrow, int32_t iroc_row, int32_t npads_row, int32_t npads_reference)
710{
711 if (pad < 0) { // Faster short-circuit
712 return true;
713 } else if (row < 63) {
714 return (pad >= npads_row);
715 } else if (row < iroc_row) { // to account for the gap between IROC and OROC. Charge will be set to the boundary fill value in order to signal boundaries to the neural network
716 return true;
717 } else if (row < maxrow) {
718 return (pad >= npads_reference);
719 } else {
720 return true;
721 }
722}
723
724GPUd() bool GPUTPCNNClusterizerKernels::isBoundaryPublish(int32_t idx, int32_t row, float& pad, float& time)
725{
726 if (pad < 0) {
727 // printf("(NN CLUS) WARNING: Boundary detected, idx = %d, pad < 0: row %d, pad %f (%d, %d), time %f (%d, %d)\n", idx, row, pad, 0, static_cast<int>(GPUTPCGeometry::NPads(row)), time, 0, TPC_MAX_FRAGMENT_LEN_GPU);
728 pad = 0.f;
729 return true;
730 } else if (pad >= static_cast<int>(GPUTPCGeometry::NPads(row))) {
731 // printf("(NN CLUS) WARNING: Boundary detected, idx = %d, pad >= static_cast<int>(GPUTPCGeometry::NPads(row): row %d, pad %f (%d, %d), time %f (%d, %d)\n", idx, row, pad, 0, static_cast<int>(GPUTPCGeometry::NPads(row)), time, 0, TPC_MAX_FRAGMENT_LEN_GPU);
732 pad = static_cast<float>(GPUTPCGeometry::NPads(row) - 1);
733 return true;
734 } else if (time < 0) {
735 // printf("(NN CLUS) WARNING: Boundary detected, idx = %d, time < 0: row %d, pad %f (%d, %d), time %f (%d, %d)\n", idx, row, pad, 0, static_cast<int>(GPUTPCGeometry::NPads(row)), time, 0, TPC_MAX_FRAGMENT_LEN_GPU);
736 time = 0.f;
737 return true;
738 } else if (time >= TPC_MAX_FRAGMENT_LEN_GPU) {
739 // printf("(NN CLUS) WARNING: Boundary detected, idx = %d, time >= TPC_MAX_FRAGMENT_LEN_GPU: row %d, pad %f (%d, %d), time %f (%d, %d)\n", idx, row, pad, 0, static_cast<int>(GPUTPCGeometry::NPads(row)), time, 0, TPC_MAX_FRAGMENT_LEN_GPU);
740 time = static_cast<float>(TPC_MAX_FRAGMENT_LEN_GPU - 1);
741 return true;
742 } else {
743 return false;
744 }
745}
int16_t charge
Definition RawEventData.h:5
int16_t time
Definition RawEventData.h:4
int32_t i
#define get_local_size(dim)
#define get_local_id(dim)
#define get_num_groups(dim)
#define get_global_id(dim)
#define get_group_id(dim)
GPUdii() void GPUTPCNNClusterizerKernels
GPUd() int32_t GPUTPCNNClusterizerKernels
uint16_t pos
Definition RawData.h:3
void collect(const CfChargePos &, tpccf::Charge)
void commit(tpccf::Row, uint32_t, uint32_t)
#define TPC_MAX_FRAGMENT_LEN_GPU
#define CPU_ONLY(x)
#define CPU_PTR(x)
const GLdouble * v
Definition glcorearb.h:832
GLintptr offset
Definition glcorearb.h:660
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLuint start
Definition glcorearb.h:469
constexpr int MAXSECTOR
Definition Constants.h:28
constexpr int MAXGLOBALPADROW
Definition Constants.h:34
int16_t y
int16_t x
std::vector< int > row