Project
Loading...
Searching...
No Matches
GPUTPCNNClusterizerKernels.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
15#include "clusterFinderDefs.h"
16#include "PackedCharge.h"
18#include "GPUConstantMem.h"
19#include "GPUTPCClusterFinder.h"
20#include "GPUTPCCFClusterizer.h"
21#include "GPUTPCGeometry.h"
22
23using namespace o2::gpu;
24using namespace o2::gpu::tpccf;
25
26#include "CfConsts.h"
27#include "CfUtils.h"
28#include "ClusterAccumulator.h"
30
31#if !defined(GPUCA_GPUCODE)
32#include "GPUHostDataTypes.h"
33#include "MCLabelAccumulator.h"
34#endif
35
36#ifdef GPUCA_GPUCODE
37#include "GPUTPCCFClusterizer.inc"
38#endif
39
40// Defining individual thread functions for data filling, determining the class label and running the CF clusterizer
41template <>
42GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
43{
44 uint32_t glo_idx = get_global_id(0);
45 auto& clusterer = processors.tpcClusterer[sector];
46 auto& clustererNN = processors.tpcNNClusterer[sector];
47 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
48 CPU_ONLY(MCLabelAccumulator labelAcc(clusterer));
49 tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow;
50 int8_t isAccepted = (clustererNN.mNnClusterizerUseClassification ? (clustererNN.mOutputDataClass[CAMath::Min(glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] > 0) : 1);
51 GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, reinterpret_cast<GPUTPCCFClusterizer::GPUSharedMemory&>(smem), chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, isAccepted);
52}
53
54template <>
55GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNCPU>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
56{
57 auto& clusterer = processors.tpcClusterer[sector];
58 auto& clustererNN = processors.tpcNNClusterer[sector];
59
60 uint32_t glo_idx = get_global_id(0);
61 if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
62 return;
63 }
64
65 uint32_t write_idx = glo_idx * clustererNN.mNnClusterizerElementSize;
66
67 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
68 CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
69 CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
70 int32_t row = static_cast<int>(peak.row());
71 int32_t pad = static_cast<int>(peak.pad());
72 int32_t time = static_cast<int>(peak.time());
73 float central_charge = static_cast<float>(chargeMap[peak].unpack());
74 int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
75
76 for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; ++r) {
77 int32_t target_row = row + r;
78 bool is_row_boundary = (target_row < 0) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW);
79 int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, target_row);
80
81 for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; ++p) {
82 int32_t target_pad = pad + p;
83 bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow);
84
85 for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; ++t) {
86 int32_t target_time = time + t;
87
88 if (is_boundary || target_time < 0 || target_time >= clustererNN.maxAllowedTimebin) {
89 // Fill boundary value
90 float boundary_value = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
91 if (dtype == 0) {
92 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)boundary_value;
93 } else {
94 clustererNN.mInputData_32[write_idx] = boundary_value;
95 }
96 } else {
97 CfChargePos tmp_pos(target_row, target_pad, target_time);
98 float normalized_charge = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
99 if (dtype == 0) {
100 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)normalized_charge;
101 } else {
102 clustererNN.mInputData_32[write_idx] = normalized_charge;
103 }
104 }
105 // if((CAMath::Abs(static_cast<float>(clustererNN.mInputData_16_Test[write_idx]) - static_cast<float>(clustererNN.mInputData_16[write_idx])) > 1e-4) && ((glo_idx + batchStart) < clusterer.mPmemory->counters.nClusters)) {
106 // printf("Warning: Input data mismatch at index %d, %d - row, pad, time: %d, %d, %d : %f -> %f\n", glo_idx, glo_idx + batchStart, r, p, t,
107 // static_cast<float>(clustererNN.mInputData_16_Test[write_idx]), static_cast<float>(clustererNN.mInputData_16[write_idx]));
108 // }
109 write_idx++;
110 }
111 }
112 }
113
114 if (clustererNN.mNnClusterizerAddIndexData) {
115 if (dtype == 0) {
116 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR);
117 clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW);
118 clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / GPUTPCGeometry::NPads(row));
119 } else {
120 clustererNN.mInputData_32[write_idx] = static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR;
121 clustererNN.mInputData_32[write_idx + 1] = static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW;
122 clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
123 }
124 }
125
126 if (!clustererNN.mNnClusterizerSetDeconvolutionFlags) {
127 clustererNN.mClusterFlags[2 * glo_idx] = 0;
128 clustererNN.mClusterFlags[2 * glo_idx + 1] = 0;
129
130 for (uint16_t i = 0; i < 8; ++i) {
131 Delta2 d = cfconsts::InnerNeighbors[i];
132 CfChargePos tmp_pos = peak.delta(d);
133 clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
134 }
135 clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx];
136 }
137}
138
139template <>
140GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNGPU>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
141{
142 uint32_t glo_idx = get_global_id(0);
143 auto& clusterer = processors.tpcClusterer[sector];
144 auto& clustererNN = processors.tpcNNClusterer[sector];
145
146 if (glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerRowTimeSizeFull) {
147 return;
148 }
149
150 uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeFull;
151 uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeFull);
152
153 // Early exit for out-of-bounds threads
154 if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
155 return;
156 }
157 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
158 CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
159
160 // Use dedicated neural network shared memory arrays for warp-level caching
161 // First thread in each warp loads shared data
162 CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(base_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
163 float central_charge = static_cast<float>(chargeMap[peak].unpack());
164 int32_t row = static_cast<int>(peak.row());
165 int32_t pad = static_cast<int>(peak.pad());
166 int32_t time = static_cast<int>(peak.time());
167
168 // Handle index data with fewer branches
169 if (clustererNN.mNnClusterizerAddIndexData && transient_index >= clustererNN.mNnClusterizerRowTimeSize) {
170 int32_t data_idx = transient_index - clustererNN.mNnClusterizerRowTimeSize;
171 uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx;
172
173 float index_values[3] = {
174 static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR,
175 static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW,
176 static_cast<float>(pad) / GPUTPCGeometry::NPads(row)};
177
178 if (dtype == 0) {
179 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)index_values[data_idx];
180 } else {
181 clustererNN.mInputData_32[write_idx] = index_values[data_idx];
182 }
183
184 // Handle deconvolution flags only once per cluster (last thread in element)
185 if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && data_idx == 2) {
186 uint8_t cluster_flags = 0;
187 for (uint16_t i = 0; i < 8; i++) {
188 Delta2 d = cfconsts::InnerNeighbors[i];
189 CfChargePos tmp_pos = peak.delta(d);
190 cluster_flags += CfUtils::isPeak(isPeakMap[tmp_pos]);
191 }
192 clustererNN.mClusterFlags[2 * base_idx] = cluster_flags;
193 clustererNN.mClusterFlags[2 * base_idx + 1] = cluster_flags;
194 }
195 return;
196 }
197
198 // Main data processing - optimize index calculations
199 if (transient_index < clustererNN.mNnClusterizerRowTimeSize) {
200 // Optimize 3D index calculation
201 int32_t row_idx = transient_index / clustererNN.mNnClusterizerFullTimeSize;
202 int32_t r_local = row_idx - clustererNN.mNnClusterizerSizeInputRow;
203 int32_t time_idx = transient_index - row_idx * clustererNN.mNnClusterizerFullTimeSize;
204 int32_t t_local = time_idx - clustererNN.mNnClusterizerSizeInputTime;
205 int32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + row_idx * clustererNN.mNnClusterizerPadTimeSize + time_idx;
206
207 // Early boundary check for row
208 int32_t target_row = row + r_local;
209 int8_t is_row_boundary = (target_row < 0) || (target_row > (o2::tpc::constants::MAXGLOBALPADROW - 1));
210
211 // Calculate offsets
212 int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
213 int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, target_row);
214 for (int32_t p_local = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p_local <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p_local++) {
215 if (is_row_boundary) {
216 // Use boundary fill value
217 float boundary_val = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
218 if (dtype == 0) {
219 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)boundary_val;
220 } else {
221 clustererNN.mInputData_32[write_idx] = boundary_val;
222 }
223 write_idx += clustererNN.mNnClusterizerFullTimeSize; // Move to next pad position
224 continue;
225 }
226
227 // Calculate target pad and time
228 int32_t target_pad = pad + p_local;
229 int32_t target_time = time + t_local;
230
231 // Optimized boundary check
232 int8_t is_boundary = GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow) || (target_time < 0) || (target_time >= clustererNN.maxAllowedTimebin);
233
234 float output_value;
235 if (is_boundary) {
236 output_value = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
237 } else {
238 // Coalesced memory access - create position and read charge
239 CfChargePos tmp_pos(target_row, target_pad, target_time);
240 output_value = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge; // Normalize by central charge
241 }
242
243 // Write output with reduced branching
244 if (dtype == 0) {
245 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
246 } else {
247 clustererNN.mInputData_32[write_idx] = output_value;
248 }
249 // if (write_idx >= clustererNN.mNnClusterizerElementSize * clustererNN.mNnClusterizerBatchedMode) {
250 // printf("Error: Write index out of bounds (central array)! %d >= %d (write_idx: %d, base_idx: %d, transient_index: %d, row_idx: %d, time_idx: %d, r_local: %d, t_local: %d)\n",
251 // write_idx, (int)(clustererNN.mNnClusterizerElementSize * clustererNN.mNnClusterizerBatchedMode), write_idx, base_idx, transient_index, row_idx, time_idx, r_local, t_local);
252 // }
253 // if ((clusterer.mPmemory->counters.nClusters - batchStart) < clustererNN.mNnClusterizerBatchedMode) {
254 // if (write_idx >= ((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize)) {
255 // printf("Error: Write index out of bounds (end of array)! %d >= %d (write_idx: %d, base_idx: %d, transient_index: %d, row_idx: %d, time_idx: %d, r_local: %d, t_local: %d)\n",
256 // write_idx, (int)((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize), write_idx, base_idx, transient_index, row_idx, time_idx, r_local, t_local);
257 // }
258 // if (write_idx > ((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize - 5)) {
259 // printf("Sanity check (should appear only once) %d == %d (write_idx: %d, base_idx: %d, transient_index: %d, row_idx: %d, time_idx: %d, r_local: %d, t_local: %d)\n",
260 // write_idx, (int)((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize - 4), write_idx, base_idx, transient_index, row_idx, time_idx, r_local, t_local);
261 // }
262 // }
263
264 write_idx += clustererNN.mNnClusterizerFullTimeSize; // Move to next pad position
265 }
266 }
267}
268
269template <>
270GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
271{
272 uint32_t glo_idx = get_global_id(0);
273 auto& clusterer = processors.tpcClusterer[sector];
274 auto& clustererNN = processors.tpcNNClusterer[sector];
275 if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
276 return;
277 }
278 if (clustererNN.mNnClusterizerUseClassification) {
279 if (dtype == 0) {
280 clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)((clustererNN.mModelProbabilities_16[glo_idx]).ToFloat() > clustererNN.mNnClassThreshold);
281 } else if (dtype == 1) {
282 clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)(clustererNN.mModelProbabilities_32[glo_idx] > clustererNN.mNnClassThreshold);
283 }
284 } else {
285 clustererNN.mOutputDataClass[glo_idx + batchStart] = 1;
286 }
287}
288
289template <>
290GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
291{
292 uint32_t glo_idx = get_global_id(0);
293 auto& clusterer = processors.tpcClusterer[sector];
294 auto& clustererNN = processors.tpcNNClusterer[sector];
295 if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
296 return;
297 }
298 if (clustererNN.mNnClusterizerUseClassification) {
299 uint32_t elem_iterator = glo_idx * clustererNN.mNnClusterizerModelClassNumOutputNodes;
300 float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
301 uint32_t class_label = 0;
302 for (uint32_t pIdx = elem_iterator; pIdx < elem_iterator + clustererNN.mNnClusterizerModelClassNumOutputNodes; pIdx++) {
303 if (pIdx == elem_iterator) {
304 if (dtype == 0) {
305 current_max_prob = static_cast<float>(clustererNN.mModelProbabilities_16[pIdx]);
306 } else if (dtype == 1) {
307 current_max_prob = clustererNN.mModelProbabilities_32[pIdx];
308 }
309 } else {
310 if (dtype == 0) {
311 current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_16[pIdx].ToFloat());
312 } else if (dtype == 1) {
313 current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_32[pIdx]);
314 }
315 }
316 }
317 // uint32_t class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clustererNN.mNnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
318 clustererNN.mOutputDataClass[glo_idx + batchStart] = class_label;
319 if (class_label > 1) {
320 clustererNN.mClusterFlags[2 * glo_idx] = 1;
321 clustererNN.mClusterFlags[2 * glo_idx + 1] = 1;
322 }
323 } else {
324 clustererNN.mOutputDataClass[glo_idx + batchStart] = 1;
325 }
326}
327
328template <>
329GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass1Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
330{
331 uint32_t glo_idx = get_global_id(0);
332 auto& clusterer = processors.tpcClusterer[sector];
333 auto& clustererNN = processors.tpcNNClusterer[sector];
334 if (glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
335 return;
336 }
337
338 uint32_t maxClusterNum = clusterer.mPmemory->counters.nClusters;
339 uint32_t full_glo_idx = glo_idx + batchStart;
340 int32_t model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg1NumOutputNodes;
341
342 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
343 uint32_t peakIndex = CAMath::Min(full_glo_idx, maxClusterNum - 1);
344 CfChargePos peak = clusterer.mPfilteredPeakPositions[peakIndex];
345 float central_charge = static_cast<float>(chargeMap[peak].unpack());
346
347 CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
348 MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
349
350 if (full_glo_idx >= maxClusterNum) {
351 if (withMC) {
352 ClusterAccumulator dummy_pc;
353 CPU_ONLY(labelAcc->collect(peak, central_charge));
354 GPUTPCCFClusterizer::buildCluster(
355 clusterer.Param().rec,
356 chargeMap,
357 peak,
358 smem.posBcast,
359 smem.buf,
360 smem.innerAboveThreshold,
361 &dummy_pc,
362 labelAcc);
363 }
364 return;
365 }
366
367 tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow;
368
370
371 if (withMC) {
372 ClusterAccumulator dummy_pc;
373 CPU_ONLY(labelAcc->collect(peak, central_charge));
374 GPUTPCCFClusterizer::buildCluster(
375 clusterer.Param().rec,
376 chargeMap,
377 peak,
378 smem.posBcast,
379 smem.buf,
380 smem.innerAboveThreshold,
381 &dummy_pc,
382 labelAcc);
383 }
384 if ((clusterer.mPmemory->fragment).isOverlap(peak.time())) {
385 if (clusterer.mPclusterPosInRow) {
386 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
387 }
388 return;
389 }
390
391 bool notSinglePad = false, notSingleTime = false;
392 for (uint16_t i = 0; i < 8; i++) {
393 Delta2 d = cfconsts::InnerNeighbors[i];
394 CfChargePos tmp_pos = peak.delta(d);
395 float v = static_cast<float>(chargeMap[tmp_pos].unpack());
396 notSinglePad |= (d.x != 0) && (v > 0.f);
397 notSingleTime |= (d.y != 0) && (v > 0.f);
398 }
399
400 float publishPadPosition = 0.f, publishTimePosition = 0.f;
401 if (dtype == 0) {
402 publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_16[model_output_index].ToFloat();
403 publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_16[model_output_index + 1].ToFloat();
404 isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
405 pc.setFull(central_charge * clustererNN.mOutputDataReg1_16[model_output_index + 4].ToFloat(),
406 publishPadPosition,
407 notSinglePad ? clustererNN.mOutputDataReg1_16[model_output_index + 2].ToFloat() : 0.f,
408 (clusterer.mPmemory->fragment).start + publishTimePosition,
409 notSingleTime ? clustererNN.mOutputDataReg1_16[model_output_index + 3].ToFloat() : 0.f,
410 clustererNN.mClusterFlags[2 * glo_idx],
411 clustererNN.mClusterFlags[2 * glo_idx + 1]);
412 } else {
413 publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_32[model_output_index];
414 publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_32[model_output_index + 1];
415 isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
416 pc.setFull(central_charge * clustererNN.mOutputDataReg1_32[model_output_index + 4],
417 publishPadPosition,
418 notSinglePad ? clustererNN.mOutputDataReg1_32[model_output_index + 2] : 0.f,
419 (clusterer.mPmemory->fragment).start + publishTimePosition,
420 notSingleTime ? clustererNN.mOutputDataReg1_32[model_output_index + 3] : 0.f,
421 clustererNN.mClusterFlags[2 * glo_idx],
422 clustererNN.mClusterFlags[2 * glo_idx + 1]);
423 }
424
425 // if (boundaryFlag != 0) { // Prints the entire NN input for the given index
426 // // Build a simple buffer manually (float with 3 decimals)
427 // const int MAX_CHARS = 4096;
428 // char buffer[MAX_CHARS];
429 // int pos = 0;
430 //
431 // auto appendChar = [&](char c) {
432 // if (pos < MAX_CHARS - 1) buffer[pos++] = c;
433 // };
434 // auto appendStr = [&](const char* s) {
435 // while (*s && pos < MAX_CHARS - 1) buffer[pos++] = *s++;
436 // };
437 // auto appendUInt = [&](uint32_t v) {
438 // char tmp[16]; int tp = 0;
439 // if (v == 0) { appendChar('0'); return; }
440 // while (v && tp < 16) { tmp[tp++] = char('0' + (v % 10)); v /= 10; }
441 // while (tp--) appendChar(tmp[tp]);
442 // };
443 // auto appendInt = [&](int v) {
444 // if (v < 0) { appendChar('-'); v = -v; }
445 // appendUInt((uint32_t)v);
446 // };
447 // auto appendFloat = [&](float f) {
448 // if (f < 0) { appendChar('-'); f = -f; }
449 // int ip = (int)f;
450 // float frac = f - (float)ip;
451 // appendInt(ip);
452 // appendChar('.');
453 // for (int i = 0; i < 3; i++) {
454 // frac *= 10.f;
455 // int d = (int)frac;
456 // appendChar((char)('0' + (d < 0 ? 0 : (d > 9 ? 9 : d))));
457 // frac -= d;
458 // if (frac < 0) frac = 0;
459 // }
460 // };
461 //
462 // appendStr("(NN CLUS) DEBUG: Boundary cluster detected (sector ");
463 // appendUInt(sector);
464 // appendStr(", row ");
465 // appendUInt(peak.row());
466 // appendStr(", pad ");
467 // appendFloat(publishPadPosition);
468 // appendStr(", time ");
469 // appendFloat(publishTimePosition);
470 // appendStr(") [glo_idx=");
471 // appendUInt(glo_idx);
472 // appendStr(" elemSize=");
473 // appendInt(clustererNN.mNnClusterizerElementSize);
474 // appendStr(" dtype=");
475 // appendInt(dtype);
476 // appendStr("] INPUT:");
477 //
478 // int elemSize = clustererNN.mNnClusterizerElementSize;
479 // int baseIdx = glo_idx * elemSize;
480 //
481 // int maxPrint = elemSize;
482 // for (int i = 0; i < maxPrint; ++i) {
483 // appendChar(' ');
484 // float v = (dtype == 0) ? clustererNN.mInputData_16[baseIdx + i].ToFloat()
485 // : clustererNN.mInputData_32[baseIdx + i];
486 // appendFloat(v);
487 // if (pos > (MAX_CHARS - 32)) { appendStr(" ..."); break; }
488 // }
489 //
490 // buffer[pos] = 0;
491 // printf("%s\n", buffer);
492 // }
493
494 tpc::ClusterNative myCluster;
495 bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
496 if (clustererNN.mNnClusterizerUseClassification) {
497 rejectCluster |= (clustererNN.mOutputDataClass[peakIndex] <= 0);
498 }
499 if (rejectCluster) {
500 if (clusterer.mPclusterPosInRow) {
501 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
502 }
503 return;
504 }
505
506 uint32_t rowIndex = 0;
507 if (clusterOut != nullptr) {
508 rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
509 clusterer,
510 myCluster,
511 peak.row(),
512 clusterer.mNMaxClusterPerRow,
513 clusterer.mPclusterInRow,
514 clusterOut);
515 if (clusterer.mPclusterPosInRow != nullptr) {
516 clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
517 }
518 } else if (clusterer.mPclusterPosInRow) {
519 rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
520 }
521 CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow));
522}
523
524template <>
525GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass2Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
526{
527 uint32_t glo_idx = get_global_id(0);
528 auto& clusterer = processors.tpcClusterer[sector];
529 auto& clustererNN = processors.tpcNNClusterer[sector];
530 if (glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
531 return;
532 }
533
534 uint32_t maxClusterNum = clusterer.mPmemory->counters.nClusters;
535 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
536 CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
537 float central_charge = static_cast<float>(chargeMap[peak].unpack());
538
539 CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
540 MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
541 tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow;
542 uint32_t full_glo_idx = glo_idx + batchStart;
543
544 if (full_glo_idx >= maxClusterNum) {
545 if (withMC) {
546 ClusterAccumulator dummy_pc;
547 CPU_ONLY(labelAcc->collect(peak, central_charge));
548 GPUTPCCFClusterizer::buildCluster(
549 clusterer.Param().rec,
550 chargeMap,
551 peak,
552 smem.posBcast,
553 smem.buf,
554 smem.innerAboveThreshold,
555 &dummy_pc,
556 labelAcc);
557 }
558 return;
559 }
560
561 uint32_t model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg2NumOutputNodes;
562
564
565 if (withMC) {
566 ClusterAccumulator dummy_pc;
567 CPU_ONLY(labelAcc->collect(peak, central_charge));
568 GPUTPCCFClusterizer::buildCluster(
569 clusterer.Param().rec,
570 chargeMap,
571 peak,
572 smem.posBcast,
573 smem.buf,
574 smem.innerAboveThreshold,
575 &dummy_pc,
576 labelAcc);
577 }
578 if ((clusterer.mPmemory->fragment).isOverlap(peak.time())) {
579 if (clusterer.mPclusterPosInRow) {
580 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
581 }
582 return;
583 }
584
585 // Cluster 1
586 float publishPadPosition = 0.f, publishTimePosition = 0.f;
587 if (dtype == 0) {
588 publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index].ToFloat();
589 publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat();
590 isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
591 pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 8].ToFloat(),
592 publishPadPosition,
593 clustererNN.mOutputDataReg2_16[model_output_index + 4].ToFloat(),
594 (clusterer.mPmemory->fragment).start + publishTimePosition,
595 clustererNN.mOutputDataReg2_16[model_output_index + 6].ToFloat(),
596 clustererNN.mClusterFlags[2 * glo_idx],
597 clustererNN.mClusterFlags[2 * glo_idx + 1]);
598 } else if (dtype == 1) {
599 publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index];
600 publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 1];
601 isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
602 pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 8],
603 publishPadPosition,
604 clustererNN.mOutputDataReg2_32[model_output_index + 4],
605 (clusterer.mPmemory->fragment).start + publishTimePosition,
606 clustererNN.mOutputDataReg2_32[model_output_index + 6],
607 clustererNN.mClusterFlags[2 * glo_idx],
608 clustererNN.mClusterFlags[2 * glo_idx + 1]);
609 }
610
611 tpc::ClusterNative myCluster;
612 bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
613 if (clustererNN.mNnClusterizerUseClassification) {
614 rejectCluster |= (clustererNN.mOutputDataClass[CAMath::Min(full_glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] <= 0);
615 }
616 if (rejectCluster) {
617 if (clusterer.mPclusterPosInRow) {
618 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
619 }
620 return;
621 }
622
623 uint32_t rowIndex = 0;
624 if (clusterOut != nullptr) {
625 rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
626 clusterer,
627 myCluster,
628 peak.row(),
629 clusterer.mNMaxClusterPerRow,
630 clusterer.mPclusterInRow,
631 clusterOut);
632 if (clusterer.mPclusterPosInRow != nullptr) {
633 clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
634 }
635 } else if (clusterer.mPclusterPosInRow) {
636 rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
637 }
638 CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow));
639
640 // Cluster 2
641 if (dtype == 0) {
642 publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat();
643 publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 3].ToFloat();
644 isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
645 pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 9].ToFloat(),
646 publishPadPosition,
647 clustererNN.mOutputDataReg2_16[model_output_index + 5].ToFloat(),
648 (clusterer.mPmemory->fragment).start + publishTimePosition,
649 clustererNN.mOutputDataReg2_16[model_output_index + 7].ToFloat(),
650 clustererNN.mClusterFlags[2 * glo_idx],
651 clustererNN.mClusterFlags[2 * glo_idx + 1]);
652 } else if (dtype == 1) {
653 publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index + 1];
654 publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 3];
655 isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
656 pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 9],
657 publishPadPosition,
658 clustererNN.mOutputDataReg2_32[model_output_index + 5],
659 (clusterer.mPmemory->fragment).start + publishTimePosition,
660 clustererNN.mOutputDataReg2_32[model_output_index + 7],
661 clustererNN.mClusterFlags[2 * glo_idx],
662 clustererNN.mClusterFlags[2 * glo_idx + 1]);
663 }
664
665 rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
666 if (clustererNN.mNnClusterizerUseClassification) {
667 rejectCluster |= (clustererNN.mOutputDataClass[CAMath::Min(full_glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] <= 0);
668 }
669 if (rejectCluster) {
670 if (clusterer.mPclusterPosInRow) {
671 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
672 }
673 return;
674 }
675
676 if (clusterOut != nullptr) {
677 rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
678 clusterer,
679 myCluster,
680 peak.row(),
681 clusterer.mNMaxClusterPerRow,
682 clusterer.mPclusterInRow,
683 clusterOut);
684 if (clusterer.mPclusterPosInRow != nullptr) {
685 clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
686 }
687 } else if (clusterer.mPclusterPosInRow) {
688 rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
689 }
690 // CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters?
691}
692
693// ---------------------------------
694template <>
695GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishDeconvolutionFlags>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart)
696{
697 // Implements identical publishing logic as the heuristic clusterizer and deconvolution kernel
698 uint32_t glo_idx = get_global_id(0);
699 auto& clusterer = processors.tpcClusterer[sector];
700 auto& clustererNN = processors.tpcNNClusterer[sector];
701 if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
702 return;
703 }
704 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
705 CfChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
706
707 clustererNN.mClusterFlags[2 * glo_idx] = 0;
708 clustererNN.mClusterFlags[2 * glo_idx + 1] = 0;
709 for (int i = 0; i < 8; i++) {
710 Delta2 d = cfconsts::InnerNeighbors[i];
711 CfChargePos tmp_pos = peak.delta(d);
712 PackedCharge charge = chargeMap[tmp_pos];
713 clustererNN.mClusterFlags[2 * glo_idx] += (d.y != 0 && charge.isSplit());
714 clustererNN.mClusterFlags[2 * glo_idx + 1] += (d.x != 0 && charge.isSplit());
715 }
716 for (int i = 0; i < 16; i++) {
717 Delta2 d = cfconsts::OuterNeighbors[i];
718 CfChargePos tmp_pos = peak.delta(d);
719 PackedCharge charge = chargeMap[tmp_pos];
720 clustererNN.mClusterFlags[2 * glo_idx] += (d.y != 0 && charge.isSplit() && !charge.has3x3Peak());
721 clustererNN.mClusterFlags[2 * glo_idx + 1] += (d.x != 0 && charge.isSplit() && !charge.has3x3Peak());
722 }
723}
724
725// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
726GPUd() int32_t GPUTPCNNClusterizerKernels::padOffset(int32_t row_ref, int32_t row_current)
727{
728 if (row_current < 0 || row_current >= o2::tpc::constants::MAXGLOBALPADROW) {
729 return 0; // Short-circuit for out-of-bound rows
730 } else {
731 return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2);
732 }
733}
734
735GPUd() int32_t GPUTPCNNClusterizerKernels::rowOffset(int32_t row, int32_t offset)
736{
737 return (row > 62 ? offset : 0);
738}
739
740GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t offset)
741{
742 if (pad < 0 || row < 0) { // Faster short-circuit
743 return true;
744 } else if (row < 63) {
745 return (pad >= static_cast<int>(GPUTPCGeometry::NPads(row)));
746 } else if (row < (63 + offset)) { // to account for the gap between IROC and OROC. Charge will be set to the boundary fill value in order to signal boundaries to the neural network
747 return true;
749 return (pad >= static_cast<int>(GPUTPCGeometry::NPads(row - offset)));
750 } else {
751 return true;
752 }
753}
754
755GPUd() bool GPUTPCNNClusterizerKernels::isBoundaryPublish(int32_t idx, int32_t row, float& pad, float& time)
756{
757 if (pad < 0) {
758 // printf("(NN CLUS) WARNING: Boundary detected, idx = %d, pad < 0: row %d, pad %f (%d, %d), time %f (%d, %d)\n", idx, row, pad, 0, static_cast<int>(GPUTPCGeometry::NPads(row)), time, 0, TPC_MAX_FRAGMENT_LEN_GPU);
759 pad = 0.f;
760 return true;
761 } else if (pad >= static_cast<int>(GPUTPCGeometry::NPads(row))) {
762 // printf("(NN CLUS) WARNING: Boundary detected, idx = %d, pad >= static_cast<int>(GPUTPCGeometry::NPads(row): row %d, pad %f (%d, %d), time %f (%d, %d)\n", idx, row, pad, 0, static_cast<int>(GPUTPCGeometry::NPads(row)), time, 0, TPC_MAX_FRAGMENT_LEN_GPU);
763 pad = static_cast<float>(GPUTPCGeometry::NPads(row) - 1);
764 return true;
765 } else if (time < 0) {
766 // printf("(NN CLUS) WARNING: Boundary detected, idx = %d, time < 0: row %d, pad %f (%d, %d), time %f (%d, %d)\n", idx, row, pad, 0, static_cast<int>(GPUTPCGeometry::NPads(row)), time, 0, TPC_MAX_FRAGMENT_LEN_GPU);
767 time = 0.f;
768 return true;
769 } else if (time >= TPC_MAX_FRAGMENT_LEN_GPU) {
770 // printf("(NN CLUS) WARNING: Boundary detected, idx = %d, time >= TPC_MAX_FRAGMENT_LEN_GPU: row %d, pad %f (%d, %d), time %f (%d, %d)\n", idx, row, pad, 0, static_cast<int>(GPUTPCGeometry::NPads(row)), time, 0, TPC_MAX_FRAGMENT_LEN_GPU);
771 time = static_cast<float>(TPC_MAX_FRAGMENT_LEN_GPU - 1);
772 return true;
773 } else {
774 return false;
775 }
776}
int16_t charge
Definition RawEventData.h:5
int16_t time
Definition RawEventData.h:4
int32_t i
#define get_local_size(dim)
#define get_local_id(dim)
#define get_num_groups(dim)
#define get_global_id(dim)
#define get_group_id(dim)
GPUdii() void GPUTPCNNClusterizerKernels
GPUd() int32_t GPUTPCNNClusterizerKernels
void collect(const CfChargePos &, tpccf::Charge)
void commit(tpccf::Row, uint32_t, uint32_t)
#define TPC_MAX_FRAGMENT_LEN_GPU
#define CPU_ONLY(x)
#define CPU_PTR(x)
const GLdouble * v
Definition glcorearb.h:832
GLintptr offset
Definition glcorearb.h:660
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLboolean r
Definition glcorearb.h:1233
GLuint start
Definition glcorearb.h:469
constexpr int MAXSECTOR
Definition Constants.h:28
constexpr int MAXGLOBALPADROW
Definition Constants.h:34
int16_t y
int16_t x
std::vector< int > row