Project
Loading...
Searching...
No Matches
GPUTPCNNClusterizerKernels.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
16#include "GPUTPCCFClusterizer.h"
17#include "GPUTPCGeometry.h"
18
19using namespace o2::gpu;
20using namespace o2::gpu::tpccf;
21
22#include "CfConsts.h"
23#include "CfUtils.h"
24#include "ClusterAccumulator.h"
26
27#if !defined(GPUCA_GPUCODE)
28#include "GPUHostDataTypes.h"
29#include "MCLabelAccumulator.h"
30#endif
31
32#ifdef GPUCA_GPUCODE
33#include "GPUTPCCFClusterizer.inc"
34#endif
35
36// Defining individual thread functions for data filling, determining the class label and running the CF clusterizer
37template <>
38GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
39{
40 uint32_t glo_idx = get_global_id(0);
41 auto& clusterer = processors.tpcClusterer[sector];
42 auto& clustererNN = processors.tpcNNClusterer[sector];
43 if (clustererNN.mOutputDataClass[glo_idx] == 0) { // default clusterizer should not be called in batched mode due to mess-up with thread indices
44 return;
45 }
46 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
47 CPU_ONLY(MCLabelAccumulator labelAcc(clusterer));
48 tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
50 GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
51}
52
53template <>
54GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNCPU>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
55{
56 auto& clusterer = processors.tpcClusterer[sector];
57 auto& clustererNN = processors.tpcNNClusterer[sector];
58
59 uint32_t glo_idx = get_global_id(0);
60 if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
61 return;
62 }
63
64 uint32_t write_idx = glo_idx * clustererNN.mNnClusterizerElementSize;
65
66 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
67 CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
68 CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
69 int32_t row = static_cast<int>(peak.row());
70 int32_t pad = static_cast<int>(peak.pad());
71 int32_t time = static_cast<int>(peak.time());
72 float central_charge = static_cast<float>(chargeMap[peak].unpack());
73 int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
74
75 for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; ++r) {
76 int32_t target_row = row + r;
77 bool is_row_boundary = (target_row < 0) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW);
78 int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, target_row);
79
80 for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; ++p) {
81 int32_t target_pad = pad + p;
82 bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow);
83
84 for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; ++t) {
85 int32_t target_time = time + t;
86
87 if (is_boundary || target_time < 0 || target_time >= TPC_MAX_FRAGMENT_LEN_GPU) {
88 // Fill boundary value
89 float boundary_value = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
90 if (dtype == 0) {
91 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)boundary_value;
92 } else {
93 clustererNN.mInputData_32[write_idx] = boundary_value;
94 }
95 } else {
96 CfChargePos tmp_pos(target_row, target_pad, target_time);
97 float normalized_charge = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
98
99 if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && r == 0 && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) {
100 clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
101 clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx];
102 }
103
104 if (dtype == 0) {
105 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)normalized_charge;
106 } else {
107 clustererNN.mInputData_32[write_idx] = normalized_charge;
108 }
109 }
110 // if((CAMath::Abs(static_cast<float>(clustererNN.mInputData_16_Test[write_idx]) - static_cast<float>(clustererNN.mInputData_16[write_idx])) > 1e-4) && ((glo_idx + batchStart) < clusterer.mPmemory->counters.nClusters)) {
111 // printf("Warning: Input data mismatch at index %d, %d - row, pad, time: %d, %d, %d : %f -> %f\n", glo_idx, glo_idx + batchStart, r, p, t,
112 // static_cast<float>(clustererNN.mInputData_16_Test[write_idx]), static_cast<float>(clustererNN.mInputData_16[write_idx]));
113 // }
114 write_idx++;
115 }
116 }
117 }
118
119 if (clustererNN.mNnClusterizerAddIndexData) {
120 if (dtype == 0) {
121 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR);
122 clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW);
123 clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / GPUTPCGeometry::NPads(row));
124 } else {
125 clustererNN.mInputData_32[write_idx] = static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR;
126 clustererNN.mInputData_32[write_idx + 1] = static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW;
127 clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
128 }
129 }
130
131 if (!clustererNN.mNnClusterizerSetDeconvolutionFlags) {
132 clustererNN.mClusterFlags[2 * glo_idx] = 0;
133 clustererNN.mClusterFlags[2 * glo_idx + 1] = 0;
134
135 for (uint16_t i = 0; i < 8; ++i) {
136 Delta2 d = cfconsts::InnerNeighbors[i];
137 CfChargePos tmp_pos = peak.delta(d);
138 clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
139 }
140 clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx];
141 }
142}
143
144template <>
145GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNGPU>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
146{
147 uint32_t glo_idx = get_global_id(0);
148
149 auto& clusterer = processors.tpcClusterer[sector];
150 auto& clustererNN = processors.tpcNNClusterer[sector];
151
152 // Optimized division using bit operations
153 uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeFull;
154 uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeFull);
155
156 // Early exit for out-of-bounds threads
157 if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
158 return;
159 }
160 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
161 CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
162
163 // Use dedicated neural network shared memory arrays for warp-level caching
164 // First thread in each warp loads shared data
165 CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(base_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
166 float central_charge = static_cast<float>(chargeMap[peak].unpack());
167 int32_t row = static_cast<int>(peak.row());
168 int32_t pad = static_cast<int>(peak.pad());
169 int32_t time = static_cast<int>(peak.time());
170
171 // Handle index data with fewer branches
172 if (clustererNN.mNnClusterizerAddIndexData && transient_index >= clustererNN.mNnClusterizerRowTimeSize) {
173 int32_t data_idx = transient_index - clustererNN.mNnClusterizerRowTimeSize;
174 uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx;
175
176 float index_values[3] = {
177 static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR,
178 static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW,
179 static_cast<float>(pad) / GPUTPCGeometry::NPads(row)};
180
181 if (dtype == 0) {
182 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)index_values[data_idx];
183 } else {
184 clustererNN.mInputData_32[write_idx] = index_values[data_idx];
185 }
186
187 // Handle deconvolution flags only once per cluster (last thread in element)
188 if (data_idx == 2 && !clustererNN.mNnClusterizerSetDeconvolutionFlags) {
189 uint8_t cluster_flags = 0;
190 for (uint16_t i = 0; i < 8; i++) {
191 Delta2 d = cfconsts::InnerNeighbors[i];
192 CfChargePos tmp_pos = peak.delta(d);
193 cluster_flags += CfUtils::isPeak(isPeakMap[tmp_pos]);
194 }
195 clustererNN.mClusterFlags[2 * base_idx] = cluster_flags;
196 clustererNN.mClusterFlags[2 * base_idx + 1] = cluster_flags;
197 }
198 return;
199 }
200
201 // Main data processing - optimize index calculations
202 if (transient_index < clustererNN.mNnClusterizerRowTimeSize) {
203 // Optimize 3D index calculation
204 int32_t row_idx = transient_index / clustererNN.mNnClusterizerFullTimeSize;
205 int32_t r_local = row_idx - clustererNN.mNnClusterizerSizeInputRow;
206 int32_t time_idx = transient_index - row_idx * clustererNN.mNnClusterizerFullTimeSize;
207 int32_t t_local = time_idx - clustererNN.mNnClusterizerSizeInputTime;
208 int32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + row_idx * clustererNN.mNnClusterizerPadTimeSize + time_idx;
209
210 // Early boundary check for row
211 int32_t target_row = row + r_local;
212 int8_t is_row_boundary = (target_row < 0) || (target_row > (o2::tpc::constants::MAXGLOBALPADROW - 1));
213
214 // Calculate offsets
215 int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
216 int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, target_row);
217 for (int32_t p_local = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p_local <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p_local++) {
218 if (is_row_boundary) {
219 // Use boundary fill value
220 float boundary_val = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
221 if (dtype == 0) {
222 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)boundary_val;
223 } else {
224 clustererNN.mInputData_32[write_idx] = boundary_val;
225 }
226 write_idx += clustererNN.mNnClusterizerFullTimeSize; // Move to next pad position
227 continue;
228 }
229
230 // Calculate target pad and time
231 int32_t target_pad = pad + p_local;
232 int32_t target_time = time + t_local;
233
234 // Optimized boundary check
235 int8_t is_boundary = GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow) || (target_time < 0) || (target_time >= TPC_MAX_FRAGMENT_LEN_GPU);
236
237 float output_value;
238 if (is_boundary) {
239 output_value = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
240 } else {
241 // Coalesced memory access - create position and read charge
242 CfChargePos tmp_pos(target_row, target_pad, target_time);
243 output_value = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge; // Normalize by central charge
244 }
245
246 // Write output with reduced branching
247 if (dtype == 0) {
248 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
249 } else {
250 clustererNN.mInputData_32[write_idx] = output_value;
251 }
252 write_idx += clustererNN.mNnClusterizerFullTimeSize; // Move to next pad position
253 }
254 }
255}
256
257template <>
258GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
259{
260 uint32_t glo_idx = get_global_id(0);
261 if (dtype == 0) {
262 processors.tpcNNClusterer[sector].mOutputDataClass[glo_idx + batchStart] = (int)((processors.tpcNNClusterer[sector].mModelProbabilities_16[glo_idx]).ToFloat() > processors.tpcNNClusterer[sector].mNnClassThreshold);
263 } else if (dtype == 1) {
264 processors.tpcNNClusterer[sector].mOutputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].mModelProbabilities_32[glo_idx] > processors.tpcNNClusterer[sector].mNnClassThreshold);
265 }
266}
267
268template <>
269GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
270{
271 auto& clustererNN = processors.tpcNNClusterer[sector];
272 uint32_t glo_idx = get_global_id(0);
273 uint32_t elem_iterator = glo_idx * clustererNN.mNnClusterizerModelClassNumOutputNodes;
274 float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
275 uint32_t class_label = 0;
276 for (uint32_t pIdx = elem_iterator; pIdx < elem_iterator + clustererNN.mNnClusterizerModelClassNumOutputNodes; pIdx++) {
277 if (pIdx == elem_iterator) {
278 if (dtype == 0) {
279 current_max_prob = static_cast<float>(clustererNN.mModelProbabilities_16[pIdx]);
280 } else if (dtype == 1) {
281 current_max_prob = clustererNN.mModelProbabilities_32[pIdx];
282 }
283 } else {
284 if (dtype == 0) {
285 current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_16[pIdx].ToFloat());
286 } else if (dtype == 1) {
287 current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_32[pIdx]);
288 }
289 }
290 }
291 // uint32_t class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clustererNN.mNnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
292 clustererNN.mOutputDataClass[glo_idx + batchStart] = class_label;
293 if (class_label > 1) {
294 clustererNN.mClusterFlags[2 * glo_idx] = 1;
295 clustererNN.mClusterFlags[2 * glo_idx + 1] = 1;
296 }
297}
298
299template <>
300GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass1Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
301{
302 uint32_t glo_idx = get_global_id(0);
303 auto& clusterer = processors.tpcClusterer[sector];
304 auto& clustererNN = processors.tpcNNClusterer[sector];
305
306 uint32_t maxClusterNum = clusterer.mPmemory->counters.nClusters;
307 uint32_t full_glo_idx = glo_idx + batchStart;
308 int32_t model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg1NumOutputNodes;
309
310 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
311 CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(full_glo_idx, maxClusterNum - 1)];
312 float central_charge = static_cast<float>(chargeMap[peak].unpack());
313
314 CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
315 MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
316
317 if (full_glo_idx >= maxClusterNum) {
318 if (withMC) {
319 ClusterAccumulator dummy_pc;
320 CPU_ONLY(labelAcc->collect(peak, central_charge));
321 GPUTPCCFClusterizer::buildCluster(
322 clusterer.Param().rec,
323 chargeMap,
324 peak,
325 smem.posBcast,
326 smem.buf,
327 smem.innerAboveThreshold,
328 &dummy_pc,
329 labelAcc);
330 }
331 return;
332 }
333
334 tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow;
335
336 // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.mNnClusterizerModelReg1NumOutputNodes << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size();
337
338 if (clustererNN.mOutputDataClass[full_glo_idx] == 1 || (clustererNN.mNnClusterizerUseClassification <= 0)) {
339
341
342 // Publishing logic is taken from default clusterizer
343 if (withMC) {
344 ClusterAccumulator dummy_pc;
345 CPU_ONLY(labelAcc->collect(peak, central_charge));
346 GPUTPCCFClusterizer::buildCluster(
347 clusterer.Param().rec,
348 chargeMap,
349 peak,
350 smem.posBcast,
351 smem.buf,
352 smem.innerAboveThreshold,
353 &dummy_pc,
354 labelAcc);
355 }
356 if ((clusterer.mPmemory->fragment).isOverlap(peak.time())) {
357 if (clusterer.mPclusterPosInRow) {
358 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
359 }
360 return;
361 }
362
363 bool notSinglePad = false, notSingleTime = false;
364 for (uint16_t i = 0; i < 8; i++) {
365 Delta2 d = cfconsts::InnerNeighbors[i];
366 CfChargePos tmp_pos = peak.delta(d);
367 notSinglePad |= (d.x != 0) && (static_cast<float>(chargeMap[tmp_pos].unpack()) > 0);
368 notSingleTime |= (d.y != 0) && (static_cast<float>(chargeMap[tmp_pos].unpack()) > 0);
369 }
370
371 if (dtype == 0) {
372 pc.setFull(central_charge * clustererNN.mOutputDataReg1_16[model_output_index + 4].ToFloat(),
373 static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_16[model_output_index].ToFloat(),
374 notSinglePad ? clustererNN.mOutputDataReg1_16[model_output_index + 2].ToFloat() : 0.f,
375 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_16[model_output_index + 1].ToFloat(),
376 notSingleTime ? clustererNN.mOutputDataReg1_16[model_output_index + 3].ToFloat() : 0.f,
377 clustererNN.mClusterFlags[2 * glo_idx],
378 clustererNN.mClusterFlags[2 * glo_idx + 1]);
379 } else if (dtype == 1) {
380 pc.setFull(central_charge * clustererNN.mOutputDataReg1_32[model_output_index + 4],
381 static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_32[model_output_index],
382 notSinglePad ? clustererNN.mOutputDataReg1_32[model_output_index + 2] : 0.f,
383 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_32[model_output_index + 1],
384 notSingleTime ? clustererNN.mOutputDataReg1_32[model_output_index + 3] : 0.f,
385 clustererNN.mClusterFlags[2 * glo_idx],
386 clustererNN.mClusterFlags[2 * glo_idx + 1]);
387 }
388
389 tpc::ClusterNative myCluster;
390 bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
391 if (rejectCluster) {
392 if (clusterer.mPclusterPosInRow) {
393 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
394 }
395 return;
396 }
397
398 uint32_t rowIndex = 0;
399 if (clusterOut != nullptr) {
400 rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
401 clusterer,
402 myCluster,
403 peak.row(),
404 clusterer.mNMaxClusterPerRow,
405 clusterer.mPclusterInRow,
406 clusterOut);
407 if (clusterer.mPclusterPosInRow != nullptr) {
408 clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
409 }
410 } else if (clusterer.mPclusterPosInRow) {
411 rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
412 }
413 CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow));
414 } else {
415 if (clusterer.mPclusterPosInRow) {
416 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
417 }
418 return;
419 }
420}
421
422template <>
423GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass2Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
424{
425 uint32_t glo_idx = get_global_id(0);
426 auto& clusterer = processors.tpcClusterer[sector];
427 auto& clustererNN = processors.tpcNNClusterer[sector];
428
429 uint32_t maxClusterNum = clusterer.mPmemory->counters.nClusters;
430 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
431 CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
432 float central_charge = static_cast<float>(chargeMap[peak].unpack());
433
434 CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
435 MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
436 tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
437 uint32_t full_glo_idx = glo_idx + batchStart;
438
439 if (full_glo_idx >= maxClusterNum) {
440 if (withMC) {
441 ClusterAccumulator dummy_pc;
442 CPU_ONLY(labelAcc->collect(peak, central_charge));
443 GPUTPCCFClusterizer::buildCluster(
444 clusterer.Param().rec,
445 chargeMap,
446 peak,
447 smem.posBcast,
448 smem.buf,
449 smem.innerAboveThreshold,
450 &dummy_pc,
451 labelAcc);
452 }
453 return;
454 }
455
456 uint32_t model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg2NumOutputNodes;
457
458 if ((clustererNN.mOutputDataClass[full_glo_idx] > 0) || (clustererNN.mNnClusterizerUseClassification <= 0)) {
459
461
462 if (withMC) {
463 ClusterAccumulator dummy_pc;
464 CPU_ONLY(labelAcc->collect(peak, central_charge));
465 GPUTPCCFClusterizer::buildCluster(
466 clusterer.Param().rec,
467 chargeMap,
468 peak,
469 smem.posBcast,
470 smem.buf,
471 smem.innerAboveThreshold,
472 &dummy_pc,
473 labelAcc);
474 }
475 if ((clusterer.mPmemory->fragment).isOverlap(peak.time())) {
476 if (clusterer.mPclusterPosInRow) {
477 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
478 }
479 return;
480 }
481
482 // Cluster 1
483 if (dtype == 0) {
484 pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 8].ToFloat(),
485 static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index].ToFloat(),
486 clustererNN.mOutputDataReg2_16[model_output_index + 4].ToFloat(),
487 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 2].ToFloat(),
488 clustererNN.mOutputDataReg2_16[model_output_index + 6].ToFloat(),
489 clustererNN.mClusterFlags[2 * glo_idx],
490 clustererNN.mClusterFlags[2 * glo_idx + 1]);
491 } else if (dtype == 1) {
492 pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 8],
493 static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index],
494 clustererNN.mOutputDataReg2_32[model_output_index + 4],
495 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 2],
496 clustererNN.mOutputDataReg2_32[model_output_index + 6],
497 clustererNN.mClusterFlags[2 * glo_idx],
498 clustererNN.mClusterFlags[2 * glo_idx + 1]);
499 }
500
501 tpc::ClusterNative myCluster;
502 bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
503 if (rejectCluster) {
504 if (clusterer.mPclusterPosInRow) {
505 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
506 }
507 return;
508 }
509
510 uint32_t rowIndex = 0;
511 if (clusterOut != nullptr) {
512 rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
513 clusterer,
514 myCluster,
515 peak.row(),
516 clusterer.mNMaxClusterPerRow,
517 clusterer.mPclusterInRow,
518 clusterOut);
519 if (clusterer.mPclusterPosInRow != nullptr) {
520 clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
521 }
522 } else if (clusterer.mPclusterPosInRow) {
523 rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
524 }
525 CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow));
526
527 // Cluster 2
528 if (dtype == 0) {
529 pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 9].ToFloat(),
530 static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat(),
531 clustererNN.mOutputDataReg2_16[model_output_index + 5].ToFloat(),
532 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 3].ToFloat(),
533 clustererNN.mOutputDataReg2_16[model_output_index + 7].ToFloat(),
534 clustererNN.mClusterFlags[2 * glo_idx],
535 clustererNN.mClusterFlags[2 * glo_idx + 1]);
536 } else if (dtype == 1) {
537 pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 9],
538 static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index + 1],
539 clustererNN.mOutputDataReg2_32[model_output_index + 5],
540 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 3],
541 clustererNN.mOutputDataReg2_32[model_output_index + 7],
542 clustererNN.mClusterFlags[2 * glo_idx],
543 clustererNN.mClusterFlags[2 * glo_idx + 1]);
544 }
545
546 rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
547 if (rejectCluster) {
548 if (clusterer.mPclusterPosInRow) {
549 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
550 }
551 return;
552 }
553
554 if (clusterOut != nullptr) {
555 rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
556 clusterer,
557 myCluster,
558 peak.row(),
559 clusterer.mNMaxClusterPerRow,
560 clusterer.mPclusterInRow,
561 clusterOut);
562 if (clusterer.mPclusterPosInRow != nullptr) {
563 clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
564 }
565 } else if (clusterer.mPclusterPosInRow) {
566 rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
567 }
568 // CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters?
569 } else {
570 if (clusterer.mPclusterPosInRow) {
571 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
572 }
573 return;
574 }
575}
576
577// ---------------------------------
578template <>
579GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishDeconvolutionFlags>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart)
580{
581 // Implements identical publishing logic as the heuristic clusterizer and deconvolution kernel
582 uint32_t idx = get_global_id(0);
583 auto& clusterer = processors.tpcClusterer[sector];
584 auto& clustererNN = processors.tpcNNClusterer[sector];
585 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
586 CfChargePos peak = clusterer.mPfilteredPeakPositions[idx + batchStart];
587
588 clustererNN.mClusterFlags[2 * idx] = 0;
589 clustererNN.mClusterFlags[2 * idx + 1] = 0;
590 for (int i = 0; i < 8; i++) {
591 Delta2 d = cfconsts::InnerNeighbors[i];
592 CfChargePos tmp_pos = peak.delta(d);
593 PackedCharge charge = chargeMap[tmp_pos];
594 clustererNN.mClusterFlags[2 * idx] += (d.y != 0 && charge.isSplit());
595 clustererNN.mClusterFlags[2 * idx + 1] += (d.x != 0 && charge.isSplit());
596 }
597 for (int i = 0; i < 16; i++) {
598 Delta2 d = cfconsts::OuterNeighbors[i];
599 CfChargePos tmp_pos = peak.delta(d);
600 PackedCharge charge = chargeMap[tmp_pos];
601 clustererNN.mClusterFlags[2 * idx] += (d.y != 0 && charge.isSplit() && !charge.has3x3Peak());
602 clustererNN.mClusterFlags[2 * idx + 1] += (d.x != 0 && charge.isSplit() && !charge.has3x3Peak());
603 }
604}
605
606// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
607GPUd() int32_t GPUTPCNNClusterizerKernels::padOffset(int32_t row_ref, int32_t row_current)
608{
609 if (row_current < 0 || row_current >= o2::tpc::constants::MAXGLOBALPADROW) {
610 return 0; // Short-circuit for negative rows
611 } else {
612 return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2);
613 }
614}
615
616GPUd() int32_t GPUTPCNNClusterizerKernels::rowOffset(int32_t row, int32_t offset)
617{
618 return (row > 62 ? offset : 0);
619}
620
621GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t offset)
622{
623 if (pad < 0 || row < 0) { // Faster short-circuit
624 return true;
625 } else if (row < 63) {
626 return ((pad < 0) || (pad >= static_cast<int>(GPUTPCGeometry::NPads(row))));
627 } else if (row < (63 + offset)) { // to account for the gap between IROC and OROC. Charge will be set to the boundary fill value in order to signal boundaries to the neural network
628 return true;
630 return ((pad < 0) || (pad >= static_cast<int>(GPUTPCGeometry::NPads(row - offset))));
631 } else {
632 return true;
633 }
634}
int16_t charge
Definition RawEventData.h:5
int16_t time
Definition RawEventData.h:4
int32_t i
#define get_local_size(dim)
#define get_local_id(dim)
#define get_num_groups(dim)
#define get_global_id(dim)
#define get_group_id(dim)
GPUdii() void GPUTPCNNClusterizerKernels
GPUd() int32_t GPUTPCNNClusterizerKernels
void collect(const CfChargePos &, tpccf::Charge)
void commit(tpccf::Row, uint32_t, uint32_t)
#define TPC_MAX_FRAGMENT_LEN_GPU
#define CPU_ONLY(x)
#define CPU_PTR(x)
GLintptr offset
Definition glcorearb.h:660
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLboolean r
Definition glcorearb.h:1233
GLuint start
Definition glcorearb.h:469
constexpr int MAXSECTOR
Definition Constants.h:28
constexpr int MAXGLOBALPADROW
Definition Constants.h:34
int16_t y
int16_t x
std::vector< int > row