Project
Loading...
Searching...
No Matches
GPUTPCNNClusterizerKernels.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
16#include "GPUTPCCFClusterizer.h"
17#include "GPUTPCGeometry.h"
18
19using namespace o2::gpu;
20using namespace o2::gpu::tpccf;
21
22#include "CfConsts.h"
23#include "CfUtils.h"
24#include "ClusterAccumulator.h"
26
27#if !defined(GPUCA_GPUCODE)
28#include "GPUHostDataTypes.h"
29#include "MCLabelAccumulator.h"
30#endif
31
32#ifdef GPUCA_GPUCODE
33#include "GPUTPCCFClusterizer.inc"
34#endif
35
36// Defining individual thread functions for data filling, determining the class label and running the CF clusterizer
37template <>
38GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
39{
40 uint glo_idx = get_global_id(0);
41 auto& clusterer = processors.tpcClusterer[sector];
42 auto& clustererNN = processors.tpcNNClusterer[sector];
43 if (clustererNN.outputDataClass[glo_idx] == 0) { // default clusterizer should not be called in batched mode due to mess-up with thread indices
44 return;
45 }
46 Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
47 CPU_ONLY(MCLabelAccumulator labelAcc(clusterer));
48 tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
50 GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
51}
52
53template <>
54GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNN>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
55{
56 GPUTPCNNClusterizerKernels::fillInputData(nBlocks, nThreads, iBlock, iThread, processors, sector, dtype, batchStart);
57}
58
59template <>
60GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
61{
62 uint glo_idx = get_global_id(0);
63 processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].modelProbabilities[glo_idx] > processors.tpcNNClusterer[sector].nnClassThreshold);
64}
65
66template <>
67GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
68{
69 auto& clusterer = processors.tpcNNClusterer[sector];
70 uint glo_idx = get_global_id(0);
71 uint elem_iterator = glo_idx * clusterer.nnClusterizerModelClassNumOutputNodes;
72 float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
73 uint class_label = 0;
74 for (int pIdx = elem_iterator; pIdx < elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes; pIdx++) {
75 if (pIdx == elem_iterator) {
76 current_max_prob = clusterer.modelProbabilities[pIdx];
77 } else {
78 class_label = (clusterer.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label);
79 }
80 }
81 // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
82 clusterer.outputDataClass[glo_idx + batchStart] = class_label;
83}
84
85template <>
86GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass1Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
87{
88 uint glo_idx = get_global_id(0);
89 if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) {
90 return;
91 }
92 GPUTPCNNClusterizerKernels::publishClustersReg1(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart);
93}
94
95template <>
96GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass2Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
97{
98 uint glo_idx = get_global_id(0);
99 if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) {
100 return;
101 }
102 GPUTPCNNClusterizerKernels::publishClustersReg2(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart);
103}
104
105// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
106GPUd() int GPUTPCNNClusterizerKernels::padOffset(int row_ref, int row_current)
107{
108 return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2);
109}
110
111GPUd() int GPUTPCNNClusterizerKernels::rowOffset(int row, int global_shift)
112{
113 return (row > 62 ? global_shift : 0);
114}
115
116GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int row, int pad, int global_shift)
117{
118 if (pad < 0 || row < 0) { // Faster short-circuit
119 return true;
120 } else if (row < 63) {
121 return (pad >= static_cast<int>(GPUTPCGeometry::NPads(row)));
122 } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network
123 return true;
124 } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + global_shift)) {
125 return (pad >= static_cast<int>(GPUTPCGeometry::NPads(row - global_shift)));
126 } else {
127 return true;
128 }
129}
130
131// Filling the input data for the neural network where there is no boundary
132GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, processorType& processors, uint8_t sector, int8_t dtype, uint batchStart)
133{
134 uint glo_idx = get_global_id(0);
135 auto& clusterer = processors.tpcClusterer[sector];
136 auto& clustererNN = processors.tpcNNClusterer[sector];
137 Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
138 Array2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
139
140 uint write_idx = glo_idx * clustererNN.nnClusterizerElementSize; // Potential optimization: Either choose nnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
141
142 ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
143 int row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad()), time = static_cast<int>(peak.time()); // Explicit casting to avoid conversion errors
144 float central_charge = static_cast<float>(chargeMap[peak].unpack());
145
146 clustererNN.peakPositions[glo_idx] = peak;
147 clustererNN.centralCharges[glo_idx] = central_charge;
148 clustererNN.outputDataClass[glo_idx + batchStart] = -1;
149
150 int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.nnClusterizerSizeInputRow);
151#ifndef GPUCA_GPUCODE
152 GPUCA_UNROLL(U(), U());
153#endif
154 for (int r = -clustererNN.nnClusterizerSizeInputRow; r <= clustererNN.nnClusterizerSizeInputRow; r++) {
155 bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
156 int pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, row + r);
157 for (int p = -clustererNN.nnClusterizerSizeInputPad + pad_offset; p <= clustererNN.nnClusterizerSizeInputPad + pad_offset; p++) {
158 bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.nnClusterizerSizeInputRow);
159 for (int t = -clustererNN.nnClusterizerSizeInputTime; t <= clustererNN.nnClusterizerSizeInputTime; t++) {
160 if (!is_boundary) {
161 ChargePos tmp_pos(row + r, pad + p, time + t);
162 if (r == 0 && !clustererNN.clusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
163 clustererNN.clusterFlags[2 * glo_idx] = CfUtils::isPeak(isPeakMap[tmp_pos]);
164 clustererNN.clusterFlags[2 * glo_idx + 1] = clustererNN.clusterFlags[2 * glo_idx];
165 }
166 if (dtype == 0) {
167 clustererNN.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
168 } else {
169 clustererNN.inputData32[write_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
170 }
171 } else {
172 // Filling boundary just to make sure that no values are left unintentionally
173 if (dtype == 0) {
174 clustererNN.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue));
175 } else {
176 clustererNN.inputData32[write_idx] = static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue);
177 }
178 }
179 write_idx++;
180 }
181 }
182 }
183 if (clustererNN.nnClusterizerAddIndexData) {
184 if (dtype == 0) {
185 clustererNN.inputData16[write_idx] = (OrtDataType::Float16_t)(clusterer.mISector / 36.f);
186 clustererNN.inputData16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f);
187 clustererNN.inputData16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / GPUTPCGeometry::NPads(row));
188 } else {
189 clustererNN.inputData32[write_idx] = clusterer.mISector / 36.f;
190 clustererNN.inputData32[write_idx + 1] = row / 152.f;
191 clustererNN.inputData32[write_idx + 2] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
192 }
193 }
194}
195
196GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
197{
198 auto& clusterer = processors.tpcClusterer[sector];
199 auto& clustererNN = processors.tpcNNClusterer[sector];
200 Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
201 CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
202 MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
203 tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
204 uint full_glo_idx = glo_idx + batchStart;
205 int model_output_index = glo_idx * clustererNN.nnClusterizerModelReg1NumOutputNodes;
206
207 // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.nnClusterizerModelReg1NumOutputNodes << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size();
208
209 if (clustererNN.outputDataClass[full_glo_idx] == 1) {
210
212
213 // Publishing logic is taken from default clusterizer
214 if (onlyMC) {
215 ClusterAccumulator dummy_pc;
216 CPU_ONLY(labelAcc->collect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack()));
217 GPUTPCCFClusterizer::buildCluster(
218 clusterer.Param().rec,
219 chargeMap,
220 clustererNN.peakPositions[glo_idx],
221 smem.posBcast,
222 smem.buf,
223 smem.innerAboveThreshold,
224 &dummy_pc,
225 labelAcc);
226 }
227
228 if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) {
229 if (clusterer.mPclusterPosInRow) {
230 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
231 }
232 return;
233 }
234
235 pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg1[model_output_index + 4],
236 static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg1[model_output_index],
237 clustererNN.outputDataReg1[model_output_index + 2],
238 (clusterer.mPmemory->fragment).start + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg1[model_output_index + 1],
239 clustererNN.outputDataReg1[model_output_index + 3],
240 clustererNN.clusterFlags[2 * glo_idx],
241 clustererNN.clusterFlags[2 * glo_idx + 1]);
242
243 tpc::ClusterNative myCluster;
244 bool rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap);
245 if (rejectCluster) {
246 if (clusterer.mPclusterPosInRow) {
247 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
248 }
249 return;
250 }
251
252 uint rowIndex = 0;
253 if (clusterer.mPclusterByRow != nullptr) {
254 rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
255 clusterer,
256 myCluster,
257 clustererNN.peakPositions[glo_idx].row(),
258 clusterer.mNMaxClusterPerRow,
259 clusterer.mPclusterInRow,
260 clusterOut);
261 if (clusterer.mPclusterPosInRow != nullptr) {
262 clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
263 }
264 } else if (clusterer.mPclusterPosInRow) {
265 rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
266 }
267 CPU_ONLY(labelAcc->commit(clustererNN.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));
268 } else {
269 if (clusterer.mPclusterPosInRow) {
270 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
271 }
272 return;
273 }
274}
275
276GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
277{
278 auto& clusterer = processors.tpcClusterer[sector];
279 auto& clustererNN = processors.tpcNNClusterer[sector];
280 Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
281 CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
282 MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
283 tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
284 uint full_glo_idx = glo_idx + batchStart;
285 int model_output_index = glo_idx * clustererNN.nnClusterizerModelReg2NumOutputNodes;
286
287 // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.nnClusterizerModelReg2NumOutputNodes << " -- " << clustererNN.peakPositions.size() << " -- " << clustererNN.centralCharges.size();
288
289 if (clustererNN.outputDataClass[full_glo_idx] > 0) {
290
292
293 if (onlyMC) {
294 ClusterAccumulator dummy_pc;
295 CPU_ONLY(labelAcc->collect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack()));
296 GPUTPCCFClusterizer::buildCluster(
297 clusterer.Param().rec,
298 chargeMap,
299 clustererNN.peakPositions[glo_idx],
300 smem.posBcast,
301 smem.buf,
302 smem.innerAboveThreshold,
303 &dummy_pc,
304 labelAcc);
305 }
306
307 if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) {
308 if (clusterer.mPclusterPosInRow) {
309 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
310 }
311 return;
312 }
313
314 // Cluster 1
315 pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg2[model_output_index + 8],
316 static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg2[model_output_index],
317 clustererNN.outputDataReg2[model_output_index + 4],
318 (clusterer.mPmemory->fragment).start + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 2],
319 clustererNN.outputDataReg2[model_output_index + 6],
320 clustererNN.clusterFlags[2 * glo_idx],
321 clustererNN.clusterFlags[2 * glo_idx + 1]);
322
323 tpc::ClusterNative myCluster;
324 bool rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap);
325 if (rejectCluster) {
326 if (clusterer.mPclusterPosInRow) {
327 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
328 }
329 return;
330 }
331
332 uint rowIndex = 0;
333 if (clusterer.mPclusterByRow != nullptr) {
334 rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
335 clusterer,
336 myCluster,
337 clustererNN.peakPositions[glo_idx].row(),
338 clusterer.mNMaxClusterPerRow,
339 clusterer.mPclusterInRow,
340 clusterOut);
341 if (clusterer.mPclusterPosInRow != nullptr) {
342 clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
343 }
344 } else if (clusterer.mPclusterPosInRow) {
345 rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
346 }
347 CPU_ONLY(labelAcc->commit(clustererNN.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));
348
349 // Cluster 2
350 pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg2[model_output_index + 9],
351 static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg2[model_output_index + 1],
352 clustererNN.outputDataReg2[model_output_index + 5],
353 (clusterer.mPmemory->fragment).start + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 3],
354 clustererNN.outputDataReg2[model_output_index + 7],
355 clustererNN.clusterFlags[2 * glo_idx],
356 clustererNN.clusterFlags[2 * glo_idx + 1]);
357
358 rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap);
359 if (rejectCluster) {
360 if (clusterer.mPclusterPosInRow) {
361 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
362 }
363 return;
364 }
365
366 if (clusterer.mPclusterByRow != nullptr) {
367 rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
368 clusterer,
369 myCluster,
370 clustererNN.peakPositions[glo_idx].row(),
371 clusterer.mNMaxClusterPerRow,
372 clusterer.mPclusterInRow,
373 clusterOut);
374 if (clusterer.mPclusterPosInRow != nullptr) {
375 clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
376 }
377 } else if (clusterer.mPclusterPosInRow) {
378 rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
379 }
380 // CPU_ONLY(labelAcc->commit(clustererNN.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters?
381 } else {
382 if (clusterer.mPclusterPosInRow) {
383 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
384 }
385 return;
386 }
387}
int16_t time
Definition RawEventData.h:4
#define get_local_size(dim)
#define get_local_id(dim)
#define get_num_groups(dim)
#define get_global_id(dim)
#define get_group_id(dim)
#define GPUCA_UNROLL(optCu, optHi)
GPUd() int GPUTPCNNClusterizerKernels
GPUdii() void GPUTPCNNClusterizerKernels
void collect(const ChargePos &, tpccf::Charge)
void commit(tpccf::Row, uint32_t, uint32_t)
#define CPU_ONLY(x)
#define CPU_PTR(x)
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLboolean r
Definition glcorearb.h:1233
T unpack(BitPtr pos, size_t packingWidth)
Definition pack.h:101
constexpr int MAXGLOBALPADROW
Definition Constants.h:34
std::vector< int > row