Project
Loading...
Searching...
No Matches
GPUTPCNNClusterizerKernels.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
16#include "GPUTPCCFClusterizer.h"
17#include "GPUTPCGeometry.h"
18
19using namespace o2::gpu;
20using namespace o2::gpu::tpccf;
21
22#include "CfConsts.h"
23#include "CfUtils.h"
24#include "ClusterAccumulator.h"
26
27#if !defined(GPUCA_GPUCODE)
28#include "GPUHostDataTypes.h"
29#include "MCLabelAccumulator.h"
30#endif
31
32#ifdef GPUCA_GPUCODE
33#include "GPUTPCCFClusterizer.inc"
34#endif
35
36// Defining individual thread functions for data filling, determining the class label and running the CF clusterizer
37template <>
38GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
39{
40 uint32_t glo_idx = get_global_id(0);
41 auto& clusterer = processors.tpcClusterer[sector];
42 auto& clustererNN = processors.tpcNNClusterer[sector];
43 if (clustererNN.mOutputDataClass[glo_idx] == 0) { // default clusterizer should not be called in batched mode due to mess-up with thread indices
44 return;
45 }
46 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
47 CPU_ONLY(MCLabelAccumulator labelAcc(clusterer));
48 tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
50 GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
51}
52
53template <>
54GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNCPU>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
55{
56 uint32_t glo_idx = get_global_id(0);
57 auto& clusterer = processors.tpcClusterer[sector];
58 auto& clustererNN = processors.tpcNNClusterer[sector];
59 uint32_t write_idx = glo_idx * clustererNN.mNnClusterizerElementSize; // Potential optimization: Either choose mNnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
60
61 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
62 CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
63 CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
64 int32_t row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad()), time = static_cast<int>(peak.time()); // Explicit casting to avoid conversion errors
65 float central_charge = static_cast<float>(chargeMap[peak].unpack());
66 int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
67
68 for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; r++) {
69 bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
70 int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, row + r);
71 for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p++) {
72 bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow);
73 for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; t++) {
74 int32_t time_pos = time + t;
75 if (!is_boundary && (time_pos >= 0) && (time_pos < TPC_MAX_FRAGMENT_LEN_GPU)) {
76 CfChargePos tmp_pos(row + r, pad + p, time + t);
77 if (r == 0 && !clustererNN.mClusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
78 clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
79 clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx];
80 }
81 if (dtype == 0) {
82 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
83 } else if (dtype == 1) {
84 clustererNN.mInputData_32[write_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
85 }
86 } else {
87 // Filling boundary just to make sure that no values are left unintentionally
88 if (dtype == 0) {
89 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue));
90 } else {
91 clustererNN.mInputData_32[write_idx] = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
92 }
93 }
94 write_idx++;
95 }
96 }
97 }
98 if (clustererNN.mNnClusterizerAddIndexData) {
99 if (dtype == 0) {
100 clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(sector / 36.f);
101 clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f);
102 clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / GPUTPCGeometry::NPads(row));
103 } else {
104 clustererNN.mInputData_32[write_idx] = sector / 36.f;
105 clustererNN.mInputData_32[write_idx + 1] = row / 152.f;
106 clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
107 }
108 }
109 if (!clustererNN.mNnClusterizerSetDeconvolutionFlags) {
110 clustererNN.mClusterFlags[2 * glo_idx] = 0;
111 clustererNN.mClusterFlags[2 * glo_idx + 1] = 0;
112 for (uint16_t i = 0; i < 8; i++) {
113 Delta2 d = cfconsts::InnerNeighbors[i];
114 CfChargePos tmp_pos = peak.delta(d);
115 clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
116 }
117 clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx];
118 }
119}
120
121template <>
122GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNGPU>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
123{
124 uint32_t glo_idx = get_global_id(0);
125 auto& clusterer = processors.tpcClusterer[sector];
126 auto& clustererNN = processors.tpcNNClusterer[sector];
127 uint32_t base_idx = CAMath::Floor(glo_idx / clustererNN.mNnClusterizerElementSize);
128 uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerElementSize);
129
130 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
131 CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
132 CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(base_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
133 int32_t row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad());
134
135 if (clustererNN.mNnClusterizerAddIndexData && (int32_t)transient_index == (clustererNN.mNnClusterizerElementSize - 1)) {
136 uint32_t top_idx = (base_idx + 1) * clustererNN.mNnClusterizerElementSize;
137 if (!clustererNN.mNnClusterizerSetDeconvolutionFlags) { // Only if deconvolution flags are not set
138 clustererNN.mClusterFlags[2 * base_idx] = 0;
139 clustererNN.mClusterFlags[2 * base_idx + 1] = 0;
140 for (uint16_t i = 0; i < 8; i++) { // This solution needs testing. It is not the same as the deconvolution flags
141 Delta2 d = cfconsts::InnerNeighbors[i];
142 CfChargePos tmp_pos = peak.delta(d);
143 clustererNN.mClusterFlags[2 * base_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
144 }
145 clustererNN.mClusterFlags[2 * base_idx + 1] = clustererNN.mClusterFlags[2 * base_idx];
146 }
147 if (dtype == 0) {
148 clustererNN.mInputData_16[top_idx - 3] = (OrtDataType::Float16_t)(sector / 36.f);
149 clustererNN.mInputData_16[top_idx - 2] = (OrtDataType::Float16_t)(row / 152.f);
150 clustererNN.mInputData_16[top_idx - 1] = (OrtDataType::Float16_t)(static_cast<float>(pad) / GPUTPCGeometry::NPads(row));
151 } else {
152 clustererNN.mInputData_32[top_idx - 3] = sector / 36.f;
153 clustererNN.mInputData_32[top_idx - 2] = row / 152.f;
154 clustererNN.mInputData_32[top_idx - 1] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
155 }
156 } else if ((int32_t)transient_index < (clustererNN.mNnClusterizerElementSize - 3)) {
157 int32_t time = static_cast<int>(peak.time());
158 int32_t r = CAMath::Floor(transient_index / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow;
159 bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
160 if (is_row_boundary) {
161 if (dtype == 0) {
162 clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue));
163 } else {
164 clustererNN.mInputData_32[glo_idx] = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
165 }
166 } else {
167 int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
168 int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, row + r);
169 int32_t rest_1 = transient_index % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1));
170 int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad + pad_offset;
171 int32_t time_pos = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime + time;
172
173 bool is_boundary = GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow) && (time_pos < 0 || time_pos >= TPC_MAX_FRAGMENT_LEN_GPU);
174
175 if (!is_boundary) {
176 float central_charge = static_cast<float>(chargeMap[peak].unpack());
177 CfChargePos tmp_pos(row + r, pad + p, time_pos);
178 if (dtype == 0) {
179 clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
180 } else if (dtype == 1) {
181 clustererNN.mInputData_32[glo_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
182 }
183 } else {
184 if (dtype == 0) {
185 clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue));
186 } else {
187 clustererNN.mInputData_32[glo_idx] = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
188 }
189 }
190 }
191 }
192}
193
194template <>
195GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
196{
197 uint32_t glo_idx = get_global_id(0);
198 if (dtype == 0) {
199 processors.tpcNNClusterer[sector].mOutputDataClass[glo_idx + batchStart] = (int)((processors.tpcNNClusterer[sector].mModelProbabilities_16[glo_idx]).ToFloat() > processors.tpcNNClusterer[sector].mNnClassThreshold);
200 } else if (dtype == 1) {
201 processors.tpcNNClusterer[sector].mOutputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].mModelProbabilities_32[glo_idx] > processors.tpcNNClusterer[sector].mNnClassThreshold);
202 }
203}
204
205template <>
206GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
207{
208 auto& clustererNN = processors.tpcNNClusterer[sector];
209 uint32_t glo_idx = get_global_id(0);
210 uint32_t elem_iterator = glo_idx * clustererNN.mNnClusterizerModelClassNumOutputNodes;
211 float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
212 uint32_t class_label = 0;
213 for (uint32_t pIdx = elem_iterator; pIdx < elem_iterator + clustererNN.mNnClusterizerModelClassNumOutputNodes; pIdx++) {
214 if (pIdx == elem_iterator) {
215 if (dtype == 0) {
216 current_max_prob = static_cast<float>(clustererNN.mModelProbabilities_16[pIdx]);
217 } else if (dtype == 1) {
218 current_max_prob = clustererNN.mModelProbabilities_32[pIdx];
219 }
220 } else {
221 if (dtype == 0) {
222 current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_16[pIdx].ToFloat());
223 } else if (dtype == 1) {
224 current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_32[pIdx]);
225 }
226 }
227 }
228 // uint32_t class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clustererNN.mNnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
229 clustererNN.mOutputDataClass[glo_idx + batchStart] = class_label;
230 if (class_label > 1) {
231 clustererNN.mClusterFlags[2 * glo_idx] = 1;
232 clustererNN.mClusterFlags[2 * glo_idx + 1] = 1;
233 }
234}
235
236template <>
237GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass1Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
238{
239 uint32_t glo_idx = get_global_id(0);
240 auto& clusterer = processors.tpcClusterer[sector];
241 auto& clustererNN = processors.tpcNNClusterer[sector];
242
243 uint32_t maxClusterNum = clusterer.mPmemory->counters.nClusters;
244 uint32_t full_glo_idx = glo_idx + batchStart;
245 if (full_glo_idx >= maxClusterNum) {
246 return;
247 }
248 int32_t model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg1NumOutputNodes;
249
250 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
251 CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(full_glo_idx, maxClusterNum - 1)];
252 float central_charge = static_cast<float>(chargeMap[peak].unpack());
253
254 CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
255 MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
256 tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
257
258 // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.mNnClusterizerModelReg1NumOutputNodes << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size();
259
260 if (clustererNN.mOutputDataClass[full_glo_idx] == 1 || (clustererNN.mNnClusterizerModelReg2NumOutputNodes != -1 && clustererNN.mOutputDataClass[full_glo_idx] >= 1)) {
261
263
264 // Publishing logic is taken from default clusterizer
265 if (withMC) {
266 ClusterAccumulator dummy_pc;
267 CPU_ONLY(labelAcc->collect(peak, central_charge));
268 GPUTPCCFClusterizer::buildCluster(
269 clusterer.Param().rec,
270 chargeMap,
271 peak,
272 smem.posBcast,
273 smem.buf,
274 smem.innerAboveThreshold,
275 &dummy_pc,
276 labelAcc);
277 }
278 if ((clusterer.mPmemory->fragment).isOverlap(peak.time())) {
279 if (clusterer.mPclusterPosInRow) {
280 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
281 }
282 return;
283 }
284
285 if (dtype == 0) {
286 pc.setFull(central_charge * clustererNN.mOutputDataReg1_16[model_output_index + 4].ToFloat(),
287 static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_16[model_output_index].ToFloat(),
288 clustererNN.mOutputDataReg1_16[model_output_index + 2].ToFloat(),
289 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_16[model_output_index + 1].ToFloat(),
290 clustererNN.mOutputDataReg1_16[model_output_index + 3].ToFloat(),
291 clustererNN.mClusterFlags[2 * glo_idx],
292 clustererNN.mClusterFlags[2 * glo_idx + 1]);
293 } else if (dtype == 1) {
294 pc.setFull(central_charge * clustererNN.mOutputDataReg1_32[model_output_index + 4],
295 static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_32[model_output_index],
296 clustererNN.mOutputDataReg1_32[model_output_index + 2],
297 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_32[model_output_index + 1],
298 clustererNN.mOutputDataReg1_32[model_output_index + 3],
299 clustererNN.mClusterFlags[2 * glo_idx],
300 clustererNN.mClusterFlags[2 * glo_idx + 1]);
301 }
302
303 tpc::ClusterNative myCluster;
304 bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
305 if (rejectCluster) {
306 if (clusterer.mPclusterPosInRow) {
307 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
308 }
309 return;
310 }
311
312 uint32_t rowIndex = 0;
313 if (clusterOut != nullptr) {
314 rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
315 clusterer,
316 myCluster,
317 peak.row(),
318 clusterer.mNMaxClusterPerRow,
319 clusterer.mPclusterInRow,
320 clusterOut);
321 if (clusterer.mPclusterPosInRow != nullptr) {
322 clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
323 }
324 } else if (clusterer.mPclusterPosInRow) {
325 rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
326 }
327 CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow));
328 } else {
329 if (clusterer.mPclusterPosInRow) {
330 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
331 }
332 return;
333 }
334}
335
336template <>
337GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass2Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
338{
339 uint32_t glo_idx = get_global_id(0);
340 auto& clusterer = processors.tpcClusterer[sector];
341 auto& clustererNN = processors.tpcNNClusterer[sector];
342
343 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
344 CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
345 float central_charge = static_cast<float>(chargeMap[peak].unpack());
346
347 CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
348 MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
349 tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
350 uint32_t full_glo_idx = glo_idx + batchStart;
351 uint32_t model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg2NumOutputNodes;
352
353 if (clustererNN.mOutputDataClass[full_glo_idx] > 0) {
354
356
357 if (withMC) {
358 ClusterAccumulator dummy_pc;
359 CPU_ONLY(labelAcc->collect(peak, central_charge));
360 GPUTPCCFClusterizer::buildCluster(
361 clusterer.Param().rec,
362 chargeMap,
363 peak,
364 smem.posBcast,
365 smem.buf,
366 smem.innerAboveThreshold,
367 &dummy_pc,
368 labelAcc);
369 }
370 if ((clusterer.mPmemory->fragment).isOverlap(peak.time())) {
371 if (clusterer.mPclusterPosInRow) {
372 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
373 }
374 return;
375 }
376
377 // Cluster 1
378 if (dtype == 0) {
379 pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 8].ToFloat(),
380 static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index].ToFloat(),
381 clustererNN.mOutputDataReg2_16[model_output_index + 4].ToFloat(),
382 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 2].ToFloat(),
383 clustererNN.mOutputDataReg2_16[model_output_index + 6].ToFloat(),
384 clustererNN.mClusterFlags[2 * glo_idx],
385 clustererNN.mClusterFlags[2 * glo_idx + 1]);
386 } else if (dtype == 1) {
387 pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 8],
388 static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index],
389 clustererNN.mOutputDataReg2_32[model_output_index + 4],
390 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 2],
391 clustererNN.mOutputDataReg2_32[model_output_index + 6],
392 clustererNN.mClusterFlags[2 * glo_idx],
393 clustererNN.mClusterFlags[2 * glo_idx + 1]);
394 }
395
396 tpc::ClusterNative myCluster;
397 bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
398 if (rejectCluster) {
399 if (clusterer.mPclusterPosInRow) {
400 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
401 }
402 return;
403 }
404
405 uint32_t rowIndex = 0;
406 if (clusterOut != nullptr) {
407 rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
408 clusterer,
409 myCluster,
410 peak.row(),
411 clusterer.mNMaxClusterPerRow,
412 clusterer.mPclusterInRow,
413 clusterOut);
414 if (clusterer.mPclusterPosInRow != nullptr) {
415 clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
416 }
417 } else if (clusterer.mPclusterPosInRow) {
418 rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
419 }
420 CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow));
421
422 // Cluster 2
423 if (dtype == 0) {
424 pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 9].ToFloat(),
425 static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat(),
426 clustererNN.mOutputDataReg2_16[model_output_index + 5].ToFloat(),
427 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 3].ToFloat(),
428 clustererNN.mOutputDataReg2_16[model_output_index + 7].ToFloat(),
429 clustererNN.mClusterFlags[2 * glo_idx],
430 clustererNN.mClusterFlags[2 * glo_idx + 1]);
431 } else if (dtype == 1) {
432 pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 9],
433 static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index + 1],
434 clustererNN.mOutputDataReg2_32[model_output_index + 5],
435 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 3],
436 clustererNN.mOutputDataReg2_32[model_output_index + 7],
437 clustererNN.mClusterFlags[2 * glo_idx],
438 clustererNN.mClusterFlags[2 * glo_idx + 1]);
439 }
440
441 rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
442 if (rejectCluster) {
443 if (clusterer.mPclusterPosInRow) {
444 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
445 }
446 return;
447 }
448
449 if (clusterOut != nullptr) {
450 rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
451 clusterer,
452 myCluster,
453 peak.row(),
454 clusterer.mNMaxClusterPerRow,
455 clusterer.mPclusterInRow,
456 clusterOut);
457 if (clusterer.mPclusterPosInRow != nullptr) {
458 clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
459 }
460 } else if (clusterer.mPclusterPosInRow) {
461 rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
462 }
463 // CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters?
464 } else {
465 if (clusterer.mPclusterPosInRow) {
466 clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
467 }
468 return;
469 }
470}
471
472// ---------------------------------
473template <>
474GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishDeconvolutionFlags>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart)
475{
476 // Implements identical publishing logic as the heuristic clusterizer and deconvolution kernel
477 uint32_t idx = get_global_id(0);
478 auto& clusterer = processors.tpcClusterer[sector];
479 auto& clustererNN = processors.tpcNNClusterer[sector];
480 CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
481 CfChargePos peak = clusterer.mPfilteredPeakPositions[idx + batchStart];
482
483 clustererNN.mClusterFlags[2 * idx] = 0;
484 clustererNN.mClusterFlags[2 * idx + 1] = 0;
485 for (int i = 0; i < 8; i++) {
486 Delta2 d = cfconsts::InnerNeighbors[i];
487 CfChargePos tmp_pos = peak.delta(d);
488 PackedCharge charge = chargeMap[tmp_pos];
489 clustererNN.mClusterFlags[2 * idx] += (d.y != 0 && charge.isSplit());
490 clustererNN.mClusterFlags[2 * idx + 1] += (d.x != 0 && charge.isSplit());
491 }
492 for (int i = 0; i < 16; i++) {
493 Delta2 d = cfconsts::OuterNeighbors[i];
494 CfChargePos tmp_pos = peak.delta(d);
495 PackedCharge charge = chargeMap[tmp_pos];
496 clustererNN.mClusterFlags[2 * idx] += (d.y != 0 && charge.isSplit() && !charge.has3x3Peak());
497 clustererNN.mClusterFlags[2 * idx + 1] += (d.x != 0 && charge.isSplit() && !charge.has3x3Peak());
498 }
499}
500
501// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
502GPUd() int32_t GPUTPCNNClusterizerKernels::padOffset(int32_t row_ref, int32_t row_current)
503{
504 return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2);
505}
506
507GPUd() int32_t GPUTPCNNClusterizerKernels::rowOffset(int32_t row, int32_t global_shift)
508{
509 return (row > 62 ? global_shift : 0);
510}
511
512GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t global_shift)
513{
514 if (pad < 0 || row < 0) { // Faster short-circuit
515 return true;
516 } else if (row < 63) {
517 return (pad >= static_cast<int>(GPUTPCGeometry::NPads(row)));
518 } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network
519 return true;
520 } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + global_shift)) {
521 return (pad >= static_cast<int>(GPUTPCGeometry::NPads(row - global_shift)));
522 } else {
523 return true;
524 }
525}
int16_t charge
Definition RawEventData.h:5
int16_t time
Definition RawEventData.h:4
int32_t i
#define get_local_size(dim)
#define get_local_id(dim)
#define get_num_groups(dim)
#define get_global_id(dim)
#define get_group_id(dim)
GPUdii() void GPUTPCNNClusterizerKernels
GPUd() int32_t GPUTPCNNClusterizerKernels
void collect(const CfChargePos &, tpccf::Charge)
void commit(tpccf::Row, uint32_t, uint32_t)
#define TPC_MAX_FRAGMENT_LEN_GPU
#define CPU_ONLY(x)
#define CPU_PTR(x)
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLboolean r
Definition glcorearb.h:1233
constexpr int MAXGLOBALPADROW
Definition Constants.h:34
int16_t y
int16_t x
std::vector< int > row