Project
Loading...
Searching...
No Matches
GPUTPCTrackingData.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
15#include "GPUParam.h"
16#include "GPUTPCClusterData.h"
17#include "GPUTPCHit.h"
18#include "GPUTPCTrackingData.h"
19#include "GPUProcessor.h"
20#include "GPUO2DataTypes.h"
21#include "GPUTPCConvertImpl.h"
22#include "GPUTPCGeometry.h"
23#include "GPUCommonMath.h"
24
25#ifndef GPUCA_GPUCODE_DEVICE
26#include "utils/vecpod.h"
27#include <iostream>
28#include <cstring>
29#include "GPUReconstruction.h"
30#endif
31
32using namespace o2::gpu;
33
34#ifndef GPUCA_GPUCODE
35
37{
38 // initialisation of rows
39 for (int32_t i = 0; i < GPUCA_ROW_COUNT + 1; i++) {
40 new (&mRows[i]) GPUTPCRow;
41 }
42 for (int32_t i = 0; i < GPUCA_ROW_COUNT; i++) {
43 mRows[i].mX = GPUTPCGeometry::Row2X(i);
44 mRows[i].mMaxY = CAMath::Tan(p.par.dAlpha / 2.f) * mRows[i].mX;
45 }
46}
47
48void GPUTPCTrackingData::SetClusterData(const GPUTPCClusterData* data, int32_t nClusters, int32_t clusterIdOffset)
49{
50 mClusterData = data;
51 mNumberOfHits = nClusters;
52 mClusterIdOffset = clusterIdOffset;
53}
54
56{
57 int32_t hitMemCount = GPUCA_ROW_COUNT * GPUCA_ROWALIGNMENT + mNumberOfHits;
58 const uint32_t kVectorAlignment = 256;
59 mNumberOfHitsPlusAlign = GPUProcessor::nextMultipleOf<(kVectorAlignment > GPUCA_ROWALIGNMENT ? kVectorAlignment : GPUCA_ROWALIGNMENT) / sizeof(int32_t)>(hitMemCount);
60}
61
63{
64 GPUProcessor::computePointerWithAlignment(mem, mLinkUpData, mNumberOfHitsPlusAlign);
65 GPUProcessor::computePointerWithAlignment(mem, mLinkDownData, mNumberOfHitsPlusAlign);
66 return mem;
67}
68
70{
71 GPUProcessor::computePointerWithAlignment(mem, mHitWeights, mNumberOfHitsPlusAlign + 16 / sizeof(*mHitWeights));
72 return mem;
73}
74
75void* GPUTPCTrackingData::SetPointersScratch(void* mem, bool idsOnGPU)
76{
77 const int32_t firstHitInBinSize = GetGridSize(mNumberOfHits, GPUCA_ROW_COUNT) + GPUCA_ROW_COUNT * GPUCA_ROWALIGNMENT / sizeof(int32_t);
78 GPUProcessor::computePointerWithAlignment(mem, mHitData, mNumberOfHitsPlusAlign);
79 GPUProcessor::computePointerWithAlignment(mem, mFirstHitInBin, firstHitInBinSize);
80 if (idsOnGPU) {
81 mem = SetPointersClusterIds(mem, false); // Hijack the allocation from SetPointersClusterIds
82 }
83 return mem;
84}
85
86void* GPUTPCTrackingData::SetPointersClusterIds(void* mem, bool idsOnGPU)
87{
88 if (!idsOnGPU) {
89 GPUProcessor::computePointerWithAlignment(mem, mClusterDataIndex, mNumberOfHitsPlusAlign);
90 }
91 return mem;
92}
93
95{
97 return mem;
98}
99
100#endif
101
102GPUd() void GPUTPCTrackingData::GetMaxNBins(GPUconstantref() const GPUConstantMem* mem, GPUTPCRow* GPUrestrict() row, int32_t& maxY, int32_t& maxZ)
103{
104 maxY = row->mMaxY * 2.f / GPUCA_MIN_BIN_SIZE + 1;
105 maxZ = (mem->param.continuousMaxTimeBin > 0 ? (mem->calibObjects.fastTransformHelper->getCorrMap()->convTimeToZinTimeFrame(0, 0, mem->param.continuousMaxTimeBin)) : GPUTPCGeometry::TPCLength()) + 50;
107}
108
109GPUd() uint32_t GPUTPCTrackingData::GetGridSize(uint32_t nHits, uint32_t nRows)
110{
111 return 128 * nRows + 4 * nHits;
112}
113
114GPUdi() void GPUTPCTrackingData::CreateGrid(GPUconstantref() const GPUConstantMem* mem, GPUTPCRow* GPUrestrict() row, float yMin, float yMax, float zMin, float zMax)
115{
116 float dz = zMax - zMin;
117 float tfFactor = 1.f;
118 if (dz > GPUTPCGeometry::TPCLength() + 20.f) {
119 tfFactor = dz / GPUTPCGeometry::TPCLength();
120 dz = GPUTPCGeometry::TPCLength();
121 }
122 const float norm = CAMath::InvSqrt(row->mNHits / tfFactor);
123 float sy = CAMath::Min(CAMath::Max((yMax - yMin) * norm, GPUCA_MIN_BIN_SIZE), GPUCA_MAX_BIN_SIZE);
124 float sz = CAMath::Min(CAMath::Max(dz * norm, GPUCA_MIN_BIN_SIZE), GPUCA_MAX_BIN_SIZE);
125 int32_t maxy, maxz;
126 GetMaxNBins(mem, row, maxy, maxz);
127 int32_t ny = CAMath::Max(1, CAMath::Min<int32_t>(maxy, (yMax - yMin) / sy + 1));
128 int32_t nz = CAMath::Max(1, CAMath::Min<int32_t>(maxz, (zMax - zMin) / sz + 1));
129 row->mGrid.Create(yMin, yMax, zMin, zMax, ny, nz);
130}
131
132GPUdi() static void UpdateMinMaxYZ(float& yMin, float& yMax, float& zMin, float& zMax, float y, float z)
133{
134 if (yMax < y) {
135 yMax = y;
136 }
137 if (yMin > y) {
138 yMin = y;
139 }
140 if (zMax < z) {
141 zMax = z;
142 }
143 if (zMin > z) {
144 zMin = z;
145 }
146}
147
149{
150 GPUAtomic(calink)* c = (GPUAtomic(calink)*)mFirstHitInBin + row.mFirstHitInBinOffset;
151 row.mGrid.CreateEmpty();
152 row.mNHits = 0;
153 row.mHitNumberOffset = 0;
154 row.mHy0 = 0.f;
155 row.mHz0 = 0.f;
156 row.mHstepY = 1.f;
157 row.mHstepZ = 1.f;
158 row.mHstepYi = 1.f;
159 row.mHstepZi = 1.f;
160 for (int32_t i = 0; i < 4; i++) {
161 c[i] = 0;
162 }
163}
164
165GPUdii() int32_t GPUTPCTrackingData::InitFromClusterData(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUconstantref() const GPUConstantMem* GPUrestrict() mem, int32_t iSector, float* tmpMinMax)
166{
167#ifdef GPUCA_GPUCODE
168 constexpr bool EarlyTransformWithoutClusterNative = false;
169#else
170 bool EarlyTransformWithoutClusterNative = mem->param.par.earlyTpcTransform && mem->ioPtrs.clustersNative == nullptr;
171#endif
172 int32_t* tmpHitIndex = nullptr;
173 const uint32_t* NumberOfClustersInRow = nullptr;
174 const uint32_t* RowOffsets = nullptr;
175
176#ifndef GPUCA_GPUCODE
177 vecpod<float2> YZData(mNumberOfHits);
178 vecpod<calink> binMemory(mNumberOfHits);
179 uint32_t RowOffsetsA[GPUCA_ROW_COUNT];
180 uint32_t NumberOfClustersInRowA[GPUCA_ROW_COUNT];
181
182 vecpod<int32_t> tmpHitIndexA;
183 if (EarlyTransformWithoutClusterNative) { // Implies mem->param.par.earlyTpcTransform but no ClusterNative present
184 NumberOfClustersInRow = NumberOfClustersInRowA;
185 RowOffsets = RowOffsetsA;
186 tmpHitIndexA.resize(mNumberOfHits);
187 tmpHitIndex = tmpHitIndexA.data();
188
189 memset(NumberOfClustersInRowA, 0, GPUCA_ROW_COUNT * sizeof(NumberOfClustersInRowA[0]));
190 for (int32_t i = 0; i < mNumberOfHits; i++) {
191 const int32_t tmpRow = mClusterData[i].row;
192 NumberOfClustersInRowA[tmpRow]++;
193 }
194 int32_t tmpOffset = 0;
195 for (int32_t i = 0; i < GPUCA_ROW_COUNT; i++) {
196 RowOffsetsA[i] = tmpOffset;
197 tmpOffset += NumberOfClustersInRow[i];
198 }
199 int32_t RowsFilled[GPUCA_ROW_COUNT];
200 memset(RowsFilled, 0, GPUCA_ROW_COUNT * sizeof(int32_t));
201 for (int32_t i = 0; i < mNumberOfHits; i++) {
202 float2 tmp;
203 tmp.x = mClusterData[i].y;
204 tmp.y = mClusterData[i].z;
205 int32_t tmpRow = mClusterData[i].row;
206 int32_t newIndex = RowOffsetsA[tmpRow] + (RowsFilled[tmpRow])++;
207 YZData[newIndex] = tmp;
208 tmpHitIndex[newIndex] = i;
209 }
210 } // Other cases below in loop over rows
211#else
212 float2* YZData = (float2*)mLinkUpData; // TODO: we can do this as well on the CPU, just must make sure that CPU has the scratch memory
213 calink* binMemory = (calink*)mHitWeights;
214 static_assert(sizeof(*YZData) <= (sizeof(*mLinkUpData) + sizeof(*mLinkDownData)), "Cannot reuse memory");
215 static_assert(sizeof(*binMemory) <= sizeof(*mHitWeights), "Cannot reuse memory");
216#endif
217
218 for (int32_t rowIndex = iBlock; rowIndex < GPUCA_ROW_COUNT; rowIndex += nBlocks) {
219 float yMin = 1.e6f;
220 float yMax = -1.e6f;
221 float zMin = 1.e6f;
222 float zMax = -1.e6f;
223
224 const uint32_t NumberOfClusters = EarlyTransformWithoutClusterNative ? NumberOfClustersInRow[rowIndex] : mem->ioPtrs.clustersNative->nClusters[iSector][rowIndex];
225 const uint32_t RowOffset = EarlyTransformWithoutClusterNative ? RowOffsets[rowIndex] : (mem->ioPtrs.clustersNative->clusterOffset[iSector][rowIndex] - mem->ioPtrs.clustersNative->clusterOffset[iSector][0]);
226 constexpr const uint32_t maxN = 1u << (sizeof(calink) < 3 ? (sizeof(calink) * 8) : 24);
227 GPUTPCRow& row = mRows[rowIndex];
228 if (iThread == 0) {
229 row.mFirstHitInBinOffset = CAMath::nextMultipleOf<GPUCA_ROWALIGNMENT / sizeof(calink)>(GetGridSize(RowOffset, rowIndex) + rowIndex * GPUCA_ROWALIGNMENT / sizeof(int32_t));
230 }
231 if (NumberOfClusters >= maxN) {
232 if (iThread == 0) {
233 mem->errorCodes.raiseError(GPUErrors::ERROR_SECTORDATA_HITINROW_OVERFLOW, iSector * 1000 + rowIndex, NumberOfClusters, maxN);
234 SetRowGridEmpty(row);
235 }
236 continue;
237 }
238
239 if (iThread == 0) {
240 tmpMinMax[0] = yMin;
241 tmpMinMax[1] = yMax;
242 tmpMinMax[2] = zMin;
243 tmpMinMax[3] = zMax;
244 }
245 GPUbarrier();
246 GPUAtomic(calink)* c = (GPUAtomic(calink)*)mFirstHitInBin + row.mFirstHitInBinOffset;
247 if (NumberOfClusters == 0) {
248 if (iThread == 0) {
249 SetRowGridEmpty(row);
250 }
251 continue;
252 }
253
254 if (EarlyTransformWithoutClusterNative) {
255 for (uint32_t i = iThread; i < NumberOfClusters; i += nThreads) {
256 UpdateMinMaxYZ(yMin, yMax, zMin, zMax, YZData[RowOffset + i].x, YZData[RowOffset + i].y);
257 }
258 } else if (mem->param.par.earlyTpcTransform) { // Early transform case with ClusterNative present
259 for (uint32_t i = iThread; i < NumberOfClusters; i += nThreads) {
260 float2 tmp;
261 tmp.x = mClusterData[RowOffset + i].y;
262 tmp.y = mClusterData[RowOffset + i].z;
263 UpdateMinMaxYZ(yMin, yMax, zMin, zMax, tmp.x, tmp.y);
264 YZData[RowOffset + i] = tmp;
265 }
266 } else {
267 for (uint32_t i = iThread; i < NumberOfClusters; i += nThreads) {
268 float x, y, z;
269 GPUTPCConvertImpl::convert(*mem, iSector, rowIndex, mem->ioPtrs.clustersNative->clusters[iSector][rowIndex][i].getPad(), mem->ioPtrs.clustersNative->clusters[iSector][rowIndex][i].getTime(), x, y, z);
270 UpdateMinMaxYZ(yMin, yMax, zMin, zMax, y, z);
271 YZData[RowOffset + i] = CAMath::MakeFloat2(y, z);
272 }
273 }
274
275 if (iThread == 0) {
276 row.mNHits = NumberOfClusters;
277 row.mHitNumberOffset = CAMath::nextMultipleOf<GPUCA_ROWALIGNMENT / sizeof(calink)>(RowOffset + rowIndex * GPUCA_ROWALIGNMENT / sizeof(calink));
278 }
279
280#ifdef GPUCA_HAVE_ATOMIC_MINMAX_FLOAT
281 CAMath::AtomicMinShared(&tmpMinMax[0], yMin);
282 CAMath::AtomicMaxShared(&tmpMinMax[1], yMax);
283 CAMath::AtomicMinShared(&tmpMinMax[2], zMin);
284 CAMath::AtomicMaxShared(&tmpMinMax[3], zMax);
285#else
286 for (int32_t i = 0; i < nThreads; i++) {
287 GPUbarrier();
288 if (iThread == i) {
289 if (tmpMinMax[0] > yMin) {
290 tmpMinMax[0] = yMin;
291 }
292 if (tmpMinMax[1] < yMax) {
293 tmpMinMax[1] = yMax;
294 }
295 if (tmpMinMax[2] > zMin) {
296 tmpMinMax[2] = zMin;
297 }
298 if (tmpMinMax[3] < zMax) {
299 tmpMinMax[3] = zMax;
300 }
301 }
302 }
303#endif
304 GPUbarrier();
305 if (iThread == 0) {
306 CreateGrid(mem, &row, tmpMinMax[0], tmpMinMax[1], tmpMinMax[2], tmpMinMax[3]);
307 }
308 GPUbarrier();
309 const GPUTPCGrid& grid = row.mGrid;
310 const int32_t numberOfBins = grid.N();
311 constexpr const int32_t maxBins = sizeof(calink) < 4 ? (int32_t)(1ul << (sizeof(calink) * 8)) : 0x7FFFFFFF; // NOLINT: false warning
312 if (sizeof(calink) < 4 && numberOfBins >= maxBins) {
313 if (iThread == 0) {
314 mem->errorCodes.raiseError(GPUErrors::ERROR_SECTORDATA_BIN_OVERFLOW, iSector * 1000 + rowIndex, numberOfBins, maxBins);
315 SetRowGridEmpty(row);
316 }
317 continue;
318 }
319 const uint32_t nn = numberOfBins + grid.Ny() + 3;
320 const uint32_t maxnn = GetGridSize(NumberOfClusters, 1);
321 if (nn >= maxnn) {
322 if (iThread == 0) {
323 mem->errorCodes.raiseError(GPUErrors::ERROR_SECTORDATA_FIRSTHITINBIN_OVERFLOW, iSector, nn, maxnn);
324 SetRowGridEmpty(row);
325 }
326 continue;
327 }
328
329 calink* bins = &binMemory[RowOffset]; // Reuse mLinkUpData memory as temporary memory
330
331 for (int32_t bin = iThread; bin < numberOfBins; bin += nThreads) {
332 c[bin] = 0; // initialize to 0
333 }
334 GPUbarrier();
335 for (int32_t hitIndex = iThread; hitIndex < row.mNHits; hitIndex += nThreads) {
336 const int32_t globalHitIndex = RowOffset + hitIndex;
337 const calink bin = row.mGrid.GetBin(YZData[globalHitIndex].x, YZData[globalHitIndex].y);
338
339 bins[hitIndex] = bin;
340 CAMath::AtomicAdd(&c[bin], 1u);
341 }
342 GPUbarrier();
343
344 if (iThread == 0) {
345 calink n = 0;
346 for (int32_t bin = 0; bin < numberOfBins; ++bin) { // TODO: Parallelize
347 n += c[bin];
348 c[bin] = n;
349 }
350 for (uint32_t bin = numberOfBins; bin < nn; bin++) {
351 c[bin] = n;
352 }
353 }
354 GPUbarrier();
355
356 constexpr float maxVal = (((int64_t)1 << (sizeof(cahit) < 3 ? sizeof(cahit) * 8 : 24)) - 1); // Stay within float precision in any case!
357 constexpr float packingConstant = 1.f / (maxVal - 2.f);
358 const float y0 = row.mGrid.YMin();
359 const float z0 = row.mGrid.ZMin();
360 const float stepY = (row.mGrid.YMax() - y0) * packingConstant;
361 const float stepZ = (row.mGrid.ZMax() - z0) * packingConstant;
362 const float stepYi = 1.f / stepY;
363 const float stepZi = 1.f / stepZ;
364
365 if (iThread == 0) {
366 row.mHy0 = y0;
367 row.mHz0 = z0;
368 row.mHstepY = stepY;
369 row.mHstepZ = stepZ;
370 row.mHstepYi = stepYi;
371 row.mHstepZi = stepZi;
372 }
373
374 GPUbarrier();
375
376 for (int32_t hitIndex = iThread; hitIndex < row.mNHits; hitIndex += nThreads) {
377 const calink bin = bins[hitIndex];
378 const calink ind = CAMath::AtomicAdd(&c[bin], (calink)-1) - 1; // generate an index for this hit that is >= c[bin] and < c[bin + 1]
379 const int32_t globalBinsortedIndex = row.mHitNumberOffset + ind;
380 const int32_t globalHitIndex = RowOffset + hitIndex;
381
382 // allows to find the global hit index / coordinates from a global bin sorted hit index
383 mClusterDataIndex[globalBinsortedIndex] = EarlyTransformWithoutClusterNative ? tmpHitIndex[globalHitIndex] : (RowOffset + hitIndex);
384
385 const float xx = ((YZData[globalHitIndex].x - y0) * stepYi) + .5;
386 const float yy = ((YZData[globalHitIndex].y - z0) * stepZi) + .5;
387#if !defined(GPUCA_GPUCODE) && !defined(NDEBUG)
388 if (xx < 0 || yy < 0 || xx > maxVal || yy > maxVal) {
389 std::cout << "!!!! hit packing error!!! " << xx << " " << yy << " (" << maxVal << ")" << std::endl;
390 return 1;
391 }
392#endif
393 // HitData is bin sorted
394 mHitData[globalBinsortedIndex].x = (cahit)xx;
395 mHitData[globalBinsortedIndex].y = (cahit)yy;
396 }
397
398 GPUbarrier();
399
400 if (iThread == 0 && !mem->param.par.continuousTracking) {
401 const float maxAbsZ = CAMath::Max(CAMath::Abs(tmpMinMax[2]), CAMath::Abs(tmpMinMax[3]));
402 if (maxAbsZ > 300) {
403 mem->errorCodes.raiseError(GPUErrors::ERROR_SECTORDATA_Z_OVERFLOW, iSector, (uint32_t)maxAbsZ);
404 SetRowGridEmpty(row);
405 continue;
406 }
407 }
408 }
409
410 return 0;
411}
const auto bins
Definition PID.cxx:49
int32_t i
#define GPUconstantref()
#define GPUdii()
#define GPUAtomic(type)
#define GPUbarrier()
#define GPUrestrict()
#define GPUCA_MAX_BIN_SIZE
#define GPUCA_MIN_BIN_SIZE
#define GPUCA_ROWALIGNMENT
#define GPUCA_ROW_COUNT
float float float & zMax
float float & zMin
float & yMax
uint32_t c
Definition RawData.h:2
int nClusters
static void computePointerWithAlignment(T *&basePtr, S *&objPtr, size_t nEntries=1)
static size_t nextMultipleOf(size_t size)
void InitializeRows(const GPUParam &p)
void * SetPointersScratch(void *mem, bool idsOnGPU)
void SetClusterData(const GPUTPCClusterData *data, int32_t nClusters, int32_t clusterIdOffset)
void * SetPointersClusterIds(void *mem, bool idsOnGPU)
GLdouble n
Definition glcorearb.h:1982
GLint GLenum GLint x
Definition glcorearb.h:403
GLfloat GLfloat GLfloat GLfloat GLfloat maxY
Definition glcorearb.h:2910
GLint y
Definition glcorearb.h:270
GLboolean * data
Definition glcorearb.h:298
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat maxZ
Definition glcorearb.h:2910
GLuint GLfloat GLfloat y0
Definition glcorearb.h:5034
GLdouble GLdouble GLdouble z
Definition glcorearb.h:843
GPUdi() o2
Definition TrackTRD.h:38
GPUd() const expr uint32_t MultivariatePolynomialHelper< Dim
uint32_t cahit
Definition GPUTPCDef.h:31
uint32_t calink
Definition GPUTPCDef.h:30
std::vector< int > row
typename std::vector< T, vecpod_allocator< T > > vecpod
Definition vecpod.h:31