Project
Loading...
Searching...
No Matches
GPUTPCCompressionKernels.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
16#include "GPUConstantMem.h"
17#include "GPUO2DataTypes.h"
18#include "GPUParam.h"
19#include "GPUCommonAlgorithm.h"
22#include "GPUTPCCompressionKernels.inc"
23
24using namespace o2::gpu;
25using namespace o2::tpc;
26
27template <>
28GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step0attached>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors)
29{
30 const GPUTrackingInOutPointers& GPUrestrict() ioPtrs = processors.ioPtrs;
32 GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor;
33 const GPUParam& GPUrestrict() param = processors.param;
34
35 int32_t myTrack = 0;
36 for (uint32_t i = get_global_id(0); i < ioPtrs.nMergedTracks; i += get_global_size(0)) {
38 const GPUTPCGMMergedTrack& GPUrestrict() trk = ioPtrs.mergedTracks[i];
39 if (!trk.OK()) {
40 continue;
41 }
42 bool rejectTrk = GPUTPCClusterRejection::IsTrackRejected(trk, param);
43 uint32_t nClustersStored = 0;
45 uint8_t lastRow = 0, lastSector = 0;
47 float zOffset = 0;
48 for (int32_t k = trk.NClusters() - 1; k >= 0; k--) {
49 const GPUTPCGMMergedTrackHit& GPUrestrict() hit = ioPtrs.mergedTrackHits[trk.FirstClusterRef() + k];
51 continue;
52 }
53
54 int32_t hitId = hit.num;
55 int32_t attach = ioPtrs.mergedTrackHitAttachment[hitId];
56 if ((attach & gputpcgmmergertypes::attachTrackMask) != i) {
57 continue; // Main attachment to different track
58 }
59 bool rejectCluster = processors.param.rec.tpc.rejectionStrategy >= GPUSettings::RejectionStrategyA && !(attach & gputpcgmmergertypes::attachProtect) && (rejectTrk || GPUTPCClusterRejection::GetIsRejected(attach));
60 if (rejectCluster) {
61 compressor.mClusterStatus[hitId] = 1; // Cluster rejected, do not store
62 continue;
63 } else if (processors.param.rec.tpc.rejectionStrategy >= GPUSettings::RejectionStrategyA && rejectTrk) {
64 continue;
65 }
66
67 if (!(param.rec.tpc.compressionTypeMask & GPUSettings::CompressionTrackModel)) {
68 continue; // No track model compression
69 }
70 const ClusterNative& GPUrestrict() orgCl = clusters -> clusters[hit.sector][hit.row][hit.num - clusters->clusterOffset[hit.sector][hit.row]];
71 constexpr GPUTPCGeometry geo;
72 float x = geo.Row2X(hit.row);
73 float y = track.LinearPad2Y(hit.sector, orgCl.getPad(), geo.PadWidth(hit.row), geo.NPads(hit.row));
74 float z = geo.LinearTime2Z(hit.sector, orgCl.getTime());
75 if (nClustersStored) {
76 if ((hit.sector < GPUTPCGeometry::NSECTORS) ^ (lastSector < GPUTPCGeometry::NSECTORS)) {
77 break;
78 }
79 if (track.Propagate(geo.Row2X(hit.row), GPUTPCGeometry::SectorAlpha(hit.sector))) {
80 break;
81 }
82 }
83
84 compressor.mClusterStatus[hitId] = 1; // Cluster compressed in track model, do not store as difference
85
86 int32_t cidx = trk.FirstClusterRef() + nClustersStored++;
87 if (nClustersStored == 1) {
88 uint8_t qpt = fabs(trk.GetParam().GetQPt()) < 20.f ? (trk.GetParam().GetQPt() * (127.f / 20.f) + 127.5f) : (trk.GetParam().GetQPt() > 0 ? 254 : 0);
89 zOffset = z;
90 track.Init(x, y, z - zOffset, GPUTPCGeometry::SectorAlpha(hit.sector), qpt, param);
91
92 myTrack = CAMath::AtomicAdd(&compressor.mMemory->nStoredTracks, 1u);
93 compressor.mAttachedClusterFirstIndex[myTrack] = trk.FirstClusterRef();
94 c.qPtA[myTrack] = qpt;
95 c.rowA[myTrack] = hit.row;
96 c.sliceA[myTrack] = hit.sector;
97 c.timeA[myTrack] = orgCl.getTimePacked();
98 c.padA[myTrack] = orgCl.padPacked;
99 } else {
100 uint32_t row = hit.row;
101 uint32_t sector = hit.sector;
102
103 if (param.rec.tpc.compressionTypeMask & GPUSettings::CompressionDifferences) {
104 if (lastRow > row) {
106 }
107 row -= lastRow;
108 if (lastSector > sector) {
109 sector += compressor.NSECTORS;
110 }
111 sector -= lastSector;
112 }
113 c.rowDiffA[cidx] = row;
114 c.sliceLegDiffA[cidx] = sector;
115 float pad = CAMath::Max(0.f, CAMath::Min((float)geo.NPads(GPUTPCGeometry::NROWS - 1), track.LinearY2Pad(hit.sector, track.Y(), geo.PadWidth(hit.row), geo.NPads(hit.row))));
116 c.padResA[cidx] = orgCl.padPacked - orgCl.packPad(pad);
117 float time = CAMath::Max(0.f, geo.LinearZ2Time(hit.sector, track.Z() + zOffset));
118 c.timeResA[cidx] = (orgCl.getTimePacked() - orgCl.packTime(time)) & 0xFFFFFF;
119 }
120 uint16_t qtot = orgCl.qTot, qmax = orgCl.qMax;
121 uint8_t sigmapad = orgCl.sigmaPadPacked, sigmatime = orgCl.sigmaTimePacked;
122 if (param.rec.tpc.compressionTypeMask & GPUSettings::CompressionTruncate) {
123 compressor.truncateSignificantBitsChargeMax(qmax, param);
124 compressor.truncateSignificantBitsWidth(sigmapad, param);
125 if (!orgCl.isSaturated()) [[likely]] {
126 compressor.truncateSignificantBitsCharge(qtot, param);
127 compressor.truncateSignificantBitsWidth(sigmatime, param);
128 }
129 }
130 c.qTotA[cidx] = qtot;
131 c.qMaxA[cidx] = qmax;
132 c.sigmaPadA[cidx] = sigmapad;
133 c.sigmaTimeA[cidx] = sigmatime;
134 c.flagsA[cidx] = orgCl.getFlags();
135 if (k && track.Filter(y, z - zOffset, hit.row)) {
136 break;
137 }
138 lastRow = hit.row;
139 lastSector = hit.sector;
140 }
141 if (nClustersStored) {
142 CAMath::AtomicAdd(&compressor.mMemory->nStoredAttachedClusters, nClustersStored);
143 c.nTrackClusters[myTrack] = nClustersStored;
144 }
145 }
146}
147
148template <>
149GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<GPUSettings::SortTime>::operator()(uint32_t a, uint32_t b) const
150{
151 return mClsPtr[a].getTimePacked() < mClsPtr[b].getTimePacked();
152}
153
154template <>
155GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<GPUSettings::SortPad>::operator()(uint32_t a, uint32_t b) const
156{
157 return mClsPtr[a].padPacked < mClsPtr[b].padPacked;
158}
159
160template <>
161GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<GPUSettings::SortZTimePad>::operator()(uint32_t a, uint32_t b) const
162{
163 if (mClsPtr[a].getTimePacked() >> 3 == mClsPtr[b].getTimePacked() >> 3) {
164 return mClsPtr[a].padPacked < mClsPtr[b].padPacked;
165 }
166 return mClsPtr[a].getTimePacked() < mClsPtr[b].getTimePacked();
167}
168
169template <>
170GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<GPUSettings::SortZPadTime>::operator()(uint32_t a, uint32_t b) const
171{
172 if (mClsPtr[a].padPacked >> 3 == mClsPtr[b].padPacked >> 3) {
173 return mClsPtr[a].getTimePacked() < mClsPtr[b].getTimePacked();
174 }
175 return mClsPtr[a].padPacked < mClsPtr[b].padPacked;
176}
177
178template <> // Deterministic comparison
179GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<4>::operator()(uint32_t a, uint32_t b) const
180{
181 if (mClsPtr[a].getTimePacked() != mClsPtr[b].getTimePacked()) {
182 return mClsPtr[a].getTimePacked() < mClsPtr[b].getTimePacked();
183 }
184 if (mClsPtr[a].padPacked != mClsPtr[b].padPacked) {
185 return mClsPtr[a].padPacked < mClsPtr[b].padPacked;
186 }
187 return mClsPtr[a].qTot < mClsPtr[b].qTot;
188}
189
190GPUd() bool GPUTPCCompression::rejectCluster(int32_t idx, const GPUParam& GPUrestrict() param, const GPUTrackingInOutPointers& GPUrestrict() ioPtrs) const
191{
192 if (mClusterStatus[idx]) {
193 return true;
194 }
195 int32_t attach = ioPtrs.mergedTrackHitAttachment[idx];
196 bool unattached = attach == 0;
197
198 if (unattached) {
199 if (param.rec.tpc.rejectionStrategy >= GPUSettings::RejectionStrategyB) {
200 return true;
201 }
202 } else if (param.rec.tpc.rejectionStrategy >= GPUSettings::RejectionStrategyA) {
203 if (GPUTPCClusterRejection::GetIsRejected(attach)) {
204 return true;
205 }
207 return false;
208 }
209 int32_t id = attach & gputpcgmmergertypes::attachTrackMask;
210 auto& trk = ioPtrs.mergedTracks[id];
211 if (GPUTPCClusterRejection::IsTrackRejected(trk, param)) {
212 return true;
213 }
214 }
215 return false;
216}
217
218template <>
219GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1unattached>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
220{
221 const GPUTrackingInOutPointers& GPUrestrict() ioPtrs = processors.ioPtrs;
222 const o2::tpc::ClusterNativeAccess* GPUrestrict() clusters = ioPtrs.clustersNative;
223 GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor;
224 GPUParam& GPUrestrict() param = processors.param;
225 uint32_t* sortBuffer = smem.sortBuffer;
226 for (uint32_t iSectorRow = iBlock; iSectorRow < GPUTPCGeometry::NSECTORS * GPUTPCGeometry::NROWS; iSectorRow += nBlocks) {
227 const uint32_t iSector = iSectorRow / GPUTPCGeometry::NROWS;
228 const uint32_t iRow = iSectorRow % GPUTPCGeometry::NROWS;
229 const uint32_t idOffset = clusters->clusterOffset[iSector][iRow];
230 const uint32_t idOffsetOut = clusters->clusterOffset[iSector][iRow] * compressor.mMaxClusterFactorBase1024 / 1024; // 32 bit enough for number of clusters per row * 1024
231 const uint32_t idOffsetOutMax = ((const uint32_t*)clusters->clusterOffset[iSector])[iRow + 1] * compressor.mMaxClusterFactorBase1024 / 1024; // Array out of bounds access is ok, since it goes to the correct nClustersTotal
232 if (iThread == nThreads - 1) {
233 smem.nCount = 0;
234 }
235 uint32_t totalCount = 0;
236 GPUbarrier();
237
238 CompressedClustersPtrs& GPUrestrict() c = compressor.mPtrs;
239
240 const uint32_t nn = CAMath::nextMultipleOf<GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionKernels_step1unattached)>(clusters->nClusters[iSector][iRow]);
241 for (uint32_t i = iThread; i < nn + nThreads; i += nThreads) {
242 const int32_t idx = idOffset + i;
243 int32_t storeCluster = i < clusters->nClusters[iSector][iRow] && !compressor.rejectCluster(idx, param, ioPtrs);
244
245 GPUbarrier();
246 int32_t myIndex = work_group_scan_inclusive_add(storeCluster);
247 int32_t storeLater = -1;
248 if (storeCluster) {
249 if (smem.nCount + myIndex <= constants::TPC_COMP_CHUNK_SIZE) {
250 sortBuffer[smem.nCount + myIndex - 1] = i;
251 } else {
252 storeLater = smem.nCount + myIndex - 1 - constants::TPC_COMP_CHUNK_SIZE;
253 }
254 }
255 GPUbarrier();
256 if (iThread == nThreads - 1) {
257 smem.nCount += myIndex;
258 }
259 GPUbarrier();
260
261 if (smem.nCount < constants::TPC_COMP_CHUNK_SIZE && i < nn) {
262 continue;
263 }
264
265 uint32_t count = CAMath::Min(smem.nCount, (uint32_t)constants::TPC_COMP_CHUNK_SIZE);
266 if (idOffsetOut + totalCount + count > idOffsetOutMax) {
267 if (iThread == nThreads - 1) {
268 compressor.raiseError(GPUErrors::ERROR_COMPRESSION_ROW_HIT_OVERFLOW, iSector * 1000 + iRow, idOffsetOut + totalCount + count, idOffsetOutMax);
269 }
270 break;
271 }
272 if (param.rec.tpc.compressionTypeMask & GPUSettings::CompressionDifferences) {
273#ifdef GPUCA_GPUCODE
274 static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionKernels_step1unattached) * 2 <= constants::TPC_COMP_CHUNK_SIZE);
275#endif
276#ifdef GPUCA_DETERMINISTIC_MODE
277 CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZPadTime>(clusters->clusters[iSector][iRow]));
278#else // GPUCA_DETERMINISTIC_MODE
279 if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZPadTime) {
280 CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZPadTime>(clusters->clusters[iSector][iRow]));
281 } else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZTimePad) {
282 CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZTimePad>(clusters->clusters[iSector][iRow]));
283 } else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortPad) {
284 CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortPad>(clusters->clusters[iSector][iRow]));
285 } else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortTime) {
286 CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortTime>(clusters->clusters[iSector][iRow]));
287 }
288#endif // GPUCA_DETERMINISTIC_MODE
289 GPUbarrier();
290 }
291
292 for (uint32_t j = get_local_id(0); j < count; j += get_local_size(0)) {
293 int32_t outidx = idOffsetOut + totalCount + j;
294 const ClusterNative& GPUrestrict() orgCl = clusters -> clusters[iSector][iRow][sortBuffer[j]];
295
296 int32_t preId = j != 0 ? (int32_t)sortBuffer[j - 1] : (totalCount != 0 ? (int32_t)smem.lastIndex : -1);
297 GPUTPCCompression_EncodeUnattached(param.rec.tpc.compressionTypeMask, orgCl, c.timeDiffU[outidx], c.padDiffU[outidx], preId == -1 ? nullptr : &clusters->clusters[iSector][iRow][preId]);
298
299 uint16_t qtot = orgCl.qTot, qmax = orgCl.qMax;
300 uint8_t sigmapad = orgCl.sigmaPadPacked, sigmatime = orgCl.sigmaTimePacked;
301 if (param.rec.tpc.compressionTypeMask & GPUSettings::CompressionTruncate) {
302 compressor.truncateSignificantBitsChargeMax(qmax, param);
303 compressor.truncateSignificantBitsWidth(sigmapad, param);
304 if (!orgCl.isSaturated()) [[likely]] {
305 compressor.truncateSignificantBitsCharge(qtot, param);
306 compressor.truncateSignificantBitsWidth(sigmatime, param);
307 }
308 }
309 c.qTotU[outidx] = qtot;
310 c.qMaxU[outidx] = qmax;
311 c.sigmaPadU[outidx] = sigmapad;
312 c.sigmaTimeU[outidx] = sigmatime;
313 c.flagsU[outidx] = orgCl.getFlags();
314 }
315
316 GPUbarrier();
317 if (storeLater >= 0) {
318 sortBuffer[storeLater] = i;
319 }
320 totalCount += count;
321 if (iThread == nThreads - 1 && count) {
322 smem.lastIndex = sortBuffer[count - 1];
323 smem.nCount -= count;
324 }
325 }
326
327 if (iThread == nThreads - 1) {
328 c.nSliceRowClusters[iSector * GPUTPCGeometry::NROWS + iRow] = totalCount;
329 CAMath::AtomicAdd(&compressor.mMemory->nStoredUnattachedClusters, totalCount);
330 }
331 GPUbarrier();
332 }
333}
334
335template <>
337{
338 return buf32[iWarp];
339}
340
341template <>
343{
344 return buf64[iWarp];
345}
346
347template <>
349{
350 return buf128[iWarp];
351}
352
353template <typename T, typename S>
354GPUdi() bool GPUTPCCompressionGatherKernels::isAlignedTo(const S* ptr)
355{
356 if constexpr (alignof(S) >= alignof(T)) {
357 static_cast<void>(ptr);
358 return true;
359 } else {
360 return reinterpret_cast<size_t>(ptr) % alignof(T) == 0;
361 }
362}
363
364template <>
365GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpy<uint8_t>(uint8_t* GPUrestrict() dst, const uint8_t* GPUrestrict() src, uint32_t size, int32_t nThreads, int32_t iThread)
366{
367 constexpr const int32_t vec128Elems = CpyVector<uint8_t, Vec128>::Size;
368 constexpr const int32_t vec64Elems = CpyVector<uint8_t, Vec64>::Size;
369 constexpr const int32_t vec32Elems = CpyVector<uint8_t, Vec32>::Size;
370 constexpr const int32_t vec16Elems = CpyVector<uint8_t, Vec16>::Size;
371
372 if (size >= uint32_t(nThreads * vec128Elems)) {
373 compressorMemcpyVectorised<uint8_t, Vec128>(dst, src, size, nThreads, iThread);
374 } else if (size >= uint32_t(nThreads * vec64Elems)) {
375 compressorMemcpyVectorised<uint8_t, Vec64>(dst, src, size, nThreads, iThread);
376 } else if (size >= uint32_t(nThreads * vec32Elems)) {
377 compressorMemcpyVectorised<uint8_t, Vec32>(dst, src, size, nThreads, iThread);
378 } else if (size >= uint32_t(nThreads * vec16Elems)) {
379 compressorMemcpyVectorised<uint8_t, Vec16>(dst, src, size, nThreads, iThread);
380 } else {
381 compressorMemcpyBasic(dst, src, size, nThreads, iThread);
382 }
383}
384
385template <>
386GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpy<uint16_t>(uint16_t* GPUrestrict() dst, const uint16_t* GPUrestrict() src, uint32_t size, int32_t nThreads, int32_t iThread)
387{
388 constexpr const int32_t vec128Elems = CpyVector<uint16_t, Vec128>::Size;
389 constexpr const int32_t vec64Elems = CpyVector<uint16_t, Vec64>::Size;
390 constexpr const int32_t vec32Elems = CpyVector<uint16_t, Vec32>::Size;
391
392 if (size >= uint32_t(nThreads * vec128Elems)) {
393 compressorMemcpyVectorised<uint16_t, Vec128>(dst, src, size, nThreads, iThread);
394 } else if (size >= uint32_t(nThreads * vec64Elems)) {
395 compressorMemcpyVectorised<uint16_t, Vec64>(dst, src, size, nThreads, iThread);
396 } else if (size >= uint32_t(nThreads * vec32Elems)) {
397 compressorMemcpyVectorised<uint16_t, Vec32>(dst, src, size, nThreads, iThread);
398 } else {
399 compressorMemcpyBasic(dst, src, size, nThreads, iThread);
400 }
401}
402
403template <>
404GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpy<uint32_t>(uint32_t* GPUrestrict() dst, const uint32_t* GPUrestrict() src, uint32_t size, int32_t nThreads, int32_t iThread)
405{
406 constexpr const int32_t vec128Elems = CpyVector<uint32_t, Vec128>::Size;
407 constexpr const int32_t vec64Elems = CpyVector<uint32_t, Vec64>::Size;
408
409 if (size >= uint32_t(nThreads * vec128Elems)) {
410 compressorMemcpyVectorised<uint32_t, Vec128>(dst, src, size, nThreads, iThread);
411 } else if (size >= uint32_t(nThreads * vec64Elems)) {
412 compressorMemcpyVectorised<uint32_t, Vec64>(dst, src, size, nThreads, iThread);
413 } else {
414 compressorMemcpyBasic(dst, src, size, nThreads, iThread);
415 }
416}
417
418template <typename Scalar, typename BaseVector>
419GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpyVectorised(Scalar* dst, const Scalar* src, uint32_t size, int32_t nThreads, int32_t iThread)
420{
421 if (not isAlignedTo<BaseVector>(dst)) {
422 size_t dsti = reinterpret_cast<size_t>(dst);
423 int32_t offset = (alignof(BaseVector) - dsti % alignof(BaseVector)) / sizeof(Scalar);
424 compressorMemcpyBasic(dst, src, offset, nThreads, iThread);
425 src += offset;
426 dst += offset;
427 size -= offset;
428 }
429
430 BaseVector* GPUrestrict() dstAligned = reinterpret_cast<BaseVector*>(dst);
431
432 using CpyVec = CpyVector<Scalar, BaseVector>;
433 uint32_t sizeAligned = size / CpyVec::Size;
434
435 if (isAlignedTo<BaseVector>(src)) {
436 const BaseVector* GPUrestrict() srcAligned = reinterpret_cast<const BaseVector*>(src);
437 compressorMemcpyBasic(dstAligned, srcAligned, sizeAligned, nThreads, iThread);
438 } else {
439 for (uint32_t i = iThread; i < sizeAligned; i += nThreads) {
440 CpyVec buf;
441 for (uint32_t j = 0; j < CpyVec::Size; j++) {
442 buf.elems[j] = src[i * CpyVec::Size + j];
443 }
444 dstAligned[i] = buf.all;
445 }
446 }
447
448 int32_t leftovers = size % CpyVec::Size;
449 compressorMemcpyBasic(dst + size - leftovers, src + size - leftovers, leftovers, nThreads, iThread);
450}
451
452template <typename T>
453GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpyBasic(T* GPUrestrict() dst, const T* GPUrestrict() src, uint32_t size, int32_t nThreads, int32_t iThread, int32_t nBlocks, int32_t iBlock)
454{
455 uint32_t start = (size + nBlocks - 1) / nBlocks * iBlock + iThread;
456 uint32_t end = CAMath::Min(size, (size + nBlocks - 1) / nBlocks * (iBlock + 1));
457 for (uint32_t i = start; i < end; i += nThreads) {
458 dst[i] = src[i];
459 }
460}
461
462template <typename V, typename T, typename S>
463GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpyBuffered(V* buf, T* GPUrestrict() dst, const T* GPUrestrict() src, const S* GPUrestrict() nums, const uint32_t* GPUrestrict() srcOffsets, uint32_t nEntries, int32_t nLanes, int32_t iLane, int32_t diff, size_t scaleBase1024)
464{
465 int32_t shmPos = 0;
466 uint32_t dstOffset = 0;
467 V* GPUrestrict() dstAligned = nullptr;
468
469 T* bufT = reinterpret_cast<T*>(buf);
470 constexpr const int32_t bufSize = GPUCA_WARP_SIZE;
471 constexpr const int32_t bufTSize = bufSize * sizeof(V) / sizeof(T);
472
473 for (uint32_t i = 0; i < nEntries; i++) {
474 uint32_t srcPos = 0;
475 uint32_t srcOffset = (srcOffsets[i] * scaleBase1024 / 1024) + diff;
476 uint32_t srcSize = nums[i] - diff;
477
478 if (dstAligned == nullptr) {
479 if (not isAlignedTo<V>(dst)) {
480 size_t dsti = reinterpret_cast<size_t>(dst);
481 uint32_t offset = (alignof(V) - dsti % alignof(V)) / sizeof(T);
482 offset = CAMath::Min<uint32_t>(offset, srcSize);
483 compressorMemcpyBasic(dst, src + srcOffset, offset, nLanes, iLane);
484 dst += offset;
485 srcPos += offset;
486 }
487 if (isAlignedTo<V>(dst)) {
488 dstAligned = reinterpret_cast<V*>(dst);
489 }
490 }
491 while (srcPos < srcSize) {
492 uint32_t shmElemsLeft = bufTSize - shmPos;
493 uint32_t srcElemsLeft = srcSize - srcPos;
494 uint32_t size = CAMath::Min(srcElemsLeft, shmElemsLeft);
495 compressorMemcpyBasic(bufT + shmPos, src + srcOffset + srcPos, size, nLanes, iLane);
496 srcPos += size;
497 shmPos += size;
499
500 if (shmPos >= bufTSize) {
501 compressorMemcpyBasic(dstAligned + dstOffset, buf, bufSize, nLanes, iLane);
502 dstOffset += bufSize;
503 shmPos = 0;
505 }
506 }
507 }
508
509 compressorMemcpyBasic(reinterpret_cast<T*>(dstAligned + dstOffset), bufT, shmPos, nLanes, iLane);
511}
512
513template <typename T>
514GPUdi() uint32_t GPUTPCCompressionGatherKernels::calculateWarpOffsets(GPUSharedMemory& smem, T* nums, uint32_t start, uint32_t end, int32_t nWarps, int32_t iWarp, int32_t nLanes, int32_t iLane)
515{
516 uint32_t blockOffset = 0;
517 int32_t iThread = nLanes * iWarp + iLane;
518 int32_t nThreads = nLanes * nWarps;
519 uint32_t blockStart = work_group_broadcast(start, 0);
520 for (uint32_t i = iThread; i < blockStart; i += nThreads) {
521 blockOffset += nums[i];
522 }
523 blockOffset = work_group_reduce_add(blockOffset);
524
525 uint32_t offset = 0;
526 for (uint32_t i = start + iLane; i < end; i += nLanes) {
527 offset += nums[i];
528 }
529 offset = work_group_scan_inclusive_add(offset);
530 if (iWarp > -1 && iLane == nLanes - 1) {
531 smem.warpOffset[iWarp] = offset;
532 }
533 GPUbarrier();
534 offset = (iWarp <= 0) ? 0 : smem.warpOffset[iWarp - 1];
535 GPUbarrier();
536
537 return offset + blockOffset;
538}
539
540template <>
541GPUdii() void GPUTPCCompressionGatherKernels::Thread<GPUTPCCompressionGatherKernels::unbuffered>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
542{
543 GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor;
544 const o2::tpc::ClusterNativeAccess* GPUrestrict() clusters = processors.ioPtrs.clustersNative;
545
546 int32_t nWarps = nThreads / GPUCA_WARP_SIZE;
547 int32_t iWarp = iThread / GPUCA_WARP_SIZE;
548
549 int32_t nLanes = GPUCA_WARP_SIZE;
550 int32_t iLane = iThread % GPUCA_WARP_SIZE;
551
552 if (iBlock == 0) {
553
554 uint32_t nRows = compressor.NSECTORS * GPUTPCGeometry::NROWS;
555 uint32_t rowsPerWarp = (nRows + nWarps - 1) / nWarps;
556 uint32_t rowStart = rowsPerWarp * iWarp;
557 uint32_t rowEnd = CAMath::Min(nRows, rowStart + rowsPerWarp);
558 if (rowStart >= nRows) {
559 rowStart = 0;
560 rowEnd = 0;
561 }
562
563 uint32_t rowsOffset = calculateWarpOffsets(smem, compressor.mPtrs.nSliceRowClusters, rowStart, rowEnd, nWarps, iWarp, nLanes, iLane);
564
565 compressorMemcpy(compressor.mOutput->nSliceRowClusters, compressor.mPtrs.nSliceRowClusters, compressor.NSECTORS * GPUTPCGeometry::NROWS, nThreads, iThread);
566 compressorMemcpy(compressor.mOutput->nTrackClusters, compressor.mPtrs.nTrackClusters, compressor.mMemory->nStoredTracks, nThreads, iThread);
567 compressorMemcpy(compressor.mOutput->qPtA, compressor.mPtrs.qPtA, compressor.mMemory->nStoredTracks, nThreads, iThread);
568 compressorMemcpy(compressor.mOutput->rowA, compressor.mPtrs.rowA, compressor.mMemory->nStoredTracks, nThreads, iThread);
569 compressorMemcpy(compressor.mOutput->sliceA, compressor.mPtrs.sliceA, compressor.mMemory->nStoredTracks, nThreads, iThread);
570 compressorMemcpy(compressor.mOutput->timeA, compressor.mPtrs.timeA, compressor.mMemory->nStoredTracks, nThreads, iThread);
571 compressorMemcpy(compressor.mOutput->padA, compressor.mPtrs.padA, compressor.mMemory->nStoredTracks, nThreads, iThread);
572
573 uint32_t sectorStart = rowStart / GPUTPCGeometry::NROWS;
574 uint32_t sectorEnd = rowEnd / GPUTPCGeometry::NROWS;
575
576 uint32_t sectorRowStart = rowStart % GPUTPCGeometry::NROWS;
577 uint32_t sectorRowEnd = rowEnd % GPUTPCGeometry::NROWS;
578
579 for (uint32_t i = sectorStart; i <= sectorEnd && i < compressor.NSECTORS; i++) {
580 for (uint32_t j = ((i == sectorStart) ? sectorRowStart : 0); j < ((i == sectorEnd) ? sectorRowEnd : GPUTPCGeometry::NROWS); j++) {
581 uint32_t nClusters = compressor.mPtrs.nSliceRowClusters[i * GPUTPCGeometry::NROWS + j];
582 uint32_t clusterOffsetInCache = clusters->clusterOffset[i][j] * compressor.mMaxClusterFactorBase1024 / 1024;
583 compressorMemcpy(compressor.mOutput->qTotU + rowsOffset, compressor.mPtrs.qTotU + clusterOffsetInCache, nClusters, nLanes, iLane);
584 compressorMemcpy(compressor.mOutput->qMaxU + rowsOffset, compressor.mPtrs.qMaxU + clusterOffsetInCache, nClusters, nLanes, iLane);
585 compressorMemcpy(compressor.mOutput->flagsU + rowsOffset, compressor.mPtrs.flagsU + clusterOffsetInCache, nClusters, nLanes, iLane);
586 compressorMemcpy(compressor.mOutput->padDiffU + rowsOffset, compressor.mPtrs.padDiffU + clusterOffsetInCache, nClusters, nLanes, iLane);
587 compressorMemcpy(compressor.mOutput->timeDiffU + rowsOffset, compressor.mPtrs.timeDiffU + clusterOffsetInCache, nClusters, nLanes, iLane);
588 compressorMemcpy(compressor.mOutput->sigmaPadU + rowsOffset, compressor.mPtrs.sigmaPadU + clusterOffsetInCache, nClusters, nLanes, iLane);
589 compressorMemcpy(compressor.mOutput->sigmaTimeU + rowsOffset, compressor.mPtrs.sigmaTimeU + clusterOffsetInCache, nClusters, nLanes, iLane);
590 rowsOffset += nClusters;
591 }
592 }
593 }
594
595 if (iBlock == 1) {
596 uint32_t tracksPerWarp = (compressor.mMemory->nStoredTracks + nWarps - 1) / nWarps;
597 uint32_t trackStart = tracksPerWarp * iWarp;
598 uint32_t trackEnd = CAMath::Min(compressor.mMemory->nStoredTracks, trackStart + tracksPerWarp);
599 if (trackStart >= compressor.mMemory->nStoredTracks) {
600 trackStart = 0;
601 trackEnd = 0;
602 }
603
604 uint32_t tracksOffset = calculateWarpOffsets(smem, compressor.mPtrs.nTrackClusters, trackStart, trackEnd, nWarps, iWarp, nLanes, iLane);
605
606 for (uint32_t i = trackStart; i < trackEnd; i += nLanes) {
607 uint32_t nTrackClusters = 0;
608 uint32_t srcOffset = 0;
609
610 if (i + iLane < trackEnd) {
611 nTrackClusters = compressor.mPtrs.nTrackClusters[i + iLane];
612 srcOffset = compressor.mAttachedClusterFirstIndex[i + iLane];
613 }
614 smem.unbuffered.sizes[iWarp][iLane] = nTrackClusters;
615 smem.unbuffered.srcOffsets[iWarp][iLane] = srcOffset;
616
617 uint32_t elems = (i + nLanes < trackEnd) ? nLanes : (trackEnd - i);
618
619 for (uint32_t j = 0; j < elems; j++) {
620 nTrackClusters = smem.unbuffered.sizes[iWarp][j];
621 srcOffset = smem.unbuffered.srcOffsets[iWarp][j];
622 uint32_t idx = i + j;
623 compressorMemcpy(compressor.mOutput->qTotA + tracksOffset, compressor.mPtrs.qTotA + srcOffset, nTrackClusters, nLanes, iLane);
624 compressorMemcpy(compressor.mOutput->qMaxA + tracksOffset, compressor.mPtrs.qMaxA + srcOffset, nTrackClusters, nLanes, iLane);
625 compressorMemcpy(compressor.mOutput->flagsA + tracksOffset, compressor.mPtrs.flagsA + srcOffset, nTrackClusters, nLanes, iLane);
626 compressorMemcpy(compressor.mOutput->sigmaPadA + tracksOffset, compressor.mPtrs.sigmaPadA + srcOffset, nTrackClusters, nLanes, iLane);
627 compressorMemcpy(compressor.mOutput->sigmaTimeA + tracksOffset, compressor.mPtrs.sigmaTimeA + srcOffset, nTrackClusters, nLanes, iLane);
628
629 // First index stored with track
630 compressorMemcpy(compressor.mOutput->rowDiffA + tracksOffset - idx, compressor.mPtrs.rowDiffA + srcOffset + 1, (nTrackClusters - 1), nLanes, iLane);
631 compressorMemcpy(compressor.mOutput->sliceLegDiffA + tracksOffset - idx, compressor.mPtrs.sliceLegDiffA + srcOffset + 1, (nTrackClusters - 1), nLanes, iLane);
632 compressorMemcpy(compressor.mOutput->padResA + tracksOffset - idx, compressor.mPtrs.padResA + srcOffset + 1, (nTrackClusters - 1), nLanes, iLane);
633 compressorMemcpy(compressor.mOutput->timeResA + tracksOffset - idx, compressor.mPtrs.timeResA + srcOffset + 1, (nTrackClusters - 1), nLanes, iLane);
634
635 tracksOffset += nTrackClusters;
636 }
637 }
638 }
639}
640
641template <typename V>
642GPUdii() void GPUTPCCompressionGatherKernels::gatherBuffered(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
643{
644
645 GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor;
646 const o2::tpc::ClusterNativeAccess* GPUrestrict() clusters = processors.ioPtrs.clustersNative;
647
648 int32_t nWarps = nThreads / GPUCA_WARP_SIZE;
649 int32_t iWarp = iThread / GPUCA_WARP_SIZE;
650
651 int32_t nGlobalWarps = nWarps * nBlocks;
652 int32_t iGlobalWarp = nWarps * iBlock + iWarp;
653
654 int32_t nLanes = GPUCA_WARP_SIZE;
655 int32_t iLane = iThread % GPUCA_WARP_SIZE;
656
657 auto& input = compressor.mPtrs;
658 auto* output = compressor.mOutput;
659
660 uint32_t nRows = compressor.NSECTORS * GPUTPCGeometry::NROWS;
661 uint32_t rowsPerWarp = (nRows + nGlobalWarps - 1) / nGlobalWarps;
662 uint32_t rowStart = rowsPerWarp * iGlobalWarp;
663 uint32_t rowEnd = CAMath::Min(nRows, rowStart + rowsPerWarp);
664 if (rowStart >= nRows) {
665 rowStart = 0;
666 rowEnd = 0;
667 }
668 rowsPerWarp = rowEnd - rowStart;
669
670 uint32_t rowsOffset = calculateWarpOffsets(smem, input.nSliceRowClusters, rowStart, rowEnd, nWarps, iWarp, nLanes, iLane);
671
672 uint32_t nStoredTracks = compressor.mMemory->nStoredTracks;
673 uint32_t tracksPerWarp = (nStoredTracks + nGlobalWarps - 1) / nGlobalWarps;
674 uint32_t trackStart = tracksPerWarp * iGlobalWarp;
675 uint32_t trackEnd = CAMath::Min(nStoredTracks, trackStart + tracksPerWarp);
676 if (trackStart >= nStoredTracks) {
677 trackStart = 0;
678 trackEnd = 0;
679 }
680 tracksPerWarp = trackEnd - trackStart;
681
682 uint32_t tracksOffset = calculateWarpOffsets(smem, input.nTrackClusters, trackStart, trackEnd, nWarps, iWarp, nLanes, iLane);
683
684 if (iBlock == 0) {
685 compressorMemcpyBasic(output->nSliceRowClusters, input.nSliceRowClusters, compressor.NSECTORS * GPUTPCGeometry::NROWS, nThreads, iThread);
686 compressorMemcpyBasic(output->nTrackClusters, input.nTrackClusters, compressor.mMemory->nStoredTracks, nThreads, iThread);
687 compressorMemcpyBasic(output->qPtA, input.qPtA, compressor.mMemory->nStoredTracks, nThreads, iThread);
688 compressorMemcpyBasic(output->rowA, input.rowA, compressor.mMemory->nStoredTracks, nThreads, iThread);
689 compressorMemcpyBasic(output->sliceA, input.sliceA, compressor.mMemory->nStoredTracks, nThreads, iThread);
690 compressorMemcpyBasic(output->timeA, input.timeA, compressor.mMemory->nStoredTracks, nThreads, iThread);
691 compressorMemcpyBasic(output->padA, input.padA, compressor.mMemory->nStoredTracks, nThreads, iThread);
692 }
693
694 const uint32_t* clusterOffsets = &clusters->clusterOffset[0][0] + rowStart;
695 const uint32_t* nSectorRowClusters = input.nSliceRowClusters + rowStart;
696
697 auto* buf = smem.getBuffer<V>(iWarp);
698
699 compressorMemcpyBuffered(buf, output->qTotU + rowsOffset, input.qTotU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
700 compressorMemcpyBuffered(buf, output->qMaxU + rowsOffset, input.qMaxU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
701 compressorMemcpyBuffered(buf, output->flagsU + rowsOffset, input.flagsU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
702 compressorMemcpyBuffered(buf, output->padDiffU + rowsOffset, input.padDiffU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
703 compressorMemcpyBuffered(buf, output->timeDiffU + rowsOffset, input.timeDiffU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
704 compressorMemcpyBuffered(buf, output->sigmaPadU + rowsOffset, input.sigmaPadU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
705 compressorMemcpyBuffered(buf, output->sigmaTimeU + rowsOffset, input.sigmaTimeU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
706
707 const uint16_t* nTrackClustersPtr = input.nTrackClusters + trackStart;
708 const uint32_t* aClsFstIdx = compressor.mAttachedClusterFirstIndex + trackStart;
709
710 compressorMemcpyBuffered(buf, output->qTotA + tracksOffset, input.qTotA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
711 compressorMemcpyBuffered(buf, output->qMaxA + tracksOffset, input.qMaxA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
712 compressorMemcpyBuffered(buf, output->flagsA + tracksOffset, input.flagsA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
713 compressorMemcpyBuffered(buf, output->sigmaPadA + tracksOffset, input.sigmaPadA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
714 compressorMemcpyBuffered(buf, output->sigmaTimeA + tracksOffset, input.sigmaTimeA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
715
716 // First index stored with track
717 uint32_t tracksOffsetDiff = tracksOffset - trackStart;
718 compressorMemcpyBuffered(buf, output->rowDiffA + tracksOffsetDiff, input.rowDiffA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
719 compressorMemcpyBuffered(buf, output->sliceLegDiffA + tracksOffsetDiff, input.sliceLegDiffA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
720 compressorMemcpyBuffered(buf, output->padResA + tracksOffsetDiff, input.padResA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
721 compressorMemcpyBuffered(buf, output->timeResA + tracksOffsetDiff, input.timeResA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
722}
723
724GPUdii() void GPUTPCCompressionGatherKernels::gatherMulti(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
725{
726 GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor;
727 const o2::tpc::ClusterNativeAccess* GPUrestrict() clusters = processors.ioPtrs.clustersNative;
728 const auto& input = compressor.mPtrs;
729 auto* output = compressor.mOutput;
730
731 const int32_t nWarps = nThreads / GPUCA_WARP_SIZE;
732 const int32_t iWarp = iThread / GPUCA_WARP_SIZE;
733 const int32_t nLanes = GPUCA_WARP_SIZE;
734 const int32_t iLane = iThread % GPUCA_WARP_SIZE;
735 auto* buf = smem.getBuffer<Vec128>(iWarp);
736
737 if (iBlock == 0) {
738 compressorMemcpyBasic(output->nSliceRowClusters, input.nSliceRowClusters, compressor.NSECTORS * GPUTPCGeometry::NROWS, nThreads, iThread);
739 compressorMemcpyBasic(output->nTrackClusters, input.nTrackClusters, compressor.mMemory->nStoredTracks, nThreads, iThread);
740 compressorMemcpyBasic(output->qPtA, input.qPtA, compressor.mMemory->nStoredTracks, nThreads, iThread);
741 compressorMemcpyBasic(output->rowA, input.rowA, compressor.mMemory->nStoredTracks, nThreads, iThread);
742 compressorMemcpyBasic(output->sliceA, input.sliceA, compressor.mMemory->nStoredTracks, nThreads, iThread);
743 compressorMemcpyBasic(output->timeA, input.timeA, compressor.mMemory->nStoredTracks, nThreads, iThread);
744 compressorMemcpyBasic(output->padA, input.padA, compressor.mMemory->nStoredTracks, nThreads, iThread);
745 } else if (iBlock & 1) {
746 const uint32_t nGlobalWarps = nWarps * (nBlocks - 1) / 2;
747 const uint32_t iGlobalWarp = nWarps * (iBlock - 1) / 2 + iWarp;
748
749 const uint32_t nRows = compressor.NSECTORS * GPUTPCGeometry::NROWS;
750 uint32_t rowsPerWarp = (nRows + nGlobalWarps - 1) / nGlobalWarps;
751 uint32_t rowStart = rowsPerWarp * iGlobalWarp;
752 uint32_t rowEnd = CAMath::Min(nRows, rowStart + rowsPerWarp);
753 if (rowStart >= nRows) {
754 rowStart = 0;
755 rowEnd = 0;
756 }
757 rowsPerWarp = rowEnd - rowStart;
758
759 const uint32_t rowsOffset = calculateWarpOffsets(smem, input.nSliceRowClusters, rowStart, rowEnd, nWarps, iWarp, nLanes, iLane);
760 const uint32_t* clusterOffsets = &clusters->clusterOffset[0][0] + rowStart;
761 const uint32_t* nSectorRowClusters = input.nSliceRowClusters + rowStart;
762
763 compressorMemcpyBuffered(buf, output->qTotU + rowsOffset, input.qTotU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
764 compressorMemcpyBuffered(buf, output->qMaxU + rowsOffset, input.qMaxU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
765 compressorMemcpyBuffered(buf, output->flagsU + rowsOffset, input.flagsU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
766 compressorMemcpyBuffered(buf, output->padDiffU + rowsOffset, input.padDiffU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
767 compressorMemcpyBuffered(buf, output->timeDiffU + rowsOffset, input.timeDiffU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
768 compressorMemcpyBuffered(buf, output->sigmaPadU + rowsOffset, input.sigmaPadU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
769 compressorMemcpyBuffered(buf, output->sigmaTimeU + rowsOffset, input.sigmaTimeU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
770 } else {
771 const uint32_t nGlobalWarps = nWarps * (nBlocks - 1) / 2;
772 const uint32_t iGlobalWarp = nWarps * (iBlock / 2 - 1) + iWarp;
773
774 const uint32_t nStoredTracks = compressor.mMemory->nStoredTracks;
775 uint32_t tracksPerWarp = (nStoredTracks + nGlobalWarps - 1) / nGlobalWarps;
776 uint32_t trackStart = tracksPerWarp * iGlobalWarp;
777 uint32_t trackEnd = CAMath::Min(nStoredTracks, trackStart + tracksPerWarp);
778 if (trackStart >= nStoredTracks) {
779 trackStart = 0;
780 trackEnd = 0;
781 }
782 tracksPerWarp = trackEnd - trackStart;
783
784 const uint32_t tracksOffset = calculateWarpOffsets(smem, input.nTrackClusters, trackStart, trackEnd, nWarps, iWarp, nLanes, iLane);
785 const uint16_t* nTrackClustersPtr = input.nTrackClusters + trackStart;
786 const uint32_t* aClsFstIdx = compressor.mAttachedClusterFirstIndex + trackStart;
787
788 compressorMemcpyBuffered(buf, output->qTotA + tracksOffset, input.qTotA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
789 compressorMemcpyBuffered(buf, output->qMaxA + tracksOffset, input.qMaxA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
790 compressorMemcpyBuffered(buf, output->flagsA + tracksOffset, input.flagsA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
791 compressorMemcpyBuffered(buf, output->sigmaPadA + tracksOffset, input.sigmaPadA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
792 compressorMemcpyBuffered(buf, output->sigmaTimeA + tracksOffset, input.sigmaTimeA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
793
794 // First index stored with track
795 uint32_t tracksOffsetDiff = tracksOffset - trackStart;
796 compressorMemcpyBuffered(buf, output->rowDiffA + tracksOffsetDiff, input.rowDiffA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
797 compressorMemcpyBuffered(buf, output->sliceLegDiffA + tracksOffsetDiff, input.sliceLegDiffA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
798 compressorMemcpyBuffered(buf, output->padResA + tracksOffsetDiff, input.padResA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
799 compressorMemcpyBuffered(buf, output->timeResA + tracksOffsetDiff, input.timeResA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
800 }
801}
802
803template <>
804GPUdii() void GPUTPCCompressionGatherKernels::Thread<GPUTPCCompressionGatherKernels::buffered32>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
805{
806 gatherBuffered<Vec32>(nBlocks, nThreads, iBlock, iThread, smem, processors);
807}
808
809template <>
810GPUdii() void GPUTPCCompressionGatherKernels::Thread<GPUTPCCompressionGatherKernels::buffered64>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
811{
812 gatherBuffered<Vec64>(nBlocks, nThreads, iBlock, iThread, smem, processors);
813}
814
815template <>
816GPUdii() void GPUTPCCompressionGatherKernels::Thread<GPUTPCCompressionGatherKernels::buffered128>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
817{
818 gatherBuffered<Vec128>(nBlocks, nThreads, iBlock, iThread, smem, processors);
819}
820
821template <>
822GPUdii() void GPUTPCCompressionGatherKernels::Thread<GPUTPCCompressionGatherKernels::multiBlock>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
823{
824 gatherMulti(nBlocks, nThreads, iBlock, iThread, smem, processors);
825}
int16_t time
Definition RawEventData.h:4
int32_t i
#define get_local_size(dim)
#define get_local_id(dim)
#define GPUsharedref()
#define GPUbarrierWarp()
#define get_global_size(dim)
#define GPUbarrier()
#define GPUrestrict()
#define get_global_id(dim)
#define GPUCA_GET_THREAD_COUNT(...)
uint32_t iSector
int32_t lastRow
GPUdii() void GPUTPCCompressionKernels
void output(const std::map< std::string, ChannelStat > &channels)
Definition rawdump.cxx:197
uint32_t j
Definition RawData.h:0
uint32_t c
Definition RawData.h:2
TBranch * ptr
int nClusters
o2::tpc::CompressedClusters * mOutput
static constexpr uint32_t NSECTORS
o2::tpc::CompressedClustersPtrs mPtrs
static constexpr uint32_t NROWS
static constexpr uint32_t NSECTORS
GLint GLenum GLint x
Definition glcorearb.h:403
GLenum src
Definition glcorearb.h:1767
GLint GLsizei count
Definition glcorearb.h:399
GLsizeiptr size
Definition glcorearb.h:659
GLuint GLuint end
Definition glcorearb.h:469
GLuint GLsizei bufSize
Definition glcorearb.h:790
GLboolean GLboolean GLboolean b
Definition glcorearb.h:1233
GLenum GLenum dst
Definition glcorearb.h:1767
GLintptr offset
Definition glcorearb.h:660
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLuint start
Definition glcorearb.h:469
GLenum GLfloat param
Definition glcorearb.h:271
GLboolean GLboolean GLboolean GLboolean a
Definition glcorearb.h:1233
GLenum GLuint GLenum GLsizei const GLchar * buf
Definition glcorearb.h:2514
GLuint id
Definition glcorearb.h:650
GLdouble GLdouble GLdouble z
Definition glcorearb.h:843
Global TPC definitions and constants.
Definition SimTraits.h:168
GPUd() void PIDResponse
Definition PIDResponse.h:71
GPUdi() T BetheBlochAleph(T bg
constexpr std::array< int, nLayers > nRows
Definition Specs.h:59
a couple of static helper functions to create timestamp values for CCDB queries or override obsolete ...
GPUReconstruction * rec
const o2::tpc::ClusterNativeAccess * clustersNative
const GPUTPCGMMergedTrackHit * mergedTrackHits
const GPUTPCGMMergedTrack * mergedTracks
std::vector< std::byte > getBuffer(const char *filename)
std::vector< Cluster > clusters
for(int irof=0;irof< 1000;irof++)
std::vector< int > row