Project
Loading...
Searching...
No Matches
GPUTPCCompressionKernels.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
16#include "GPUConstantMem.h"
17#include "GPUO2DataTypes.h"
18#include "GPUParam.h"
19#include "GPUCommonAlgorithm.h"
22#include "GPUTPCCompressionKernels.inc"
23
24using namespace o2::gpu;
25using namespace o2::tpc;
26
27template <>
28GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step0attached>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors)
29{
30 const GPUTrackingInOutPointers& GPUrestrict() ioPtrs = processors.ioPtrs;
32 GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor;
33 const GPUParam& GPUrestrict() param = processors.param;
34
35 int32_t myTrack = 0;
36 for (uint32_t i = get_global_id(0); i < ioPtrs.nMergedTracks; i += get_global_size(0)) {
38 const GPUTPCGMMergedTrack& GPUrestrict() trk = ioPtrs.mergedTracks[i];
39 if (!trk.OK()) {
40 continue;
41 }
42 bool rejectTrk = CAMath::Abs(trk.GetParam().GetQPt() * processors.param.qptB5Scaler) > processors.param.rec.tpc.rejectQPtB5 || trk.MergedLooper();
43 uint32_t nClustersStored = 0;
45 uint8_t lastRow = 0, lastSector = 0;
47 float zOffset = 0;
48 for (int32_t k = trk.NClusters() - 1; k >= 0; k--) {
49 const GPUTPCGMMergedTrackHit& GPUrestrict() hit = ioPtrs.mergedTrackHits[trk.FirstClusterRef() + k];
51 continue;
52 }
53
54 int32_t hitId = hit.num;
55 int32_t attach = ioPtrs.mergedTrackHitAttachment[hitId];
56 if ((attach & gputpcgmmergertypes::attachTrackMask) != i) {
57 continue; // Main attachment to different track
58 }
59 bool rejectCluster = processors.param.rec.tpc.rejectionStrategy >= GPUSettings::RejectionStrategyA && (rejectTrk || GPUTPCClusterRejection::GetIsRejected(attach));
60 if (rejectCluster) {
61 compressor.mClusterStatus[hitId] = 1; // Cluster rejected, do not store
62 continue;
63 }
64
65 if (!(param.rec.tpc.compressionTypeMask & GPUSettings::CompressionTrackModel)) {
66 continue; // No track model compression
67 }
68 const ClusterNative& GPUrestrict() orgCl = clusters -> clusters[hit.sector][hit.row][hit.num - clusters->clusterOffset[hit.sector][hit.row]];
69 constexpr GPUTPCGeometry geo;
70 float x = geo.Row2X(hit.row);
71 float y = track.LinearPad2Y(hit.sector, orgCl.getPad(), geo.PadWidth(hit.row), geo.NPads(hit.row));
72 float z = geo.LinearTime2Z(hit.sector, orgCl.getTime());
73 if (nClustersStored) {
74 if ((hit.sector < GPUCA_NSECTORS) ^ (lastSector < GPUCA_NSECTORS)) {
75 break;
76 }
77 if (track.Propagate(geo.Row2X(hit.row), param.SectorParam[hit.sector].Alpha)) {
78 break;
79 }
80 }
81
82 compressor.mClusterStatus[hitId] = 1; // Cluster compressed in track model, do not store as difference
83
84 int32_t cidx = trk.FirstClusterRef() + nClustersStored++;
85 if (nClustersStored == 1) {
86 uint8_t qpt = fabs(trk.GetParam().GetQPt()) < 20.f ? (trk.GetParam().GetQPt() * (127.f / 20.f) + 127.5f) : (trk.GetParam().GetQPt() > 0 ? 254 : 0);
87 zOffset = z;
88 track.Init(x, y, z - zOffset, param.SectorParam[hit.sector].Alpha, qpt, param);
89
90 myTrack = CAMath::AtomicAdd(&compressor.mMemory->nStoredTracks, 1u);
91 compressor.mAttachedClusterFirstIndex[myTrack] = trk.FirstClusterRef();
92 c.qPtA[myTrack] = qpt;
93 c.rowA[myTrack] = hit.row;
94 c.sliceA[myTrack] = hit.sector;
95 c.timeA[myTrack] = orgCl.getTimePacked();
96 c.padA[myTrack] = orgCl.padPacked;
97 } else {
98 uint32_t row = hit.row;
99 uint32_t sector = hit.sector;
100
101 if (param.rec.tpc.compressionTypeMask & GPUSettings::CompressionDifferences) {
102 if (lastRow > row) {
104 }
105 row -= lastRow;
106 if (lastSector > sector) {
107 sector += compressor.NSECTORS;
108 }
109 sector -= lastSector;
110 }
111 c.rowDiffA[cidx] = row;
112 c.sliceLegDiffA[cidx] = sector;
113 float pad = CAMath::Max(0.f, CAMath::Min((float)geo.NPads(GPUCA_ROW_COUNT - 1), track.LinearY2Pad(hit.sector, track.Y(), geo.PadWidth(hit.row), geo.NPads(hit.row))));
114 c.padResA[cidx] = orgCl.padPacked - orgCl.packPad(pad);
115 float time = CAMath::Max(0.f, geo.LinearZ2Time(hit.sector, track.Z() + zOffset));
116 c.timeResA[cidx] = (orgCl.getTimePacked() - orgCl.packTime(time)) & 0xFFFFFF;
117 }
118 uint16_t qtot = orgCl.qTot, qmax = orgCl.qMax;
119 uint8_t sigmapad = orgCl.sigmaPadPacked, sigmatime = orgCl.sigmaTimePacked;
120 if (param.rec.tpc.compressionTypeMask & GPUSettings::CompressionTruncate) {
121 compressor.truncateSignificantBitsChargeMax(qmax, param);
122 compressor.truncateSignificantBitsCharge(qtot, param);
123 compressor.truncateSignificantBitsWidth(sigmapad, param);
124 compressor.truncateSignificantBitsWidth(sigmatime, param);
125 }
126 c.qTotA[cidx] = qtot;
127 c.qMaxA[cidx] = qmax;
128 c.sigmaPadA[cidx] = sigmapad;
129 c.sigmaTimeA[cidx] = sigmatime;
130 c.flagsA[cidx] = orgCl.getFlags();
131 if (k && track.Filter(y, z - zOffset, hit.row)) {
132 break;
133 }
134 lastRow = hit.row;
135 lastSector = hit.sector;
136 }
137 if (nClustersStored) {
138 CAMath::AtomicAdd(&compressor.mMemory->nStoredAttachedClusters, nClustersStored);
139 c.nTrackClusters[myTrack] = nClustersStored;
140 }
141 }
142}
143
144template <>
145GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<GPUSettings::SortTime>::operator()(uint32_t a, uint32_t b) const
146{
147 return mClsPtr[a].getTimePacked() < mClsPtr[b].getTimePacked();
148}
149
150template <>
151GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<GPUSettings::SortPad>::operator()(uint32_t a, uint32_t b) const
152{
153 return mClsPtr[a].padPacked < mClsPtr[b].padPacked;
154}
155
156template <>
157GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<GPUSettings::SortZTimePad>::operator()(uint32_t a, uint32_t b) const
158{
159 if (mClsPtr[a].getTimePacked() >> 3 == mClsPtr[b].getTimePacked() >> 3) {
160 return mClsPtr[a].padPacked < mClsPtr[b].padPacked;
161 }
162 return mClsPtr[a].getTimePacked() < mClsPtr[b].getTimePacked();
163}
164
165template <>
166GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<GPUSettings::SortZPadTime>::operator()(uint32_t a, uint32_t b) const
167{
168 if (mClsPtr[a].padPacked >> 3 == mClsPtr[b].padPacked >> 3) {
169 return mClsPtr[a].getTimePacked() < mClsPtr[b].getTimePacked();
170 }
171 return mClsPtr[a].padPacked < mClsPtr[b].padPacked;
172}
173
174template <> // Deterministic comparison
175GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<4>::operator()(uint32_t a, uint32_t b) const
176{
177 if (mClsPtr[a].getTimePacked() != mClsPtr[b].getTimePacked()) {
178 return mClsPtr[a].getTimePacked() < mClsPtr[b].getTimePacked();
179 }
180 if (mClsPtr[a].padPacked != mClsPtr[b].padPacked) {
181 return mClsPtr[a].padPacked < mClsPtr[b].padPacked;
182 }
183 return mClsPtr[a].qTot < mClsPtr[b].qTot;
184}
185
186template <>
187GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1unattached>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
188{
189 const GPUTrackingInOutPointers& GPUrestrict() ioPtrs = processors.ioPtrs;
190 const o2::tpc::ClusterNativeAccess* GPUrestrict() clusters = ioPtrs.clustersNative;
191 GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor;
192 GPUParam& GPUrestrict() param = processors.param;
193 uint32_t* sortBuffer = smem.sortBuffer;
194 for (int32_t iSectorRow = iBlock; iSectorRow < GPUCA_NSECTORS * GPUCA_ROW_COUNT; iSectorRow += nBlocks) {
195 const uint32_t iSector = iSectorRow / GPUCA_ROW_COUNT;
196 const uint32_t iRow = iSectorRow % GPUCA_ROW_COUNT;
197 const uint32_t idOffset = clusters->clusterOffset[iSector][iRow];
198 const uint32_t idOffsetOut = clusters->clusterOffset[iSector][iRow] * compressor.mMaxClusterFactorBase1024 / 1024; // 32 bit enough for number of clusters per row * 1024
199 const uint32_t idOffsetOutMax = ((const uint32_t*)clusters->clusterOffset[iSector])[iRow + 1] * compressor.mMaxClusterFactorBase1024 / 1024; // Array out of bounds access is ok, since it goes to the correct nClustersTotal
200 if (iThread == nThreads - 1) {
201 smem.nCount = 0;
202 }
203 uint32_t totalCount = 0;
204 GPUbarrier();
205
206 CompressedClustersPtrs& GPUrestrict() c = compressor.mPtrs;
207
208 const uint32_t nn = CAMath::nextMultipleOf<GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionKernels_step1unattached)>(clusters->nClusters[iSector][iRow]);
209 for (uint32_t i = iThread; i < nn + nThreads; i += nThreads) {
210 const int32_t idx = idOffset + i;
211 int32_t storeCluster = 0;
212 do {
213 if (i >= clusters->nClusters[iSector][iRow]) {
214 break;
215 }
216 if (compressor.mClusterStatus[idx]) {
217 break;
218 }
219 int32_t attach = ioPtrs.mergedTrackHitAttachment[idx];
220 bool unattached = attach == 0;
221
222 if (unattached) {
223 if (processors.param.rec.tpc.rejectionStrategy >= GPUSettings::RejectionStrategyB) {
224 break;
225 }
226 } else if (processors.param.rec.tpc.rejectionStrategy >= GPUSettings::RejectionStrategyA) {
228 break;
229 }
230 int32_t id = attach & gputpcgmmergertypes::attachTrackMask;
231 auto& trk = ioPtrs.mergedTracks[id];
232 if (CAMath::Abs(trk.GetParam().GetQPt() * processors.param.qptB5Scaler) > processors.param.rec.tpc.rejectQPtB5 || trk.MergedLooper()) {
233 break;
234 }
235 }
236 storeCluster = 1;
237 } while (false);
238
239 GPUbarrier();
240 int32_t myIndex = work_group_scan_inclusive_add(storeCluster);
241 int32_t storeLater = -1;
242 if (storeCluster) {
243 if (smem.nCount + myIndex <= GPUCA_TPC_COMP_CHUNK_SIZE) {
244 sortBuffer[smem.nCount + myIndex - 1] = i;
245 } else {
246 storeLater = smem.nCount + myIndex - 1 - GPUCA_TPC_COMP_CHUNK_SIZE;
247 }
248 }
249 GPUbarrier();
250 if (iThread == nThreads - 1) {
251 smem.nCount += myIndex;
252 }
253 GPUbarrier();
254
255 if (smem.nCount < GPUCA_TPC_COMP_CHUNK_SIZE && i < nn) {
256 continue;
257 }
258
259 uint32_t count = CAMath::Min(smem.nCount, (uint32_t)GPUCA_TPC_COMP_CHUNK_SIZE);
260 if (idOffsetOut + totalCount + count > idOffsetOutMax) {
261 if (iThread == nThreads - 1) {
262 compressor.raiseError(GPUErrors::ERROR_COMPRESSION_ROW_HIT_OVERFLOW, iSector * 1000 + iRow, idOffsetOut + totalCount + count, idOffsetOutMax);
263 }
264 break;
265 }
266 if (param.rec.tpc.compressionTypeMask & GPUSettings::CompressionDifferences) {
267#ifdef GPUCA_GPUCODE
268 static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionKernels_step1unattached) * 2 <= GPUCA_TPC_COMP_CHUNK_SIZE);
269#endif
270#ifdef GPUCA_DETERMINISTIC_MODE // Not using GPUCA_DETERMINISTIC_CODE, which is enforced in TPC compression
271 CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZPadTime>(clusters->clusters[iSector][iRow]));
272#else // GPUCA_DETERMINISTIC_MODE
273 if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZPadTime) {
274 CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZPadTime>(clusters->clusters[iSector][iRow]));
275 } else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZTimePad) {
276 CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZTimePad>(clusters->clusters[iSector][iRow]));
277 } else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortPad) {
278 CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortPad>(clusters->clusters[iSector][iRow]));
279 } else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortTime) {
280 CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortTime>(clusters->clusters[iSector][iRow]));
281 }
282#endif // GPUCA_DETERMINISTIC_MODE
283 GPUbarrier();
284 }
285
286 for (uint32_t j = get_local_id(0); j < count; j += get_local_size(0)) {
287 int32_t outidx = idOffsetOut + totalCount + j;
288 const ClusterNative& GPUrestrict() orgCl = clusters -> clusters[iSector][iRow][sortBuffer[j]];
289
290 int32_t preId = j != 0 ? (int32_t)sortBuffer[j - 1] : (totalCount != 0 ? (int32_t)smem.lastIndex : -1);
291 GPUTPCCompression_EncodeUnattached(param.rec.tpc.compressionTypeMask, orgCl, c.timeDiffU[outidx], c.padDiffU[outidx], preId == -1 ? nullptr : &clusters->clusters[iSector][iRow][preId]);
292
293 uint16_t qtot = orgCl.qTot, qmax = orgCl.qMax;
294 uint8_t sigmapad = orgCl.sigmaPadPacked, sigmatime = orgCl.sigmaTimePacked;
295 if (param.rec.tpc.compressionTypeMask & GPUSettings::CompressionTruncate) {
296 compressor.truncateSignificantBitsChargeMax(qmax, param);
297 compressor.truncateSignificantBitsCharge(qtot, param);
298 compressor.truncateSignificantBitsWidth(sigmapad, param);
299 compressor.truncateSignificantBitsWidth(sigmatime, param);
300 }
301 c.qTotU[outidx] = qtot;
302 c.qMaxU[outidx] = qmax;
303 c.sigmaPadU[outidx] = sigmapad;
304 c.sigmaTimeU[outidx] = sigmatime;
305 c.flagsU[outidx] = orgCl.getFlags();
306 }
307
308 GPUbarrier();
309 if (storeLater >= 0) {
310 sortBuffer[storeLater] = i;
311 }
312 totalCount += count;
313 if (iThread == nThreads - 1 && count) {
314 smem.lastIndex = sortBuffer[count - 1];
315 smem.nCount -= count;
316 }
317 }
318
319 if (iThread == nThreads - 1) {
320 c.nSliceRowClusters[iSector * GPUCA_ROW_COUNT + iRow] = totalCount;
321 CAMath::AtomicAdd(&compressor.mMemory->nStoredUnattachedClusters, totalCount);
322 }
323 GPUbarrier();
324 }
325}
326
327template <>
329{
330 return buf32[iWarp];
331}
332
333template <>
335{
336 return buf64[iWarp];
337}
338
339template <>
341{
342 return buf128[iWarp];
343}
344
345template <typename T, typename S>
346GPUdi() bool GPUTPCCompressionGatherKernels::isAlignedTo(const S* ptr)
347{
348 if constexpr (alignof(S) >= alignof(T)) {
349 static_cast<void>(ptr);
350 return true;
351 } else {
352 return reinterpret_cast<size_t>(ptr) % alignof(T) == 0;
353 }
354}
355
356template <>
357GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpy<uint8_t>(uint8_t* GPUrestrict() dst, const uint8_t* GPUrestrict() src, uint32_t size, int32_t nThreads, int32_t iThread)
358{
359 constexpr const int32_t vec128Elems = CpyVector<uint8_t, Vec128>::Size;
360 constexpr const int32_t vec64Elems = CpyVector<uint8_t, Vec64>::Size;
361 constexpr const int32_t vec32Elems = CpyVector<uint8_t, Vec32>::Size;
362 constexpr const int32_t vec16Elems = CpyVector<uint8_t, Vec16>::Size;
363
364 if (size >= uint32_t(nThreads * vec128Elems)) {
365 compressorMemcpyVectorised<uint8_t, Vec128>(dst, src, size, nThreads, iThread);
366 } else if (size >= uint32_t(nThreads * vec64Elems)) {
367 compressorMemcpyVectorised<uint8_t, Vec64>(dst, src, size, nThreads, iThread);
368 } else if (size >= uint32_t(nThreads * vec32Elems)) {
369 compressorMemcpyVectorised<uint8_t, Vec32>(dst, src, size, nThreads, iThread);
370 } else if (size >= uint32_t(nThreads * vec16Elems)) {
371 compressorMemcpyVectorised<uint8_t, Vec16>(dst, src, size, nThreads, iThread);
372 } else {
373 compressorMemcpyBasic(dst, src, size, nThreads, iThread);
374 }
375}
376
377template <>
378GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpy<uint16_t>(uint16_t* GPUrestrict() dst, const uint16_t* GPUrestrict() src, uint32_t size, int32_t nThreads, int32_t iThread)
379{
380 constexpr const int32_t vec128Elems = CpyVector<uint16_t, Vec128>::Size;
381 constexpr const int32_t vec64Elems = CpyVector<uint16_t, Vec64>::Size;
382 constexpr const int32_t vec32Elems = CpyVector<uint16_t, Vec32>::Size;
383
384 if (size >= uint32_t(nThreads * vec128Elems)) {
385 compressorMemcpyVectorised<uint16_t, Vec128>(dst, src, size, nThreads, iThread);
386 } else if (size >= uint32_t(nThreads * vec64Elems)) {
387 compressorMemcpyVectorised<uint16_t, Vec64>(dst, src, size, nThreads, iThread);
388 } else if (size >= uint32_t(nThreads * vec32Elems)) {
389 compressorMemcpyVectorised<uint16_t, Vec32>(dst, src, size, nThreads, iThread);
390 } else {
391 compressorMemcpyBasic(dst, src, size, nThreads, iThread);
392 }
393}
394
395template <>
396GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpy<uint32_t>(uint32_t* GPUrestrict() dst, const uint32_t* GPUrestrict() src, uint32_t size, int32_t nThreads, int32_t iThread)
397{
398 constexpr const int32_t vec128Elems = CpyVector<uint32_t, Vec128>::Size;
399 constexpr const int32_t vec64Elems = CpyVector<uint32_t, Vec64>::Size;
400
401 if (size >= uint32_t(nThreads * vec128Elems)) {
402 compressorMemcpyVectorised<uint32_t, Vec128>(dst, src, size, nThreads, iThread);
403 } else if (size >= uint32_t(nThreads * vec64Elems)) {
404 compressorMemcpyVectorised<uint32_t, Vec64>(dst, src, size, nThreads, iThread);
405 } else {
406 compressorMemcpyBasic(dst, src, size, nThreads, iThread);
407 }
408}
409
410template <typename Scalar, typename BaseVector>
411GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpyVectorised(Scalar* dst, const Scalar* src, uint32_t size, int32_t nThreads, int32_t iThread)
412{
413 if (not isAlignedTo<BaseVector>(dst)) {
414 size_t dsti = reinterpret_cast<size_t>(dst);
415 int32_t offset = (alignof(BaseVector) - dsti % alignof(BaseVector)) / sizeof(Scalar);
416 compressorMemcpyBasic(dst, src, offset, nThreads, iThread);
417 src += offset;
418 dst += offset;
419 size -= offset;
420 }
421
422 BaseVector* GPUrestrict() dstAligned = reinterpret_cast<BaseVector*>(dst);
423
424 using CpyVec = CpyVector<Scalar, BaseVector>;
425 uint32_t sizeAligned = size / CpyVec::Size;
426
427 if (isAlignedTo<BaseVector>(src)) {
428 const BaseVector* GPUrestrict() srcAligned = reinterpret_cast<const BaseVector*>(src);
429 compressorMemcpyBasic(dstAligned, srcAligned, sizeAligned, nThreads, iThread);
430 } else {
431 for (uint32_t i = iThread; i < sizeAligned; i += nThreads) {
432 CpyVec buf;
433 for (uint32_t j = 0; j < CpyVec::Size; j++) {
434 buf.elems[j] = src[i * CpyVec::Size + j];
435 }
436 dstAligned[i] = buf.all;
437 }
438 }
439
440 int32_t leftovers = size % CpyVec::Size;
441 compressorMemcpyBasic(dst + size - leftovers, src + size - leftovers, leftovers, nThreads, iThread);
442}
443
444template <typename T>
445GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpyBasic(T* GPUrestrict() dst, const T* GPUrestrict() src, uint32_t size, int32_t nThreads, int32_t iThread, int32_t nBlocks, int32_t iBlock)
446{
447 uint32_t start = (size + nBlocks - 1) / nBlocks * iBlock + iThread;
448 uint32_t end = CAMath::Min(size, (size + nBlocks - 1) / nBlocks * (iBlock + 1));
449 for (uint32_t i = start; i < end; i += nThreads) {
450 dst[i] = src[i];
451 }
452}
453
454template <typename V, typename T, typename S>
455GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpyBuffered(V* buf, T* GPUrestrict() dst, const T* GPUrestrict() src, const S* GPUrestrict() nums, const uint32_t* GPUrestrict() srcOffsets, uint32_t nEntries, int32_t nLanes, int32_t iLane, int32_t diff, size_t scaleBase1024)
456{
457 int32_t shmPos = 0;
458 uint32_t dstOffset = 0;
459 V* GPUrestrict() dstAligned = nullptr;
460
461 T* bufT = reinterpret_cast<T*>(buf);
462 constexpr const int32_t bufSize = GPUCA_WARP_SIZE;
463 constexpr const int32_t bufTSize = bufSize * sizeof(V) / sizeof(T);
464
465 for (uint32_t i = 0; i < nEntries; i++) {
466 uint32_t srcPos = 0;
467 uint32_t srcOffset = (srcOffsets[i] * scaleBase1024 / 1024) + diff;
468 uint32_t srcSize = nums[i] - diff;
469
470 if (dstAligned == nullptr) {
471 if (not isAlignedTo<V>(dst)) {
472 size_t dsti = reinterpret_cast<size_t>(dst);
473 uint32_t offset = (alignof(V) - dsti % alignof(V)) / sizeof(T);
474 offset = CAMath::Min<uint32_t>(offset, srcSize);
475 compressorMemcpyBasic(dst, src + srcOffset, offset, nLanes, iLane);
476 dst += offset;
477 srcPos += offset;
478 }
479 if (isAlignedTo<V>(dst)) {
480 dstAligned = reinterpret_cast<V*>(dst);
481 }
482 }
483 while (srcPos < srcSize) {
484 uint32_t shmElemsLeft = bufTSize - shmPos;
485 uint32_t srcElemsLeft = srcSize - srcPos;
486 uint32_t size = CAMath::Min(srcElemsLeft, shmElemsLeft);
487 compressorMemcpyBasic(bufT + shmPos, src + srcOffset + srcPos, size, nLanes, iLane);
488 srcPos += size;
489 shmPos += size;
491
492 if (shmPos >= bufTSize) {
493 compressorMemcpyBasic(dstAligned + dstOffset, buf, bufSize, nLanes, iLane);
494 dstOffset += bufSize;
495 shmPos = 0;
497 }
498 }
499 }
500
501 compressorMemcpyBasic(reinterpret_cast<T*>(dstAligned + dstOffset), bufT, shmPos, nLanes, iLane);
503}
504
505template <typename T>
506GPUdi() uint32_t GPUTPCCompressionGatherKernels::calculateWarpOffsets(GPUSharedMemory& smem, T* nums, uint32_t start, uint32_t end, int32_t nWarps, int32_t iWarp, int32_t nLanes, int32_t iLane)
507{
508 uint32_t blockOffset = 0;
509 int32_t iThread = nLanes * iWarp + iLane;
510 int32_t nThreads = nLanes * nWarps;
511 uint32_t blockStart = work_group_broadcast(start, 0);
512 for (uint32_t i = iThread; i < blockStart; i += nThreads) {
513 blockOffset += nums[i];
514 }
515 blockOffset = work_group_reduce_add(blockOffset);
516
517 uint32_t offset = 0;
518 for (uint32_t i = start + iLane; i < end; i += nLanes) {
519 offset += nums[i];
520 }
521 offset = work_group_scan_inclusive_add(offset);
522 if (iWarp > -1 && iLane == nLanes - 1) {
523 smem.warpOffset[iWarp] = offset;
524 }
525 GPUbarrier();
526 offset = (iWarp <= 0) ? 0 : smem.warpOffset[iWarp - 1];
527 GPUbarrier();
528
529 return offset + blockOffset;
530}
531
532template <>
533GPUdii() void GPUTPCCompressionGatherKernels::Thread<GPUTPCCompressionGatherKernels::unbuffered>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
534{
535 GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor;
536 const o2::tpc::ClusterNativeAccess* GPUrestrict() clusters = processors.ioPtrs.clustersNative;
537
538 int32_t nWarps = nThreads / GPUCA_WARP_SIZE;
539 int32_t iWarp = iThread / GPUCA_WARP_SIZE;
540
541 int32_t nLanes = GPUCA_WARP_SIZE;
542 int32_t iLane = iThread % GPUCA_WARP_SIZE;
543
544 if (iBlock == 0) {
545
546 uint32_t nRows = compressor.NSECTORS * GPUCA_ROW_COUNT;
547 uint32_t rowsPerWarp = (nRows + nWarps - 1) / nWarps;
548 uint32_t rowStart = rowsPerWarp * iWarp;
549 uint32_t rowEnd = CAMath::Min(nRows, rowStart + rowsPerWarp);
550 if (rowStart >= nRows) {
551 rowStart = 0;
552 rowEnd = 0;
553 }
554
555 uint32_t rowsOffset = calculateWarpOffsets(smem, compressor.mPtrs.nSliceRowClusters, rowStart, rowEnd, nWarps, iWarp, nLanes, iLane);
556
557 compressorMemcpy(compressor.mOutput->nSliceRowClusters, compressor.mPtrs.nSliceRowClusters, compressor.NSECTORS * GPUCA_ROW_COUNT, nThreads, iThread);
558 compressorMemcpy(compressor.mOutput->nTrackClusters, compressor.mPtrs.nTrackClusters, compressor.mMemory->nStoredTracks, nThreads, iThread);
559 compressorMemcpy(compressor.mOutput->qPtA, compressor.mPtrs.qPtA, compressor.mMemory->nStoredTracks, nThreads, iThread);
560 compressorMemcpy(compressor.mOutput->rowA, compressor.mPtrs.rowA, compressor.mMemory->nStoredTracks, nThreads, iThread);
561 compressorMemcpy(compressor.mOutput->sliceA, compressor.mPtrs.sliceA, compressor.mMemory->nStoredTracks, nThreads, iThread);
562 compressorMemcpy(compressor.mOutput->timeA, compressor.mPtrs.timeA, compressor.mMemory->nStoredTracks, nThreads, iThread);
563 compressorMemcpy(compressor.mOutput->padA, compressor.mPtrs.padA, compressor.mMemory->nStoredTracks, nThreads, iThread);
564
565 uint32_t sectorStart = rowStart / GPUCA_ROW_COUNT;
566 uint32_t sectorEnd = rowEnd / GPUCA_ROW_COUNT;
567
568 uint32_t sectorRowStart = rowStart % GPUCA_ROW_COUNT;
569 uint32_t sectorRowEnd = rowEnd % GPUCA_ROW_COUNT;
570
571 for (uint32_t i = sectorStart; i <= sectorEnd && i < compressor.NSECTORS; i++) {
572 for (uint32_t j = ((i == sectorStart) ? sectorRowStart : 0); j < ((i == sectorEnd) ? sectorRowEnd : GPUCA_ROW_COUNT); j++) {
573 uint32_t nClusters = compressor.mPtrs.nSliceRowClusters[i * GPUCA_ROW_COUNT + j];
574 uint32_t clusterOffsetInCache = clusters->clusterOffset[i][j] * compressor.mMaxClusterFactorBase1024 / 1024;
575 compressorMemcpy(compressor.mOutput->qTotU + rowsOffset, compressor.mPtrs.qTotU + clusterOffsetInCache, nClusters, nLanes, iLane);
576 compressorMemcpy(compressor.mOutput->qMaxU + rowsOffset, compressor.mPtrs.qMaxU + clusterOffsetInCache, nClusters, nLanes, iLane);
577 compressorMemcpy(compressor.mOutput->flagsU + rowsOffset, compressor.mPtrs.flagsU + clusterOffsetInCache, nClusters, nLanes, iLane);
578 compressorMemcpy(compressor.mOutput->padDiffU + rowsOffset, compressor.mPtrs.padDiffU + clusterOffsetInCache, nClusters, nLanes, iLane);
579 compressorMemcpy(compressor.mOutput->timeDiffU + rowsOffset, compressor.mPtrs.timeDiffU + clusterOffsetInCache, nClusters, nLanes, iLane);
580 compressorMemcpy(compressor.mOutput->sigmaPadU + rowsOffset, compressor.mPtrs.sigmaPadU + clusterOffsetInCache, nClusters, nLanes, iLane);
581 compressorMemcpy(compressor.mOutput->sigmaTimeU + rowsOffset, compressor.mPtrs.sigmaTimeU + clusterOffsetInCache, nClusters, nLanes, iLane);
582 rowsOffset += nClusters;
583 }
584 }
585 }
586
587 if (iBlock == 1) {
588 uint32_t tracksPerWarp = (compressor.mMemory->nStoredTracks + nWarps - 1) / nWarps;
589 uint32_t trackStart = tracksPerWarp * iWarp;
590 uint32_t trackEnd = CAMath::Min(compressor.mMemory->nStoredTracks, trackStart + tracksPerWarp);
591 if (trackStart >= compressor.mMemory->nStoredTracks) {
592 trackStart = 0;
593 trackEnd = 0;
594 }
595
596 uint32_t tracksOffset = calculateWarpOffsets(smem, compressor.mPtrs.nTrackClusters, trackStart, trackEnd, nWarps, iWarp, nLanes, iLane);
597
598 for (uint32_t i = trackStart; i < trackEnd; i += nLanes) {
599 uint32_t nTrackClusters = 0;
600 uint32_t srcOffset = 0;
601
602 if (i + iLane < trackEnd) {
603 nTrackClusters = compressor.mPtrs.nTrackClusters[i + iLane];
604 srcOffset = compressor.mAttachedClusterFirstIndex[i + iLane];
605 }
606 smem.unbuffered.sizes[iWarp][iLane] = nTrackClusters;
607 smem.unbuffered.srcOffsets[iWarp][iLane] = srcOffset;
608
609 uint32_t elems = (i + nLanes < trackEnd) ? nLanes : (trackEnd - i);
610
611 for (uint32_t j = 0; j < elems; j++) {
612 nTrackClusters = smem.unbuffered.sizes[iWarp][j];
613 srcOffset = smem.unbuffered.srcOffsets[iWarp][j];
614 uint32_t idx = i + j;
615 compressorMemcpy(compressor.mOutput->qTotA + tracksOffset, compressor.mPtrs.qTotA + srcOffset, nTrackClusters, nLanes, iLane);
616 compressorMemcpy(compressor.mOutput->qMaxA + tracksOffset, compressor.mPtrs.qMaxA + srcOffset, nTrackClusters, nLanes, iLane);
617 compressorMemcpy(compressor.mOutput->flagsA + tracksOffset, compressor.mPtrs.flagsA + srcOffset, nTrackClusters, nLanes, iLane);
618 compressorMemcpy(compressor.mOutput->sigmaPadA + tracksOffset, compressor.mPtrs.sigmaPadA + srcOffset, nTrackClusters, nLanes, iLane);
619 compressorMemcpy(compressor.mOutput->sigmaTimeA + tracksOffset, compressor.mPtrs.sigmaTimeA + srcOffset, nTrackClusters, nLanes, iLane);
620
621 // First index stored with track
622 compressorMemcpy(compressor.mOutput->rowDiffA + tracksOffset - idx, compressor.mPtrs.rowDiffA + srcOffset + 1, (nTrackClusters - 1), nLanes, iLane);
623 compressorMemcpy(compressor.mOutput->sliceLegDiffA + tracksOffset - idx, compressor.mPtrs.sliceLegDiffA + srcOffset + 1, (nTrackClusters - 1), nLanes, iLane);
624 compressorMemcpy(compressor.mOutput->padResA + tracksOffset - idx, compressor.mPtrs.padResA + srcOffset + 1, (nTrackClusters - 1), nLanes, iLane);
625 compressorMemcpy(compressor.mOutput->timeResA + tracksOffset - idx, compressor.mPtrs.timeResA + srcOffset + 1, (nTrackClusters - 1), nLanes, iLane);
626
627 tracksOffset += nTrackClusters;
628 }
629 }
630 }
631}
632
633template <typename V>
634GPUdii() void GPUTPCCompressionGatherKernels::gatherBuffered(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
635{
636
637 GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor;
638 const o2::tpc::ClusterNativeAccess* GPUrestrict() clusters = processors.ioPtrs.clustersNative;
639
640 int32_t nWarps = nThreads / GPUCA_WARP_SIZE;
641 int32_t iWarp = iThread / GPUCA_WARP_SIZE;
642
643 int32_t nGlobalWarps = nWarps * nBlocks;
644 int32_t iGlobalWarp = nWarps * iBlock + iWarp;
645
646 int32_t nLanes = GPUCA_WARP_SIZE;
647 int32_t iLane = iThread % GPUCA_WARP_SIZE;
648
649 auto& input = compressor.mPtrs;
650 auto* output = compressor.mOutput;
651
652 uint32_t nRows = compressor.NSECTORS * GPUCA_ROW_COUNT;
653 uint32_t rowsPerWarp = (nRows + nGlobalWarps - 1) / nGlobalWarps;
654 uint32_t rowStart = rowsPerWarp * iGlobalWarp;
655 uint32_t rowEnd = CAMath::Min(nRows, rowStart + rowsPerWarp);
656 if (rowStart >= nRows) {
657 rowStart = 0;
658 rowEnd = 0;
659 }
660 rowsPerWarp = rowEnd - rowStart;
661
662 uint32_t rowsOffset = calculateWarpOffsets(smem, input.nSliceRowClusters, rowStart, rowEnd, nWarps, iWarp, nLanes, iLane);
663
664 uint32_t nStoredTracks = compressor.mMemory->nStoredTracks;
665 uint32_t tracksPerWarp = (nStoredTracks + nGlobalWarps - 1) / nGlobalWarps;
666 uint32_t trackStart = tracksPerWarp * iGlobalWarp;
667 uint32_t trackEnd = CAMath::Min(nStoredTracks, trackStart + tracksPerWarp);
668 if (trackStart >= nStoredTracks) {
669 trackStart = 0;
670 trackEnd = 0;
671 }
672 tracksPerWarp = trackEnd - trackStart;
673
674 uint32_t tracksOffset = calculateWarpOffsets(smem, input.nTrackClusters, trackStart, trackEnd, nWarps, iWarp, nLanes, iLane);
675
676 if (iBlock == 0) {
677 compressorMemcpyBasic(output->nSliceRowClusters, input.nSliceRowClusters, compressor.NSECTORS * GPUCA_ROW_COUNT, nThreads, iThread);
678 compressorMemcpyBasic(output->nTrackClusters, input.nTrackClusters, compressor.mMemory->nStoredTracks, nThreads, iThread);
679 compressorMemcpyBasic(output->qPtA, input.qPtA, compressor.mMemory->nStoredTracks, nThreads, iThread);
680 compressorMemcpyBasic(output->rowA, input.rowA, compressor.mMemory->nStoredTracks, nThreads, iThread);
681 compressorMemcpyBasic(output->sliceA, input.sliceA, compressor.mMemory->nStoredTracks, nThreads, iThread);
682 compressorMemcpyBasic(output->timeA, input.timeA, compressor.mMemory->nStoredTracks, nThreads, iThread);
683 compressorMemcpyBasic(output->padA, input.padA, compressor.mMemory->nStoredTracks, nThreads, iThread);
684 }
685
686 const uint32_t* clusterOffsets = &clusters->clusterOffset[0][0] + rowStart;
687 const uint32_t* nSectorRowClusters = input.nSliceRowClusters + rowStart;
688
689 auto* buf = smem.getBuffer<V>(iWarp);
690
691 compressorMemcpyBuffered(buf, output->qTotU + rowsOffset, input.qTotU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
692 compressorMemcpyBuffered(buf, output->qMaxU + rowsOffset, input.qMaxU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
693 compressorMemcpyBuffered(buf, output->flagsU + rowsOffset, input.flagsU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
694 compressorMemcpyBuffered(buf, output->padDiffU + rowsOffset, input.padDiffU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
695 compressorMemcpyBuffered(buf, output->timeDiffU + rowsOffset, input.timeDiffU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
696 compressorMemcpyBuffered(buf, output->sigmaPadU + rowsOffset, input.sigmaPadU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
697 compressorMemcpyBuffered(buf, output->sigmaTimeU + rowsOffset, input.sigmaTimeU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
698
699 const uint16_t* nTrackClustersPtr = input.nTrackClusters + trackStart;
700 const uint32_t* aClsFstIdx = compressor.mAttachedClusterFirstIndex + trackStart;
701
702 compressorMemcpyBuffered(buf, output->qTotA + tracksOffset, input.qTotA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
703 compressorMemcpyBuffered(buf, output->qMaxA + tracksOffset, input.qMaxA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
704 compressorMemcpyBuffered(buf, output->flagsA + tracksOffset, input.flagsA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
705 compressorMemcpyBuffered(buf, output->sigmaPadA + tracksOffset, input.sigmaPadA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
706 compressorMemcpyBuffered(buf, output->sigmaTimeA + tracksOffset, input.sigmaTimeA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
707
708 // First index stored with track
709 uint32_t tracksOffsetDiff = tracksOffset - trackStart;
710 compressorMemcpyBuffered(buf, output->rowDiffA + tracksOffsetDiff, input.rowDiffA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
711 compressorMemcpyBuffered(buf, output->sliceLegDiffA + tracksOffsetDiff, input.sliceLegDiffA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
712 compressorMemcpyBuffered(buf, output->padResA + tracksOffsetDiff, input.padResA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
713 compressorMemcpyBuffered(buf, output->timeResA + tracksOffsetDiff, input.timeResA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
714}
715
716GPUdii() void GPUTPCCompressionGatherKernels::gatherMulti(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
717{
718 GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor;
719 const o2::tpc::ClusterNativeAccess* GPUrestrict() clusters = processors.ioPtrs.clustersNative;
720 const auto& input = compressor.mPtrs;
721 auto* output = compressor.mOutput;
722
723 const int32_t nWarps = nThreads / GPUCA_WARP_SIZE;
724 const int32_t iWarp = iThread / GPUCA_WARP_SIZE;
725 const int32_t nLanes = GPUCA_WARP_SIZE;
726 const int32_t iLane = iThread % GPUCA_WARP_SIZE;
727 auto* buf = smem.getBuffer<Vec128>(iWarp);
728
729 if (iBlock == 0) {
730 compressorMemcpyBasic(output->nSliceRowClusters, input.nSliceRowClusters, compressor.NSECTORS * GPUCA_ROW_COUNT, nThreads, iThread);
731 compressorMemcpyBasic(output->nTrackClusters, input.nTrackClusters, compressor.mMemory->nStoredTracks, nThreads, iThread);
732 compressorMemcpyBasic(output->qPtA, input.qPtA, compressor.mMemory->nStoredTracks, nThreads, iThread);
733 compressorMemcpyBasic(output->rowA, input.rowA, compressor.mMemory->nStoredTracks, nThreads, iThread);
734 compressorMemcpyBasic(output->sliceA, input.sliceA, compressor.mMemory->nStoredTracks, nThreads, iThread);
735 compressorMemcpyBasic(output->timeA, input.timeA, compressor.mMemory->nStoredTracks, nThreads, iThread);
736 compressorMemcpyBasic(output->padA, input.padA, compressor.mMemory->nStoredTracks, nThreads, iThread);
737 } else if (iBlock & 1) {
738 const uint32_t nGlobalWarps = nWarps * (nBlocks - 1) / 2;
739 const uint32_t iGlobalWarp = nWarps * (iBlock - 1) / 2 + iWarp;
740
741 const uint32_t nRows = compressor.NSECTORS * GPUCA_ROW_COUNT;
742 uint32_t rowsPerWarp = (nRows + nGlobalWarps - 1) / nGlobalWarps;
743 uint32_t rowStart = rowsPerWarp * iGlobalWarp;
744 uint32_t rowEnd = CAMath::Min(nRows, rowStart + rowsPerWarp);
745 if (rowStart >= nRows) {
746 rowStart = 0;
747 rowEnd = 0;
748 }
749 rowsPerWarp = rowEnd - rowStart;
750
751 const uint32_t rowsOffset = calculateWarpOffsets(smem, input.nSliceRowClusters, rowStart, rowEnd, nWarps, iWarp, nLanes, iLane);
752 const uint32_t* clusterOffsets = &clusters->clusterOffset[0][0] + rowStart;
753 const uint32_t* nSectorRowClusters = input.nSliceRowClusters + rowStart;
754
755 compressorMemcpyBuffered(buf, output->qTotU + rowsOffset, input.qTotU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
756 compressorMemcpyBuffered(buf, output->qMaxU + rowsOffset, input.qMaxU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
757 compressorMemcpyBuffered(buf, output->flagsU + rowsOffset, input.flagsU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
758 compressorMemcpyBuffered(buf, output->padDiffU + rowsOffset, input.padDiffU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
759 compressorMemcpyBuffered(buf, output->timeDiffU + rowsOffset, input.timeDiffU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
760 compressorMemcpyBuffered(buf, output->sigmaPadU + rowsOffset, input.sigmaPadU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
761 compressorMemcpyBuffered(buf, output->sigmaTimeU + rowsOffset, input.sigmaTimeU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
762 } else {
763 const uint32_t nGlobalWarps = nWarps * (nBlocks - 1) / 2;
764 const uint32_t iGlobalWarp = nWarps * (iBlock / 2 - 1) + iWarp;
765
766 const uint32_t nStoredTracks = compressor.mMemory->nStoredTracks;
767 uint32_t tracksPerWarp = (nStoredTracks + nGlobalWarps - 1) / nGlobalWarps;
768 uint32_t trackStart = tracksPerWarp * iGlobalWarp;
769 uint32_t trackEnd = CAMath::Min(nStoredTracks, trackStart + tracksPerWarp);
770 if (trackStart >= nStoredTracks) {
771 trackStart = 0;
772 trackEnd = 0;
773 }
774 tracksPerWarp = trackEnd - trackStart;
775
776 const uint32_t tracksOffset = calculateWarpOffsets(smem, input.nTrackClusters, trackStart, trackEnd, nWarps, iWarp, nLanes, iLane);
777 const uint16_t* nTrackClustersPtr = input.nTrackClusters + trackStart;
778 const uint32_t* aClsFstIdx = compressor.mAttachedClusterFirstIndex + trackStart;
779
780 compressorMemcpyBuffered(buf, output->qTotA + tracksOffset, input.qTotA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
781 compressorMemcpyBuffered(buf, output->qMaxA + tracksOffset, input.qMaxA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
782 compressorMemcpyBuffered(buf, output->flagsA + tracksOffset, input.flagsA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
783 compressorMemcpyBuffered(buf, output->sigmaPadA + tracksOffset, input.sigmaPadA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
784 compressorMemcpyBuffered(buf, output->sigmaTimeA + tracksOffset, input.sigmaTimeA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
785
786 // First index stored with track
787 uint32_t tracksOffsetDiff = tracksOffset - trackStart;
788 compressorMemcpyBuffered(buf, output->rowDiffA + tracksOffsetDiff, input.rowDiffA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
789 compressorMemcpyBuffered(buf, output->sliceLegDiffA + tracksOffsetDiff, input.sliceLegDiffA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
790 compressorMemcpyBuffered(buf, output->padResA + tracksOffsetDiff, input.padResA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
791 compressorMemcpyBuffered(buf, output->timeResA + tracksOffsetDiff, input.timeResA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
792 }
793}
794
795template <>
796GPUdii() void GPUTPCCompressionGatherKernels::Thread<GPUTPCCompressionGatherKernels::buffered32>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
797{
798 gatherBuffered<Vec32>(nBlocks, nThreads, iBlock, iThread, smem, processors);
799}
800
801template <>
802GPUdii() void GPUTPCCompressionGatherKernels::Thread<GPUTPCCompressionGatherKernels::buffered64>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
803{
804 gatherBuffered<Vec64>(nBlocks, nThreads, iBlock, iThread, smem, processors);
805}
806
807template <>
808GPUdii() void GPUTPCCompressionGatherKernels::Thread<GPUTPCCompressionGatherKernels::buffered128>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
809{
810 gatherBuffered<Vec128>(nBlocks, nThreads, iBlock, iThread, smem, processors);
811}
812
813template <>
814GPUdii() void GPUTPCCompressionGatherKernels::Thread<GPUTPCCompressionGatherKernels::multiBlock>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
815{
816 gatherMulti(nBlocks, nThreads, iBlock, iThread, smem, processors);
817}
int16_t time
Definition RawEventData.h:4
int32_t i
#define get_local_size(dim)
#define get_local_id(dim)
#define GPUsharedref()
#define GPUbarrierWarp()
#define get_global_size(dim)
#define GPUbarrier()
#define GPUrestrict()
#define get_global_id(dim)
#define GPUCA_TPC_COMP_CHUNK_SIZE
#define GPUCA_GET_THREAD_COUNT(...)
GPUdii() void GPUTPCCompressionKernels
#define GPUCA_NSECTORS
#define GPUCA_ROW_COUNT
void output(const std::map< std::string, ChannelStat > &channels)
Definition rawdump.cxx:197
uint32_t j
Definition RawData.h:0
uint32_t c
Definition RawData.h:2
TBranch * ptr
int nClusters
o2::tpc::CompressedClusters * mOutput
static constexpr uint32_t NSECTORS
o2::tpc::CompressedClustersPtrs mPtrs
GLint GLenum GLint x
Definition glcorearb.h:403
GLenum src
Definition glcorearb.h:1767
GLint GLsizei count
Definition glcorearb.h:399
GLsizeiptr size
Definition glcorearb.h:659
GLuint GLuint end
Definition glcorearb.h:469
GLuint GLsizei bufSize
Definition glcorearb.h:790
GLboolean GLboolean GLboolean b
Definition glcorearb.h:1233
GLenum GLenum dst
Definition glcorearb.h:1767
GLintptr offset
Definition glcorearb.h:660
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLuint start
Definition glcorearb.h:469
GLenum GLfloat param
Definition glcorearb.h:271
GLboolean GLboolean GLboolean GLboolean a
Definition glcorearb.h:1233
GLenum GLuint GLenum GLsizei const GLchar * buf
Definition glcorearb.h:2514
GLuint id
Definition glcorearb.h:650
GLdouble GLdouble GLdouble z
Definition glcorearb.h:843
Global TPC definitions and constants.
Definition SimTraits.h:167
GPUd() void PIDResponse
Definition PIDResponse.h:71
GPUdi() T BetheBlochAleph(T bg
constexpr std::array< int, nLayers > nRows
Definition Specs.h:56
a couple of static helper functions to create timestamp values for CCDB queries or override obsolete ...
GPUReconstruction * rec
static constexpr bool GetIsRejected(int32_t attach)
const o2::tpc::ClusterNativeAccess * clustersNative
const uint32_t * mergedTrackHitAttachment
const GPUTPCGMMergedTrackHit * mergedTrackHits
const GPUTPCGMMergedTrack * mergedTracks
std::vector< std::byte > getBuffer(const char *filename)
std::vector< Cluster > clusters
for(int irof=0;irof< 1000;irof++)
std::vector< int > row