Project
Loading...
Searching...
No Matches
GPUTPCCompressionKernels.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
16#include "GPUConstantMem.h"
17#include "GPUO2DataTypes.h"
18#include "GPUParam.h"
19#include "GPUCommonAlgorithm.h"
21#include "GPUTPCGeometry.h"
23#include "GPUTPCCompressionKernels.inc"
24
25using namespace o2::gpu;
26using namespace o2::tpc;
27
28template <>
29GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step0attached>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors)
30{
31 const GPUTrackingInOutPointers& GPUrestrict() ioPtrs = processors.ioPtrs;
33 GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor;
34 const GPUParam& GPUrestrict() param = processors.param;
35
36 uint8_t lastLeg = 0;
37 int32_t myTrack = 0;
38 for (uint32_t i = get_global_id(0); i < ioPtrs.nMergedTracks; i += get_global_size(0)) {
40 const GPUTPCGMMergedTrack& GPUrestrict() trk = ioPtrs.mergedTracks[i];
41 if (!trk.OK()) {
42 continue;
43 }
44 bool rejectTrk = CAMath::Abs(trk.GetParam().GetQPt() * processors.param.qptB5Scaler) > processors.param.rec.tpc.rejectQPtB5 || trk.MergedLooper();
45 uint32_t nClustersStored = 0;
47 uint8_t lastRow = 0, lastSector = 0;
49 float zOffset = 0;
50 for (int32_t k = trk.NClusters() - 1; k >= 0; k--) {
51 const GPUTPCGMMergedTrackHit& GPUrestrict() hit = ioPtrs.mergedTrackHits[trk.FirstClusterRef() + k];
53 continue;
54 }
55
56 int32_t hitId = hit.num;
57 int32_t attach = ioPtrs.mergedTrackHitAttachment[hitId];
58 if ((attach & gputpcgmmergertypes::attachTrackMask) != i) {
59 continue; // Main attachment to different track
60 }
61 bool rejectCluster = processors.param.rec.tpc.rejectionStrategy >= GPUSettings::RejectionStrategyA && (rejectTrk || GPUTPCClusterRejection::GetIsRejected(attach));
62 if (rejectCluster) {
63 compressor.mClusterStatus[hitId] = 1; // Cluster rejected, do not store
64 continue;
65 }
66
67 if (!(param.rec.tpc.compressionTypeMask & GPUSettings::CompressionTrackModel)) {
68 continue; // No track model compression
69 }
70 const ClusterNative& GPUrestrict() orgCl = clusters -> clusters[hit.sector][hit.row][hit.num - clusters->clusterOffset[hit.sector][hit.row]];
71 float x = param.tpcGeometry.Row2X(hit.row);
72 float y = track.LinearPad2Y(hit.sector, orgCl.getPad(), param.tpcGeometry.PadWidth(hit.row), param.tpcGeometry.NPads(hit.row));
73 float z = param.tpcGeometry.LinearTime2Z(hit.sector, orgCl.getTime());
74 if (nClustersStored) {
75 if ((hit.sector < GPUCA_NSECTORS) ^ (lastSector < GPUCA_NSECTORS)) {
76 break;
77 }
78 if (lastLeg != hit.leg && track.Mirror()) {
79 break;
80 }
81 if (track.Propagate(param.tpcGeometry.Row2X(hit.row), param.SectorParam[hit.sector].Alpha)) {
82 break;
83 }
84 }
85
86 compressor.mClusterStatus[hitId] = 1; // Cluster compressed in track model, do not store as difference
87
88 int32_t cidx = trk.FirstClusterRef() + nClustersStored++;
89 if (nClustersStored == 1) {
90 uint8_t qpt = fabs(trk.GetParam().GetQPt()) < 20.f ? (trk.GetParam().GetQPt() * (127.f / 20.f) + 127.5f) : (trk.GetParam().GetQPt() > 0 ? 254 : 0);
91 zOffset = z;
92 track.Init(x, y, z - zOffset, param.SectorParam[hit.sector].Alpha, qpt, param);
93
94 myTrack = CAMath::AtomicAdd(&compressor.mMemory->nStoredTracks, 1u);
95 compressor.mAttachedClusterFirstIndex[myTrack] = trk.FirstClusterRef();
96 lastLeg = hit.leg;
97 c.qPtA[myTrack] = qpt;
98 c.rowA[myTrack] = hit.row;
99 c.sliceA[myTrack] = hit.sector;
100 c.timeA[myTrack] = orgCl.getTimePacked();
101 c.padA[myTrack] = orgCl.padPacked;
102 } else {
103 uint32_t row = hit.row;
104 uint32_t sector = hit.sector;
105
106 if (param.rec.tpc.compressionTypeMask & GPUSettings::CompressionDifferences) {
107 if (lastRow > row) {
109 }
110 row -= lastRow;
111 if (lastSector > sector) {
112 sector += compressor.NSECTORS;
113 }
114 sector -= lastSector;
115 }
116 c.rowDiffA[cidx] = row;
117 c.sliceLegDiffA[cidx] = (hit.leg == lastLeg ? 0 : compressor.NSECTORS) + sector;
118 float pad = CAMath::Max(0.f, CAMath::Min((float)param.tpcGeometry.NPads(GPUCA_ROW_COUNT - 1), track.LinearY2Pad(hit.sector, track.Y(), param.tpcGeometry.PadWidth(hit.row), param.tpcGeometry.NPads(hit.row))));
119 c.padResA[cidx] = orgCl.padPacked - orgCl.packPad(pad);
120 float time = CAMath::Max(0.f, param.tpcGeometry.LinearZ2Time(hit.sector, track.Z() + zOffset));
121 c.timeResA[cidx] = (orgCl.getTimePacked() - orgCl.packTime(time)) & 0xFFFFFF;
122 lastLeg = hit.leg;
123 }
124 uint16_t qtot = orgCl.qTot, qmax = orgCl.qMax;
125 uint8_t sigmapad = orgCl.sigmaPadPacked, sigmatime = orgCl.sigmaTimePacked;
126 if (param.rec.tpc.compressionTypeMask & GPUSettings::CompressionTruncate) {
127 compressor.truncateSignificantBitsChargeMax(qmax, param);
128 compressor.truncateSignificantBitsCharge(qtot, param);
129 compressor.truncateSignificantBitsWidth(sigmapad, param);
130 compressor.truncateSignificantBitsWidth(sigmatime, param);
131 }
132 c.qTotA[cidx] = qtot;
133 c.qMaxA[cidx] = qmax;
134 c.sigmaPadA[cidx] = sigmapad;
135 c.sigmaTimeA[cidx] = sigmatime;
136 c.flagsA[cidx] = orgCl.getFlags();
137 if (k && track.Filter(y, z - zOffset, hit.row)) {
138 break;
139 }
140 lastRow = hit.row;
141 lastSector = hit.sector;
142 }
143 if (nClustersStored) {
144 CAMath::AtomicAdd(&compressor.mMemory->nStoredAttachedClusters, nClustersStored);
145 c.nTrackClusters[myTrack] = nClustersStored;
146 }
147 }
148}
149
150template <>
151GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<0>::operator()(uint32_t a, uint32_t b) const
152{
153 return mClsPtr[a].getTimePacked() < mClsPtr[b].getTimePacked();
154}
155
156template <>
157GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<1>::operator()(uint32_t a, uint32_t b) const
158{
159 return mClsPtr[a].padPacked < mClsPtr[b].padPacked;
160}
161
162template <>
163GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<2>::operator()(uint32_t a, uint32_t b) const
164{
165 if (mClsPtr[a].getTimePacked() >> 3 == mClsPtr[b].getTimePacked() >> 3) {
166 return mClsPtr[a].padPacked < mClsPtr[b].padPacked;
167 }
168 return mClsPtr[a].getTimePacked() < mClsPtr[b].getTimePacked();
169}
170
171template <>
172GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<3>::operator()(uint32_t a, uint32_t b) const
173{
174 if (mClsPtr[a].padPacked >> 3 == mClsPtr[b].padPacked >> 3) {
175 return mClsPtr[a].getTimePacked() < mClsPtr[b].getTimePacked();
176 }
177 return mClsPtr[a].padPacked < mClsPtr[b].padPacked;
178}
179
180template <>
181GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1unattached>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
182{
183 const GPUTrackingInOutPointers& GPUrestrict() ioPtrs = processors.ioPtrs;
184 const o2::tpc::ClusterNativeAccess* GPUrestrict() clusters = ioPtrs.clustersNative;
185 GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor;
186 GPUParam& GPUrestrict() param = processors.param;
187 uint32_t* sortBuffer = smem.sortBuffer;
188 for (int32_t iSectorRow = iBlock; iSectorRow < GPUCA_NSECTORS * GPUCA_ROW_COUNT; iSectorRow += nBlocks) {
189 const uint32_t iSector = iSectorRow / GPUCA_ROW_COUNT;
190 const uint32_t iRow = iSectorRow % GPUCA_ROW_COUNT;
191 const uint32_t idOffset = clusters->clusterOffset[iSector][iRow];
192 const uint32_t idOffsetOut = clusters->clusterOffset[iSector][iRow] * compressor.mMaxClusterFactorBase1024 / 1024;
193 const uint32_t idOffsetOutMax = ((const uint32_t*)clusters->clusterOffset[iSector])[iRow + 1] * compressor.mMaxClusterFactorBase1024 / 1024; // Array out of bounds access is ok, since it goes to the correct nClustersTotal
194 if (iThread == nThreads - 1) {
195 smem.nCount = 0;
196 }
197 uint32_t totalCount = 0;
198 GPUbarrier();
199
200 CompressedClustersPtrs& GPUrestrict() c = compressor.mPtrs;
201
202 const uint32_t nn = GPUCommonMath::nextMultipleOf<GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionKernels_step1unattached)>(clusters->nClusters[iSector][iRow]);
203 for (uint32_t i = iThread; i < nn + nThreads; i += nThreads) {
204 const int32_t idx = idOffset + i;
205 int32_t cidx = 0;
206 do {
207 if (i >= clusters->nClusters[iSector][iRow]) {
208 break;
209 }
210 if (compressor.mClusterStatus[idx]) {
211 break;
212 }
213 int32_t attach = ioPtrs.mergedTrackHitAttachment[idx];
214 bool unattached = attach == 0;
215
216 if (unattached) {
217 if (processors.param.rec.tpc.rejectionStrategy >= GPUSettings::RejectionStrategyB) {
218 break;
219 }
220 } else if (processors.param.rec.tpc.rejectionStrategy >= GPUSettings::RejectionStrategyA) {
222 break;
223 }
224 int32_t id = attach & gputpcgmmergertypes::attachTrackMask;
225 auto& trk = ioPtrs.mergedTracks[id];
226 if (CAMath::Abs(trk.GetParam().GetQPt() * processors.param.qptB5Scaler) > processors.param.rec.tpc.rejectQPtB5 || trk.MergedLooper()) {
227 break;
228 }
229 }
230 cidx = 1;
231 } while (false);
232
233 GPUbarrier();
234 int32_t myIndex = work_group_scan_inclusive_add(cidx);
235 int32_t storeLater = -1;
236 if (cidx) {
237 if (smem.nCount + myIndex <= GPUCA_TPC_COMP_CHUNK_SIZE) {
238 sortBuffer[smem.nCount + myIndex - 1] = i;
239 } else {
240 storeLater = smem.nCount + myIndex - 1 - GPUCA_TPC_COMP_CHUNK_SIZE;
241 }
242 }
243 GPUbarrier();
244 if (iThread == nThreads - 1) {
245 smem.nCount += myIndex;
246 }
247 GPUbarrier();
248
249 if (smem.nCount < GPUCA_TPC_COMP_CHUNK_SIZE && i < nn) {
250 continue;
251 }
252
253 uint32_t count = CAMath::Min(smem.nCount, (uint32_t)GPUCA_TPC_COMP_CHUNK_SIZE);
254 if (idOffsetOut + totalCount + count > idOffsetOutMax) {
255 if (iThread == nThreads - 1) {
256 compressor.raiseError(GPUErrors::ERROR_COMPRESSION_ROW_HIT_OVERFLOW, iSector * 1000 + iRow, idOffsetOut + totalCount + count, idOffsetOutMax);
257 }
258 break;
259 }
260 if (param.rec.tpc.compressionTypeMask & GPUSettings::CompressionDifferences) {
261 if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZPadTime) {
262 CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZPadTime>(clusters->clusters[iSector][iRow]));
263 } else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZTimePad) {
264 CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZTimePad>(clusters->clusters[iSector][iRow]));
265 } else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortPad) {
266 CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortPad>(clusters->clusters[iSector][iRow]));
267 } else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortTime) {
268 CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortTime>(clusters->clusters[iSector][iRow]));
269 }
270 GPUbarrier();
271 }
272
273 for (uint32_t j = get_local_id(0); j < count; j += get_local_size(0)) {
274 int32_t outidx = idOffsetOut + totalCount + j;
275 const ClusterNative& GPUrestrict() orgCl = clusters -> clusters[iSector][iRow][sortBuffer[j]];
276
277 int32_t preId = j != 0 ? (int32_t)sortBuffer[j - 1] : (totalCount != 0 ? (int32_t)smem.lastIndex : -1);
278 GPUTPCCompression_EncodeUnattached(param.rec.tpc.compressionTypeMask, orgCl, c.timeDiffU[outidx], c.padDiffU[outidx], preId == -1 ? nullptr : &clusters->clusters[iSector][iRow][preId]);
279
280 uint16_t qtot = orgCl.qTot, qmax = orgCl.qMax;
281 uint8_t sigmapad = orgCl.sigmaPadPacked, sigmatime = orgCl.sigmaTimePacked;
282 if (param.rec.tpc.compressionTypeMask & GPUSettings::CompressionTruncate) {
283 compressor.truncateSignificantBitsChargeMax(qmax, param);
284 compressor.truncateSignificantBitsCharge(qtot, param);
285 compressor.truncateSignificantBitsWidth(sigmapad, param);
286 compressor.truncateSignificantBitsWidth(sigmatime, param);
287 }
288 c.qTotU[outidx] = qtot;
289 c.qMaxU[outidx] = qmax;
290 c.sigmaPadU[outidx] = sigmapad;
291 c.sigmaTimeU[outidx] = sigmatime;
292 c.flagsU[outidx] = orgCl.getFlags();
293 }
294
295 GPUbarrier();
296 if (storeLater >= 0) {
297 sortBuffer[storeLater] = i;
298 }
299 totalCount += count;
300 if (iThread == nThreads - 1 && count) {
301 smem.lastIndex = sortBuffer[count - 1];
302 smem.nCount -= count;
303 }
304 }
305
306 if (iThread == nThreads - 1) {
307 c.nSliceRowClusters[iSector * GPUCA_ROW_COUNT + iRow] = totalCount;
308 CAMath::AtomicAdd(&compressor.mMemory->nStoredUnattachedClusters, totalCount);
309 }
310 GPUbarrier();
311 }
312}
313
314template <>
316{
317 return buf32[iWarp];
318}
319
320template <>
322{
323 return buf64[iWarp];
324}
325
326template <>
328{
329 return buf128[iWarp];
330}
331
332template <typename T, typename S>
333GPUdi() bool GPUTPCCompressionGatherKernels::isAlignedTo(const S* ptr)
334{
335 if constexpr (alignof(S) >= alignof(T)) {
336 static_cast<void>(ptr);
337 return true;
338 } else {
339 return reinterpret_cast<size_t>(ptr) % alignof(T) == 0;
340 }
341}
342
343template <>
344GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpy<uint8_t>(uint8_t* GPUrestrict() dst, const uint8_t* GPUrestrict() src, uint32_t size, int32_t nThreads, int32_t iThread)
345{
346 constexpr const int32_t vec128Elems = CpyVector<uint8_t, Vec128>::Size;
347 constexpr const int32_t vec64Elems = CpyVector<uint8_t, Vec64>::Size;
348 constexpr const int32_t vec32Elems = CpyVector<uint8_t, Vec32>::Size;
349 constexpr const int32_t vec16Elems = CpyVector<uint8_t, Vec16>::Size;
350
351 if (size >= uint32_t(nThreads * vec128Elems)) {
352 compressorMemcpyVectorised<uint8_t, Vec128>(dst, src, size, nThreads, iThread);
353 } else if (size >= uint32_t(nThreads * vec64Elems)) {
354 compressorMemcpyVectorised<uint8_t, Vec64>(dst, src, size, nThreads, iThread);
355 } else if (size >= uint32_t(nThreads * vec32Elems)) {
356 compressorMemcpyVectorised<uint8_t, Vec32>(dst, src, size, nThreads, iThread);
357 } else if (size >= uint32_t(nThreads * vec16Elems)) {
358 compressorMemcpyVectorised<uint8_t, Vec16>(dst, src, size, nThreads, iThread);
359 } else {
360 compressorMemcpyBasic(dst, src, size, nThreads, iThread);
361 }
362}
363
364template <>
365GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpy<uint16_t>(uint16_t* GPUrestrict() dst, const uint16_t* GPUrestrict() src, uint32_t size, int32_t nThreads, int32_t iThread)
366{
367 constexpr const int32_t vec128Elems = CpyVector<uint16_t, Vec128>::Size;
368 constexpr const int32_t vec64Elems = CpyVector<uint16_t, Vec64>::Size;
369 constexpr const int32_t vec32Elems = CpyVector<uint16_t, Vec32>::Size;
370
371 if (size >= uint32_t(nThreads * vec128Elems)) {
372 compressorMemcpyVectorised<uint16_t, Vec128>(dst, src, size, nThreads, iThread);
373 } else if (size >= uint32_t(nThreads * vec64Elems)) {
374 compressorMemcpyVectorised<uint16_t, Vec64>(dst, src, size, nThreads, iThread);
375 } else if (size >= uint32_t(nThreads * vec32Elems)) {
376 compressorMemcpyVectorised<uint16_t, Vec32>(dst, src, size, nThreads, iThread);
377 } else {
378 compressorMemcpyBasic(dst, src, size, nThreads, iThread);
379 }
380}
381
382template <>
383GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpy<uint32_t>(uint32_t* GPUrestrict() dst, const uint32_t* GPUrestrict() src, uint32_t size, int32_t nThreads, int32_t iThread)
384{
385 constexpr const int32_t vec128Elems = CpyVector<uint32_t, Vec128>::Size;
386 constexpr const int32_t vec64Elems = CpyVector<uint32_t, Vec64>::Size;
387
388 if (size >= uint32_t(nThreads * vec128Elems)) {
389 compressorMemcpyVectorised<uint32_t, Vec128>(dst, src, size, nThreads, iThread);
390 } else if (size >= uint32_t(nThreads * vec64Elems)) {
391 compressorMemcpyVectorised<uint32_t, Vec64>(dst, src, size, nThreads, iThread);
392 } else {
393 compressorMemcpyBasic(dst, src, size, nThreads, iThread);
394 }
395}
396
397template <typename Scalar, typename BaseVector>
398GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpyVectorised(Scalar* dst, const Scalar* src, uint32_t size, int32_t nThreads, int32_t iThread)
399{
400 if (not isAlignedTo<BaseVector>(dst)) {
401 size_t dsti = reinterpret_cast<size_t>(dst);
402 int32_t offset = (alignof(BaseVector) - dsti % alignof(BaseVector)) / sizeof(Scalar);
403 compressorMemcpyBasic(dst, src, offset, nThreads, iThread);
404 src += offset;
405 dst += offset;
406 size -= offset;
407 }
408
409 BaseVector* GPUrestrict() dstAligned = reinterpret_cast<BaseVector*>(dst);
410
411 using CpyVec = CpyVector<Scalar, BaseVector>;
412 uint32_t sizeAligned = size / CpyVec::Size;
413
414 if (isAlignedTo<BaseVector>(src)) {
415 const BaseVector* GPUrestrict() srcAligned = reinterpret_cast<const BaseVector*>(src);
416 compressorMemcpyBasic(dstAligned, srcAligned, sizeAligned, nThreads, iThread);
417 } else {
418 for (uint32_t i = iThread; i < sizeAligned; i += nThreads) {
419 CpyVec buf;
420 for (uint32_t j = 0; j < CpyVec::Size; j++) {
421 buf.elems[j] = src[i * CpyVec::Size + j];
422 }
423 dstAligned[i] = buf.all;
424 }
425 }
426
427 int32_t leftovers = size % CpyVec::Size;
428 compressorMemcpyBasic(dst + size - leftovers, src + size - leftovers, leftovers, nThreads, iThread);
429}
430
431template <typename T>
432GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpyBasic(T* GPUrestrict() dst, const T* GPUrestrict() src, uint32_t size, int32_t nThreads, int32_t iThread, int32_t nBlocks, int32_t iBlock)
433{
434 uint32_t start = (size + nBlocks - 1) / nBlocks * iBlock + iThread;
435 uint32_t end = CAMath::Min(size, (size + nBlocks - 1) / nBlocks * (iBlock + 1));
436 for (uint32_t i = start; i < end; i += nThreads) {
437 dst[i] = src[i];
438 }
439}
440
441template <typename V, typename T, typename S>
442GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpyBuffered(V* buf, T* GPUrestrict() dst, const T* GPUrestrict() src, const S* GPUrestrict() nums, const uint32_t* GPUrestrict() srcOffsets, uint32_t nEntries, int32_t nLanes, int32_t iLane, int32_t diff, size_t scaleBase1024)
443{
444 int32_t shmPos = 0;
445 uint32_t dstOffset = 0;
446 V* GPUrestrict() dstAligned = nullptr;
447
448 T* bufT = reinterpret_cast<T*>(buf);
449 constexpr const int32_t bufSize = GPUCA_WARP_SIZE;
450 constexpr const int32_t bufTSize = bufSize * sizeof(V) / sizeof(T);
451
452 for (uint32_t i = 0; i < nEntries; i++) {
453 uint32_t srcPos = 0;
454 uint32_t srcOffset = (srcOffsets[i] * scaleBase1024 / 1024) + diff;
455 uint32_t srcSize = nums[i] - diff;
456
457 if (dstAligned == nullptr) {
458 if (not isAlignedTo<V>(dst)) {
459 size_t dsti = reinterpret_cast<size_t>(dst);
460 uint32_t offset = (alignof(V) - dsti % alignof(V)) / sizeof(T);
461 offset = CAMath::Min<uint32_t>(offset, srcSize);
462 compressorMemcpyBasic(dst, src + srcOffset, offset, nLanes, iLane);
463 dst += offset;
464 srcPos += offset;
465 }
466 if (isAlignedTo<V>(dst)) {
467 dstAligned = reinterpret_cast<V*>(dst);
468 }
469 }
470 while (srcPos < srcSize) {
471 uint32_t shmElemsLeft = bufTSize - shmPos;
472 uint32_t srcElemsLeft = srcSize - srcPos;
473 uint32_t size = CAMath::Min(srcElemsLeft, shmElemsLeft);
474 compressorMemcpyBasic(bufT + shmPos, src + srcOffset + srcPos, size, nLanes, iLane);
475 srcPos += size;
476 shmPos += size;
478
479 if (shmPos >= bufTSize) {
480 compressorMemcpyBasic(dstAligned + dstOffset, buf, bufSize, nLanes, iLane);
481 dstOffset += bufSize;
482 shmPos = 0;
484 }
485 }
486 }
487
488 compressorMemcpyBasic(reinterpret_cast<T*>(dstAligned + dstOffset), bufT, shmPos, nLanes, iLane);
490}
491
492template <typename T>
493GPUdi() uint32_t GPUTPCCompressionGatherKernels::calculateWarpOffsets(GPUSharedMemory& smem, T* nums, uint32_t start, uint32_t end, int32_t nWarps, int32_t iWarp, int32_t nLanes, int32_t iLane)
494{
495 uint32_t blockOffset = 0;
496 int32_t iThread = nLanes * iWarp + iLane;
497 int32_t nThreads = nLanes * nWarps;
498 uint32_t blockStart = work_group_broadcast(start, 0);
499 for (uint32_t i = iThread; i < blockStart; i += nThreads) {
500 blockOffset += nums[i];
501 }
502 blockOffset = work_group_reduce_add(blockOffset);
503
504 uint32_t offset = 0;
505 for (uint32_t i = start + iLane; i < end; i += nLanes) {
506 offset += nums[i];
507 }
508 offset = work_group_scan_inclusive_add(offset);
509 if (iWarp > -1 && iLane == nLanes - 1) {
510 smem.warpOffset[iWarp] = offset;
511 }
512 GPUbarrier();
513 offset = (iWarp <= 0) ? 0 : smem.warpOffset[iWarp - 1];
514 GPUbarrier();
515
516 return offset + blockOffset;
517}
518
519template <>
520GPUdii() void GPUTPCCompressionGatherKernels::Thread<GPUTPCCompressionGatherKernels::unbuffered>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
521{
522 GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor;
523 const o2::tpc::ClusterNativeAccess* GPUrestrict() clusters = processors.ioPtrs.clustersNative;
524
525 int32_t nWarps = nThreads / GPUCA_WARP_SIZE;
526 int32_t iWarp = iThread / GPUCA_WARP_SIZE;
527
528 int32_t nLanes = GPUCA_WARP_SIZE;
529 int32_t iLane = iThread % GPUCA_WARP_SIZE;
530
531 if (iBlock == 0) {
532
533 uint32_t nRows = compressor.NSECTORS * GPUCA_ROW_COUNT;
534 uint32_t rowsPerWarp = (nRows + nWarps - 1) / nWarps;
535 uint32_t rowStart = rowsPerWarp * iWarp;
536 uint32_t rowEnd = CAMath::Min(nRows, rowStart + rowsPerWarp);
537 if (rowStart >= nRows) {
538 rowStart = 0;
539 rowEnd = 0;
540 }
541
542 uint32_t rowsOffset = calculateWarpOffsets(smem, compressor.mPtrs.nSliceRowClusters, rowStart, rowEnd, nWarps, iWarp, nLanes, iLane);
543
544 compressorMemcpy(compressor.mOutput->nSliceRowClusters, compressor.mPtrs.nSliceRowClusters, compressor.NSECTORS * GPUCA_ROW_COUNT, nThreads, iThread);
545 compressorMemcpy(compressor.mOutput->nTrackClusters, compressor.mPtrs.nTrackClusters, compressor.mMemory->nStoredTracks, nThreads, iThread);
546 compressorMemcpy(compressor.mOutput->qPtA, compressor.mPtrs.qPtA, compressor.mMemory->nStoredTracks, nThreads, iThread);
547 compressorMemcpy(compressor.mOutput->rowA, compressor.mPtrs.rowA, compressor.mMemory->nStoredTracks, nThreads, iThread);
548 compressorMemcpy(compressor.mOutput->sliceA, compressor.mPtrs.sliceA, compressor.mMemory->nStoredTracks, nThreads, iThread);
549 compressorMemcpy(compressor.mOutput->timeA, compressor.mPtrs.timeA, compressor.mMemory->nStoredTracks, nThreads, iThread);
550 compressorMemcpy(compressor.mOutput->padA, compressor.mPtrs.padA, compressor.mMemory->nStoredTracks, nThreads, iThread);
551
552 uint32_t sectorStart = rowStart / GPUCA_ROW_COUNT;
553 uint32_t sectorEnd = rowEnd / GPUCA_ROW_COUNT;
554
555 uint32_t sectorRowStart = rowStart % GPUCA_ROW_COUNT;
556 uint32_t sectorRowEnd = rowEnd % GPUCA_ROW_COUNT;
557
558 for (uint32_t i = sectorStart; i <= sectorEnd && i < compressor.NSECTORS; i++) {
559 for (uint32_t j = ((i == sectorStart) ? sectorRowStart : 0); j < ((i == sectorEnd) ? sectorRowEnd : GPUCA_ROW_COUNT); j++) {
560 uint32_t nClusters = compressor.mPtrs.nSliceRowClusters[i * GPUCA_ROW_COUNT + j];
561 uint32_t clusterOffsetInCache = clusters->clusterOffset[i][j] * compressor.mMaxClusterFactorBase1024 / 1024;
562 compressorMemcpy(compressor.mOutput->qTotU + rowsOffset, compressor.mPtrs.qTotU + clusterOffsetInCache, nClusters, nLanes, iLane);
563 compressorMemcpy(compressor.mOutput->qMaxU + rowsOffset, compressor.mPtrs.qMaxU + clusterOffsetInCache, nClusters, nLanes, iLane);
564 compressorMemcpy(compressor.mOutput->flagsU + rowsOffset, compressor.mPtrs.flagsU + clusterOffsetInCache, nClusters, nLanes, iLane);
565 compressorMemcpy(compressor.mOutput->padDiffU + rowsOffset, compressor.mPtrs.padDiffU + clusterOffsetInCache, nClusters, nLanes, iLane);
566 compressorMemcpy(compressor.mOutput->timeDiffU + rowsOffset, compressor.mPtrs.timeDiffU + clusterOffsetInCache, nClusters, nLanes, iLane);
567 compressorMemcpy(compressor.mOutput->sigmaPadU + rowsOffset, compressor.mPtrs.sigmaPadU + clusterOffsetInCache, nClusters, nLanes, iLane);
568 compressorMemcpy(compressor.mOutput->sigmaTimeU + rowsOffset, compressor.mPtrs.sigmaTimeU + clusterOffsetInCache, nClusters, nLanes, iLane);
569 rowsOffset += nClusters;
570 }
571 }
572 }
573
574 if (iBlock == 1) {
575 uint32_t tracksPerWarp = (compressor.mMemory->nStoredTracks + nWarps - 1) / nWarps;
576 uint32_t trackStart = tracksPerWarp * iWarp;
577 uint32_t trackEnd = CAMath::Min(compressor.mMemory->nStoredTracks, trackStart + tracksPerWarp);
578 if (trackStart >= compressor.mMemory->nStoredTracks) {
579 trackStart = 0;
580 trackEnd = 0;
581 }
582
583 uint32_t tracksOffset = calculateWarpOffsets(smem, compressor.mPtrs.nTrackClusters, trackStart, trackEnd, nWarps, iWarp, nLanes, iLane);
584
585 for (uint32_t i = trackStart; i < trackEnd; i += nLanes) {
586 uint32_t nTrackClusters = 0;
587 uint32_t srcOffset = 0;
588
589 if (i + iLane < trackEnd) {
590 nTrackClusters = compressor.mPtrs.nTrackClusters[i + iLane];
591 srcOffset = compressor.mAttachedClusterFirstIndex[i + iLane];
592 }
593 smem.unbuffered.sizes[iWarp][iLane] = nTrackClusters;
594 smem.unbuffered.srcOffsets[iWarp][iLane] = srcOffset;
595
596 uint32_t elems = (i + nLanes < trackEnd) ? nLanes : (trackEnd - i);
597
598 for (uint32_t j = 0; j < elems; j++) {
599 nTrackClusters = smem.unbuffered.sizes[iWarp][j];
600 srcOffset = smem.unbuffered.srcOffsets[iWarp][j];
601 uint32_t idx = i + j;
602 compressorMemcpy(compressor.mOutput->qTotA + tracksOffset, compressor.mPtrs.qTotA + srcOffset, nTrackClusters, nLanes, iLane);
603 compressorMemcpy(compressor.mOutput->qMaxA + tracksOffset, compressor.mPtrs.qMaxA + srcOffset, nTrackClusters, nLanes, iLane);
604 compressorMemcpy(compressor.mOutput->flagsA + tracksOffset, compressor.mPtrs.flagsA + srcOffset, nTrackClusters, nLanes, iLane);
605 compressorMemcpy(compressor.mOutput->sigmaPadA + tracksOffset, compressor.mPtrs.sigmaPadA + srcOffset, nTrackClusters, nLanes, iLane);
606 compressorMemcpy(compressor.mOutput->sigmaTimeA + tracksOffset, compressor.mPtrs.sigmaTimeA + srcOffset, nTrackClusters, nLanes, iLane);
607
608 // First index stored with track
609 compressorMemcpy(compressor.mOutput->rowDiffA + tracksOffset - idx, compressor.mPtrs.rowDiffA + srcOffset + 1, (nTrackClusters - 1), nLanes, iLane);
610 compressorMemcpy(compressor.mOutput->sliceLegDiffA + tracksOffset - idx, compressor.mPtrs.sliceLegDiffA + srcOffset + 1, (nTrackClusters - 1), nLanes, iLane);
611 compressorMemcpy(compressor.mOutput->padResA + tracksOffset - idx, compressor.mPtrs.padResA + srcOffset + 1, (nTrackClusters - 1), nLanes, iLane);
612 compressorMemcpy(compressor.mOutput->timeResA + tracksOffset - idx, compressor.mPtrs.timeResA + srcOffset + 1, (nTrackClusters - 1), nLanes, iLane);
613
614 tracksOffset += nTrackClusters;
615 }
616 }
617 }
618}
619
620template <typename V>
621GPUdii() void GPUTPCCompressionGatherKernels::gatherBuffered(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
622{
623
624 GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor;
625 const o2::tpc::ClusterNativeAccess* GPUrestrict() clusters = processors.ioPtrs.clustersNative;
626
627 int32_t nWarps = nThreads / GPUCA_WARP_SIZE;
628 int32_t iWarp = iThread / GPUCA_WARP_SIZE;
629
630 int32_t nGlobalWarps = nWarps * nBlocks;
631 int32_t iGlobalWarp = nWarps * iBlock + iWarp;
632
633 int32_t nLanes = GPUCA_WARP_SIZE;
634 int32_t iLane = iThread % GPUCA_WARP_SIZE;
635
636 auto& input = compressor.mPtrs;
637 auto* output = compressor.mOutput;
638
639 uint32_t nRows = compressor.NSECTORS * GPUCA_ROW_COUNT;
640 uint32_t rowsPerWarp = (nRows + nGlobalWarps - 1) / nGlobalWarps;
641 uint32_t rowStart = rowsPerWarp * iGlobalWarp;
642 uint32_t rowEnd = CAMath::Min(nRows, rowStart + rowsPerWarp);
643 if (rowStart >= nRows) {
644 rowStart = 0;
645 rowEnd = 0;
646 }
647 rowsPerWarp = rowEnd - rowStart;
648
649 uint32_t rowsOffset = calculateWarpOffsets(smem, input.nSliceRowClusters, rowStart, rowEnd, nWarps, iWarp, nLanes, iLane);
650
651 uint32_t nStoredTracks = compressor.mMemory->nStoredTracks;
652 uint32_t tracksPerWarp = (nStoredTracks + nGlobalWarps - 1) / nGlobalWarps;
653 uint32_t trackStart = tracksPerWarp * iGlobalWarp;
654 uint32_t trackEnd = CAMath::Min(nStoredTracks, trackStart + tracksPerWarp);
655 if (trackStart >= nStoredTracks) {
656 trackStart = 0;
657 trackEnd = 0;
658 }
659 tracksPerWarp = trackEnd - trackStart;
660
661 uint32_t tracksOffset = calculateWarpOffsets(smem, input.nTrackClusters, trackStart, trackEnd, nWarps, iWarp, nLanes, iLane);
662
663 if (iBlock == 0) {
664 compressorMemcpyBasic(output->nSliceRowClusters, input.nSliceRowClusters, compressor.NSECTORS * GPUCA_ROW_COUNT, nThreads, iThread);
665 compressorMemcpyBasic(output->nTrackClusters, input.nTrackClusters, compressor.mMemory->nStoredTracks, nThreads, iThread);
666 compressorMemcpyBasic(output->qPtA, input.qPtA, compressor.mMemory->nStoredTracks, nThreads, iThread);
667 compressorMemcpyBasic(output->rowA, input.rowA, compressor.mMemory->nStoredTracks, nThreads, iThread);
668 compressorMemcpyBasic(output->sliceA, input.sliceA, compressor.mMemory->nStoredTracks, nThreads, iThread);
669 compressorMemcpyBasic(output->timeA, input.timeA, compressor.mMemory->nStoredTracks, nThreads, iThread);
670 compressorMemcpyBasic(output->padA, input.padA, compressor.mMemory->nStoredTracks, nThreads, iThread);
671 }
672
673 const uint32_t* clusterOffsets = &clusters->clusterOffset[0][0] + rowStart;
674 const uint32_t* nSectorRowClusters = input.nSliceRowClusters + rowStart;
675
676 auto* buf = smem.getBuffer<V>(iWarp);
677
678 compressorMemcpyBuffered(buf, output->qTotU + rowsOffset, input.qTotU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
679 compressorMemcpyBuffered(buf, output->qMaxU + rowsOffset, input.qMaxU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
680 compressorMemcpyBuffered(buf, output->flagsU + rowsOffset, input.flagsU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
681 compressorMemcpyBuffered(buf, output->padDiffU + rowsOffset, input.padDiffU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
682 compressorMemcpyBuffered(buf, output->timeDiffU + rowsOffset, input.timeDiffU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
683 compressorMemcpyBuffered(buf, output->sigmaPadU + rowsOffset, input.sigmaPadU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
684 compressorMemcpyBuffered(buf, output->sigmaTimeU + rowsOffset, input.sigmaTimeU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
685
686 const uint16_t* nTrackClustersPtr = input.nTrackClusters + trackStart;
687 const uint32_t* aClsFstIdx = compressor.mAttachedClusterFirstIndex + trackStart;
688
689 compressorMemcpyBuffered(buf, output->qTotA + tracksOffset, input.qTotA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
690 compressorMemcpyBuffered(buf, output->qMaxA + tracksOffset, input.qMaxA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
691 compressorMemcpyBuffered(buf, output->flagsA + tracksOffset, input.flagsA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
692 compressorMemcpyBuffered(buf, output->sigmaPadA + tracksOffset, input.sigmaPadA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
693 compressorMemcpyBuffered(buf, output->sigmaTimeA + tracksOffset, input.sigmaTimeA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
694
695 // First index stored with track
696 uint32_t tracksOffsetDiff = tracksOffset - trackStart;
697 compressorMemcpyBuffered(buf, output->rowDiffA + tracksOffsetDiff, input.rowDiffA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
698 compressorMemcpyBuffered(buf, output->sliceLegDiffA + tracksOffsetDiff, input.sliceLegDiffA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
699 compressorMemcpyBuffered(buf, output->padResA + tracksOffsetDiff, input.padResA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
700 compressorMemcpyBuffered(buf, output->timeResA + tracksOffsetDiff, input.timeResA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
701}
702
703GPUdii() void GPUTPCCompressionGatherKernels::gatherMulti(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
704{
705 GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor;
706 const o2::tpc::ClusterNativeAccess* GPUrestrict() clusters = processors.ioPtrs.clustersNative;
707 const auto& input = compressor.mPtrs;
708 auto* output = compressor.mOutput;
709
710 const int32_t nWarps = nThreads / GPUCA_WARP_SIZE;
711 const int32_t iWarp = iThread / GPUCA_WARP_SIZE;
712 const int32_t nLanes = GPUCA_WARP_SIZE;
713 const int32_t iLane = iThread % GPUCA_WARP_SIZE;
714 auto* buf = smem.getBuffer<Vec128>(iWarp);
715
716 if (iBlock == 0) {
717 compressorMemcpyBasic(output->nSliceRowClusters, input.nSliceRowClusters, compressor.NSECTORS * GPUCA_ROW_COUNT, nThreads, iThread);
718 compressorMemcpyBasic(output->nTrackClusters, input.nTrackClusters, compressor.mMemory->nStoredTracks, nThreads, iThread);
719 compressorMemcpyBasic(output->qPtA, input.qPtA, compressor.mMemory->nStoredTracks, nThreads, iThread);
720 compressorMemcpyBasic(output->rowA, input.rowA, compressor.mMemory->nStoredTracks, nThreads, iThread);
721 compressorMemcpyBasic(output->sliceA, input.sliceA, compressor.mMemory->nStoredTracks, nThreads, iThread);
722 compressorMemcpyBasic(output->timeA, input.timeA, compressor.mMemory->nStoredTracks, nThreads, iThread);
723 compressorMemcpyBasic(output->padA, input.padA, compressor.mMemory->nStoredTracks, nThreads, iThread);
724 } else if (iBlock & 1) {
725 const uint32_t nGlobalWarps = nWarps * (nBlocks - 1) / 2;
726 const uint32_t iGlobalWarp = nWarps * (iBlock - 1) / 2 + iWarp;
727
728 const uint32_t nRows = compressor.NSECTORS * GPUCA_ROW_COUNT;
729 uint32_t rowsPerWarp = (nRows + nGlobalWarps - 1) / nGlobalWarps;
730 uint32_t rowStart = rowsPerWarp * iGlobalWarp;
731 uint32_t rowEnd = CAMath::Min(nRows, rowStart + rowsPerWarp);
732 if (rowStart >= nRows) {
733 rowStart = 0;
734 rowEnd = 0;
735 }
736 rowsPerWarp = rowEnd - rowStart;
737
738 const uint32_t rowsOffset = calculateWarpOffsets(smem, input.nSliceRowClusters, rowStart, rowEnd, nWarps, iWarp, nLanes, iLane);
739 const uint32_t* clusterOffsets = &clusters->clusterOffset[0][0] + rowStart;
740 const uint32_t* nSectorRowClusters = input.nSliceRowClusters + rowStart;
741
742 compressorMemcpyBuffered(buf, output->qTotU + rowsOffset, input.qTotU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
743 compressorMemcpyBuffered(buf, output->qMaxU + rowsOffset, input.qMaxU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
744 compressorMemcpyBuffered(buf, output->flagsU + rowsOffset, input.flagsU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
745 compressorMemcpyBuffered(buf, output->padDiffU + rowsOffset, input.padDiffU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
746 compressorMemcpyBuffered(buf, output->timeDiffU + rowsOffset, input.timeDiffU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
747 compressorMemcpyBuffered(buf, output->sigmaPadU + rowsOffset, input.sigmaPadU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
748 compressorMemcpyBuffered(buf, output->sigmaTimeU + rowsOffset, input.sigmaTimeU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
749 } else {
750 const uint32_t nGlobalWarps = nWarps * (nBlocks - 1) / 2;
751 const uint32_t iGlobalWarp = nWarps * (iBlock / 2 - 1) + iWarp;
752
753 const uint32_t nStoredTracks = compressor.mMemory->nStoredTracks;
754 uint32_t tracksPerWarp = (nStoredTracks + nGlobalWarps - 1) / nGlobalWarps;
755 uint32_t trackStart = tracksPerWarp * iGlobalWarp;
756 uint32_t trackEnd = CAMath::Min(nStoredTracks, trackStart + tracksPerWarp);
757 if (trackStart >= nStoredTracks) {
758 trackStart = 0;
759 trackEnd = 0;
760 }
761 tracksPerWarp = trackEnd - trackStart;
762
763 const uint32_t tracksOffset = calculateWarpOffsets(smem, input.nTrackClusters, trackStart, trackEnd, nWarps, iWarp, nLanes, iLane);
764 const uint16_t* nTrackClustersPtr = input.nTrackClusters + trackStart;
765 const uint32_t* aClsFstIdx = compressor.mAttachedClusterFirstIndex + trackStart;
766
767 compressorMemcpyBuffered(buf, output->qTotA + tracksOffset, input.qTotA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
768 compressorMemcpyBuffered(buf, output->qMaxA + tracksOffset, input.qMaxA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
769 compressorMemcpyBuffered(buf, output->flagsA + tracksOffset, input.flagsA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
770 compressorMemcpyBuffered(buf, output->sigmaPadA + tracksOffset, input.sigmaPadA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
771 compressorMemcpyBuffered(buf, output->sigmaTimeA + tracksOffset, input.sigmaTimeA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
772
773 // First index stored with track
774 uint32_t tracksOffsetDiff = tracksOffset - trackStart;
775 compressorMemcpyBuffered(buf, output->rowDiffA + tracksOffsetDiff, input.rowDiffA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
776 compressorMemcpyBuffered(buf, output->sliceLegDiffA + tracksOffsetDiff, input.sliceLegDiffA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
777 compressorMemcpyBuffered(buf, output->padResA + tracksOffsetDiff, input.padResA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
778 compressorMemcpyBuffered(buf, output->timeResA + tracksOffsetDiff, input.timeResA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
779 }
780}
781
782template <>
783GPUdii() void GPUTPCCompressionGatherKernels::Thread<GPUTPCCompressionGatherKernels::buffered32>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
784{
785 gatherBuffered<Vec32>(nBlocks, nThreads, iBlock, iThread, smem, processors);
786}
787
788template <>
789GPUdii() void GPUTPCCompressionGatherKernels::Thread<GPUTPCCompressionGatherKernels::buffered64>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
790{
791 gatherBuffered<Vec64>(nBlocks, nThreads, iBlock, iThread, smem, processors);
792}
793
794template <>
795GPUdii() void GPUTPCCompressionGatherKernels::Thread<GPUTPCCompressionGatherKernels::buffered128>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
796{
797 gatherBuffered<Vec128>(nBlocks, nThreads, iBlock, iThread, smem, processors);
798}
799
800template <>
801GPUdii() void GPUTPCCompressionGatherKernels::Thread<GPUTPCCompressionGatherKernels::multiBlock>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
802{
803 gatherMulti(nBlocks, nThreads, iBlock, iThread, smem, processors);
804}
int16_t time
Definition RawEventData.h:4
int32_t i
#define get_local_size(dim)
#define get_local_id(dim)
#define GPUsharedref()
#define GPUbarrierWarp()
#define get_global_size(dim)
#define GPUbarrier()
#define GPUrestrict()
#define get_global_id(dim)
#define GPUCA_TPC_COMP_CHUNK_SIZE
#define GPUCA_GET_THREAD_COUNT(...)
#define GPUCA_WARP_SIZE
GPUdii() void GPUTPCCompressionKernels
#define GPUCA_NSECTORS
#define GPUCA_ROW_COUNT
void output(const std::map< std::string, ChannelStat > &channels)
Definition rawdump.cxx:197
uint32_t j
Definition RawData.h:0
uint32_t c
Definition RawData.h:2
TBranch * ptr
int nClusters
o2::tpc::CompressedClusters * mOutput
static constexpr uint32_t NSECTORS
o2::tpc::CompressedClustersPtrs mPtrs
GLint GLenum GLint x
Definition glcorearb.h:403
GLenum src
Definition glcorearb.h:1767
GLint GLsizei count
Definition glcorearb.h:399
GLsizeiptr size
Definition glcorearb.h:659
GLuint GLuint end
Definition glcorearb.h:469
GLuint GLsizei bufSize
Definition glcorearb.h:790
GLboolean GLboolean GLboolean b
Definition glcorearb.h:1233
GLenum GLenum dst
Definition glcorearb.h:1767
GLintptr offset
Definition glcorearb.h:660
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLuint start
Definition glcorearb.h:469
GLenum GLfloat param
Definition glcorearb.h:271
GLboolean GLboolean GLboolean GLboolean a
Definition glcorearb.h:1233
GLenum GLuint GLenum GLsizei const GLchar * buf
Definition glcorearb.h:2514
GLuint id
Definition glcorearb.h:650
GLdouble GLdouble GLdouble z
Definition glcorearb.h:843
Global TPC definitions and constants.
Definition SimTraits.h:167
GPUd() void PIDResponse
Definition PIDResponse.h:71
GPUdi() T BetheBlochAleph(T bg
a couple of static helper functions to create timestamp values for CCDB queries or override obsolete ...
GPUReconstruction * rec
static constexpr bool GetIsRejected(int32_t attach)
const o2::tpc::ClusterNativeAccess * clustersNative
const uint32_t * mergedTrackHitAttachment
const GPUTPCGMMergedTrackHit * mergedTrackHits
const GPUTPCGMMergedTrack * mergedTracks
std::vector< std::byte > getBuffer(const char *filename)
std::vector< Cluster > clusters
for(int irof=0;irof< 1000;irof++)
std::vector< int > row