22#include "GPUTPCCompressionKernels.inc"
42 bool rejectTrk = GPUTPCClusterRejection::IsTrackRejected(trk,
param);
43 uint32_t nClustersStored = 0;
45 uint8_t
lastRow = 0, lastSector = 0;
48 for (int32_t k = trk.NClusters() - 1; k >= 0; k--) {
54 int32_t hitId = hit.
num;
72 float x = geo.Row2X(hit.
row);
73 float y = track.LinearPad2Y(hit.
sector, orgCl.getPad(), geo.PadWidth(hit.
row), geo.NPads(hit.
row));
74 float z = geo.LinearTime2Z(hit.
sector, orgCl.getTime());
75 if (nClustersStored) {
79 if (track.Propagate(geo.Row2X(hit.
row), GPUTPCGeometry::SectorAlpha(hit.
sector))) {
86 int32_t cidx = trk.FirstClusterRef() + nClustersStored++;
87 if (nClustersStored == 1) {
88 uint8_t qpt = fabs(trk.GetParam().GetQPt()) < 20.f ? (trk.GetParam().GetQPt() * (127.f / 20.f) + 127.5f) : (trk.GetParam().GetQPt() > 0 ? 254 : 0);
90 track.Init(
x,
y,
z - zOffset, GPUTPCGeometry::SectorAlpha(hit.
sector), qpt,
param);
94 c.qPtA[myTrack] = qpt;
95 c.rowA[myTrack] = hit.
row;
97 c.timeA[myTrack] = orgCl.getTimePacked();
101 uint32_t sector = hit.
sector;
108 if (lastSector > sector) {
111 sector -= lastSector;
113 c.rowDiffA[cidx] =
row;
114 c.sliceLegDiffA[cidx] = sector;
115 float pad = CAMath::Max(0.f, CAMath::Min((
float)geo.NPads(
GPUTPCGeometry::NROWS - 1), track.LinearY2Pad(hit.
sector, track.Y(), geo.PadWidth(hit.
row), geo.NPads(hit.
row))));
116 c.padResA[cidx] = orgCl.
padPacked - orgCl.packPad(pad);
117 float time = CAMath::Max(0.f, geo.LinearZ2Time(hit.
sector, track.Z() + zOffset));
118 c.timeResA[cidx] = (orgCl.getTimePacked() - orgCl.packTime(
time)) & 0xFFFFFF;
120 uint16_t qtot = orgCl.
qTot, qmax = orgCl.
qMax;
123 compressor.truncateSignificantBitsChargeMax(qmax,
param);
124 compressor.truncateSignificantBitsWidth(sigmapad,
param);
125 if (!orgCl.isSaturated()) [[likely]] {
126 compressor.truncateSignificantBitsCharge(qtot,
param);
127 compressor.truncateSignificantBitsWidth(sigmatime,
param);
130 c.qTotA[cidx] = qtot;
131 c.qMaxA[cidx] = qmax;
132 c.sigmaPadA[cidx] = sigmapad;
133 c.sigmaTimeA[cidx] = sigmatime;
134 c.flagsA[cidx] = orgCl.getFlags();
135 if (k && track.Filter(
y,
z - zOffset, hit.
row)) {
141 if (nClustersStored) {
143 c.nTrackClusters[myTrack] = nClustersStored;
151 return mClsPtr[
a].getTimePacked() < mClsPtr[
b].getTimePacked();
157 return mClsPtr[
a].padPacked < mClsPtr[
b].padPacked;
163 if (mClsPtr[
a].getTimePacked() >> 3 == mClsPtr[
b].getTimePacked() >> 3) {
164 return mClsPtr[
a].padPacked < mClsPtr[
b].padPacked;
166 return mClsPtr[
a].getTimePacked() < mClsPtr[
b].getTimePacked();
172 if (mClsPtr[
a].padPacked >> 3 == mClsPtr[
b].padPacked >> 3) {
173 return mClsPtr[
a].getTimePacked() < mClsPtr[
b].getTimePacked();
175 return mClsPtr[
a].padPacked < mClsPtr[
b].padPacked;
181 if (mClsPtr[
a].getTimePacked() != mClsPtr[
b].getTimePacked()) {
182 return mClsPtr[
a].getTimePacked() < mClsPtr[
b].getTimePacked();
184 if (mClsPtr[
a].padPacked != mClsPtr[
b].padPacked) {
185 return mClsPtr[
a].padPacked < mClsPtr[
b].padPacked;
187 return mClsPtr[
a].qTot < mClsPtr[
b].qTot;
192 if (mClusterStatus[idx]) {
195 int32_t attach = ioPtrs.mergedTrackHitAttachment[
idx];
196 bool unattached = attach == 0;
203 if (GPUTPCClusterRejection::GetIsRejected(attach)) {
210 auto& trk = ioPtrs.mergedTracks[
id];
211 if (GPUTPCClusterRejection::IsTrackRejected(trk,
param)) {
225 uint32_t* sortBuffer = smem.sortBuffer;
230 const uint32_t idOffsetOut =
clusters->clusterOffset[
iSector][iRow] * compressor.mMaxClusterFactorBase1024 / 1024;
231 const uint32_t idOffsetOutMax = ((
const uint32_t*)
clusters->clusterOffset[
iSector])[iRow + 1] * compressor.mMaxClusterFactorBase1024 / 1024;
232 if (iThread == nThreads - 1) {
235 uint32_t totalCount = 0;
241 for (uint32_t
i = iThread;
i < nn + nThreads;
i += nThreads) {
242 const int32_t
idx = idOffset +
i;
243 int32_t storeCluster =
i <
clusters->nClusters[
iSector][iRow] && !compressor.rejectCluster(idx,
param, ioPtrs);
246 int32_t myIndex = work_group_scan_inclusive_add(storeCluster);
247 int32_t storeLater = -1;
249 if (smem.nCount + myIndex <= constants::TPC_COMP_CHUNK_SIZE) {
250 sortBuffer[smem.nCount + myIndex - 1] =
i;
252 storeLater = smem.nCount + myIndex - 1 - constants::TPC_COMP_CHUNK_SIZE;
256 if (iThread == nThreads - 1) {
257 smem.nCount += myIndex;
261 if (smem.nCount < constants::TPC_COMP_CHUNK_SIZE &&
i < nn) {
265 uint32_t
count = CAMath::Min(smem.nCount, (uint32_t)constants::TPC_COMP_CHUNK_SIZE);
266 if (idOffsetOut + totalCount +
count > idOffsetOutMax) {
267 if (iThread == nThreads - 1) {
268 compressor.raiseError(GPUErrors::ERROR_COMPRESSION_ROW_HIT_OVERFLOW,
iSector * 1000 + iRow, idOffsetOut + totalCount +
count, idOffsetOutMax);
274 static_assert(
GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionKernels_step1unattached) * 2 <= constants::TPC_COMP_CHUNK_SIZE);
276#ifdef GPUCA_DETERMINISTIC_MODE
277 CAAlgo::sortInBlock(sortBuffer, sortBuffer +
count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZPadTime>(
clusters->clusters[
iSector][iRow]));
280 CAAlgo::sortInBlock(sortBuffer, sortBuffer +
count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZPadTime>(
clusters->clusters[
iSector][iRow]));
282 CAAlgo::sortInBlock(sortBuffer, sortBuffer +
count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZTimePad>(
clusters->clusters[
iSector][iRow]));
284 CAAlgo::sortInBlock(sortBuffer, sortBuffer +
count, GPUTPCCompressionKernels_Compare<GPUSettings::SortPad>(
clusters->clusters[
iSector][iRow]));
286 CAAlgo::sortInBlock(sortBuffer, sortBuffer +
count, GPUTPCCompressionKernels_Compare<GPUSettings::SortTime>(
clusters->clusters[
iSector][iRow]));
293 int32_t outidx = idOffsetOut + totalCount +
j;
296 int32_t preId =
j != 0 ? (int32_t)sortBuffer[
j - 1] : (totalCount != 0 ? (int32_t)smem.lastIndex : -1);
297 GPUTPCCompression_EncodeUnattached(
param.
rec.tpc.compressionTypeMask, orgCl,
c.timeDiffU[outidx],
c.padDiffU[outidx], preId == -1 ?
nullptr : &
clusters->
clusters[
iSector][iRow][preId]);
299 uint16_t qtot = orgCl.qTot, qmax = orgCl.qMax;
300 uint8_t sigmapad = orgCl.sigmaPadPacked, sigmatime = orgCl.sigmaTimePacked;
302 compressor.truncateSignificantBitsChargeMax(qmax,
param);
303 compressor.truncateSignificantBitsWidth(sigmapad,
param);
304 if (!orgCl.isSaturated()) [[likely]] {
305 compressor.truncateSignificantBitsCharge(qtot,
param);
306 compressor.truncateSignificantBitsWidth(sigmatime,
param);
309 c.qTotU[outidx] = qtot;
310 c.qMaxU[outidx] = qmax;
311 c.sigmaPadU[outidx] = sigmapad;
312 c.sigmaTimeU[outidx] = sigmatime;
313 c.flagsU[outidx] = orgCl.getFlags();
317 if (storeLater >= 0) {
318 sortBuffer[storeLater] =
i;
321 if (iThread == nThreads - 1 &&
count) {
322 smem.lastIndex = sortBuffer[
count - 1];
323 smem.nCount -=
count;
327 if (iThread == nThreads - 1) {
329 CAMath::AtomicAdd(&compressor.mMemory->nStoredUnattachedClusters, totalCount);
350 return buf128[iWarp];
353template <
typename T,
typename S>
356 if constexpr (
alignof(
S) >=
alignof(T)) {
357 static_cast<void>(
ptr);
360 return reinterpret_cast<size_t>(
ptr) %
alignof(T) == 0;
367 constexpr const int32_t vec128Elems = CpyVector<uint8_t, Vec128>::Size;
368 constexpr const int32_t vec64Elems = CpyVector<uint8_t, Vec64>::Size;
369 constexpr const int32_t vec32Elems = CpyVector<uint8_t, Vec32>::Size;
370 constexpr const int32_t vec16Elems = CpyVector<uint8_t, Vec16>::Size;
372 if (
size >= uint32_t(nThreads * vec128Elems)) {
373 compressorMemcpyVectorised<uint8_t, Vec128>(
dst,
src,
size, nThreads, iThread);
374 }
else if (
size >= uint32_t(nThreads * vec64Elems)) {
375 compressorMemcpyVectorised<uint8_t, Vec64>(
dst,
src,
size, nThreads, iThread);
376 }
else if (
size >= uint32_t(nThreads * vec32Elems)) {
377 compressorMemcpyVectorised<uint8_t, Vec32>(
dst,
src,
size, nThreads, iThread);
378 }
else if (
size >= uint32_t(nThreads * vec16Elems)) {
379 compressorMemcpyVectorised<uint8_t, Vec16>(
dst,
src,
size, nThreads, iThread);
381 compressorMemcpyBasic(
dst,
src,
size, nThreads, iThread);
388 constexpr const int32_t vec128Elems = CpyVector<uint16_t, Vec128>::Size;
389 constexpr const int32_t vec64Elems = CpyVector<uint16_t, Vec64>::Size;
390 constexpr const int32_t vec32Elems = CpyVector<uint16_t, Vec32>::Size;
392 if (
size >= uint32_t(nThreads * vec128Elems)) {
393 compressorMemcpyVectorised<uint16_t, Vec128>(
dst,
src,
size, nThreads, iThread);
394 }
else if (
size >= uint32_t(nThreads * vec64Elems)) {
395 compressorMemcpyVectorised<uint16_t, Vec64>(
dst,
src,
size, nThreads, iThread);
396 }
else if (
size >= uint32_t(nThreads * vec32Elems)) {
397 compressorMemcpyVectorised<uint16_t, Vec32>(
dst,
src,
size, nThreads, iThread);
399 compressorMemcpyBasic(
dst,
src,
size, nThreads, iThread);
406 constexpr const int32_t vec128Elems = CpyVector<uint32_t, Vec128>::Size;
407 constexpr const int32_t vec64Elems = CpyVector<uint32_t, Vec64>::Size;
409 if (
size >= uint32_t(nThreads * vec128Elems)) {
410 compressorMemcpyVectorised<uint32_t, Vec128>(
dst,
src,
size, nThreads, iThread);
411 }
else if (
size >= uint32_t(nThreads * vec64Elems)) {
412 compressorMemcpyVectorised<uint32_t, Vec64>(
dst,
src,
size, nThreads, iThread);
414 compressorMemcpyBasic(
dst,
src,
size, nThreads, iThread);
418template <
typename Scalar,
typename BaseVector>
421 if (not isAlignedTo<BaseVector>(
dst)) {
422 size_t dsti =
reinterpret_cast<size_t>(
dst);
423 int32_t
offset = (
alignof(BaseVector) - dsti %
alignof(BaseVector)) /
sizeof(Scalar);
424 compressorMemcpyBasic(
dst,
src,
offset, nThreads, iThread);
430 BaseVector*
GPUrestrict() dstAligned = reinterpret_cast<BaseVector*>(
dst);
432 using CpyVec = CpyVector<Scalar, BaseVector>;
433 uint32_t sizeAligned =
size / CpyVec::
Size;
435 if (isAlignedTo<BaseVector>(
src)) {
436 const BaseVector*
GPUrestrict() srcAligned = reinterpret_cast<const BaseVector*>(
src);
437 compressorMemcpyBasic(dstAligned, srcAligned, sizeAligned, nThreads, iThread);
439 for (uint32_t
i = iThread;
i < sizeAligned;
i += nThreads) {
441 for (uint32_t
j = 0;
j < CpyVec::Size;
j++) {
442 buf.elems[
j] =
src[
i * CpyVec::Size +
j];
444 dstAligned[
i] =
buf.all;
448 int32_t leftovers =
size % CpyVec::Size;
449 compressorMemcpyBasic(
dst +
size - leftovers,
src +
size - leftovers, leftovers, nThreads, iThread);
455 uint32_t
start = (
size + nBlocks - 1) / nBlocks * iBlock + iThread;
456 uint32_t
end = CAMath::Min(
size, (
size + nBlocks - 1) / nBlocks * (iBlock + 1));
462template <
typename V,
typename T,
typename S>
466 uint32_t dstOffset = 0;
469 T* bufT =
reinterpret_cast<T*
>(
buf);
470 constexpr const int32_t
bufSize = GPUCA_WARP_SIZE;
471 constexpr const int32_t bufTSize =
bufSize *
sizeof(V) /
sizeof(T);
473 for (uint32_t
i = 0;
i < nEntries;
i++) {
475 uint32_t srcOffset = (srcOffsets[
i] * scaleBase1024 / 1024) + diff;
476 uint32_t srcSize = nums[
i] - diff;
478 if (dstAligned ==
nullptr) {
479 if (not isAlignedTo<V>(
dst)) {
480 size_t dsti =
reinterpret_cast<size_t>(
dst);
481 uint32_t
offset = (
alignof(V) - dsti %
alignof(V)) /
sizeof(T);
483 compressorMemcpyBasic(
dst,
src + srcOffset,
offset, nLanes, iLane);
487 if (isAlignedTo<V>(
dst)) {
488 dstAligned =
reinterpret_cast<V*
>(
dst);
491 while (srcPos < srcSize) {
492 uint32_t shmElemsLeft = bufTSize - shmPos;
493 uint32_t srcElemsLeft = srcSize - srcPos;
494 uint32_t
size = CAMath::Min(srcElemsLeft, shmElemsLeft);
495 compressorMemcpyBasic(bufT + shmPos,
src + srcOffset + srcPos,
size, nLanes, iLane);
500 if (shmPos >= bufTSize) {
501 compressorMemcpyBasic(dstAligned + dstOffset,
buf,
bufSize, nLanes, iLane);
509 compressorMemcpyBasic(
reinterpret_cast<T*
>(dstAligned + dstOffset), bufT, shmPos, nLanes, iLane);
516 uint32_t blockOffset = 0;
517 int32_t iThread = nLanes * iWarp + iLane;
518 int32_t nThreads = nLanes * nWarps;
519 uint32_t blockStart = work_group_broadcast(
start, 0);
520 for (uint32_t
i = iThread;
i < blockStart;
i += nThreads) {
521 blockOffset += nums[
i];
523 blockOffset = work_group_reduce_add(blockOffset);
526 for (uint32_t
i =
start + iLane;
i <
end;
i += nLanes) {
530 if (iWarp > -1 && iLane == nLanes - 1) {
531 smem.warpOffset[iWarp] =
offset;
534 offset = (iWarp <= 0) ? 0 : smem.warpOffset[iWarp - 1];
537 return offset + blockOffset;
546 int32_t nWarps = nThreads / GPUCA_WARP_SIZE;
547 int32_t iWarp = iThread / GPUCA_WARP_SIZE;
549 int32_t nLanes = GPUCA_WARP_SIZE;
550 int32_t iLane = iThread % GPUCA_WARP_SIZE;
555 uint32_t rowsPerWarp = (nRows + nWarps - 1) / nWarps;
556 uint32_t rowStart = rowsPerWarp * iWarp;
557 uint32_t rowEnd = CAMath::Min(nRows, rowStart + rowsPerWarp);
558 if (rowStart >= nRows) {
563 uint32_t rowsOffset = calculateWarpOffsets(smem, compressor.
mPtrs.
nSliceRowClusters, rowStart, rowEnd, nWarps, iWarp, nLanes, iLane);
579 for (uint32_t
i = sectorStart;
i <= sectorEnd &&
i < compressor.
NSECTORS;
i++) {
580 for (uint32_t
j = ((
i == sectorStart) ? sectorRowStart : 0);
j < ((
i == sectorEnd) ? sectorRowEnd :
GPUTPCGeometry::NROWS);
j++) {
597 uint32_t trackStart = tracksPerWarp * iWarp;
604 uint32_t tracksOffset = calculateWarpOffsets(smem, compressor.
mPtrs.
nTrackClusters, trackStart, trackEnd, nWarps, iWarp, nLanes, iLane);
606 for (uint32_t
i = trackStart;
i < trackEnd;
i += nLanes) {
607 uint32_t nTrackClusters = 0;
608 uint32_t srcOffset = 0;
610 if (
i + iLane < trackEnd) {
614 smem.unbuffered.sizes[iWarp][iLane] = nTrackClusters;
615 smem.unbuffered.srcOffsets[iWarp][iLane] = srcOffset;
617 uint32_t elems = (
i + nLanes < trackEnd) ? nLanes : (trackEnd -
i);
619 for (uint32_t
j = 0;
j < elems;
j++) {
620 nTrackClusters = smem.unbuffered.sizes[iWarp][
j];
621 srcOffset = smem.unbuffered.srcOffsets[iWarp][
j];
622 uint32_t idx =
i +
j;
623 compressorMemcpy(compressor.
mOutput->
qTotA + tracksOffset, compressor.
mPtrs.
qTotA + srcOffset, nTrackClusters, nLanes, iLane);
624 compressorMemcpy(compressor.
mOutput->
qMaxA + tracksOffset, compressor.
mPtrs.
qMaxA + srcOffset, nTrackClusters, nLanes, iLane);
625 compressorMemcpy(compressor.
mOutput->
flagsA + tracksOffset, compressor.
mPtrs.
flagsA + srcOffset, nTrackClusters, nLanes, iLane);
630 compressorMemcpy(compressor.
mOutput->
rowDiffA + tracksOffset - idx, compressor.
mPtrs.
rowDiffA + srcOffset + 1, (nTrackClusters - 1), nLanes, iLane);
632 compressorMemcpy(compressor.
mOutput->
padResA + tracksOffset - idx, compressor.
mPtrs.
padResA + srcOffset + 1, (nTrackClusters - 1), nLanes, iLane);
633 compressorMemcpy(compressor.
mOutput->
timeResA + tracksOffset - idx, compressor.
mPtrs.
timeResA + srcOffset + 1, (nTrackClusters - 1), nLanes, iLane);
635 tracksOffset += nTrackClusters;
648 int32_t nWarps = nThreads / GPUCA_WARP_SIZE;
649 int32_t iWarp = iThread / GPUCA_WARP_SIZE;
651 int32_t nGlobalWarps = nWarps * nBlocks;
652 int32_t iGlobalWarp = nWarps * iBlock + iWarp;
654 int32_t nLanes = GPUCA_WARP_SIZE;
655 int32_t iLane = iThread % GPUCA_WARP_SIZE;
657 auto& input = compressor.
mPtrs;
661 uint32_t rowsPerWarp = (nRows + nGlobalWarps - 1) / nGlobalWarps;
662 uint32_t rowStart = rowsPerWarp * iGlobalWarp;
663 uint32_t rowEnd = CAMath::Min(nRows, rowStart + rowsPerWarp);
664 if (rowStart >= nRows) {
668 rowsPerWarp = rowEnd - rowStart;
670 uint32_t rowsOffset = calculateWarpOffsets(smem, input.nSliceRowClusters, rowStart, rowEnd, nWarps, iWarp, nLanes, iLane);
673 uint32_t tracksPerWarp = (nStoredTracks + nGlobalWarps - 1) / nGlobalWarps;
674 uint32_t trackStart = tracksPerWarp * iGlobalWarp;
675 uint32_t trackEnd = CAMath::Min(nStoredTracks, trackStart + tracksPerWarp);
676 if (trackStart >= nStoredTracks) {
680 tracksPerWarp = trackEnd - trackStart;
682 uint32_t tracksOffset = calculateWarpOffsets(smem, input.nTrackClusters, trackStart, trackEnd, nWarps, iWarp, nLanes, iLane);
694 const uint32_t* clusterOffsets = &
clusters->clusterOffset[0][0] + rowStart;
695 const uint32_t* nSectorRowClusters = input.nSliceRowClusters + rowStart;
697 auto*
buf = smem.getBuffer<V>(iWarp);
699 compressorMemcpyBuffered(
buf,
output->qTotU + rowsOffset, input.qTotU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.
mMaxClusterFactorBase1024);
700 compressorMemcpyBuffered(
buf,
output->qMaxU + rowsOffset, input.qMaxU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.
mMaxClusterFactorBase1024);
701 compressorMemcpyBuffered(
buf,
output->flagsU + rowsOffset, input.flagsU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.
mMaxClusterFactorBase1024);
702 compressorMemcpyBuffered(
buf,
output->padDiffU + rowsOffset, input.padDiffU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.
mMaxClusterFactorBase1024);
703 compressorMemcpyBuffered(
buf,
output->timeDiffU + rowsOffset, input.timeDiffU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.
mMaxClusterFactorBase1024);
704 compressorMemcpyBuffered(
buf,
output->sigmaPadU + rowsOffset, input.sigmaPadU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.
mMaxClusterFactorBase1024);
705 compressorMemcpyBuffered(
buf,
output->sigmaTimeU + rowsOffset, input.sigmaTimeU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.
mMaxClusterFactorBase1024);
707 const uint16_t* nTrackClustersPtr = input.nTrackClusters + trackStart;
710 compressorMemcpyBuffered(
buf,
output->qTotA + tracksOffset, input.qTotA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
711 compressorMemcpyBuffered(
buf,
output->qMaxA + tracksOffset, input.qMaxA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
712 compressorMemcpyBuffered(
buf,
output->flagsA + tracksOffset, input.flagsA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
713 compressorMemcpyBuffered(
buf,
output->sigmaPadA + tracksOffset, input.sigmaPadA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
714 compressorMemcpyBuffered(
buf,
output->sigmaTimeA + tracksOffset, input.sigmaTimeA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
717 uint32_t tracksOffsetDiff = tracksOffset - trackStart;
718 compressorMemcpyBuffered(
buf,
output->rowDiffA + tracksOffsetDiff, input.rowDiffA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
719 compressorMemcpyBuffered(
buf,
output->sliceLegDiffA + tracksOffsetDiff, input.sliceLegDiffA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
720 compressorMemcpyBuffered(
buf,
output->padResA + tracksOffsetDiff, input.padResA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
721 compressorMemcpyBuffered(
buf,
output->timeResA + tracksOffsetDiff, input.timeResA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
728 const auto& input = compressor.mPtrs;
729 auto*
output = compressor.mOutput;
731 const int32_t nWarps = nThreads / GPUCA_WARP_SIZE;
732 const int32_t iWarp = iThread / GPUCA_WARP_SIZE;
733 const int32_t nLanes = GPUCA_WARP_SIZE;
734 const int32_t iLane = iThread % GPUCA_WARP_SIZE;
738 compressorMemcpyBasic(
output->nSliceRowClusters, input.nSliceRowClusters, compressor.NSECTORS *
GPUTPCGeometry::NROWS, nThreads, iThread);
739 compressorMemcpyBasic(
output->nTrackClusters, input.nTrackClusters, compressor.mMemory->nStoredTracks, nThreads, iThread);
740 compressorMemcpyBasic(
output->qPtA, input.qPtA, compressor.mMemory->nStoredTracks, nThreads, iThread);
741 compressorMemcpyBasic(
output->rowA, input.rowA, compressor.mMemory->nStoredTracks, nThreads, iThread);
742 compressorMemcpyBasic(
output->sliceA, input.sliceA, compressor.mMemory->nStoredTracks, nThreads, iThread);
743 compressorMemcpyBasic(
output->timeA, input.timeA, compressor.mMemory->nStoredTracks, nThreads, iThread);
744 compressorMemcpyBasic(
output->padA, input.padA, compressor.mMemory->nStoredTracks, nThreads, iThread);
745 }
else if (iBlock & 1) {
746 const uint32_t nGlobalWarps = nWarps * (nBlocks - 1) / 2;
747 const uint32_t iGlobalWarp = nWarps * (iBlock - 1) / 2 + iWarp;
750 uint32_t rowsPerWarp = (
nRows + nGlobalWarps - 1) / nGlobalWarps;
751 uint32_t rowStart = rowsPerWarp * iGlobalWarp;
752 uint32_t rowEnd = CAMath::Min(nRows, rowStart + rowsPerWarp);
753 if (rowStart >= nRows) {
757 rowsPerWarp = rowEnd - rowStart;
759 const uint32_t rowsOffset = calculateWarpOffsets(smem, input.nSliceRowClusters, rowStart, rowEnd, nWarps, iWarp, nLanes, iLane);
760 const uint32_t* clusterOffsets = &
clusters->clusterOffset[0][0] + rowStart;
761 const uint32_t* nSectorRowClusters = input.nSliceRowClusters + rowStart;
763 compressorMemcpyBuffered(
buf,
output->qTotU + rowsOffset, input.qTotU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
764 compressorMemcpyBuffered(
buf,
output->qMaxU + rowsOffset, input.qMaxU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
765 compressorMemcpyBuffered(
buf,
output->flagsU + rowsOffset, input.flagsU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
766 compressorMemcpyBuffered(
buf,
output->padDiffU + rowsOffset, input.padDiffU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
767 compressorMemcpyBuffered(
buf,
output->timeDiffU + rowsOffset, input.timeDiffU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
768 compressorMemcpyBuffered(
buf,
output->sigmaPadU + rowsOffset, input.sigmaPadU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
769 compressorMemcpyBuffered(
buf,
output->sigmaTimeU + rowsOffset, input.sigmaTimeU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);
771 const uint32_t nGlobalWarps = nWarps * (nBlocks - 1) / 2;
772 const uint32_t iGlobalWarp = nWarps * (iBlock / 2 - 1) + iWarp;
774 const uint32_t nStoredTracks = compressor.mMemory->nStoredTracks;
775 uint32_t tracksPerWarp = (nStoredTracks + nGlobalWarps - 1) / nGlobalWarps;
776 uint32_t trackStart = tracksPerWarp * iGlobalWarp;
777 uint32_t trackEnd = CAMath::Min(nStoredTracks, trackStart + tracksPerWarp);
778 if (trackStart >= nStoredTracks) {
782 tracksPerWarp = trackEnd - trackStart;
784 const uint32_t tracksOffset = calculateWarpOffsets(smem, input.nTrackClusters, trackStart, trackEnd, nWarps, iWarp, nLanes, iLane);
785 const uint16_t* nTrackClustersPtr = input.nTrackClusters + trackStart;
786 const uint32_t* aClsFstIdx = compressor.mAttachedClusterFirstIndex + trackStart;
788 compressorMemcpyBuffered(
buf,
output->qTotA + tracksOffset, input.qTotA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
789 compressorMemcpyBuffered(
buf,
output->qMaxA + tracksOffset, input.qMaxA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
790 compressorMemcpyBuffered(
buf,
output->flagsA + tracksOffset, input.flagsA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
791 compressorMemcpyBuffered(
buf,
output->sigmaPadA + tracksOffset, input.sigmaPadA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
792 compressorMemcpyBuffered(
buf,
output->sigmaTimeA + tracksOffset, input.sigmaTimeA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);
795 uint32_t tracksOffsetDiff = tracksOffset - trackStart;
796 compressorMemcpyBuffered(
buf,
output->rowDiffA + tracksOffsetDiff, input.rowDiffA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
797 compressorMemcpyBuffered(
buf,
output->sliceLegDiffA + tracksOffsetDiff, input.sliceLegDiffA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
798 compressorMemcpyBuffered(
buf,
output->padResA + tracksOffsetDiff, input.padResA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
799 compressorMemcpyBuffered(
buf,
output->timeResA + tracksOffsetDiff, input.timeResA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);
806 gatherBuffered<Vec32>(nBlocks, nThreads, iBlock, iThread, smem, processors);
812 gatherBuffered<Vec64>(nBlocks, nThreads, iBlock, iThread, smem, processors);
818 gatherBuffered<Vec128>(nBlocks, nThreads, iBlock, iThread, smem, processors);
824 gatherMulti(nBlocks, nThreads, iBlock, iThread, smem, processors);
#define get_local_size(dim)
#define get_local_id(dim)
#define get_global_size(dim)
#define get_global_id(dim)
#define GPUCA_GET_THREAD_COUNT(...)
GPUdii() void GPUTPCCompressionKernels
uint32_t * mAttachedClusterFirstIndex
o2::tpc::CompressedClusters * mOutput
static constexpr uint32_t NSECTORS
o2::tpc::CompressedClustersPtrs mPtrs
size_t mMaxClusterFactorBase1024
static constexpr uint32_t NROWS
static constexpr uint32_t NSECTORS
GLboolean GLboolean GLboolean b
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLboolean GLboolean GLboolean GLboolean a
GLenum GLuint GLenum GLsizei const GLchar * buf
GLdouble GLdouble GLdouble z
Global TPC definitions and constants.
GPUdi() T BetheBlochAleph(T bg
constexpr std::array< int, nLayers > nRows
a couple of static helper functions to create timestamp values for CCDB queries or override obsolete ...
uint32_t nStoredAttachedClusters
const o2::tpc::ClusterNativeAccess * clustersNative
const uint32_t * mergedTrackHitAttachment
const GPUTPCGMMergedTrackHit * mergedTrackHits
const GPUTPCGMMergedTrack * mergedTracks
std::vector< std::byte > getBuffer(const char *filename)
std::vector< Cluster > clusters
for(int irof=0;irof< 1000;irof++)