d9/d57/GPUTPCCompressionKernels_8cxx_source.html

// Copyright 2019-2020 CERN and copyright holders of ALICE O2.

// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.

// All rights not expressly granted are reserved.

//

// This software is distributed under the terms of the GNU General Public

// License v3 (GPL Version 3), copied verbatim in the file "COPYING".

//

// In applying this license CERN does not waive the privileges and immunities

// granted to it by virtue of its status as an Intergovernmental Organization

// or submit itself to any jurisdiction.


#include "GPUTPCCompressionKernels.h"

#include "GPUConstantMem.h"

#include "GPUO2DataTypes.h"

#include "GPUParam.h"

#include "GPUCommonAlgorithm.h"

#include "GPUTPCCompressionTrackModel.h"

#include "GPUTPCClusterRejection.h"

#include "GPUTPCCompressionKernels.inc"


using namespace o2::gpu;

using namespace o2::tpc;


template <>


GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step0attached>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors)

{

  const GPUTrackingInOutPointers& GPUrestrict() ioPtrs = processors.ioPtrs;

  const o2::tpc::ClusterNativeAccess* GPUrestrict() clusters = ioPtrs.clustersNative;

  GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor;

  const GPUParam& GPUrestrict() param = processors.param;


  uint8_t lastLeg = 0;

  int32_t myTrack = 0;

  for (uint32_t i = get_global_id(0); i < ioPtrs.nMergedTracks; i += get_global_size(0)) {

    GPUbarrierWarp();

    const GPUTPCGMMergedTrack& GPUrestrict() trk = ioPtrs.mergedTracks[i];

    if (!trk.OK()) {

      continue;

    }

    bool rejectTrk = CAMath::Abs(trk.GetParam().GetQPt() * processors.param.qptB5Scaler) > processors.param.rec.tpc.rejectQPtB5 || trk.MergedLooper();

    uint32_t nClustersStored = 0;

    CompressedClustersPtrs& GPUrestrict() c = compressor.mPtrs;

    uint8_t lastRow = 0, lastSector = 0;

    GPUTPCCompressionTrackModel track;

    float zOffset = 0;

    for (int32_t k = trk.NClusters() - 1; k >= 0; k--) {

      const GPUTPCGMMergedTrackHit& GPUrestrict() hit = ioPtrs.mergedTrackHits[trk.FirstClusterRef() + k];

      if (hit.state & GPUTPCGMMergedTrackHit::flagReject) {

        continue;

      }


      int32_t hitId = hit.num;

      int32_t attach = ioPtrs.mergedTrackHitAttachment[hitId];

      if ((attach & gputpcgmmergertypes::attachTrackMask) != i) {

        continue; // Main attachment to different track

      }

      bool rejectCluster = processors.param.rec.tpc.rejectionStrategy >= GPUSettings::RejectionStrategyA && (rejectTrk || GPUTPCClusterRejection::GetIsRejected(attach));

      if (rejectCluster) {

        compressor.mClusterStatus[hitId] = 1; // Cluster rejected, do not store

        continue;

      }


      if (!(param.rec.tpc.compressionTypeMask & GPUSettings::CompressionTrackModel)) {

        continue; // No track model compression

      }

      const ClusterNative& GPUrestrict() orgCl = clusters -> clusters[hit.sector][hit.row][hit.num - clusters->clusterOffset[hit.sector][hit.row]];

      constexpr GPUTPCGeometry geo;

      float x = geo.Row2X(hit.row);

      float y = track.LinearPad2Y(hit.sector, orgCl.getPad(), geo.PadWidth(hit.row), geo.NPads(hit.row));

      float z = geo.LinearTime2Z(hit.sector, orgCl.getTime());

      if (nClustersStored) {

        if ((hit.sector < GPUCA_NSECTORS) ^ (lastSector < GPUCA_NSECTORS)) {

          break;

        }

        if (lastLeg != hit.leg && track.Mirror()) {

          break;

        }

        if (track.Propagate(geo.Row2X(hit.row), param.SectorParam[hit.sector].Alpha)) {

          break;

        }

      }


      compressor.mClusterStatus[hitId] = 1; // Cluster compressed in track model, do not store as difference


      int32_t cidx = trk.FirstClusterRef() + nClustersStored++;

      if (nClustersStored == 1) {

        uint8_t qpt = fabs(trk.GetParam().GetQPt()) < 20.f ? (trk.GetParam().GetQPt() * (127.f / 20.f) + 127.5f) : (trk.GetParam().GetQPt() > 0 ? 254 : 0);

        zOffset = z;

        track.Init(x, y, z - zOffset, param.SectorParam[hit.sector].Alpha, qpt, param);


        myTrack = CAMath::AtomicAdd(&compressor.mMemory->nStoredTracks, 1u);

        compressor.mAttachedClusterFirstIndex[myTrack] = trk.FirstClusterRef();

        lastLeg = hit.leg;

        c.qPtA[myTrack] = qpt;

        c.rowA[myTrack] = hit.row;

        c.sliceA[myTrack] = hit.sector;

        c.timeA[myTrack] = orgCl.getTimePacked();

        c.padA[myTrack] = orgCl.padPacked;

      } else {

        uint32_t row = hit.row;

        uint32_t sector = hit.sector;


        if (param.rec.tpc.compressionTypeMask & GPUSettings::CompressionDifferences) {

          if (lastRow > row) {

            row += GPUCA_ROW_COUNT;

          }

          row -= lastRow;

          if (lastSector > sector) {

            sector += compressor.NSECTORS;

          }

          sector -= lastSector;

        }

        c.rowDiffA[cidx] = row;

        c.sliceLegDiffA[cidx] = (hit.leg == lastLeg ? 0 : compressor.NSECTORS) + sector;

        float pad = CAMath::Max(0.f, CAMath::Min((float)geo.NPads(GPUCA_ROW_COUNT - 1), track.LinearY2Pad(hit.sector, track.Y(), geo.PadWidth(hit.row), geo.NPads(hit.row))));

        c.padResA[cidx] = orgCl.padPacked - orgCl.packPad(pad);

        float time = CAMath::Max(0.f, geo.LinearZ2Time(hit.sector, track.Z() + zOffset));

        c.timeResA[cidx] = (orgCl.getTimePacked() - orgCl.packTime(time)) & 0xFFFFFF;

        lastLeg = hit.leg;

      }

      uint16_t qtot = orgCl.qTot, qmax = orgCl.qMax;

      uint8_t sigmapad = orgCl.sigmaPadPacked, sigmatime = orgCl.sigmaTimePacked;

      if (param.rec.tpc.compressionTypeMask & GPUSettings::CompressionTruncate) {

        compressor.truncateSignificantBitsChargeMax(qmax, param);

        compressor.truncateSignificantBitsCharge(qtot, param);

        compressor.truncateSignificantBitsWidth(sigmapad, param);

        compressor.truncateSignificantBitsWidth(sigmatime, param);

      }

      c.qTotA[cidx] = qtot;

      c.qMaxA[cidx] = qmax;

      c.sigmaPadA[cidx] = sigmapad;

      c.sigmaTimeA[cidx] = sigmatime;

      c.flagsA[cidx] = orgCl.getFlags();

      if (k && track.Filter(y, z - zOffset, hit.row)) {

        break;

      }

      lastRow = hit.row;

      lastSector = hit.sector;

    }

    if (nClustersStored) {

      CAMath::AtomicAdd(&compressor.mMemory->nStoredAttachedClusters, nClustersStored);

      c.nTrackClusters[myTrack] = nClustersStored;

    }

  }

}


template <>


GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<0>::operator()(uint32_t a, uint32_t b) const

{

  return mClsPtr[a].getTimePacked() < mClsPtr[b].getTimePacked();

}


template <>

GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<1>::operator()(uint32_t a, uint32_t b) const

{

  return mClsPtr[a].padPacked < mClsPtr[b].padPacked;

}


template <>

GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<2>::operator()(uint32_t a, uint32_t b) const

{

  if (mClsPtr[a].getTimePacked() >> 3 == mClsPtr[b].getTimePacked() >> 3) {

    return mClsPtr[a].padPacked < mClsPtr[b].padPacked;

  }

  return mClsPtr[a].getTimePacked() < mClsPtr[b].getTimePacked();

}


template <>

GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<3>::operator()(uint32_t a, uint32_t b) const

{

  if (mClsPtr[a].padPacked >> 3 == mClsPtr[b].padPacked >> 3) {

    return mClsPtr[a].getTimePacked() < mClsPtr[b].getTimePacked();

  }

  return mClsPtr[a].padPacked < mClsPtr[b].padPacked;

}


template <>

GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1unattached>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)

{

  const GPUTrackingInOutPointers& GPUrestrict() ioPtrs = processors.ioPtrs;

  const o2::tpc::ClusterNativeAccess* GPUrestrict() clusters = ioPtrs.clustersNative;

  GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor;

  GPUParam& GPUrestrict() param = processors.param;

  uint32_t* sortBuffer = smem.sortBuffer;

  for (int32_t iSectorRow = iBlock; iSectorRow < GPUCA_NSECTORS * GPUCA_ROW_COUNT; iSectorRow += nBlocks) {

    const uint32_t iSector = iSectorRow / GPUCA_ROW_COUNT;

    const uint32_t iRow = iSectorRow % GPUCA_ROW_COUNT;

    const uint32_t idOffset = clusters->clusterOffset[iSector][iRow];

    const uint32_t idOffsetOut = clusters->clusterOffset[iSector][iRow] * compressor.mMaxClusterFactorBase1024 / 1024;

    const uint32_t idOffsetOutMax = ((const uint32_t*)clusters->clusterOffset[iSector])[iRow + 1] * compressor.mMaxClusterFactorBase1024 / 1024; // Array out of bounds access is ok, since it goes to the correct nClustersTotal

    if (iThread == nThreads - 1) {

      smem.nCount = 0;

    }

    uint32_t totalCount = 0;

    GPUbarrier();


    CompressedClustersPtrs& GPUrestrict() c = compressor.mPtrs;


    const uint32_t nn = CAMath::nextMultipleOf<GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionKernels_step1unattached)>(clusters->nClusters[iSector][iRow]);

    for (uint32_t i = iThread; i < nn + nThreads; i += nThreads) {

      const int32_t idx = idOffset + i;

      int32_t cidx = 0;

      do {

        if (i >= clusters->nClusters[iSector][iRow]) {

          break;

        }

        if (compressor.mClusterStatus[idx]) {

          break;

        }

        int32_t attach = ioPtrs.mergedTrackHitAttachment[idx];

        bool unattached = attach == 0;


        if (unattached) {

          if (processors.param.rec.tpc.rejectionStrategy >= GPUSettings::RejectionStrategyB) {

            break;

          }

        } else if (processors.param.rec.tpc.rejectionStrategy >= GPUSettings::RejectionStrategyA) {

          if (GPUTPCClusterRejection::GetIsRejected(attach)) {

            break;

          }

          int32_t id = attach & gputpcgmmergertypes::attachTrackMask;

          auto& trk = ioPtrs.mergedTracks[id];

          if (CAMath::Abs(trk.GetParam().GetQPt() * processors.param.qptB5Scaler) > processors.param.rec.tpc.rejectQPtB5 || trk.MergedLooper()) {

            break;

          }

        }

        cidx = 1;

      } while (false);


      GPUbarrier();

      int32_t myIndex = work_group_scan_inclusive_add(cidx);

      int32_t storeLater = -1;

      if (cidx) {

        if (smem.nCount + myIndex <= GPUCA_TPC_COMP_CHUNK_SIZE) {

          sortBuffer[smem.nCount + myIndex - 1] = i;

        } else {

          storeLater = smem.nCount + myIndex - 1 - GPUCA_TPC_COMP_CHUNK_SIZE;

        }

      }

      GPUbarrier();

      if (iThread == nThreads - 1) {

        smem.nCount += myIndex;

      }

      GPUbarrier();


      if (smem.nCount < GPUCA_TPC_COMP_CHUNK_SIZE && i < nn) {

        continue;

      }


      uint32_t count = CAMath::Min(smem.nCount, (uint32_t)GPUCA_TPC_COMP_CHUNK_SIZE);

      if (idOffsetOut + totalCount + count > idOffsetOutMax) {

        if (iThread == nThreads - 1) {

          compressor.raiseError(GPUErrors::ERROR_COMPRESSION_ROW_HIT_OVERFLOW, iSector * 1000 + iRow, idOffsetOut + totalCount + count, idOffsetOutMax);

        }

        break;

      }

      if (param.rec.tpc.compressionTypeMask & GPUSettings::CompressionDifferences) {

        if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZPadTime) {

          CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZPadTime>(clusters->clusters[iSector][iRow]));

        } else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZTimePad) {

          CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZTimePad>(clusters->clusters[iSector][iRow]));

        } else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortPad) {

          CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortPad>(clusters->clusters[iSector][iRow]));

        } else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortTime) {

          CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortTime>(clusters->clusters[iSector][iRow]));

        }

        GPUbarrier();

      }


      for (uint32_t j = get_local_id(0); j < count; j += get_local_size(0)) {

        int32_t outidx = idOffsetOut + totalCount + j;

        const ClusterNative& GPUrestrict() orgCl = clusters -> clusters[iSector][iRow][sortBuffer[j]];


        int32_t preId = j != 0 ? (int32_t)sortBuffer[j - 1] : (totalCount != 0 ? (int32_t)smem.lastIndex : -1);

        GPUTPCCompression_EncodeUnattached(param.rec.tpc.compressionTypeMask, orgCl, c.timeDiffU[outidx], c.padDiffU[outidx], preId == -1 ? nullptr : &clusters->clusters[iSector][iRow][preId]);


        uint16_t qtot = orgCl.qTot, qmax = orgCl.qMax;

        uint8_t sigmapad = orgCl.sigmaPadPacked, sigmatime = orgCl.sigmaTimePacked;

        if (param.rec.tpc.compressionTypeMask & GPUSettings::CompressionTruncate) {

          compressor.truncateSignificantBitsChargeMax(qmax, param);

          compressor.truncateSignificantBitsCharge(qtot, param);

          compressor.truncateSignificantBitsWidth(sigmapad, param);

          compressor.truncateSignificantBitsWidth(sigmatime, param);

        }

        c.qTotU[outidx] = qtot;

        c.qMaxU[outidx] = qmax;

        c.sigmaPadU[outidx] = sigmapad;

        c.sigmaTimeU[outidx] = sigmatime;

        c.flagsU[outidx] = orgCl.getFlags();

      }


      GPUbarrier();

      if (storeLater >= 0) {

        sortBuffer[storeLater] = i;

      }

      totalCount += count;

      if (iThread == nThreads - 1 && count) {

        smem.lastIndex = sortBuffer[count - 1];

        smem.nCount -= count;

      }

    }


    if (iThread == nThreads - 1) {

      c.nSliceRowClusters[iSector * GPUCA_ROW_COUNT + iRow] = totalCount;

      CAMath::AtomicAdd(&compressor.mMemory->nStoredUnattachedClusters, totalCount);

    }

    GPUbarrier();

  }

}


template <>


GPUdi() GPUTPCCompressionGatherKernels::Vec32* GPUTPCCompressionGatherKernels::GPUSharedMemory::getBuffer<GPUTPCCompressionGatherKernels::Vec32>(int32_t iWarp)

{

  return buf32[iWarp];

}


template <>


GPUdi() GPUTPCCompressionGatherKernels::Vec64* GPUTPCCompressionGatherKernels::GPUSharedMemory::getBuffer<GPUTPCCompressionGatherKernels::Vec64>(int32_t iWarp)

{

  return buf64[iWarp];

}


template <>


GPUdi() GPUTPCCompressionGatherKernels::Vec128* GPUTPCCompressionGatherKernels::GPUSharedMemory::getBuffer<GPUTPCCompressionGatherKernels::Vec128>(int32_t iWarp)

{

  return buf128[iWarp];

}


template <typename T, typename S>


GPUdi() bool GPUTPCCompressionGatherKernels::isAlignedTo(const S* ptr)

{

  if constexpr (alignof(S) >= alignof(T)) {

    static_cast<void>(ptr);

    return true;

  } else {

    return reinterpret_cast<size_t>(ptr) % alignof(T) == 0;

  }

}


template <>


GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpy<uint8_t>(uint8_t* GPUrestrict() dst, const uint8_t* GPUrestrict() src, uint32_t size, int32_t nThreads, int32_t iThread)

{

  constexpr const int32_t vec128Elems = CpyVector<uint8_t, Vec128>::Size;

  constexpr const int32_t vec64Elems = CpyVector<uint8_t, Vec64>::Size;

  constexpr const int32_t vec32Elems = CpyVector<uint8_t, Vec32>::Size;

  constexpr const int32_t vec16Elems = CpyVector<uint8_t, Vec16>::Size;


  if (size >= uint32_t(nThreads * vec128Elems)) {

    compressorMemcpyVectorised<uint8_t, Vec128>(dst, src, size, nThreads, iThread);

  } else if (size >= uint32_t(nThreads * vec64Elems)) {

    compressorMemcpyVectorised<uint8_t, Vec64>(dst, src, size, nThreads, iThread);

  } else if (size >= uint32_t(nThreads * vec32Elems)) {

    compressorMemcpyVectorised<uint8_t, Vec32>(dst, src, size, nThreads, iThread);

  } else if (size >= uint32_t(nThreads * vec16Elems)) {

    compressorMemcpyVectorised<uint8_t, Vec16>(dst, src, size, nThreads, iThread);

  } else {

    compressorMemcpyBasic(dst, src, size, nThreads, iThread);

  }

}


template <>


GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpy<uint16_t>(uint16_t* GPUrestrict() dst, const uint16_t* GPUrestrict() src, uint32_t size, int32_t nThreads, int32_t iThread)

{

  constexpr const int32_t vec128Elems = CpyVector<uint16_t, Vec128>::Size;

  constexpr const int32_t vec64Elems = CpyVector<uint16_t, Vec64>::Size;

  constexpr const int32_t vec32Elems = CpyVector<uint16_t, Vec32>::Size;


  if (size >= uint32_t(nThreads * vec128Elems)) {

    compressorMemcpyVectorised<uint16_t, Vec128>(dst, src, size, nThreads, iThread);

  } else if (size >= uint32_t(nThreads * vec64Elems)) {

    compressorMemcpyVectorised<uint16_t, Vec64>(dst, src, size, nThreads, iThread);

  } else if (size >= uint32_t(nThreads * vec32Elems)) {

    compressorMemcpyVectorised<uint16_t, Vec32>(dst, src, size, nThreads, iThread);

  } else {

    compressorMemcpyBasic(dst, src, size, nThreads, iThread);

  }

}


template <>


GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpy<uint32_t>(uint32_t* GPUrestrict() dst, const uint32_t* GPUrestrict() src, uint32_t size, int32_t nThreads, int32_t iThread)

{

  constexpr const int32_t vec128Elems = CpyVector<uint32_t, Vec128>::Size;

  constexpr const int32_t vec64Elems = CpyVector<uint32_t, Vec64>::Size;


  if (size >= uint32_t(nThreads * vec128Elems)) {

    compressorMemcpyVectorised<uint32_t, Vec128>(dst, src, size, nThreads, iThread);

  } else if (size >= uint32_t(nThreads * vec64Elems)) {

    compressorMemcpyVectorised<uint32_t, Vec64>(dst, src, size, nThreads, iThread);

  } else {

    compressorMemcpyBasic(dst, src, size, nThreads, iThread);

  }

}


template <typename Scalar, typename BaseVector>

GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpyVectorised(Scalar* dst, const Scalar* src, uint32_t size, int32_t nThreads, int32_t iThread)

{

  if (not isAlignedTo<BaseVector>(dst)) {

    size_t dsti = reinterpret_cast<size_t>(dst);

    int32_t offset = (alignof(BaseVector) - dsti % alignof(BaseVector)) / sizeof(Scalar);

    compressorMemcpyBasic(dst, src, offset, nThreads, iThread);

    src += offset;

    dst += offset;

    size -= offset;

  }


  BaseVector* GPUrestrict() dstAligned = reinterpret_cast<BaseVector*>(dst);


  using CpyVec = CpyVector<Scalar, BaseVector>;

  uint32_t sizeAligned = size / CpyVec::Size;


  if (isAlignedTo<BaseVector>(src)) {

    const BaseVector* GPUrestrict() srcAligned = reinterpret_cast<const BaseVector*>(src);

    compressorMemcpyBasic(dstAligned, srcAligned, sizeAligned, nThreads, iThread);

  } else {

    for (uint32_t i = iThread; i < sizeAligned; i += nThreads) {

      CpyVec buf;

      for (uint32_t j = 0; j < CpyVec::Size; j++) {

        buf.elems[j] = src[i * CpyVec::Size + j];

      }

      dstAligned[i] = buf.all;

    }

  }


  int32_t leftovers = size % CpyVec::Size;

  compressorMemcpyBasic(dst + size - leftovers, src + size - leftovers, leftovers, nThreads, iThread);

}


template <typename T>


GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpyBasic(T* GPUrestrict() dst, const T* GPUrestrict() src, uint32_t size, int32_t nThreads, int32_t iThread, int32_t nBlocks, int32_t iBlock)

{

  uint32_t start = (size + nBlocks - 1) / nBlocks * iBlock + iThread;

  uint32_t end = CAMath::Min(size, (size + nBlocks - 1) / nBlocks * (iBlock + 1));

  for (uint32_t i = start; i < end; i += nThreads) {

    dst[i] = src[i];

  }

}


template <typename V, typename T, typename S>


GPUdi() void GPUTPCCompressionGatherKernels::compressorMemcpyBuffered(V* buf, T* GPUrestrict() dst, const T* GPUrestrict() src, const S* GPUrestrict() nums, const uint32_t* GPUrestrict() srcOffsets, uint32_t nEntries, int32_t nLanes, int32_t iLane, int32_t diff, size_t scaleBase1024)

{

  int32_t shmPos = 0;

  uint32_t dstOffset = 0;

  V* GPUrestrict() dstAligned = nullptr;


  T* bufT = reinterpret_cast<T*>(buf);

  constexpr const int32_t bufSize = GPUCA_WARP_SIZE;

  constexpr const int32_t bufTSize = bufSize * sizeof(V) / sizeof(T);


  for (uint32_t i = 0; i < nEntries; i++) {

    uint32_t srcPos = 0;

    uint32_t srcOffset = (srcOffsets[i] * scaleBase1024 / 1024) + diff;

    uint32_t srcSize = nums[i] - diff;


    if (dstAligned == nullptr) {

      if (not isAlignedTo<V>(dst)) {

        size_t dsti = reinterpret_cast<size_t>(dst);

        uint32_t offset = (alignof(V) - dsti % alignof(V)) / sizeof(T);

        offset = CAMath::Min<uint32_t>(offset, srcSize);

        compressorMemcpyBasic(dst, src + srcOffset, offset, nLanes, iLane);

        dst += offset;

        srcPos += offset;

      }

      if (isAlignedTo<V>(dst)) {

        dstAligned = reinterpret_cast<V*>(dst);

      }

    }

    while (srcPos < srcSize) {

      uint32_t shmElemsLeft = bufTSize - shmPos;

      uint32_t srcElemsLeft = srcSize - srcPos;

      uint32_t size = CAMath::Min(srcElemsLeft, shmElemsLeft);

      compressorMemcpyBasic(bufT + shmPos, src + srcOffset + srcPos, size, nLanes, iLane);

      srcPos += size;

      shmPos += size;

      GPUbarrierWarp();


      if (shmPos >= bufTSize) {

        compressorMemcpyBasic(dstAligned + dstOffset, buf, bufSize, nLanes, iLane);

        dstOffset += bufSize;

        shmPos = 0;

        GPUbarrierWarp();

      }

    }

  }


  compressorMemcpyBasic(reinterpret_cast<T*>(dstAligned + dstOffset), bufT, shmPos, nLanes, iLane);

  GPUbarrierWarp();

}


template <typename T>


GPUdi() uint32_t GPUTPCCompressionGatherKernels::calculateWarpOffsets(GPUSharedMemory& smem, T* nums, uint32_t start, uint32_t end, int32_t nWarps, int32_t iWarp, int32_t nLanes, int32_t iLane)

{

  uint32_t blockOffset = 0;

  int32_t iThread = nLanes * iWarp + iLane;

  int32_t nThreads = nLanes * nWarps;

  uint32_t blockStart = work_group_broadcast(start, 0);

  for (uint32_t i = iThread; i < blockStart; i += nThreads) {

    blockOffset += nums[i];

  }

  blockOffset = work_group_reduce_add(blockOffset);


  uint32_t offset = 0;

  for (uint32_t i = start + iLane; i < end; i += nLanes) {

    offset += nums[i];

  }

  offset = work_group_scan_inclusive_add(offset);

  if (iWarp > -1 && iLane == nLanes - 1) {

    smem.warpOffset[iWarp] = offset;

  }

  GPUbarrier();

  offset = (iWarp <= 0) ? 0 : smem.warpOffset[iWarp - 1];

  GPUbarrier();


  return offset + blockOffset;

}


template <>


GPUdii() void GPUTPCCompressionGatherKernels::Thread<GPUTPCCompressionGatherKernels::unbuffered>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)

{

  GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor;

  const o2::tpc::ClusterNativeAccess* GPUrestrict() clusters = processors.ioPtrs.clustersNative;


  int32_t nWarps = nThreads / GPUCA_WARP_SIZE;

  int32_t iWarp = iThread / GPUCA_WARP_SIZE;


  int32_t nLanes = GPUCA_WARP_SIZE;

  int32_t iLane = iThread % GPUCA_WARP_SIZE;


  if (iBlock == 0) {


    uint32_t nRows = compressor.NSECTORS * GPUCA_ROW_COUNT;

    uint32_t rowsPerWarp = (nRows + nWarps - 1) / nWarps;

    uint32_t rowStart = rowsPerWarp * iWarp;

    uint32_t rowEnd = CAMath::Min(nRows, rowStart + rowsPerWarp);

    if (rowStart >= nRows) {

      rowStart = 0;

      rowEnd = 0;

    }


    uint32_t rowsOffset = calculateWarpOffsets(smem, compressor.mPtrs.nSliceRowClusters, rowStart, rowEnd, nWarps, iWarp, nLanes, iLane);


    compressorMemcpy(compressor.mOutput->nSliceRowClusters, compressor.mPtrs.nSliceRowClusters, compressor.NSECTORS * GPUCA_ROW_COUNT, nThreads, iThread);

    compressorMemcpy(compressor.mOutput->nTrackClusters, compressor.mPtrs.nTrackClusters, compressor.mMemory->nStoredTracks, nThreads, iThread);

    compressorMemcpy(compressor.mOutput->qPtA, compressor.mPtrs.qPtA, compressor.mMemory->nStoredTracks, nThreads, iThread);

    compressorMemcpy(compressor.mOutput->rowA, compressor.mPtrs.rowA, compressor.mMemory->nStoredTracks, nThreads, iThread);

    compressorMemcpy(compressor.mOutput->sliceA, compressor.mPtrs.sliceA, compressor.mMemory->nStoredTracks, nThreads, iThread);

    compressorMemcpy(compressor.mOutput->timeA, compressor.mPtrs.timeA, compressor.mMemory->nStoredTracks, nThreads, iThread);

    compressorMemcpy(compressor.mOutput->padA, compressor.mPtrs.padA, compressor.mMemory->nStoredTracks, nThreads, iThread);


    uint32_t sectorStart = rowStart / GPUCA_ROW_COUNT;

    uint32_t sectorEnd = rowEnd / GPUCA_ROW_COUNT;


    uint32_t sectorRowStart = rowStart % GPUCA_ROW_COUNT;

    uint32_t sectorRowEnd = rowEnd % GPUCA_ROW_COUNT;


    for (uint32_t i = sectorStart; i <= sectorEnd && i < compressor.NSECTORS; i++) {

      for (uint32_t j = ((i == sectorStart) ? sectorRowStart : 0); j < ((i == sectorEnd) ? sectorRowEnd : GPUCA_ROW_COUNT); j++) {

        uint32_t nClusters = compressor.mPtrs.nSliceRowClusters[i * GPUCA_ROW_COUNT + j];

        uint32_t clusterOffsetInCache = clusters->clusterOffset[i][j] * compressor.mMaxClusterFactorBase1024 / 1024;

        compressorMemcpy(compressor.mOutput->qTotU + rowsOffset, compressor.mPtrs.qTotU + clusterOffsetInCache, nClusters, nLanes, iLane);

        compressorMemcpy(compressor.mOutput->qMaxU + rowsOffset, compressor.mPtrs.qMaxU + clusterOffsetInCache, nClusters, nLanes, iLane);

        compressorMemcpy(compressor.mOutput->flagsU + rowsOffset, compressor.mPtrs.flagsU + clusterOffsetInCache, nClusters, nLanes, iLane);

        compressorMemcpy(compressor.mOutput->padDiffU + rowsOffset, compressor.mPtrs.padDiffU + clusterOffsetInCache, nClusters, nLanes, iLane);

        compressorMemcpy(compressor.mOutput->timeDiffU + rowsOffset, compressor.mPtrs.timeDiffU + clusterOffsetInCache, nClusters, nLanes, iLane);

        compressorMemcpy(compressor.mOutput->sigmaPadU + rowsOffset, compressor.mPtrs.sigmaPadU + clusterOffsetInCache, nClusters, nLanes, iLane);

        compressorMemcpy(compressor.mOutput->sigmaTimeU + rowsOffset, compressor.mPtrs.sigmaTimeU + clusterOffsetInCache, nClusters, nLanes, iLane);

        rowsOffset += nClusters;

      }

    }

  }


  if (iBlock == 1) {

    uint32_t tracksPerWarp = (compressor.mMemory->nStoredTracks + nWarps - 1) / nWarps;

    uint32_t trackStart = tracksPerWarp * iWarp;

    uint32_t trackEnd = CAMath::Min(compressor.mMemory->nStoredTracks, trackStart + tracksPerWarp);

    if (trackStart >= compressor.mMemory->nStoredTracks) {

      trackStart = 0;

      trackEnd = 0;

    }


    uint32_t tracksOffset = calculateWarpOffsets(smem, compressor.mPtrs.nTrackClusters, trackStart, trackEnd, nWarps, iWarp, nLanes, iLane);


    for (uint32_t i = trackStart; i < trackEnd; i += nLanes) {

      uint32_t nTrackClusters = 0;

      uint32_t srcOffset = 0;


      if (i + iLane < trackEnd) {

        nTrackClusters = compressor.mPtrs.nTrackClusters[i + iLane];

        srcOffset = compressor.mAttachedClusterFirstIndex[i + iLane];

      }

      smem.unbuffered.sizes[iWarp][iLane] = nTrackClusters;

      smem.unbuffered.srcOffsets[iWarp][iLane] = srcOffset;


      uint32_t elems = (i + nLanes < trackEnd) ? nLanes : (trackEnd - i);


      for (uint32_t j = 0; j < elems; j++) {

        nTrackClusters = smem.unbuffered.sizes[iWarp][j];

        srcOffset = smem.unbuffered.srcOffsets[iWarp][j];

        uint32_t idx = i + j;

        compressorMemcpy(compressor.mOutput->qTotA + tracksOffset, compressor.mPtrs.qTotA + srcOffset, nTrackClusters, nLanes, iLane);

        compressorMemcpy(compressor.mOutput->qMaxA + tracksOffset, compressor.mPtrs.qMaxA + srcOffset, nTrackClusters, nLanes, iLane);

        compressorMemcpy(compressor.mOutput->flagsA + tracksOffset, compressor.mPtrs.flagsA + srcOffset, nTrackClusters, nLanes, iLane);

        compressorMemcpy(compressor.mOutput->sigmaPadA + tracksOffset, compressor.mPtrs.sigmaPadA + srcOffset, nTrackClusters, nLanes, iLane);

        compressorMemcpy(compressor.mOutput->sigmaTimeA + tracksOffset, compressor.mPtrs.sigmaTimeA + srcOffset, nTrackClusters, nLanes, iLane);


        // First index stored with track

        compressorMemcpy(compressor.mOutput->rowDiffA + tracksOffset - idx, compressor.mPtrs.rowDiffA + srcOffset + 1, (nTrackClusters - 1), nLanes, iLane);

        compressorMemcpy(compressor.mOutput->sliceLegDiffA + tracksOffset - idx, compressor.mPtrs.sliceLegDiffA + srcOffset + 1, (nTrackClusters - 1), nLanes, iLane);

        compressorMemcpy(compressor.mOutput->padResA + tracksOffset - idx, compressor.mPtrs.padResA + srcOffset + 1, (nTrackClusters - 1), nLanes, iLane);

        compressorMemcpy(compressor.mOutput->timeResA + tracksOffset - idx, compressor.mPtrs.timeResA + srcOffset + 1, (nTrackClusters - 1), nLanes, iLane);


        tracksOffset += nTrackClusters;

      }

    }

  }

}


template <typename V>


GPUdii() void GPUTPCCompressionGatherKernels::gatherBuffered(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)

{


  GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor;

  const o2::tpc::ClusterNativeAccess* GPUrestrict() clusters = processors.ioPtrs.clustersNative;


  int32_t nWarps = nThreads / GPUCA_WARP_SIZE;

  int32_t iWarp = iThread / GPUCA_WARP_SIZE;


  int32_t nGlobalWarps = nWarps * nBlocks;

  int32_t iGlobalWarp = nWarps * iBlock + iWarp;


  int32_t nLanes = GPUCA_WARP_SIZE;

  int32_t iLane = iThread % GPUCA_WARP_SIZE;


  auto& input = compressor.mPtrs;

  auto* output = compressor.mOutput;


  uint32_t nRows = compressor.NSECTORS * GPUCA_ROW_COUNT;

  uint32_t rowsPerWarp = (nRows + nGlobalWarps - 1) / nGlobalWarps;

  uint32_t rowStart = rowsPerWarp * iGlobalWarp;

  uint32_t rowEnd = CAMath::Min(nRows, rowStart + rowsPerWarp);

  if (rowStart >= nRows) {

    rowStart = 0;

    rowEnd = 0;

  }

  rowsPerWarp = rowEnd - rowStart;


  uint32_t rowsOffset = calculateWarpOffsets(smem, input.nSliceRowClusters, rowStart, rowEnd, nWarps, iWarp, nLanes, iLane);


  uint32_t nStoredTracks = compressor.mMemory->nStoredTracks;

  uint32_t tracksPerWarp = (nStoredTracks + nGlobalWarps - 1) / nGlobalWarps;

  uint32_t trackStart = tracksPerWarp * iGlobalWarp;

  uint32_t trackEnd = CAMath::Min(nStoredTracks, trackStart + tracksPerWarp);

  if (trackStart >= nStoredTracks) {

    trackStart = 0;

    trackEnd = 0;

  }

  tracksPerWarp = trackEnd - trackStart;


  uint32_t tracksOffset = calculateWarpOffsets(smem, input.nTrackClusters, trackStart, trackEnd, nWarps, iWarp, nLanes, iLane);


  if (iBlock == 0) {

    compressorMemcpyBasic(output->nSliceRowClusters, input.nSliceRowClusters, compressor.NSECTORS * GPUCA_ROW_COUNT, nThreads, iThread);

    compressorMemcpyBasic(output->nTrackClusters, input.nTrackClusters, compressor.mMemory->nStoredTracks, nThreads, iThread);

    compressorMemcpyBasic(output->qPtA, input.qPtA, compressor.mMemory->nStoredTracks, nThreads, iThread);

    compressorMemcpyBasic(output->rowA, input.rowA, compressor.mMemory->nStoredTracks, nThreads, iThread);

    compressorMemcpyBasic(output->sliceA, input.sliceA, compressor.mMemory->nStoredTracks, nThreads, iThread);

    compressorMemcpyBasic(output->timeA, input.timeA, compressor.mMemory->nStoredTracks, nThreads, iThread);

    compressorMemcpyBasic(output->padA, input.padA, compressor.mMemory->nStoredTracks, nThreads, iThread);

  }


  const uint32_t* clusterOffsets = &clusters->clusterOffset[0][0] + rowStart;

  const uint32_t* nSectorRowClusters = input.nSliceRowClusters + rowStart;


  auto* buf = smem.getBuffer<V>(iWarp);


  compressorMemcpyBuffered(buf, output->qTotU + rowsOffset, input.qTotU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);

  compressorMemcpyBuffered(buf, output->qMaxU + rowsOffset, input.qMaxU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);

  compressorMemcpyBuffered(buf, output->flagsU + rowsOffset, input.flagsU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);

  compressorMemcpyBuffered(buf, output->padDiffU + rowsOffset, input.padDiffU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);

  compressorMemcpyBuffered(buf, output->timeDiffU + rowsOffset, input.timeDiffU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);

  compressorMemcpyBuffered(buf, output->sigmaPadU + rowsOffset, input.sigmaPadU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);

  compressorMemcpyBuffered(buf, output->sigmaTimeU + rowsOffset, input.sigmaTimeU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);


  const uint16_t* nTrackClustersPtr = input.nTrackClusters + trackStart;

  const uint32_t* aClsFstIdx = compressor.mAttachedClusterFirstIndex + trackStart;


  compressorMemcpyBuffered(buf, output->qTotA + tracksOffset, input.qTotA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);

  compressorMemcpyBuffered(buf, output->qMaxA + tracksOffset, input.qMaxA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);

  compressorMemcpyBuffered(buf, output->flagsA + tracksOffset, input.flagsA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);

  compressorMemcpyBuffered(buf, output->sigmaPadA + tracksOffset, input.sigmaPadA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);

  compressorMemcpyBuffered(buf, output->sigmaTimeA + tracksOffset, input.sigmaTimeA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);


  // First index stored with track

  uint32_t tracksOffsetDiff = tracksOffset - trackStart;

  compressorMemcpyBuffered(buf, output->rowDiffA + tracksOffsetDiff, input.rowDiffA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);

  compressorMemcpyBuffered(buf, output->sliceLegDiffA + tracksOffsetDiff, input.sliceLegDiffA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);

  compressorMemcpyBuffered(buf, output->padResA + tracksOffsetDiff, input.padResA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);

  compressorMemcpyBuffered(buf, output->timeResA + tracksOffsetDiff, input.timeResA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);

}


GPUdii() void GPUTPCCompressionGatherKernels::gatherMulti(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)

{

  GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor;

  const o2::tpc::ClusterNativeAccess* GPUrestrict() clusters = processors.ioPtrs.clustersNative;

  const auto& input = compressor.mPtrs;

  auto* output = compressor.mOutput;


  const int32_t nWarps = nThreads / GPUCA_WARP_SIZE;

  const int32_t iWarp = iThread / GPUCA_WARP_SIZE;

  const int32_t nLanes = GPUCA_WARP_SIZE;

  const int32_t iLane = iThread % GPUCA_WARP_SIZE;

  auto* buf = smem.getBuffer<Vec128>(iWarp);


  if (iBlock == 0) {

    compressorMemcpyBasic(output->nSliceRowClusters, input.nSliceRowClusters, compressor.NSECTORS * GPUCA_ROW_COUNT, nThreads, iThread);

    compressorMemcpyBasic(output->nTrackClusters, input.nTrackClusters, compressor.mMemory->nStoredTracks, nThreads, iThread);

    compressorMemcpyBasic(output->qPtA, input.qPtA, compressor.mMemory->nStoredTracks, nThreads, iThread);

    compressorMemcpyBasic(output->rowA, input.rowA, compressor.mMemory->nStoredTracks, nThreads, iThread);

    compressorMemcpyBasic(output->sliceA, input.sliceA, compressor.mMemory->nStoredTracks, nThreads, iThread);

    compressorMemcpyBasic(output->timeA, input.timeA, compressor.mMemory->nStoredTracks, nThreads, iThread);

    compressorMemcpyBasic(output->padA, input.padA, compressor.mMemory->nStoredTracks, nThreads, iThread);

  } else if (iBlock & 1) {

    const uint32_t nGlobalWarps = nWarps * (nBlocks - 1) / 2;

    const uint32_t iGlobalWarp = nWarps * (iBlock - 1) / 2 + iWarp;


    const uint32_t nRows = compressor.NSECTORS * GPUCA_ROW_COUNT;

    uint32_t rowsPerWarp = (nRows + nGlobalWarps - 1) / nGlobalWarps;

    uint32_t rowStart = rowsPerWarp * iGlobalWarp;

    uint32_t rowEnd = CAMath::Min(nRows, rowStart + rowsPerWarp);

    if (rowStart >= nRows) {

      rowStart = 0;

      rowEnd = 0;

    }

    rowsPerWarp = rowEnd - rowStart;


    const uint32_t rowsOffset = calculateWarpOffsets(smem, input.nSliceRowClusters, rowStart, rowEnd, nWarps, iWarp, nLanes, iLane);

    const uint32_t* clusterOffsets = &clusters->clusterOffset[0][0] + rowStart;

    const uint32_t* nSectorRowClusters = input.nSliceRowClusters + rowStart;


    compressorMemcpyBuffered(buf, output->qTotU + rowsOffset, input.qTotU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);

    compressorMemcpyBuffered(buf, output->qMaxU + rowsOffset, input.qMaxU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);

    compressorMemcpyBuffered(buf, output->flagsU + rowsOffset, input.flagsU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);

    compressorMemcpyBuffered(buf, output->padDiffU + rowsOffset, input.padDiffU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);

    compressorMemcpyBuffered(buf, output->timeDiffU + rowsOffset, input.timeDiffU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);

    compressorMemcpyBuffered(buf, output->sigmaPadU + rowsOffset, input.sigmaPadU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);

    compressorMemcpyBuffered(buf, output->sigmaTimeU + rowsOffset, input.sigmaTimeU, nSectorRowClusters, clusterOffsets, rowsPerWarp, nLanes, iLane, 0, compressor.mMaxClusterFactorBase1024);

  } else {

    const uint32_t nGlobalWarps = nWarps * (nBlocks - 1) / 2;

    const uint32_t iGlobalWarp = nWarps * (iBlock / 2 - 1) + iWarp;


    const uint32_t nStoredTracks = compressor.mMemory->nStoredTracks;

    uint32_t tracksPerWarp = (nStoredTracks + nGlobalWarps - 1) / nGlobalWarps;

    uint32_t trackStart = tracksPerWarp * iGlobalWarp;

    uint32_t trackEnd = CAMath::Min(nStoredTracks, trackStart + tracksPerWarp);

    if (trackStart >= nStoredTracks) {

      trackStart = 0;

      trackEnd = 0;

    }

    tracksPerWarp = trackEnd - trackStart;


    const uint32_t tracksOffset = calculateWarpOffsets(smem, input.nTrackClusters, trackStart, trackEnd, nWarps, iWarp, nLanes, iLane);

    const uint16_t* nTrackClustersPtr = input.nTrackClusters + trackStart;

    const uint32_t* aClsFstIdx = compressor.mAttachedClusterFirstIndex + trackStart;


    compressorMemcpyBuffered(buf, output->qTotA + tracksOffset, input.qTotA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);

    compressorMemcpyBuffered(buf, output->qMaxA + tracksOffset, input.qMaxA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);

    compressorMemcpyBuffered(buf, output->flagsA + tracksOffset, input.flagsA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);

    compressorMemcpyBuffered(buf, output->sigmaPadA + tracksOffset, input.sigmaPadA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);

    compressorMemcpyBuffered(buf, output->sigmaTimeA + tracksOffset, input.sigmaTimeA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 0);


    // First index stored with track

    uint32_t tracksOffsetDiff = tracksOffset - trackStart;

    compressorMemcpyBuffered(buf, output->rowDiffA + tracksOffsetDiff, input.rowDiffA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);

    compressorMemcpyBuffered(buf, output->sliceLegDiffA + tracksOffsetDiff, input.sliceLegDiffA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);

    compressorMemcpyBuffered(buf, output->padResA + tracksOffsetDiff, input.padResA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);

    compressorMemcpyBuffered(buf, output->timeResA + tracksOffsetDiff, input.timeResA, nTrackClustersPtr, aClsFstIdx, tracksPerWarp, nLanes, iLane, 1);

  }

}


template <>


GPUdii() void GPUTPCCompressionGatherKernels::Thread<GPUTPCCompressionGatherKernels::buffered32>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)

{

  gatherBuffered<Vec32>(nBlocks, nThreads, iBlock, iThread, smem, processors);

}


template <>


GPUdii() void GPUTPCCompressionGatherKernels::Thread<GPUTPCCompressionGatherKernels::buffered64>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)

{

  gatherBuffered<Vec64>(nBlocks, nThreads, iBlock, iThread, smem, processors);

}


template <>


GPUdii() void GPUTPCCompressionGatherKernels::Thread<GPUTPCCompressionGatherKernels::buffered128>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)

{

  gatherBuffered<Vec128>(nBlocks, nThreads, iBlock, iThread, smem, processors);

}


template <>


GPUdii() void GPUTPCCompressionGatherKernels::Thread<GPUTPCCompressionGatherKernels::multiBlock>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)

{

  gatherMulti(nBlocks, nThreads, iBlock, iThread, smem, processors);

}


time
int16_t time
Definition RawEventData.h:4

GPUCommonAlgorithm.h

i
int32_t i
Definition GPUCommonAlgorithm.h:431

get_local_size
#define get_local_size(dim)
Definition GPUCommonDefAPI.h:229

get_local_id
#define get_local_id(dim)
Definition GPUCommonDefAPI.h:228

GPUsharedref
#define GPUsharedref()
Definition GPUCommonDefAPI.h:56

GPUbarrierWarp
#define GPUbarrierWarp()
Definition GPUCommonDefAPI.h:54

get_global_size
#define get_global_size(dim)
Definition GPUCommonDefAPI.h:226

GPUbarrier
#define GPUbarrier()
Definition GPUCommonDefAPI.h:53

GPUrestrict
#define GPUrestrict()
Definition GPUCommonDefAPI.h:212

get_global_id
#define get_global_id(dim)
Definition GPUCommonDefAPI.h:225

GPUConstantMem.h

GPUCA_TPC_COMP_CHUNK_SIZE
#define GPUCA_TPC_COMP_CHUNK_SIZE
Definition GPUDefConstantsAndSettings.h:47

GPUCA_GET_THREAD_COUNT
#define GPUCA_GET_THREAD_COUNT(...)
Definition GPUDefParametersDefault.h:520

GPUCA_WARP_SIZE
#define GPUCA_WARP_SIZE
Definition GPUDefParametersDefault.h:579

GPUO2DataTypes.h

GPUParam.h

GPUTPCClusterRejection.h

GPUdii
GPUdii() void GPUTPCCompressionKernels
Definition GPUTPCCompressionKernels.cxx:28

GPUTPCCompressionKernels.h

GPUTPCCompressionTrackModel.h

GPUCA_NSECTORS
#define GPUCA_NSECTORS
Definition GPUTPCGeometry.h:22

GPUCA_ROW_COUNT
#define GPUCA_ROW_COUNT
Definition GPUTPCGeometry.h:23

output
void output(const std::map< std::string, ChannelStat > &channels)
Definition rawdump.cxx:197

j
uint32_t j
Definition RawData.h:0

c
uint32_t c
Definition RawData.h:2

ptr
TBranch * ptr
Definition TTreePlugin.cxx:836

nClusters
int nClusters
Definition bench_Clusterizer.cxx:120

Measurement::Size
@ Size

int

o2::gpu::GPUCommonMath
Definition GPUCommonMath.h:49

o2::gpu::GPUSettings
Definition GPUSettings.h:32

o2::gpu::GPUSettings::SortTime
@ SortTime
Definition GPUSettings.h:38

o2::gpu::GPUSettings::SortZTimePad
@ SortZTimePad
Definition GPUSettings.h:40

o2::gpu::GPUSettings::SortPad
@ SortPad
Definition GPUSettings.h:39

o2::gpu::GPUSettings::SortZPadTime
@ SortZPadTime
Definition GPUSettings.h:41

o2::gpu::GPUSettings::RejectionStrategyB
@ RejectionStrategyB
Definition GPUSettings.h:45

o2::gpu::GPUSettings::RejectionStrategyA
@ RejectionStrategyA
Definition GPUSettings.h:44

o2::gpu::GPUSettings::CompressionDifferences
@ CompressionDifferences
Definition GPUSettings.h:35

o2::gpu::GPUSettings::CompressionTruncate
@ CompressionTruncate
Definition GPUSettings.h:34

o2::gpu::GPUSettings::CompressionTrackModel
@ CompressionTrackModel
Definition GPUSettings.h:36

o2::gpu::GPUTPCCompressionGatherKernels
Definition GPUTPCCompressionKernels.h:59

o2::gpu::GPUTPCCompressionKernels
Definition GPUTPCCompressionKernels.h:28

o2::gpu::GPUTPCCompressionTrackModel
Definition GPUTPCCompressionTrackModel.h:45

o2::gpu::GPUTPCCompression
Definition GPUTPCCompression.h:29

o2::gpu::GPUTPCCompression::mAttachedClusterFirstIndex
uint32_t * mAttachedClusterFirstIndex
Definition GPUTPCCompression.h:75

o2::gpu::GPUTPCCompression::mClusterStatus
uint8_t * mClusterStatus
Definition GPUTPCCompression.h:76

o2::gpu::GPUTPCCompression::mOutput
o2::tpc::CompressedClusters * mOutput
Definition GPUTPCCompression.h:70

o2::gpu::GPUTPCCompression::NSECTORS
static constexpr uint32_t NSECTORS
Definition GPUTPCCompression.h:67

o2::gpu::GPUTPCCompression::mPtrs
o2::tpc::CompressedClustersPtrs mPtrs
Definition GPUTPCCompression.h:69

o2::gpu::GPUTPCCompression::mMemory
memory * mMemory
Definition GPUTPCCompression.h:74

o2::gpu::GPUTPCCompression::mMaxClusterFactorBase1024
size_t mMaxClusterFactorBase1024
Definition GPUTPCCompression.h:82

o2::gpu::GPUTPCGMMergedTrack
Definition GPUTPCGMMergedTrack.h:29

o2::gpu::GPUTPCGeometry
Definition GPUTPCGeometry.h:84

x
GLint GLenum GLint x
Definition glcorearb.h:403

src
GLenum src
Definition glcorearb.h:1767

count
GLint GLsizei count
Definition glcorearb.h:399

size
GLsizeiptr size
Definition glcorearb.h:659

end
GLuint GLuint end
Definition glcorearb.h:469

bufSize
GLuint GLsizei bufSize
Definition glcorearb.h:790

b
GLboolean GLboolean GLboolean b
Definition glcorearb.h:1233

dst
GLenum GLenum dst
Definition glcorearb.h:1767

offset
GLintptr offset
Definition glcorearb.h:660

void
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)

start
GLuint start
Definition glcorearb.h:469

param
GLenum GLfloat param
Definition glcorearb.h:271

a
GLboolean GLboolean GLboolean GLboolean a
Definition glcorearb.h:1233

buf
GLenum GLuint GLenum GLsizei const GLchar * buf
Definition glcorearb.h:2514

id
GLuint id
Definition glcorearb.h:650

z
GLdouble GLdouble GLdouble z
Definition glcorearb.h:843

o2::gpu::gputpcgmmergertypes::attachTrackMask
@ attachTrackMask
Definition GPUTPCGMMergerTypes.h:29

o2::gpu
Definition TrackTRD.h:35

o2::its3::constants::pixelarray::nRows
constexpr int nRows
Definition SpecsV2.h:34

o2::rans::idx
auto idx
Definition DenseHistogram.h:610

o2::tpc
Global TPC definitions and constants.
Definition SimTraits.h:167

o2::tpc::GPUd
GPUd() void PIDResponse
Definition PIDResponse.h:71

o2::tpc::GPUdi
GPUdi() T BetheBlochAleph(T bg
Definition GPUTPCCompressionKernels.cxx:493

o2
a couple of static helper functions to create timestamp values for CCDB queries or override obsolete ...
Definition BitstreamReader.h:24

rec
GPUReconstruction * rec
Definition standalone.cxx:69

S
Definition cxx14-test-aggregate-initialization.cxx:18

o2::gpu::GPUParam
Definition GPUParam.h:77

o2::gpu::GPUTPCClusterRejection::GetIsRejected
static constexpr bool GetIsRejected(int32_t attach)
Definition GPUTPCClusterRejection.h:62

o2::gpu::GPUTPCCompression::memory::nStoredAttachedClusters
uint32_t nStoredAttachedClusters
Definition GPUTPCCompression.h:63

o2::gpu::GPUTPCCompression::memory::nStoredTracks
uint32_t nStoredTracks
Definition GPUTPCCompression.h:62

o2::gpu::GPUTPCGMMergedTrackHit
Definition GPUTPCGMMergedTrackHit.h:22

o2::gpu::GPUTPCGMMergedTrackHit::state
uint8_t state
Definition GPUTPCGMMergedTrackHit.h:24

o2::gpu::GPUTPCGMMergedTrackHit::row
uint8_t row
Definition GPUTPCGMMergedTrackHit.h:24

o2::gpu::GPUTPCGMMergedTrackHit::num
uint32_t num
Definition GPUTPCGMMergedTrackHit.h:23

o2::gpu::GPUTPCGMMergedTrackHit::leg
uint8_t leg
Definition GPUTPCGMMergedTrackHit.h:24

o2::gpu::GPUTPCGMMergedTrackHit::sector
uint8_t sector
Definition GPUTPCGMMergedTrackHit.h:24

o2::gpu::GPUTPCGMMergedTrackHit::flagReject
@ flagReject
Definition GPUTPCGMMergedTrackHit.h:37

o2::gpu::GPUTrackingInOutPointers
Definition GPUDataTypes.h:215

o2::gpu::GPUTrackingInOutPointers::clustersNative
const o2::tpc::ClusterNativeAccess * clustersNative
Definition GPUDataTypes.h:226

o2::gpu::GPUTrackingInOutPointers::nMergedTracks
uint32_t nMergedTracks
Definition GPUDataTypes.h:238

o2::gpu::GPUTrackingInOutPointers::mergedTrackHitAttachment
const uint32_t * mergedTrackHitAttachment
Definition GPUDataTypes.h:242

o2::gpu::GPUTrackingInOutPointers::mergedTrackHits
const GPUTPCGMMergedTrackHit * mergedTrackHits
Definition GPUDataTypes.h:239

o2::gpu::GPUTrackingInOutPointers::mergedTracks
const GPUTPCGMMergedTrack * mergedTracks
Definition GPUDataTypes.h:237

o2::tpc::ClusterNativeAccess
Definition ClusterNative.h:174

o2::tpc::ClusterNative
Definition ClusterNative.h:54

o2::tpc::ClusterNative::qMax
uint16_t qMax
Definition ClusterNative.h:71

o2::tpc::ClusterNative::qTot
uint16_t qTot
Definition ClusterNative.h:72

o2::tpc::ClusterNative::sigmaPadPacked
uint8_t sigmaPadPacked
Definition ClusterNative.h:70

o2::tpc::ClusterNative::sigmaTimePacked
uint8_t sigmaTimePacked
Definition ClusterNative.h:69

o2::tpc::ClusterNative::padPacked
uint16_t padPacked
Definition ClusterNative.h:68

o2::tpc::CompressedClustersPtrs_x::nTrackClusters
TSHORT nTrackClusters
Definition CompressedClusters.h:65

o2::tpc::CompressedClustersPtrs_x::sigmaPadA
TCHAR sigmaPadA
Definition CompressedClusters.h:48

o2::tpc::CompressedClustersPtrs_x::qMaxU
TSHORT qMaxU
Definition CompressedClusters.h:58

o2::tpc::CompressedClustersPtrs_x::qTotU
TSHORT qTotU
Definition CompressedClusters.h:57

o2::tpc::CompressedClustersPtrs_x::rowA
TCHAR rowA
Definition CompressedClusters.h:52

o2::tpc::CompressedClustersPtrs_x::timeA
TINT timeA
Definition CompressedClusters.h:54

o2::tpc::CompressedClustersPtrs_x::rowDiffA
TCHAR rowDiffA
Definition CompressedClusters.h:44

o2::tpc::CompressedClustersPtrs_x::padDiffU
TSHORT padDiffU
Definition CompressedClusters.h:60

o2::tpc::CompressedClustersPtrs_x::sigmaPadU
TCHAR sigmaPadU
Definition CompressedClusters.h:62

o2::tpc::CompressedClustersPtrs_x::padA
TSHORT padA
Definition CompressedClusters.h:55

o2::tpc::CompressedClustersPtrs_x::timeResA
TINT timeResA
Definition CompressedClusters.h:47

o2::tpc::CompressedClustersPtrs_x::padResA
TSHORT padResA
Definition CompressedClusters.h:46

o2::tpc::CompressedClustersPtrs_x::sliceA
TCHAR sliceA
Definition CompressedClusters.h:53

o2::tpc::CompressedClustersPtrs_x::qPtA
TCHAR qPtA
Definition CompressedClusters.h:51

o2::tpc::CompressedClustersPtrs_x::sigmaTimeU
TCHAR sigmaTimeU
Definition CompressedClusters.h:63

o2::tpc::CompressedClustersPtrs_x::qTotA
TSHORT qTotA
Definition CompressedClusters.h:41

o2::tpc::CompressedClustersPtrs_x::flagsA
TCHAR flagsA
Definition CompressedClusters.h:43

o2::tpc::CompressedClustersPtrs_x::sigmaTimeA
TCHAR sigmaTimeA
Definition CompressedClusters.h:49

o2::tpc::CompressedClustersPtrs_x::timeDiffU
TINT timeDiffU
Definition CompressedClusters.h:61

o2::tpc::CompressedClustersPtrs_x::sliceLegDiffA
TCHAR sliceLegDiffA
Definition CompressedClusters.h:45

o2::tpc::CompressedClustersPtrs_x::nSliceRowClusters
TINT nSliceRowClusters
Definition CompressedClusters.h:66

o2::tpc::CompressedClustersPtrs_x::qMaxA
TSHORT qMaxA
Definition CompressedClusters.h:42

o2::tpc::CompressedClustersPtrs_x::flagsU
TCHAR flagsU
Definition CompressedClusters.h:59

o2::tpc::CompressedClustersPtrs
Definition CompressedClusters.h:71

getBuffer
std::vector< std::byte > getBuffer(const char *filename)
Definition testClosureCoDecDigit.cxx:111

clusters
std::vector< Cluster > clusters
Definition test_ctf_io_cpv.cxx:41

for
for(int irof=0;irof< 1000;irof++)
Definition test_ctf_io_cpv.cxx:46

row
std::vector< int > row
Definition test_ctf_io_itsmft.cxx:48