38 RecoStep myStep = RecoStep::TPCCompression;
48 if (gatherMode == 3) {
56 runKernel<GPUTPCCompressionKernels, GPUTPCCompressionKernels::step0attached>(
GetGridAuto(0));
58 WriteReducedClusters();
60 runKernel<GPUTPCCompressionKernels, GPUTPCCompressionKernels::step1unattached>(
GetGridAuto(0));
62#ifdef GPUCA_TPC_GEOMETRY_O2
66 foreignChain->RunTPCClusterizer_prepare(
false);
72 memset((
void*)O, 0,
sizeof(*O));
85 if (gatherMode == 3) {
91 int32_t outputStream = 0;
94 outputStream = OutputStream();
96 if (gatherMode >= 2) {
97 if (gatherMode == 2) {
101 for (uint32_t
i = 0;
i <
sizeof(ptrs) /
sizeof(
void*);
i++) {
102 reinterpret_cast<char**
>(&ptrs)[
i] =
reinterpret_cast<char**
>(&ptrs)[
i] + (
reinterpret_cast<char*
>(devicePtr) -
reinterpret_cast<char*
>(Compressor.
mOutputFlat));
107 constexpr uint32_t nBlocksDefault = 2;
108 constexpr uint32_t nBlocksMulti = 1 + 2 * 200;
110 switch (gatherModeKernel) {
112 runKernel<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::unbuffered>(
GetGridBlkStep(nBlocksDefault, outputStream, RecoStep::TPCCompression));
113 getKernelTimer<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::unbuffered>(RecoStep::TPCCompression, 0, outputSize,
false);
116 runKernel<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::buffered32>(
GetGridBlkStep(nBlocksDefault, outputStream, RecoStep::TPCCompression));
117 getKernelTimer<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::buffered32>(RecoStep::TPCCompression, 0, outputSize,
false);
120 runKernel<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::buffered64>(
GetGridBlkStep(nBlocksDefault, outputStream, RecoStep::TPCCompression));
121 getKernelTimer<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::buffered64>(RecoStep::TPCCompression, 0, outputSize,
false);
124 runKernel<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::buffered128>(
GetGridBlkStep(nBlocksDefault, outputStream, RecoStep::TPCCompression));
125 getKernelTimer<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::buffered128>(RecoStep::TPCCompression, 0, outputSize,
false);
128 static_assert((nBlocksMulti & 1) && nBlocksMulti >= 3);
129 runKernel<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::multiBlock>(
GetGridBlkStep(nBlocksMulti, outputStream, RecoStep::TPCCompression));
130 getKernelTimer<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::multiBlock>(RecoStep::TPCCompression, 0, outputSize,
false);
133 GPUError(
"Invalid compression kernel %d selected.", (int32_t)gatherModeKernel);
136 if (gatherMode == 3) {
140 const size_t blockSize = CAMath::nextMultipleOf<1024>(copySize / 30);
141 const uint32_t
n = (copySize + blockSize - 1) / blockSize;
142 for (uint32_t
i = 0;
i <
n;
i++) {
143 GPUMemCpy(myStep, hostFlatPtr +
i * blockSize, deviceFlatPts +
i * blockSize, CAMath::Min(blockSize, copySize -
i * blockSize), outputStream,
false);
146 GPUMemCpy(myStep, hostFlatPtr, deviceFlatPts, copySize, outputStream,
false);
150 int8_t direction = 0;
151 if (gatherMode == 0) {
152 P = &CompressorShadow.
mPtrs;
153 }
else if (gatherMode == 1) {
156 gatherTimer = &getTimer<GPUTPCCompressionKernels>(
"GPUTPCCompression_GatherOnCPU", 0);
157 gatherTimer->
Start();
197 if (gatherMode == 1) {
201 if (gatherMode == 3) {
206 if (mPipelineFinalizationCtx ==
nullptr) {
225 GPUFatal(
"tpcApplyCFCutsAtDecoding, tpcApplyClusterFilterOnCPU and tpcCutTimeBin currently require tpcUseOldCPUDecoding");
230 std::unique_ptr<GPUParam> tmpParam;
231 int32_t inputStream = 0;
233 if (useTemporaryBz) {
234 tmpParam = std::make_unique<GPUParam>(
param());
240 const bool runFiltering = needFullFiltering || runTimeBinCutFiltering;
243 auto allocatorFinal = [
this](
size_t size) {
248 std::unique_ptr<ClusterNative[]> tmpBuffer;
249 auto allocatorTmp = [&tmpBuffer](
size_t size) {
252 auto& decompressTimer = getTimer<TPCClusterDecompressor>(
"TPCDecompression", 0);
253 auto allocatorUse = runFiltering ? std::function<
ClusterNative*(size_t)>{allocatorTmp} : std::function<
ClusterNative*(size_t)>{allocatorFinal};
254 decompressTimer.Start();
256 GPUError(
"Error decompressing clusters");
262 decompressTimer.Stop();
270 mInputsHost->mPclusterNativeAccess->setOffsetPtrs();
277 RecoStep myStep = RecoStep::TPCDecompression;
293 inputGPU = cmprClsHost;
296 inputGPU = cmprClsHost;
303 for (int32_t iStream = 0; iStream < nStreams; iStream++) {
304 uint32_t startTrack = cmprClsHost.
nTracks / nStreams * iStream;
305 uint32_t endTrack = cmprClsHost.
nTracks / nStreams * (iStream + 1) + (iStream < nStreams - 1 ? 0 : cmprClsHost.
nTracks % nStreams);
306 uint32_t numTracks = endTrack - startTrack;
309 uint32_t numClustersRed = numClusters - numTracks;
321 GPUMemCpy(myStep, inputGPUShadow.
qPtA + startTrack, cmprClsHost.
qPtA + startTrack, numTracks *
sizeof(cmprClsHost.
qPtA[0]), iStream, toGPU);
322 GPUMemCpy(myStep, inputGPUShadow.
rowA + startTrack, cmprClsHost.
rowA + startTrack, numTracks *
sizeof(cmprClsHost.
rowA[0]), iStream, toGPU);
323 GPUMemCpy(myStep, inputGPUShadow.
sliceA + startTrack, cmprClsHost.
sliceA + startTrack, numTracks *
sizeof(cmprClsHost.
sliceA[0]), iStream, toGPU);
324 GPUMemCpy(myStep, inputGPUShadow.
timeA + startTrack, cmprClsHost.
timeA + startTrack, numTracks *
sizeof(cmprClsHost.
timeA[0]), iStream, toGPU);
325 GPUMemCpy(myStep, inputGPUShadow.
padA + startTrack, cmprClsHost.
padA + startTrack, numTracks *
sizeof(cmprClsHost.
padA[0]), iStream, toGPU);
341 uint32_t decodedAttachedClusters = 0;
348 offset += unattachedOffset;
356 if (runTimeBinCutFiltering) {
388 uint32_t batchSize = doGPU ? 6 :
NSECTORS;
389 for (uint32_t iSector = 0; iSector <
NSECTORS; iSector = iSector + batchSize) {
390 int32_t iStream = (iSector / batchSize) %
mRec->
NStreams();
393 if (!runTimeBinCutFiltering) {
399 if (runTimeBinCutFiltering) {
403 runKernel<GPUTPCDecompressionUtilKernels, GPUTPCDecompressionUtilKernels::countFilteredClusters>(
GetGridAutoStep(unattachedStream, RecoStep::TPCDecompression));
429 runKernel<GPUTPCDecompressionUtilKernels, GPUTPCDecompressionUtilKernels::storeFilteredClusters>(
GetGridAutoStep(unattachedStream, RecoStep::TPCDecompression));
434 runKernel<GPUTPCDecompressionUtilKernels, GPUTPCDecompressionUtilKernels::sortPerSectorRow>(
GetGridAutoStep(unattachedStream, RecoStep::TPCDecompression));
449 if (useTemporaryBz) {