Project
Loading...
Searching...
No Matches
GPUChainTrackingCompression.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
15#include "GPUChainTracking.h"
17#include "GPULogging.h"
18#include "GPUO2DataTypes.h"
19#include "GPUTPCExtraADC.h"
23#include "GPUDefParametersRuntime.h"
24#include "GPUConstantMem.h" // TODO: Try to get rid of as many GPUConstantMem includes as possible!
29#include "utils/strtag.h"
30
31#include <numeric>
32
33using namespace o2::gpu;
34using namespace o2::tpc;
35
37{
39 RecoStep myStep = RecoStep::TPCCompression;
40 bool doGPU = GetRecoStepsGPU() & RecoStep::TPCCompression;
41 int32_t gatherMode = mRec->GetProcessingSettings().tpcCompressionGatherMode == -1 ? mRec->getGPUParameters(doGPU).par_COMP_GATHER_MODE : mRec->GetProcessingSettings().tpcCompressionGatherMode;
43 GPUTPCCompression& CompressorShadow = doGPU ? processorsShadow()->tpcCompressor : Compressor;
44 const auto& threadContext = GetThreadContext();
45 if (mPipelineFinalizationCtx && GetProcessingSettings().doublePipelineClusterizer) {
47 }
48
49 if (gatherMode == 3) {
51 }
52 SetupGPUProcessor(&Compressor, true);
53 new (Compressor.mMemory) GPUTPCCompression::memory;
54 WriteToConstantMemory(myStep, (char*)&processors()->tpcCompressor - (char*)processors(), &CompressorShadow, sizeof(CompressorShadow), 0);
55 TransferMemoryResourcesToGPU(myStep, &Compressor, 0);
56 runKernel<GPUMemClean16>(GetGridAutoStep(0, RecoStep::TPCCompression), CompressorShadow.mClusterStatus, Compressor.mMaxClusters * sizeof(CompressorShadow.mClusterStatus[0]));
57 runKernel<GPUTPCCompressionKernels, GPUTPCCompressionKernels::step0attached>(GetGridAuto(0));
58 if (GetProcessingSettings().tpcWriteClustersAfterRejection) {
59 WriteReducedClusters();
60 }
61 runKernel<GPUTPCCompressionKernels, GPUTPCCompressionKernels::step1unattached>(GetGridAuto(0));
62 TransferMemoryResourcesToHost(myStep, &Compressor, 0);
63#ifndef GPUCA_RUN2
64 if (mPipelineFinalizationCtx && GetProcessingSettings().doublePipelineClusterizer) {
66 auto* foreignChain = (GPUChainTracking*)GetNextChainInQueue();
67 foreignChain->RunTPCClusterizer_prepare(false, {});
68 foreignChain->mCFContext->ptrClusterNativeSave = processorsShadow()->ioPtrs.clustersNative;
69 }
70#endif
73 memset((void*)O, 0, sizeof(*O));
74 O->nTracks = Compressor.mMemory->nStoredTracks;
79 O->nComppressionModes = param().rec.tpc.compressionTypeMask;
80 O->solenoidBz = param().bzkG;
83 Compressor.mOutputFlat->set(outputSize, *Compressor.mOutput);
84 char* hostFlatPtr = (char*)Compressor.mOutput->qTotU; // First array as allocated in GPUTPCCompression::SetPointersCompressedClusters
85 size_t copySize = 0;
86 if (gatherMode == 3) {
87 CompressorShadow.mOutputA = Compressor.mOutput;
88 copySize = AllocateRegisteredMemory(Compressor.mMemoryResOutputGPU); // We overwrite Compressor.mOutput with the allocated output pointers on the GPU
89 }
90 const o2::tpc::CompressedClustersPtrs* P = nullptr;
91 HighResTimer* gatherTimer = nullptr;
92 int32_t outputStream = 0;
93 if (GetProcessingSettings().doublePipeline) {
94 SynchronizeStream(OutputStream()); // Synchronize output copies running in parallel from memory that might be released, only the following async copy from stacked memory is safe after the chain finishes.
95 outputStream = OutputStream();
96 }
97 if (gatherMode >= 2) {
98 if (gatherMode == 2) {
99 void* devicePtr = mRec->getGPUPointer(Compressor.mOutputFlat);
100 if (devicePtr != Compressor.mOutputFlat) {
101 CompressedClustersPtrs& ptrs = *Compressor.mOutput; // We need to update the ptrs with the gpu-mapped version of the host address space
102 for (uint32_t i = 0; i < sizeof(ptrs) / sizeof(void*); i++) {
103 reinterpret_cast<char**>(&ptrs)[i] = reinterpret_cast<char**>(&ptrs)[i] + (reinterpret_cast<char*>(devicePtr) - reinterpret_cast<char*>(Compressor.mOutputFlat));
104 }
105 }
106 }
107 TransferMemoryResourcesToGPU(myStep, &Compressor, outputStream);
108 constexpr uint32_t nBlocksDefault = 2;
109 constexpr uint32_t nBlocksMulti = 1 + 2 * 200;
110 int32_t gatherModeKernel = mRec->GetProcessingSettings().tpcCompressionGatherModeKernel == -1 ? mRec->getGPUParameters(doGPU).par_COMP_GATHER_KERNEL : mRec->GetProcessingSettings().tpcCompressionGatherMode;
111 switch (gatherModeKernel) {
112 case 0:
113 runKernel<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::unbuffered>(GetGridBlkStep(nBlocksDefault, outputStream, RecoStep::TPCCompression));
114 getKernelTimer<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::unbuffered>(RecoStep::TPCCompression, 0, outputSize, false);
115 break;
116 case 1:
117 runKernel<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::buffered32>(GetGridBlkStep(nBlocksDefault, outputStream, RecoStep::TPCCompression));
118 getKernelTimer<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::buffered32>(RecoStep::TPCCompression, 0, outputSize, false);
119 break;
120 case 2:
121 runKernel<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::buffered64>(GetGridBlkStep(nBlocksDefault, outputStream, RecoStep::TPCCompression));
122 getKernelTimer<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::buffered64>(RecoStep::TPCCompression, 0, outputSize, false);
123 break;
124 case 3:
125 runKernel<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::buffered128>(GetGridBlkStep(nBlocksDefault, outputStream, RecoStep::TPCCompression));
126 getKernelTimer<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::buffered128>(RecoStep::TPCCompression, 0, outputSize, false);
127 break;
128 case 4:
129 static_assert((nBlocksMulti & 1) && nBlocksMulti >= 3);
130 runKernel<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::multiBlock>(GetGridBlkStep(nBlocksMulti, outputStream, RecoStep::TPCCompression));
131 getKernelTimer<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::multiBlock>(RecoStep::TPCCompression, 0, outputSize, false);
132 break;
133 default:
134 GPUError("Invalid compression kernel %d selected.", (int32_t)gatherModeKernel);
135 return 1;
136 }
137 if (gatherMode == 3) {
138 RecordMarker(&mEvents->stream[outputStream], outputStream);
139 char* deviceFlatPts = (char*)Compressor.mOutput->qTotU;
140 if (GetProcessingSettings().doublePipeline) {
141 const size_t blockSize = CAMath::nextMultipleOf<1024>(copySize / 30);
142 const uint32_t n = (copySize + blockSize - 1) / blockSize;
143 for (uint32_t i = 0; i < n; i++) {
144 GPUMemCpy(myStep, hostFlatPtr + i * blockSize, deviceFlatPts + i * blockSize, CAMath::Min(blockSize, copySize - i * blockSize), outputStream, false);
145 }
146 } else {
147 GPUMemCpy(myStep, hostFlatPtr, deviceFlatPts, copySize, outputStream, false);
148 }
149 }
150 } else {
151 int8_t direction = 0;
152 if (gatherMode == 0) {
153 P = &CompressorShadow.mPtrs;
154 } else if (gatherMode == 1) {
155 P = &Compressor.mPtrs;
156 direction = -1;
157 gatherTimer = &getTimer<GPUTPCCompressionKernels>("GPUTPCCompression_GatherOnCPU", 0);
158 gatherTimer->Start();
159 }
160 GPUMemCpyAlways(myStep, O->nSliceRowClusters, P->nSliceRowClusters, NSECTORS * GPUTPCGeometry::NROWS * sizeof(O->nSliceRowClusters[0]), outputStream, direction);
161 GPUMemCpyAlways(myStep, O->nTrackClusters, P->nTrackClusters, O->nTracks * sizeof(O->nTrackClusters[0]), outputStream, direction);
162 SynchronizeStream(outputStream);
163 uint32_t offset = 0;
164 for (uint32_t i = 0; i < NSECTORS; i++) {
165 for (uint32_t j = 0; j < GPUTPCGeometry::NROWS; j++) {
166 uint32_t srcOffset = mIOPtrs.clustersNative->clusterOffset[i][j] * Compressor.mMaxClusterFactorBase1024 / 1024;
167 GPUMemCpyAlways(myStep, O->qTotU + offset, P->qTotU + srcOffset, O->nSliceRowClusters[i * GPUTPCGeometry::NROWS + j] * sizeof(O->qTotU[0]), outputStream, direction);
168 GPUMemCpyAlways(myStep, O->qMaxU + offset, P->qMaxU + srcOffset, O->nSliceRowClusters[i * GPUTPCGeometry::NROWS + j] * sizeof(O->qMaxU[0]), outputStream, direction);
169 GPUMemCpyAlways(myStep, O->flagsU + offset, P->flagsU + srcOffset, O->nSliceRowClusters[i * GPUTPCGeometry::NROWS + j] * sizeof(O->flagsU[0]), outputStream, direction);
170 GPUMemCpyAlways(myStep, O->padDiffU + offset, P->padDiffU + srcOffset, O->nSliceRowClusters[i * GPUTPCGeometry::NROWS + j] * sizeof(O->padDiffU[0]), outputStream, direction);
171 GPUMemCpyAlways(myStep, O->timeDiffU + offset, P->timeDiffU + srcOffset, O->nSliceRowClusters[i * GPUTPCGeometry::NROWS + j] * sizeof(O->timeDiffU[0]), outputStream, direction);
172 GPUMemCpyAlways(myStep, O->sigmaPadU + offset, P->sigmaPadU + srcOffset, O->nSliceRowClusters[i * GPUTPCGeometry::NROWS + j] * sizeof(O->sigmaPadU[0]), outputStream, direction);
173 GPUMemCpyAlways(myStep, O->sigmaTimeU + offset, P->sigmaTimeU + srcOffset, O->nSliceRowClusters[i * GPUTPCGeometry::NROWS + j] * sizeof(O->sigmaTimeU[0]), outputStream, direction);
175 }
176 }
177 offset = 0;
178 for (uint32_t i = 0; i < O->nTracks; i++) {
179 GPUMemCpyAlways(myStep, O->qTotA + offset, P->qTotA + Compressor.mAttachedClusterFirstIndex[i], O->nTrackClusters[i] * sizeof(O->qTotA[0]), outputStream, direction);
180 GPUMemCpyAlways(myStep, O->qMaxA + offset, P->qMaxA + Compressor.mAttachedClusterFirstIndex[i], O->nTrackClusters[i] * sizeof(O->qMaxA[0]), outputStream, direction);
181 GPUMemCpyAlways(myStep, O->flagsA + offset, P->flagsA + Compressor.mAttachedClusterFirstIndex[i], O->nTrackClusters[i] * sizeof(O->flagsA[0]), outputStream, direction);
182 GPUMemCpyAlways(myStep, O->sigmaPadA + offset, P->sigmaPadA + Compressor.mAttachedClusterFirstIndex[i], O->nTrackClusters[i] * sizeof(O->sigmaPadA[0]), outputStream, direction);
183 GPUMemCpyAlways(myStep, O->sigmaTimeA + offset, P->sigmaTimeA + Compressor.mAttachedClusterFirstIndex[i], O->nTrackClusters[i] * sizeof(O->sigmaTimeA[0]), outputStream, direction);
184
185 // First index stored with track
186 GPUMemCpyAlways(myStep, O->rowDiffA + offset - i, P->rowDiffA + Compressor.mAttachedClusterFirstIndex[i] + 1, (O->nTrackClusters[i] - 1) * sizeof(O->rowDiffA[0]), outputStream, direction);
187 GPUMemCpyAlways(myStep, O->sliceLegDiffA + offset - i, P->sliceLegDiffA + Compressor.mAttachedClusterFirstIndex[i] + 1, (O->nTrackClusters[i] - 1) * sizeof(O->sliceLegDiffA[0]), outputStream, direction);
188 GPUMemCpyAlways(myStep, O->padResA + offset - i, P->padResA + Compressor.mAttachedClusterFirstIndex[i] + 1, (O->nTrackClusters[i] - 1) * sizeof(O->padResA[0]), outputStream, direction);
189 GPUMemCpyAlways(myStep, O->timeResA + offset - i, P->timeResA + Compressor.mAttachedClusterFirstIndex[i] + 1, (O->nTrackClusters[i] - 1) * sizeof(O->timeResA[0]), outputStream, direction);
190 offset += O->nTrackClusters[i];
191 }
192 GPUMemCpyAlways(myStep, O->qPtA, P->qPtA, O->nTracks * sizeof(O->qPtA[0]), outputStream, direction);
193 GPUMemCpyAlways(myStep, O->rowA, P->rowA, O->nTracks * sizeof(O->rowA[0]), outputStream, direction);
194 GPUMemCpyAlways(myStep, O->sliceA, P->sliceA, O->nTracks * sizeof(O->sliceA[0]), outputStream, direction);
195 GPUMemCpyAlways(myStep, O->timeA, P->timeA, O->nTracks * sizeof(O->timeA[0]), outputStream, direction);
196 GPUMemCpyAlways(myStep, O->padA, P->padA, O->nTracks * sizeof(O->padA[0]), outputStream, direction);
197 }
198 if (gatherMode == 1) {
199 gatherTimer->Stop();
200 }
202 if (gatherMode == 3) {
205 }
206
207 if (mPipelineFinalizationCtx == nullptr) {
208 SynchronizeStream(outputStream);
209 } else {
211 }
212 mRec->PopNonPersistentMemory(RecoStep::TPCCompression, qStr2Tag("TPCCOMPR"));
213 if (GetProcessingSettings().deterministicGPUReconstruction) {
216 }
218 return 0;
219}
220
222{
223 const bool needFullFiltering = GetProcessingSettings().tpcApplyCFCutsAtDecoding || (GetProcessingSettings().tpcApplyClusterFilterOnCPU > 0);
224 const bool runTimeBinCutFiltering = param().tpcCutTimeBin > 0;
225 if (needFullFiltering && !GetProcessingSettings().tpcUseOldCPUDecoding) {
226 GPUFatal("tpcApplyCFCutsAtDecoding, tpcApplyClusterFilterOnCPU and tpcCutTimeBin currently require tpcUseOldCPUDecoding");
227 }
228
230 const bool useTemporaryBz = cmprClsHost.nTracks && cmprClsHost.solenoidBz != -1e6f && cmprClsHost.solenoidBz != param().bzkG && !GetProcessingSettings().doublePipeline;
231 std::unique_ptr<GPUParam> tmpParam;
232 int32_t inputStream = 0;
233
234 if (useTemporaryBz) {
235 tmpParam = std::make_unique<GPUParam>(param());
238 WriteConstantParams(inputStream);
239 }
240 if (GetProcessingSettings().tpcUseOldCPUDecoding) {
241 const bool runFiltering = needFullFiltering || runTimeBinCutFiltering;
242 const auto& threadContext = GetThreadContext();
244 auto allocatorFinal = [this](size_t size) {
245 this->mInputsHost->mNClusterNative = this->mInputsShadow->mNClusterNative = size;
246 this->AllocateRegisteredMemory(this->mInputsHost->mResourceClusterNativeOutput, this->mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
247 return this->mInputsHost->mPclusterNativeOutput;
248 };
249 std::unique_ptr<ClusterNative[]> tmpBuffer;
250 auto allocatorTmp = [&tmpBuffer](size_t size) {
251 return ((tmpBuffer = std::make_unique<ClusterNative[]>(size))).get();
252 };
253 auto& decompressTimer = getTimer<TPCClusterDecompressor>("TPCDecompression", 0);
254 auto allocatorUse = runFiltering ? std::function<ClusterNative*(size_t)>{allocatorTmp} : std::function<ClusterNative*(size_t)>{allocatorFinal};
255 decompressTimer.Start();
256 if (decomp.decompress(mIOPtrs.tpcCompressedClusters, *mClusterNativeAccess, allocatorUse, param(), GetProcessingSettings().deterministicGPUReconstruction)) {
257 GPUError("Error decompressing clusters");
258 return 1;
259 }
260 if (runFiltering) {
261 RunTPCClusterFilter(mClusterNativeAccess.get(), allocatorFinal, GetProcessingSettings().tpcApplyCFCutsAtDecoding);
262 }
263 decompressTimer.Stop();
265 if (mRec->IsGPU()) {
266 AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
267 processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
268 WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0);
269 *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative;
270 mInputsHost->mPclusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer;
271 mInputsHost->mPclusterNativeAccess->setOffsetPtrs();
272 GPUMemCpy(RecoStep::TPCDecompression, mInputsShadow->mPclusterNativeBuffer, mIOPtrs.clustersNative->clustersLinear, sizeof(mIOPtrs.clustersNative->clustersLinear[0]) * mIOPtrs.clustersNative->nClustersTotal, 0, true);
273 TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, 0);
275 }
276 } else {
278 RecoStep myStep = RecoStep::TPCDecompression;
279 bool doGPU = GetRecoStepsGPU() & RecoStep::TPCDecompression;
281 GPUTPCDecompression& DecompressorShadow = doGPU ? processorsShadow()->tpcDecompressor : Decompressor;
282 const auto& threadContext = GetThreadContext();
283 CompressedClusters& inputGPU = Decompressor.mInputGPU;
284 CompressedClusters& inputGPUShadow = DecompressorShadow.mInputGPU;
285
286 if (cmprClsHost.nTracks && cmprClsHost.solenoidBz != -1e6f && cmprClsHost.solenoidBz != param().bzkG) {
287 throw std::runtime_error("Configured solenoid Bz " + std::to_string(param().bzkG) + " does not match value used for track model encoding " + std::to_string(cmprClsHost.solenoidBz));
288 }
289 if (cmprClsHost.nTracks && cmprClsHost.maxTimeBin != -1e6 && cmprClsHost.maxTimeBin != param().continuousMaxTimeBin) {
290 throw std::runtime_error("Configured max time bin " + std::to_string(param().continuousMaxTimeBin) + " does not match value used for track model encoding " + std::to_string(cmprClsHost.maxTimeBin));
291 }
292
293 int32_t unattachedStream = mRec->NStreams() - 1;
294 inputGPU = cmprClsHost;
295 SetupGPUProcessor(&Decompressor, true);
296 WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), inputStream);
297 inputGPU = cmprClsHost;
298
299 bool toGPU = true;
300 runKernel<GPUMemClean16>({GetGridAutoStep(inputStream, RecoStep::TPCDecompression), krnlRunRangeNone, &mEvents->init}, DecompressorShadow.mNativeClustersIndex, NSECTORS * GPUTPCGeometry::NROWS * sizeof(DecompressorShadow.mNativeClustersIndex[0]));
301 int32_t nStreams = doGPU ? mRec->NStreams() - 1 : 1;
302 if (cmprClsHost.nAttachedClusters != 0) {
303 std::exclusive_scan(cmprClsHost.nTrackClusters, cmprClsHost.nTrackClusters + cmprClsHost.nTracks, Decompressor.mAttachedClustersOffsets, 0u); // computing clusters offsets for first kernel
304 for (int32_t iStream = 0; iStream < nStreams; iStream++) {
305 uint32_t startTrack = cmprClsHost.nTracks / nStreams * iStream;
306 uint32_t endTrack = cmprClsHost.nTracks / nStreams * (iStream + 1) + (iStream < nStreams - 1 ? 0 : cmprClsHost.nTracks % nStreams); // index of last track (excluded from computation)
307 uint32_t numTracks = endTrack - startTrack;
308 uint32_t* offsets = Decompressor.mAttachedClustersOffsets;
309 uint32_t numClusters = (endTrack == cmprClsHost.nTracks ? offsets[endTrack - 1] + cmprClsHost.nTrackClusters[endTrack - 1] : offsets[endTrack]) - offsets[startTrack];
310 uint32_t numClustersRed = numClusters - numTracks;
311 GPUMemCpy(myStep, DecompressorShadow.mAttachedClustersOffsets + startTrack, Decompressor.mAttachedClustersOffsets + startTrack, numTracks * sizeof(Decompressor.mAttachedClustersOffsets[0]), iStream, toGPU);
312 GPUMemCpy(myStep, inputGPUShadow.nTrackClusters + startTrack, cmprClsHost.nTrackClusters + startTrack, numTracks * sizeof(cmprClsHost.nTrackClusters[0]), iStream, toGPU);
313 GPUMemCpy(myStep, inputGPUShadow.qTotA + offsets[startTrack], cmprClsHost.qTotA + offsets[startTrack], numClusters * sizeof(cmprClsHost.qTotA[0]), iStream, toGPU);
314 GPUMemCpy(myStep, inputGPUShadow.qMaxA + offsets[startTrack], cmprClsHost.qMaxA + offsets[startTrack], numClusters * sizeof(cmprClsHost.qMaxA[0]), iStream, toGPU);
315 GPUMemCpy(myStep, inputGPUShadow.flagsA + offsets[startTrack], cmprClsHost.flagsA + offsets[startTrack], numClusters * sizeof(cmprClsHost.flagsA[0]), iStream, toGPU);
316 GPUMemCpy(myStep, inputGPUShadow.rowDiffA + offsets[startTrack] - startTrack, cmprClsHost.rowDiffA + offsets[startTrack] - startTrack, numClustersRed * sizeof(cmprClsHost.rowDiffA[0]), iStream, toGPU);
317 GPUMemCpy(myStep, inputGPUShadow.sliceLegDiffA + offsets[startTrack] - startTrack, cmprClsHost.sliceLegDiffA + offsets[startTrack] - startTrack, numClustersRed * sizeof(cmprClsHost.sliceLegDiffA[0]), iStream, toGPU);
318 GPUMemCpy(myStep, inputGPUShadow.padResA + offsets[startTrack] - startTrack, cmprClsHost.padResA + offsets[startTrack] - startTrack, numClustersRed * sizeof(cmprClsHost.padResA[0]), iStream, toGPU);
319 GPUMemCpy(myStep, inputGPUShadow.timeResA + offsets[startTrack] - startTrack, cmprClsHost.timeResA + offsets[startTrack] - startTrack, numClustersRed * sizeof(cmprClsHost.timeResA[0]), iStream, toGPU);
320 GPUMemCpy(myStep, inputGPUShadow.sigmaPadA + offsets[startTrack], cmprClsHost.sigmaPadA + offsets[startTrack], numClusters * sizeof(cmprClsHost.sigmaPadA[0]), iStream, toGPU);
321 GPUMemCpy(myStep, inputGPUShadow.sigmaTimeA + offsets[startTrack], cmprClsHost.sigmaTimeA + offsets[startTrack], numClusters * sizeof(cmprClsHost.sigmaTimeA[0]), iStream, toGPU);
322 GPUMemCpy(myStep, inputGPUShadow.qPtA + startTrack, cmprClsHost.qPtA + startTrack, numTracks * sizeof(cmprClsHost.qPtA[0]), iStream, toGPU);
323 GPUMemCpy(myStep, inputGPUShadow.rowA + startTrack, cmprClsHost.rowA + startTrack, numTracks * sizeof(cmprClsHost.rowA[0]), iStream, toGPU);
324 GPUMemCpy(myStep, inputGPUShadow.sliceA + startTrack, cmprClsHost.sliceA + startTrack, numTracks * sizeof(cmprClsHost.sliceA[0]), iStream, toGPU);
325 GPUMemCpy(myStep, inputGPUShadow.timeA + startTrack, cmprClsHost.timeA + startTrack, numTracks * sizeof(cmprClsHost.timeA[0]), iStream, toGPU);
326 GPUMemCpy(myStep, inputGPUShadow.padA + startTrack, cmprClsHost.padA + startTrack, numTracks * sizeof(cmprClsHost.padA[0]), iStream, toGPU);
327 runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::step0attached>({GetGridAuto(iStream), krnlRunRangeNone, {&mEvents->stream[iStream], &mEvents->init}}, startTrack, endTrack);
328 }
329 }
330 GPUMemCpy(myStep, inputGPUShadow.nSliceRowClusters, cmprClsHost.nSliceRowClusters, NSECTORS * GPUTPCGeometry::NROWS * sizeof(cmprClsHost.nSliceRowClusters[0]), unattachedStream, toGPU);
331 GPUMemCpy(myStep, inputGPUShadow.qTotU, cmprClsHost.qTotU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.qTotU[0]), unattachedStream, toGPU);
332 GPUMemCpy(myStep, inputGPUShadow.qMaxU, cmprClsHost.qMaxU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.qMaxU[0]), unattachedStream, toGPU);
333 GPUMemCpy(myStep, inputGPUShadow.flagsU, cmprClsHost.flagsU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.flagsU[0]), unattachedStream, toGPU);
334 GPUMemCpy(myStep, inputGPUShadow.padDiffU, cmprClsHost.padDiffU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.padDiffU[0]), unattachedStream, toGPU);
335 GPUMemCpy(myStep, inputGPUShadow.timeDiffU, cmprClsHost.timeDiffU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.timeDiffU[0]), unattachedStream, toGPU);
336 GPUMemCpy(myStep, inputGPUShadow.sigmaPadU, cmprClsHost.sigmaPadU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.sigmaPadU[0]), unattachedStream, toGPU);
337 GPUMemCpy(myStep, inputGPUShadow.sigmaTimeU, cmprClsHost.sigmaTimeU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.sigmaTimeU[0]), unattachedStream, toGPU);
338
339 TransferMemoryResourceLinkToHost(RecoStep::TPCDecompression, Decompressor.mResourceTmpIndexes, inputStream, nullptr, mEvents->stream, nStreams);
340 SynchronizeStream(inputStream);
341 uint32_t offset = 0;
342 uint32_t decodedAttachedClusters = 0;
343 for (uint32_t i = 0; i < NSECTORS; i++) {
344 for (uint32_t j = 0; j < GPUTPCGeometry::NROWS; j++) {
345 uint32_t linearIndex = i * GPUTPCGeometry::NROWS + j;
346 uint32_t unattachedOffset = (linearIndex >= cmprClsHost.nSliceRows) ? 0 : cmprClsHost.nSliceRowClusters[linearIndex];
347 (mClusterNativeAccess->nClusters)[i][j] = Decompressor.mNativeClustersIndex[linearIndex] + unattachedOffset;
348 Decompressor.mUnattachedClustersOffsets[linearIndex] = offset;
349 offset += unattachedOffset;
350 decodedAttachedClusters += Decompressor.mNativeClustersIndex[linearIndex];
351 }
352 }
353 TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, Decompressor.mResourceTmpClustersOffsets, inputStream);
354 if (decodedAttachedClusters != cmprClsHost.nAttachedClusters) {
355 GPUWarning("%u / %u clusters failed track model decoding (%f %%)", cmprClsHost.nAttachedClusters - decodedAttachedClusters, cmprClsHost.nAttachedClusters, 100.f * (float)(cmprClsHost.nAttachedClusters - decodedAttachedClusters) / (float)cmprClsHost.nAttachedClusters);
356 }
357 if (runTimeBinCutFiltering) { // If filtering, allocate a temporary buffer and cluster native access in decompressor context
358 Decompressor.mNClusterNativeBeforeFiltering = DecompressorShadow.mNClusterNativeBeforeFiltering = decodedAttachedClusters + cmprClsHost.nUnattachedClusters;
361 mClusterNativeAccess->clustersLinear = DecompressorShadow.mNativeClustersBuffer;
362 mClusterNativeAccess->setOffsetPtrs();
364 WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), inputStream);
365 TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, Decompressor.mResourceClusterNativeAccess, inputStream, &mEvents->single);
366 } else { // If not filtering, directly allocate the final buffers
367 mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = cmprClsHost.nAttachedClusters + cmprClsHost.nUnattachedClusters;
369 AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
370 DecompressorShadow.mNativeClustersBuffer = mInputsShadow->mPclusterNativeBuffer;
371 Decompressor.mNativeClustersBuffer = mInputsHost->mPclusterNativeOutput;
372 DecompressorShadow.mClusterNativeAccess = mInputsShadow->mPclusterNativeAccess;
373 Decompressor.mClusterNativeAccess = mInputsHost->mPclusterNativeAccess;
374 WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), inputStream);
375 if (doGPU) {
376 mClusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer;
377 mClusterNativeAccess->setOffsetPtrs();
378 *mInputsHost->mPclusterNativeAccess = *mClusterNativeAccess;
379 processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
380 WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), inputStream);
381 TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, inputStream, &mEvents->single);
382 }
384 mClusterNativeAccess->clustersLinear = mInputsHost->mPclusterNativeOutput;
385 mClusterNativeAccess->setOffsetPtrs();
386 *mInputsHost->mPclusterNativeAccess = *mClusterNativeAccess;
387 }
388
389 uint32_t batchSize = doGPU ? 6 : NSECTORS;
390 for (uint32_t iSector = 0; iSector < NSECTORS; iSector = iSector + batchSize) {
391 int32_t iStream = (iSector / batchSize) % mRec->NStreams();
392 runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::step1unattached>({GetGridAuto(iStream), krnlRunRangeNone, {nullptr, &mEvents->single}}, iSector, batchSize);
393 uint32_t copySize = std::accumulate(mClusterNativeAccess->nClustersSector + iSector, mClusterNativeAccess->nClustersSector + iSector + batchSize, 0u);
394 if (!runTimeBinCutFiltering) {
395 GPUMemCpy(RecoStep::TPCDecompression, mInputsHost->mPclusterNativeOutput + mClusterNativeAccess->clusterOffset[iSector][0], DecompressorShadow.mNativeClustersBuffer + mClusterNativeAccess->clusterOffset[iSector][0], sizeof(Decompressor.mNativeClustersBuffer[0]) * copySize, iStream, false);
396 }
397 }
399
400 if (runTimeBinCutFiltering) { // If filtering is applied, count how many clusters will remain after filtering and allocate final buffers accordingly
402 WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), unattachedStream);
403 runKernel<GPUMemClean16>({GetGridAutoStep(unattachedStream, RecoStep::TPCDecompression), krnlRunRangeNone}, DecompressorShadow.mNClusterPerSectorRow, NSECTORS * GPUTPCGeometry::NROWS * sizeof(DecompressorShadow.mNClusterPerSectorRow[0]));
404 runKernel<GPUTPCDecompressionUtilKernels, GPUTPCDecompressionUtilKernels::countFilteredClusters>(GetGridAutoStep(unattachedStream, RecoStep::TPCDecompression));
405 TransferMemoryResourceLinkToHost(RecoStep::TPCDecompression, Decompressor.mResourceNClusterPerSectorRow, unattachedStream);
406 SynchronizeStream(unattachedStream);
407 uint32_t nClustersFinal = std::accumulate(Decompressor.mNClusterPerSectorRow, Decompressor.mNClusterPerSectorRow + inputGPU.nSliceRows, 0u);
408 mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = nClustersFinal;
410 AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
411 DecompressorShadow.mNativeClustersBuffer = mInputsShadow->mPclusterNativeBuffer;
412 Decompressor.mNativeClustersBuffer = mInputsHost->mPclusterNativeOutput;
413 WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), unattachedStream);
414 for (uint32_t i = 0; i < NSECTORS; i++) {
415 for (uint32_t j = 0; j < GPUTPCGeometry::NROWS; j++) {
416 mClusterNativeAccess->nClusters[i][j] = Decompressor.mNClusterPerSectorRow[i * GPUTPCGeometry::NROWS + j];
417 }
418 }
419 if (doGPU) {
420 mClusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer;
421 mClusterNativeAccess->setOffsetPtrs();
422 *mInputsHost->mPclusterNativeAccess = *mClusterNativeAccess;
423 processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
424 WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), unattachedStream);
425 TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, unattachedStream);
426 }
428 mClusterNativeAccess->clustersLinear = mInputsHost->mPclusterNativeOutput;
429 mClusterNativeAccess->setOffsetPtrs();
430 runKernel<GPUTPCDecompressionUtilKernels, GPUTPCDecompressionUtilKernels::storeFilteredClusters>(GetGridAutoStep(unattachedStream, RecoStep::TPCDecompression));
431 GPUMemCpy(RecoStep::TPCDecompression, mInputsHost->mPclusterNativeOutput, DecompressorShadow.mNativeClustersBuffer, sizeof(Decompressor.mNativeClustersBuffer[0]) * nClustersFinal, unattachedStream, false);
432 SynchronizeStream(unattachedStream);
433 }
434 if (GetProcessingSettings().deterministicGPUReconstruction || GetProcessingSettings().debugLevel >= 4) {
435 runKernel<GPUTPCDecompressionUtilKernels, GPUTPCDecompressionUtilKernels::sortPerSectorRow>(GetGridAutoStep(unattachedStream, RecoStep::TPCDecompression));
437 if (doGPU) {
438 for (uint32_t i = 0; i < NSECTORS; i++) {
439 for (uint32_t j = 0; j < GPUTPCGeometry::NROWS; j++) {
440 ClusterNative* begin = mInputsHost->mPclusterNativeOutput + decoded->clusterOffset[i][j];
441 ClusterNative* end = begin + decoded->nClusters[i][j];
442 std::sort(begin, end);
443 }
444 }
445 }
446 SynchronizeStream(unattachedStream);
447 }
448 mRec->PopNonPersistentMemory(RecoStep::TPCDecompression, qStr2Tag("TPCDCMPR"));
449 }
450 if (useTemporaryBz) {
452 param() = *tmpParam;
453 tmpParam.reset();
455 }
457 return 0;
458}
459
460void GPUChainTracking::WriteReducedClusters()
461{
463 mClusterNativeAccessReduced = std::make_unique<ClusterNativeAccess>();
464 uint32_t nOutput = 0;
465 for (uint32_t iSec = 0; iSec < GPUTPCGeometry::NSECTORS; iSec++) {
466 for (uint32_t iRow = 0; iRow < GPUTPCGeometry::NROWS; iRow++) {
467 mClusterNativeAccessReduced->nClusters[iSec][iRow] = 0;
468 for (uint32_t i = 0; i < mIOPtrs.clustersNative->nClusters[iSec][iRow]; i++) {
469 mClusterNativeAccessReduced->nClusters[iSec][iRow] += !Compressor.rejectCluster(mIOPtrs.clustersNative->clusterOffset[iSec][iRow] + i, param(), mIOPtrs);
470 }
471 nOutput += mClusterNativeAccessReduced->nClusters[iSec][iRow];
472 }
473 }
474
476 if (!clOutput || !clOutput->allocator) {
477 throw std::runtime_error("No output allocator for clusterNative available");
478 }
479 auto* clBuffer = (ClusterNative*)clOutput->allocator(nOutput * sizeof(ClusterNative));
480 mClusterNativeAccessReduced->clustersLinear = clBuffer;
481 mClusterNativeAccessReduced->setOffsetPtrs();
482
483 std::pair<o2::dataformats::ConstMCLabelContainer*, o2::dataformats::ConstMCLabelContainerView*> labelBuffer;
486 if (!labelOutput || !labelOutput->allocator) {
487 throw std::runtime_error("No output allocator for clusterNative labels available");
488 }
490 labelBuffer = {&labelContainer->first, &labelContainer->second};
491 }
492
493 nOutput = 0;
495 for (uint32_t i = 0; i < mIOPtrs.clustersNative->nClustersTotal; i++) {
496 if (!Compressor.rejectCluster(i, param(), mIOPtrs)) {
498 for (const auto& element : mIOPtrs.clustersNative->clustersMCTruth->getLabels(i)) {
499 tmpContainer.addElement(nOutput, element);
500 }
501 }
502 clBuffer[nOutput++] = mIOPtrs.clustersNative->clustersLinear[i];
503 }
504 }
507 tmpContainer.flatten_to(*labelBuffer.first);
508 *labelBuffer.second = *labelBuffer.first;
509 mClusterNativeAccessReduced->clustersMCTruth = labelBuffer.second;
510 }
511}
A const (ready only) version of MCTruthContainer.
atype::type element
int32_t i
uint32_t iSector
uint32_t j
Definition RawData.h:0
void Start()
Definition timer.cxx:64
void Stop()
Definition timer.cxx:76
void addElement(uint32_t dataindex, TruthElement const &element, bool noElement=false)
size_t flatten_to(ContainerType &container) const
static void DebugSortCompressedClusters(o2::tpc::CompressedClustersFlat *cls)
std::unique_ptr< o2::tpc::ClusterNativeAccess > mClusterNativeAccess
static void DumpClusters(std::ostream &out, const o2::tpc::ClusterNativeAccess *clusters)
std::unique_ptr< GPUTrackingInputProvider > mInputsHost
std::array< GPUOutputControl *, GPUTrackingOutputs::count()> mSubOutputControls
std::unique_ptr< std::ofstream > mDebugFile
GPUTrackingInOutPointers & mIOPtrs
std::unique_ptr< GPUTrackingInputProvider > mInputsShadow
std::unique_ptr< o2::tpc::ClusterNativeAccess > mClusterNativeAccessReduced
void RecordMarker(deviceEvent *ev, int32_t stream)
Definition GPUChain.h:108
void TransferMemoryResourceLinkToGPU(RecoStep step, int16_t res, int32_t stream=-1, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
Definition GPUChain.h:124
void GPUMemCpyAlways(RecoStep step, void *dst, const void *src, size_t size, int32_t stream, int32_t toGPU, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
Definition GPUChain.h:130
void GPUMemCpy(RecoStep step, void *dst, const void *src, size_t size, int32_t stream, int32_t toGPU, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
Definition GPUChain.h:129
bool DoDebugAndDump(RecoStep step, uint32_t mask, T &processor, S T::*func, Args &&... args)
Definition GPUChain.h:239
void SynchronizeGPU()
Definition GPUChain.h:110
GPUReconstruction::RecoStepField GetRecoStepsGPU() const
Definition GPUChain.h:72
void WriteToConstantMemory(RecoStep step, size_t offset, const void *src, size_t size, int32_t stream=-1, deviceEvent *ev=nullptr)
Definition GPUChain.h:128
krnlExec GetGridAuto(int32_t stream, GPUReconstruction::krnlDeviceType d=GPUReconstruction::krnlDeviceType::Auto, gpudatatypes::RecoStep st=gpudatatypes::RecoStep::NoRecoStep)
Definition GPUChain.cxx:42
GPUChain * GetNextChainInQueue()
Definition GPUChain.h:236
size_t AllocateRegisteredMemory(GPUProcessor *proc)
Definition GPUChain.h:228
virtual std::unique_ptr< GPUReconstructionProcessing::threadContext > GetThreadContext()
Definition GPUChain.h:109
GPUConstantMem * processors()
Definition GPUChain.h:84
static constexpr krnlRunRange krnlRunRangeNone
Definition GPUChain.h:41
bool DoDebugDump(uint32_t mask, std::function< void(Args &...)> func, Args &... args)
krnlExec GetGridAutoStep(int32_t stream, gpudatatypes::RecoStep st=gpudatatypes::RecoStep::NoRecoStep)
Definition GPUChain.cxx:47
GPUParam & param()
Definition GPUChain.h:87
void SetupGPUProcessor(T *proc, bool allocate)
Definition GPUChain.h:231
krnlExec GetGridBlkStep(uint32_t nBlocks, int32_t stream, gpudatatypes::RecoStep st=gpudatatypes::RecoStep::NoRecoStep)
Definition GPUChain.cxx:37
const GPUSettingsProcessing & GetProcessingSettings() const
Definition GPUChain.h:76
void SynchronizeStream(int32_t stream)
Definition GPUChain.h:89
GPUReconstructionCPU * mRec
Definition GPUChain.h:79
GPUConstantMem * processorsShadow()
Definition GPUChain.h:85
static constexpr int32_t NSECTORS
Definition GPUChain.h:58
void TransferMemoryResourceLinkToHost(RecoStep step, int16_t res, int32_t stream=-1, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
Definition GPUChain.h:125
void TransferMemoryResourcesToHost(RecoStep step, GPUProcessor *proc, int32_t stream=-1, bool all=false)
Definition GPUChain.h:123
void WriteConstantParams(int32_t stream=-1)
Definition GPUChain.h:127
void SynchronizeEventAndRelease(deviceEvent &ev, bool doGPU=true)
Definition GPUChain.h:92
void TransferMemoryResourcesToGPU(RecoStep step, GPUProcessor *proc, int32_t stream=-1, bool all=false)
Definition GPUChain.h:122
const GPUDefParameters & getGPUParameters(bool doGPU) const override
virtual void * getGPUPointer(void *ptr)
void PopNonPersistentMemory(RecoStep step, uint64_t tag, const GPUProcessor *proc=nullptr)
void PushNonPersistentMemory(uint64_t tag)
void BlockStackedMemory(GPUReconstruction *rec)
const GPUSettingsProcessing & GetProcessingSettings() const
const GPUSettingsGRP & GetGRPSettings() const
void DumpCompressedClusters(std::ostream &out)
o2::tpc::CompressedClusters * mOutput
o2::tpc::CompressedClusters * mOutputA
o2::tpc::CompressedClustersPtrs mPtrs
o2::tpc::CompressedClustersFlat * mOutputFlat
o2::tpc::CompressedClusters mInputGPU
o2::tpc::ClusterNative * mNativeClustersBuffer
o2::tpc::ClusterNativeAccess * mClusterNativeAccess
static constexpr uint32_t NROWS
static constexpr uint32_t NSECTORS
static int32_t decompress(const o2::tpc::CompressedClustersFlat *clustersCompressed, o2::tpc::ClusterNativeAccess &clustersNative, std::function< o2::tpc::ClusterNative *(size_t)> allocator, const GPUParam &param, bool deterministicRec)
GLdouble n
Definition glcorearb.h:1982
GLsizeiptr size
Definition glcorearb.h:659
GLuint GLsizei const GLuint const GLintptr * offsets
Definition glcorearb.h:2595
GLuint GLuint end
Definition glcorearb.h:469
GLintptr offset
Definition glcorearb.h:660
GLenum GLfloat param
Definition glcorearb.h:271
std::unique_ptr< const o2::dataformats::MCTruthContainer< MCLabel > > getLabels(framework::ProcessingContext &pc, std::string_view dataBind, EventType eventType=EventType::Standard)
Global TPC definitions and constants.
Definition SimTraits.h:168
Enum< T >::Iterator begin(Enum< T >)
Definition Defs.h:156
std::string to_string(gsl::span< T, Size > span)
Definition common.h:52
constexpr T qStr2Tag(const char(&str)[N])
Definition strtag.h:24
deviceEvent stream[constants::GPU_MAX_STREAMS]
GPUTPCDecompression tpcDecompressor
GPUTrackingInOutPointers ioPtrs
GPUTPCCompression tpcCompressor
std::function< void *(size_t)> allocator
void UpdateBzOnly(float newSolenoidBz, bool assumeConstantBz=false)
Definition GPUParam.cxx:118
const o2::tpc::ClusterNativeAccess * clustersNative
const o2::tpc::CompressedClustersFlat * tpcCompressedClusters
const o2::tpc::ClusterNativeAccess * clustersNativeReduced
size_t getIndex(const GPUOutputControl &v)
GPUOutputControl compressedClusters
unsigned int nClusters[constants::MAXSECTOR][constants::MAXGLOBALPADROW]
const o2::dataformats::ConstMCTruthContainerView< o2::MCCompLabel > * clustersMCTruth
std::pair< ConstMCLabelContainer, ConstMCLabelContainerView > ConstMCLabelContainerViewWithBuffer
unsigned int clusterOffset[constants::MAXSECTOR][constants::MAXGLOBALPADROW]
const ClusterNative * clustersLinear
void set(size_t bufferSize, const CompressedClusters &v)