Project
Loading...
Searching...
No Matches
GPUReconstructionCPU.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
18#include "GPUChain.h"
19#include "GPUDefParametersRuntime.h"
20#include "GPUTPCGMMergedTrack.h"
22#include "GPUTRDTrackletWord.h"
24#include "GPUTPCMCInfo.h"
25#include "GPUTRDTrack.h"
26#include "GPUTRDTracker.h"
27#include "AliHLTTPCRawCluster.h"
29#include "GPUMemoryResource.h"
30#include "GPUConstantMem.h"
31#include "GPULogging.h"
33#include "GPUReconstructionProcessingKernels.inc"
35
36#include <atomic>
37#include <ctime>
38#include <string>
39
40#ifndef _WIN32
41#include <unistd.h>
42#endif
43
44using namespace o2::gpu;
45
48
50
52{
53 Exit(); // Needs to be identical to GPU backend bahavior in order to avoid calling abstract methods later in the destructor
54}
55
56template <class T, int32_t I, typename... Args>
57inline void GPUReconstructionCPU::runKernelBackend(const krnlSetupTime& _xyz, const Args&... args)
58{
59 auto& x = _xyz.x;
60 auto& y = _xyz.y;
61 if (x.device == krnlDeviceType::Device) {
62 throw std::runtime_error("Cannot run device kernel on host");
63 }
64 if (x.nThreads != 1) {
65 throw std::runtime_error("Cannot run device kernel on host with nThreads != 1");
66 }
67 int32_t nThreads = getNKernelHostThreads(false);
68 if (nThreads > 1) {
69 if (GetProcessingSettings().debugLevel >= 5) {
70 GPUInfo("Running %d Threads", mThreading->activeThreads->max_concurrency());
71 }
72 tbb::this_task_arena::isolate([&] {
73 mThreading->activeThreads->execute([&] {
74 tbb::parallel_for(tbb::blocked_range<uint32_t>(0, x.nBlocks, 1), [&](const tbb::blocked_range<uint32_t>& r) {
75 typename T::GPUSharedMemory smem;
76 for (uint32_t iB = r.begin(); iB < r.end(); iB++) {
77 T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.index], args...);
78 }
79 });
80 });
81 });
82 } else {
83 for (uint32_t iB = 0; iB < x.nBlocks; iB++) {
84 typename T::GPUSharedMemory smem;
85 T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.index], args...);
86 }
87 }
88}
89
90template <>
91inline void GPUReconstructionCPU::runKernelBackend<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
92{
93 int32_t nThreads = std::max<int32_t>(1, std::min<int32_t>(size / (16 * 1024 * 1024), getNKernelHostThreads(true)));
94 if (nThreads > 1) {
95 tbb::parallel_for(0, nThreads, [&](int iThread) {
96 size_t threadSize = size / nThreads;
97 if (threadSize % 4096) {
98 threadSize += 4096 - threadSize % 4096;
99 }
100 size_t offset = threadSize * iThread;
101 size_t mySize = std::min<size_t>(threadSize, size - offset);
102 if (mySize) {
103 memset((char*)ptr + offset, 0, mySize);
104 } // clang-format off
105 }, tbb::static_partitioner()); // clang-format on
106 } else {
107 memset(ptr, 0, size);
108 }
109}
110
111template <class S, int32_t I>
113{
114 if (gpu == -1) {
115 gpu = IsGPU();
116 }
117 const auto num = GetKernelNum<S, I>();
118 const auto* p = gpu ? mParDevice : mParCPU;
119 GPUReconstructionProcessing::krnlProperties ret = {p->par_LB_maxThreads[num], p->par_LB_minBlocks[num], p->par_LB_forceBlocks[num]};
120 if (ret.nThreads == 0) {
121 ret.nThreads = gpu ? mThreadCount : 1u;
122 }
123 if (ret.minBlocks == 0) {
124 ret.minBlocks = 1;
125 }
126 return ret;
127}
128
129#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \
130 template GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelProperties<GPUCA_M_KRNL_TEMPLATE(x_class)>(int gpu);
131#include "GPUReconstructionKernelList.h"
132#undef GPUCA_KRNL
133
134size_t GPUReconstructionCPU::TransferMemoryInternal(GPUMemoryResource* res, int32_t stream, deviceEvent* ev, deviceEvent* evList, int32_t nEvents, bool toGPU, const void* src, void* dst) { return 0; }
135size_t GPUReconstructionCPU::GPUMemCpy(void* dst, const void* src, size_t size, int32_t stream, int32_t toGPU, deviceEvent* ev, deviceEvent* evList, int32_t nEvents) { return 0; }
136size_t GPUReconstructionCPU::GPUMemCpyAlways(bool onGpu, void* dst, const void* src, size_t size, int32_t stream, int32_t toGPU, deviceEvent* ev, deviceEvent* evList, int32_t nEvents)
137{
138 memcpy(dst, src, size);
139 return 0;
140}
141size_t GPUReconstructionCPU::WriteToConstantMemory(size_t offset, const void* src, size_t size, int32_t stream, deviceEvent* ev) { return 0; }
142int32_t GPUReconstructionCPU::GPUDebug(const char* state, int32_t stream, bool force) { return 0; }
143size_t GPUReconstructionCPU::TransferMemoryResourcesHelper(GPUProcessor* proc, int32_t stream, bool all, bool toGPU)
144{
147 size_t n = 0;
148 for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
150 if (res.mPtr == nullptr) {
151 continue;
152 }
153 if (proc && res.mProcessor != proc) {
154 continue;
155 }
157 continue;
158 }
159 if (!GetProcessingSettings().keepAllMemory && !all && (res.mType & exc) && !(res.mType & inc)) {
160 continue;
161 }
162 if (toGPU) {
164 } else {
166 }
167 }
168 return n;
169}
170
172{
173// Get Thread ID
174#if defined(__APPLE__)
175 return (0); // syscall is deprecated on MacOS..., only needed for GPU support which we don't do on Mac anyway
176#elif defined(_WIN32)
177 return ((int32_t)(size_t)GetCurrentThread());
178#else
179 return ((int32_t)syscall(SYS_gettid));
180#endif
181}
182
184{
186 mThreading->activeThreads = std::make_unique<tbb::task_arena>(mActiveHostKernelThreads);
187 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
188 if (mMaster == nullptr) {
191 }
192 mHostMemoryBase = operator new(mHostMemorySize, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
193 }
196 }
197 if (GetProcessingSettings().inKernelParallel) {
199 }
201 return 0;
202}
203
205{
206 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
207 if (mMaster == nullptr) {
208 operator delete(mHostMemoryBase, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
209 }
211 mHostMemorySize = 0;
212 }
213 return 0;
214}
215
217{
218 mMemoryScalers->temporaryFactor = 1.;
219 if (GetProcessingSettings().memoryScalingFuzz) {
220 static std::mt19937 rng;
221 static std::uniform_int_distribution<uint64_t> dist(0, 1000000);
222 uint64_t fuzzFactor = GetProcessingSettings().memoryScalingFuzz == 1 ? dist(rng) : GetProcessingSettings().memoryScalingFuzz;
223 GPUInfo("Fuzzing memory scaling factor with %lu", fuzzFactor);
224 mMemoryScalers->fuzzScalingFactor(fuzzFactor);
225 }
226
227 mStatNEvents++;
229
230 if (GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) {
231 GPUInfo("Allocated memory when starting processing %34s", "");
233 }
234
236 const std::clock_t cpuTimerStart = std::clock();
237 int32_t retVal = 0;
238 if (GetProcessingSettings().doublePipeline) {
240 } else {
241 if (mSlaves.size() || mMaster) {
242 WriteConstantParams(); // Reinitialize // TODO: Get this in sync with GPUChainTracking::DoQueuedUpdates, and consider the doublePipeline
243 }
244 for (uint32_t i = 0; i < mChains.size(); i++) {
245 retVal = mChains[i]->RunChain();
246 }
247 }
248 if (retVal != 0 && retVal != 2) {
249 return retVal;
250 }
252 if (GetProcessingSettings().tpcFreeAllocatedMemoryAfterProcessing) {
254 }
255 mStatCPUTime += (double)(std::clock() - cpuTimerStart) / CLOCKS_PER_SEC;
256 if (GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) {
257 GPUInfo("Allocated memory when ending processing %36s", "");
259 }
260
262 std::string nEventReport;
263 if (GetProcessingSettings().debugLevel >= 0 && mStatNEvents > 1) {
264 nEventReport += " (avergage of " + std::to_string(mStatNEvents) + " runs)";
265 }
266 double kernelTotal = 0;
267 std::vector<double> kernelStepTimes(gpudatatypes::N_RECO_STEPS, 0.);
268
269 debugWriter writer(GetProcessingSettings().debugCSV, GetProcessingSettings().debugMarkdown, mStatNEvents);
270
271 if (GetProcessingSettings().debugLevel >= 1) {
272 writer.header();
273 for (uint32_t i = 0; i < mTimers.size(); i++) {
274 double time = 0;
275 if (mTimers[i] == nullptr) {
276 continue;
277 }
278 for (int32_t j = 0; j < mTimers[i]->num; j++) {
279 HighResTimer& timer = mTimers[i]->timer[j];
280 time += timer.GetElapsedTime();
281 if (GetProcessingSettings().resetTimers) {
282 timer.Reset();
283 }
284 }
285
286 uint32_t type = mTimers[i]->type;
287 if (type == 0) {
288 kernelTotal += time;
289 int32_t stepNum = getRecoStepNum(mTimers[i]->step);
290 kernelStepTimes[stepNum] += time;
291 }
292 writer.row('K', mTimers[i]->count, mTimers[i]->name.c_str(), time, -1.0, -1.0, mTimers[i]->memSize);
293 if (GetProcessingSettings().resetTimers) {
294 mTimers[i]->count = 0;
295 mTimers[i]->memSize = 0;
296 }
297 }
298 }
299 if (GetProcessingSettings().recoTaskTiming) {
300 for (int32_t i = 0; i < gpudatatypes::N_RECO_STEPS; i++) {
301 if (kernelStepTimes[i] != 0. || mTimersRecoSteps[i].timerTotal.GetElapsedTime() != 0.) {
302 writer.row(' ', 0, std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (Tasks)", kernelStepTimes[i], mTimersRecoSteps[i].timerCPU, mTimersRecoSteps[i].timerTotal.GetElapsedTime(), 0);
303 }
304 if (mTimersRecoSteps[i].bytesToGPU) {
305 writer.row('D', mTimersRecoSteps[i].countToGPU, std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to GPU)", mTimersRecoSteps[i].timerToGPU.GetElapsedTime(), -1.0, -1.0, mTimersRecoSteps[i].bytesToGPU);
306 }
307 if (mTimersRecoSteps[i].bytesToHost) {
308 writer.row('D', mTimersRecoSteps[i].countToHost, std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to Host)", mTimersRecoSteps[i].timerToHost.GetElapsedTime(), -1.0, -1.0, mTimersRecoSteps[i].bytesToHost);
309 }
310 if (GetProcessingSettings().resetTimers) {
318 }
319 }
320 for (int32_t i = 0; i < gpudatatypes::N_GENERAL_STEPS; i++) {
321 if (mTimersGeneralSteps[i].GetElapsedTime() != 0.) {
322 writer.row(' ', 0, gpudatatypes::GENERAL_STEP_NAMES[i], mTimersGeneralSteps[i].GetElapsedTime(), -1.0, -1.0, 0);
323 }
324 }
325 double gpu_time = GetProcessingSettings().debugLevel >= 1 ? kernelTotal : -1.0;
326 writer.row(' ', 0, "Wall", gpu_time, mStatCPUTime, mTimerTotal.GetElapsedTime(), 0, nEventReport);
327 } else if (GetProcessingSettings().debugLevel >= 0) {
328 GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime, nEventReport.c_str());
329 }
330 if (GetProcessingSettings().resetTimers) {
331 mStatNEvents = 0;
332 mStatCPUTime = 0;
334 }
335
336 if (GetProcessingSettings().memoryStat) {
338 } else if (GetProcessingSettings().debugLevel >= 2) {
340 }
341
342 return retVal;
343}
344
346{
347 for (uint32_t i = 0; i < mProcessors.size(); i++) {
348 if (mProcessors[i].proc->mGPUProcessorType != GPUProcessor::PROCESSOR_TYPE_DEVICE && mProcessors[i].proc->mLinkedProcessor) {
349 mProcessors[i].proc->mLinkedProcessor->InitGPUProcessor(this, GPUProcessor::PROCESSOR_TYPE_DEVICE);
350 }
351 }
352}
353
354void GPUReconstructionCPU::UpdateParamOccupancyMap(const uint32_t* mapHost, const uint32_t* mapGPU, uint32_t occupancyTotal, uint32_t mapSize, int32_t stream, deviceEvent* ev)
355{
356 if (mapHost && mapSize != GPUTPCClusterOccupancyMapBin::getNBins(param())) {
357 throw std::runtime_error("Updating occupancy map with object of invalid size");
358 }
359 param().occupancyMap = mapHost;
360 param().occupancyMapSize = mapSize;
361 param().occupancyTotal = occupancyTotal;
362 if (IsGPU()) {
363 if (!((size_t)&param().occupancyMapSize - (size_t)&param().occupancyMap == sizeof(param().occupancyMap) + sizeof(param().occupancyTotal) && sizeof(param().occupancyMap) == sizeof(void*) && sizeof(param().occupancyTotal) == sizeof(uint32_t))) { // TODO: Make static assert, and check alignment
364 throw std::runtime_error("occupancy data not consecutive in GPUParam");
365 }
366 struct tmpOccuapncyParam {
367 const void* ptr;
368 uint32_t total;
369 uint32_t size;
370 };
371 tmpOccuapncyParam tmp = {mapGPU, occupancyTotal, mapSize};
372 const auto holdContext = GetThreadContext();
373 WriteToConstantMemory((char*)&processors()->param.occupancyMap - (char*)processors(), &tmp, sizeof(tmp), stream, ev);
374 }
375}
int16_t time
Definition RawEventData.h:4
int32_t i
#define GPUCA_BUFFER_ALIGNMENT
int32_t retVal
Online TRD tracker based on extrapolated TPC tracks.
Used for storing the MC labels for the TRD tracklets.
TRD Tracklet word for GPU tracker - 32bit tracklet info + half chamber ID + index.
uint32_t j
Definition RawData.h:0
uint32_t res
Definition RawData.h:0
TBranch * ptr
double num
void Reset()
Definition timer.cxx:108
void Start()
Definition timer.cxx:64
double GetElapsedTime()
Definition timer.cxx:115
void Stop()
Definition timer.cxx:76
ProcessorType mGPUProcessorType
virtual size_t GPUMemCpy(void *dst, const void *src, size_t size, int32_t stream, int32_t toGPU, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
void runKernelBackend(const krnlSetupTime &_xyz, const Args &... args)
virtual size_t GPUMemCpyAlways(bool onGpu, void *dst, const void *src, size_t size, int32_t stream, int32_t toGPU, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
static constexpr krnlRunRange krnlRunRangeNone
size_t TransferMemoryResourceToHost(GPUMemoryResource *res, int32_t stream=-1, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
void UpdateParamOccupancyMap(const uint32_t *mapHost, const uint32_t *mapGPU, uint32_t occupancyTotal, uint32_t mapSize, int32_t stream=-1, deviceEvent *ev=nullptr)
size_t TransferMemoryResourceToGPU(GPUMemoryResource *res, int32_t stream=-1, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
GPUProcessorProcessors mProcShadow
krnlProperties getKernelProperties(int gpu=-1)
virtual int32_t GPUDebug(const char *state="UNKNOWN", int32_t stream=-1, bool force=false)
static constexpr krnlEvent krnlEventNone
size_t WriteToConstantMemory(size_t offset, const void *src, size_t size, int32_t stream=-1, deviceEvent *ev=nullptr) override
virtual size_t TransferMemoryInternal(GPUMemoryResource *res, int32_t stream, deviceEvent *ev, deviceEvent *evList, int32_t nEvents, bool toGPU, const void *src, void *dst)
RecoStepTimerMeta mTimersRecoSteps[gpudatatypes::N_RECO_STEPS]
std::vector< std::unique_ptr< timerMeta > > mTimers
HighResTimer mTimersGeneralSteps[gpudatatypes::N_GENERAL_STEPS]
virtual std::unique_ptr< threadContext > GetThreadContext() override
std::vector< std::unique_ptr< GPUChain > > mChains
std::unique_ptr< GPUMemorySizeScalers > mMemoryScalers
std::vector< GPUReconstruction * > mSlaves
std::vector< GPUMemoryResource > mMemoryResources
std::vector< ProcessorData > mProcessors
void WriteConstantParams(int32_t stream=-1)
static GPUReconstruction * GPUReconstruction_Create_CPU(const GPUSettingsDeviceBackend &cfg)
void ClearAllocatedMemory(bool clearOutputs=true)
int32_t getRecoStepNum(RecoStep step, bool validCheck=true)
const GPUSettingsProcessing & GetProcessingSettings() const
int32_t EnqueuePipeline(bool terminate=false)
std::shared_ptr< GPUReconstructionThreading > mThreading
GLdouble n
Definition glcorearb.h:1982
GLint GLenum GLint x
Definition glcorearb.h:403
GLenum src
Definition glcorearb.h:1767
GLint GLsizei count
Definition glcorearb.h:399
GLsizeiptr size
Definition glcorearb.h:659
GLuint const GLchar * name
Definition glcorearb.h:781
GLint GLint GLsizei GLint GLenum GLenum type
Definition glcorearb.h:275
GLenum GLenum dst
Definition glcorearb.h:1767
GLintptr offset
Definition glcorearb.h:660
GLboolean r
Definition glcorearb.h:1233
GLenum GLfloat param
Definition glcorearb.h:271
GLuint GLuint stream
Definition glcorearb.h:1806
std::string to_string(gsl::span< T, Size > span)
Definition common.h:52
const uint32_t * occupancyMap
Definition GPUParam.h:64
const int nEvents
Definition test_Fifo.cxx:27