Project
Loading...
Searching...
No Matches
GPUReconstructionCPU.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
18#include "GPUChain.h"
19#include "GPUDefParameters.h"
20#include "GPUTPCClusterData.h"
22#include "GPUTPCGMMergedTrack.h"
24#include "GPUTRDTrackletWord.h"
26#include "GPUTPCMCInfo.h"
27#include "GPUTRDTrack.h"
28#include "GPUTRDTracker.h"
29#include "AliHLTTPCRawCluster.h"
31#include "GPUMemoryResource.h"
32#include "GPUConstantMem.h"
34#include <atomic>
35#include <ctime>
36
37#define GPUCA_LOGGING_PRINTF
38#include "GPULogging.h"
39
40#ifndef _WIN32
41#include <unistd.h>
42#endif
43
44using namespace o2::gpu;
46
49
51
53{
54 Exit(); // Needs to be identical to GPU backend bahavior in order to avoid calling abstract methods later in the destructor
55}
56
57template <class T, int32_t I, typename... Args>
58inline void GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)
59{
60 auto& x = _xyz.x;
61 auto& y = _xyz.y;
62 if (x.device == krnlDeviceType::Device) {
63 throw std::runtime_error("Cannot run device kernel on host");
64 }
65 if (x.nThreads != 1) {
66 throw std::runtime_error("Cannot run device kernel on host with nThreads != 1");
67 }
68 int32_t nThreads = getNKernelHostThreads(false);
69 if (nThreads > 1) {
70 if (mProcessingSettings.debugLevel >= 5) {
71 printf("Running %d Threads\n", nThreads);
72 }
73 tbb::this_task_arena::isolate([&] {
74 mThreading->activeThreads->execute([&] {
75 tbb::parallel_for(tbb::blocked_range<uint32_t>(0, x.nBlocks, 1), [&](const tbb::blocked_range<uint32_t>& r) {
76 typename T::GPUSharedMemory smem;
77 for (uint32_t iB = r.begin(); iB < r.end(); iB++) {
78 T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.index], args...);
79 }
80 });
81 });
82 });
83 } else {
84 for (uint32_t iB = 0; iB < x.nBlocks; iB++) {
85 typename T::GPUSharedMemory smem;
86 T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.index], args...);
87 }
88 }
89}
90
91template <>
92inline void GPUReconstructionCPUBackend::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
93{
94 int32_t nnThreads = std::max<int32_t>(1, std::min<int32_t>(size / (16 * 1024 * 1024), getNKernelHostThreads(true)));
95 if (nnThreads > 1) {
96 tbb::parallel_for(0, nnThreads, [&](int iThread) {
97 size_t threadSize = size / nnThreads;
98 if (threadSize % 4096) {
99 threadSize += 4096 - threadSize % 4096;
100 }
101 size_t offset = threadSize * iThread;
102 size_t mySize = std::min<size_t>(threadSize, size - offset);
103 if (mySize) {
104 memset((char*)ptr + offset, 0, mySize);
105 } // clang-format off
106 }, tbb::static_partitioner()); // clang-format on
107 } else {
108 memset(ptr, 0, size);
109 }
110}
111
112template <class T, int32_t I, typename... Args>
114{
115#pragma GCC diagnostic push
116#if defined(__clang__)
117#pragma GCC diagnostic ignored "-Wunused-lambda-capture" // this is not alway captured below
118#endif
119 std::apply([this, &args](auto&... vals) { runKernelBackendInternal<T, I, Args...>(args.s, vals...); }, args.v);
120#pragma GCC diagnostic push
121}
122
123template <class S, int32_t I>
125{
126 if (gpu == -1) {
127 gpu = IsGPU();
128 }
129 const auto num = GetKernelNum<S, I>();
130 const auto* p = gpu ? mParDevice : mParCPU;
131 gpu_reconstruction_kernels::krnlProperties ret = {p->par_LB_maxThreads[num], p->par_LB_minBlocks[num], p->par_LB_forceBlocks[num]};
132 if (ret.nThreads == 0) {
133 ret.nThreads = gpu ? mThreadCount : 1u;
134 }
135 if (ret.minBlocks == 0) {
136 ret.minBlocks = 1;
137 }
138 return ret;
139}
140
141#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \
142 template void GPUReconstructionCPUBackend::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args); \
143 template krnlProperties GPUReconstructionCPU::getKernelProperties<GPUCA_M_KRNL_TEMPLATE(x_class)>(int gpu);
144#include "GPUReconstructionKernelList.h"
145#undef GPUCA_KRNL
146
147size_t GPUReconstructionCPU::TransferMemoryInternal(GPUMemoryResource* res, int32_t stream, deviceEvent* ev, deviceEvent* evList, int32_t nEvents, bool toGPU, const void* src, void* dst) { return 0; }
148size_t GPUReconstructionCPU::GPUMemCpy(void* dst, const void* src, size_t size, int32_t stream, int32_t toGPU, deviceEvent* ev, deviceEvent* evList, int32_t nEvents) { return 0; }
149size_t GPUReconstructionCPU::GPUMemCpyAlways(bool onGpu, void* dst, const void* src, size_t size, int32_t stream, int32_t toGPU, deviceEvent* ev, deviceEvent* evList, int32_t nEvents)
150{
151 memcpy(dst, src, size);
152 return 0;
153}
154size_t GPUReconstructionCPU::WriteToConstantMemory(size_t offset, const void* src, size_t size, int32_t stream, deviceEvent* ev) { return 0; }
155int32_t GPUReconstructionCPU::GPUDebug(const char* state, int32_t stream, bool force) { return 0; }
156size_t GPUReconstructionCPU::TransferMemoryResourcesHelper(GPUProcessor* proc, int32_t stream, bool all, bool toGPU)
157{
160 size_t n = 0;
161 for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
163 if (res.mPtr == nullptr) {
164 continue;
165 }
166 if (proc && res.mProcessor != proc) {
167 continue;
168 }
170 continue;
171 }
172 if (!mProcessingSettings.keepAllMemory && !all && (res.mType & exc) && !(res.mType & inc)) {
173 continue;
174 }
175 if (toGPU) {
177 } else {
179 }
180 }
181 return n;
182}
183
185{
186// Get Thread ID
187#if defined(__APPLE__)
188 return (0); // syscall is deprecated on MacOS..., only needed for GPU support which we don't do on Mac anyway
189#elif defined(_WIN32)
190 return ((int32_t)(size_t)GetCurrentThread());
191#else
192 return ((int32_t)syscall(SYS_gettid));
193#endif
194}
195
197{
199 mThreading->activeThreads = std::make_unique<tbb::task_arena>(mActiveHostKernelThreads);
200 if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
201 if (mMaster == nullptr) {
204 }
206 }
209 }
210 if (mProcessingSettings.inKernelParallel) {
212 }
214 return 0;
215}
216
218{
219 if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
220 if (mMaster == nullptr) {
222 }
224 mHostMemorySize = 0;
225 }
226 return 0;
227}
228
230{
231 mMemoryScalers->temporaryFactor = 1.;
232 mStatNEvents++;
234
235 if (mProcessingSettings.debugLevel >= 3 || mProcessingSettings.allocDebugLevel) {
236 printf("Allocated memory when starting processing %34s", "");
238 }
240 const std::clock_t cpuTimerStart = std::clock();
241 if (mProcessingSettings.doublePipeline) {
242 int32_t retVal = EnqueuePipeline();
243 if (retVal) {
244 return retVal;
245 }
246 } else {
247 if (mSlaves.size() || mMaster) {
248 WriteConstantParams(); // Reinitialize // TODO: Get this in sync with GPUChainTracking::DoQueuedUpdates, and consider the doublePipeline
249 }
250 for (uint32_t i = 0; i < mChains.size(); i++) {
251 int32_t retVal = mChains[i]->RunChain();
252 if (retVal) {
253 return retVal;
254 }
255 }
256 if (GetProcessingSettings().tpcFreeAllocatedMemoryAfterProcessing) {
258 }
259 }
261 mStatCPUTime += (double)(std::clock() - cpuTimerStart) / CLOCKS_PER_SEC;
262 if (mProcessingSettings.debugLevel >= 3 || mProcessingSettings.allocDebugLevel) {
263 printf("Allocated memory when ending processing %36s", "");
265 }
266
268 std::string nEventReport;
269 if (GetProcessingSettings().debugLevel >= 0 && mStatNEvents > 1) {
270 nEventReport += " (avergage of " + std::to_string(mStatNEvents) + " runs)";
271 }
272 double kernelTotal = 0;
273 std::vector<double> kernelStepTimes(GPUDataTypes::N_RECO_STEPS, 0.);
274
275 if (GetProcessingSettings().debugLevel >= 1) {
276 for (uint32_t i = 0; i < mTimers.size(); i++) {
277 double time = 0;
278 if (mTimers[i] == nullptr) {
279 continue;
280 }
281 for (int32_t j = 0; j < mTimers[i]->num; j++) {
282 HighResTimer& timer = mTimers[i]->timer[j];
283 time += timer.GetElapsedTime();
284 if (mProcessingSettings.resetTimers) {
285 timer.Reset();
286 }
287 }
288
289 uint32_t type = mTimers[i]->type;
290 if (type == 0) {
291 kernelTotal += time;
292 int32_t stepNum = getRecoStepNum(mTimers[i]->step);
293 kernelStepTimes[stepNum] += time;
294 }
295 char bandwidth[256] = "";
296 if (mTimers[i]->memSize && mStatNEvents && time != 0.) {
297 snprintf(bandwidth, 256, " (%8.3f GB/s - %'14zu bytes - %'14zu per call)", mTimers[i]->memSize / time * 1e-9, mTimers[i]->memSize / mStatNEvents, mTimers[i]->memSize / mStatNEvents / mTimers[i]->count);
298 }
299 printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type == 0 ? 'K' : 'C', mTimers[i]->count, mTimers[i]->name.c_str(), time * 1000000 / mStatNEvents, bandwidth);
300 if (mProcessingSettings.resetTimers) {
301 mTimers[i]->count = 0;
302 mTimers[i]->memSize = 0;
303 }
304 }
305 }
306 if (GetProcessingSettings().recoTaskTiming) {
307 for (int32_t i = 0; i < GPUDataTypes::N_RECO_STEPS; i++) {
308 if (kernelStepTimes[i] != 0. || mTimersRecoSteps[i].timerTotal.GetElapsedTime() != 0.) {
309 printf("Execution Time: Step : %11s %38s Time: %'10.0f us %64s ( Total Time : %'14.0f us, CPU Time : %'14.0f us, %'7.2fx )\n", "Tasks",
310 GPUDataTypes::RECO_STEP_NAMES[i], kernelStepTimes[i] * 1000000 / mStatNEvents, "", mTimersRecoSteps[i].timerTotal.GetElapsedTime() * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU / mTimersRecoSteps[i].timerTotal.GetElapsedTime());
311 }
312 if (mTimersRecoSteps[i].bytesToGPU) {
313 printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToGPU, "DMA to GPU", GPUDataTypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToGPU.GetElapsedTime() * 1000000 / mStatNEvents,
315 }
316 if (mTimersRecoSteps[i].bytesToHost) {
317 printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToHost, "DMA to Host", GPUDataTypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1000000 / mStatNEvents,
319 }
320 if (mProcessingSettings.resetTimers) {
328 }
329 }
330 for (int32_t i = 0; i < GPUDataTypes::N_GENERAL_STEPS; i++) {
331 if (mTimersGeneralSteps[i].GetElapsedTime() != 0.) {
332 printf("Execution Time: General Step : %50s Time: %'10.0f us\n", GPUDataTypes::GENERAL_STEP_NAMES[i], mTimersGeneralSteps[i].GetElapsedTime() * 1000000 / mStatNEvents);
333 }
334 }
335 if (GetProcessingSettings().debugLevel >= 1) {
336 mStatKernelTime = kernelTotal * 1000000 / mStatNEvents;
337 printf("Execution Time: Total : %50s Time: %'10.0f us%s\n", "Total Kernel", mStatKernelTime, nEventReport.c_str());
338 }
339 printf("Execution Time: Total : %50s Time: %'10.0f us ( CPU Time : %'10.0f us, %7.2fx ) %s\n", "Total Wall", mStatWallTime, mStatCPUTime * 1000000 / mStatNEvents, mStatCPUTime / mTimerTotal.GetElapsedTime(), nEventReport.c_str());
340 } else if (GetProcessingSettings().debugLevel >= 0) {
341 GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime, nEventReport.c_str());
342 }
343 if (mProcessingSettings.resetTimers) {
344 mStatNEvents = 0;
345 mStatCPUTime = 0;
347 }
348
349 return 0;
350}
351
353{
354 for (uint32_t i = 0; i < mProcessors.size(); i++) {
355 if (mProcessors[i].proc->mGPUProcessorType != GPUProcessor::PROCESSOR_TYPE_DEVICE && mProcessors[i].proc->mLinkedProcessor) {
356 mProcessors[i].proc->mLinkedProcessor->InitGPUProcessor(this, GPUProcessor::PROCESSOR_TYPE_DEVICE);
357 }
358 }
359}
360
361void GPUReconstructionCPU::UpdateParamOccupancyMap(const uint32_t* mapHost, const uint32_t* mapGPU, uint32_t occupancyTotal, int32_t stream)
362{
363 param().occupancyMap = mapHost;
364 param().occupancyTotal = occupancyTotal;
365 if (IsGPU()) {
366 if (!((size_t)&param().occupancyTotal - (size_t)&param().occupancyMap == sizeof(param().occupancyMap) && sizeof(param().occupancyMap) == sizeof(size_t) && sizeof(param().occupancyTotal) < sizeof(size_t))) {
367 throw std::runtime_error("occupancy data not consecutive in GPUParam");
368 }
369 const auto threadContext = GetThreadContext();
370 size_t tmp[2] = {(size_t)mapGPU, 0};
371 memcpy(&tmp[1], &occupancyTotal, sizeof(occupancyTotal));
372 WriteToConstantMemory((char*)&processors()->param.occupancyMap - (char*)processors(), &tmp, sizeof(param().occupancyMap) + sizeof(param().occupancyTotal), stream);
373 }
374}
int16_t time
Definition RawEventData.h:4
int32_t i
#define GPUCA_OPERATOR_NEW_ALIGNMENT
int32_t retVal
Online TRD tracker based on extrapolated TPC tracks.
Used for storing the MC labels for the TRD tracklets.
TRD Tracklet word for GPU tracker - 32bit tracklet info + half chamber ID + index.
uint32_t j
Definition RawData.h:0
uint32_t res
Definition RawData.h:0
TBranch * ptr
double num
void Reset()
Definition timer.cxx:101
void Start()
Definition timer.cxx:57
double GetElapsedTime()
Definition timer.cxx:108
void Stop()
Definition timer.cxx:69
static constexpr const char *const GENERAL_STEP_NAMES[]
static constexpr const char *const RECO_STEP_NAMES[]
static constexpr int32_t N_RECO_STEPS
static constexpr int32_t N_GENERAL_STEPS
ProcessorType mGPUProcessorType
void runKernelBackend(const gpu_reconstruction_kernels::krnlSetupArgs< T, I, Args... > &args)
void runKernelBackendInternal(const gpu_reconstruction_kernels::krnlSetupTime &_xyz, const Args &... args)
virtual size_t GPUMemCpy(void *dst, const void *src, size_t size, int32_t stream, int32_t toGPU, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
virtual size_t GPUMemCpyAlways(bool onGpu, void *dst, const void *src, size_t size, int32_t stream, int32_t toGPU, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
static constexpr krnlRunRange krnlRunRangeNone
size_t TransferMemoryResourceToHost(GPUMemoryResource *res, int32_t stream=-1, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
void UpdateParamOccupancyMap(const uint32_t *mapHost, const uint32_t *mapGPU, uint32_t occupancyTotal, int32_t stream=-1)
size_t TransferMemoryResourceToGPU(GPUMemoryResource *res, int32_t stream=-1, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
gpu_reconstruction_kernels::krnlProperties getKernelProperties(int gpu=-1)
virtual int32_t GPUDebug(const char *state="UNKNOWN", int32_t stream=-1, bool force=false)
static constexpr krnlEvent krnlEventNone
size_t WriteToConstantMemory(size_t offset, const void *src, size_t size, int32_t stream=-1, deviceEvent *ev=nullptr) override
virtual size_t TransferMemoryInternal(GPUMemoryResource *res, int32_t stream, deviceEvent *ev, deviceEvent *evList, int32_t nEvents, bool toGPU, const void *src, void *dst)
virtual std::unique_ptr< gpu_reconstruction_kernels::threadContext > GetThreadContext() override
RecoStepTimerMeta mTimersRecoSteps[GPUDataTypes::N_RECO_STEPS]
std::vector< std::unique_ptr< timerMeta > > mTimers
HighResTimer mTimersGeneralSteps[GPUDataTypes::N_GENERAL_STEPS]
std::vector< std::unique_ptr< GPUChain > > mChains
std::unique_ptr< GPUMemorySizeScalers > mMemoryScalers
std::vector< GPUReconstruction * > mSlaves
std::vector< GPUMemoryResource > mMemoryResources
std::vector< ProcessorData > mProcessors
GPUSettingsProcessing mProcessingSettings
static GPUReconstruction * GPUReconstruction_Create_CPU(const GPUSettingsDeviceBackend &cfg)
void ClearAllocatedMemory(bool clearOutputs=true)
int32_t getRecoStepNum(RecoStep step, bool validCheck=true)
const GPUSettingsProcessing & GetProcessingSettings() const
int32_t EnqueuePipeline(bool terminate=false)
std::shared_ptr< GPUReconstructionThreading > mThreading
GLdouble n
Definition glcorearb.h:1982
GLint GLenum GLint x
Definition glcorearb.h:403
GLenum src
Definition glcorearb.h:1767
GLint GLsizei count
Definition glcorearb.h:399
GLsizeiptr size
Definition glcorearb.h:659
GLint GLint GLsizei GLint GLenum GLenum type
Definition glcorearb.h:275
GLenum GLenum dst
Definition glcorearb.h:1767
GLintptr offset
Definition glcorearb.h:660
GLboolean r
Definition glcorearb.h:1233
GLenum GLfloat param
Definition glcorearb.h:271
GLuint GLuint stream
Definition glcorearb.h:1806
std::string to_string(gsl::span< T, Size > span)
Definition common.h:52
std::tuple< typename std::conditional<(sizeof(Args) > sizeof(void *)), const Args &, const Args >::type... > v
const uint32_t * occupancyMap
Definition GPUParam.h:62
const int nEvents
Definition test_Fifo.cxx:27