Project
Loading...
Searching...
No Matches
GPUReconstruction.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
15#include <cstring>
16#include <cstdio>
17#include <iostream>
18#include <mutex>
19#include <string>
20#include <map>
21#include <queue>
22#include <mutex>
23#include <condition_variable>
24#include <array>
25
26#include "GPUReconstruction.h"
29#include "GPUReconstructionIO.h"
30#include "GPUROOTDumpCore.h"
31#include "GPUConfigDump.h"
32#include "GPUChainTracking.h"
33#include "GPUConstantMem.h"
34#include "GPUCommonHelpers.h"
35#include "GPUSettings.h"
36
37#include "GPUMemoryResource.h"
38#include "GPUChain.h"
40
41#include "GPULogging.h"
42#include "utils/strtag.h"
43#include "utils/stdspinlock.h"
44
45#ifndef GPUCA_STANDALONE
47#endif
48
50
51namespace o2::gpu
52{
53namespace // anonymous
54{
55struct GPUReconstructionPipelineQueue {
56 uint32_t op = 0; // For now, 0 = process, 1 = terminate
57 GPUChain* chain = nullptr;
58 std::mutex m;
59 std::condition_variable c;
60 bool done = false;
61 int32_t retVal = 0;
62};
63} // namespace
64
66 std::queue<GPUReconstructionPipelineQueue*> pipelineQueue;
67 std::mutex mutex;
68 std::condition_variable cond;
69 bool terminate = false;
70};
71} // namespace o2::gpu
72
73using namespace o2::gpu;
74
75constexpr const char* const GPUReconstruction::GEOMETRY_TYPE_NAMES[];
76constexpr const char* const GPUReconstruction::IOTYPENAMES[];
78
79static ptrdiff_t ptrDiff(void* a, void* b) { return (char*)a - (char*)b; }
80
81GPUReconstruction::GPUReconstruction(const GPUSettingsDeviceBackend& cfg) : mHostConstantMem(new GPUConstantMem), mGRPSettings(new GPUSettingsGRP), mDeviceBackendSettings(new GPUSettingsDeviceBackend(cfg)), mProcessingSettings(new GPUSettingsProcessing)
82{
83 if (cfg.master) {
85 throw std::invalid_argument("device type of master and slave GPUReconstruction does not match");
86 }
87 if (cfg.master->mMaster) {
88 throw std::invalid_argument("Cannot be slave to a slave");
89 }
90 mMaster = cfg.master;
91 mSlaveId = cfg.master->mSlaves.size();
92 cfg.master->mSlaves.emplace_back(this);
93 }
96 for (uint32_t i = 0; i < NSECTORS; i++) {
97 processors()->tpcTrackers[i].SetSector(i); // TODO: Move to a better place
99#ifdef GPUCA_HAS_ONNX
100 processors()->tpcNNClusterer[i].mISector = i;
101#endif
102 }
103}
104
106{
107 if (mInitialized) {
108 GPUError("GPU Reconstruction not properly deinitialized!");
109 }
110}
111
112void GPUReconstruction::GetITSTraits(std::unique_ptr<o2::its::TrackerTraits<7>>* trackerTraits, std::unique_ptr<o2::its::VertexerTraits<7>>* vertexerTraits, std::unique_ptr<o2::its::TimeFrame<7>>* timeFrame)
113{
114 if (trackerTraits) {
115 trackerTraits->reset(new o2::its::TrackerTraits<7>);
116 }
117 if (vertexerTraits) {
118 vertexerTraits->reset(new o2::its::VertexerTraits<7>);
119 }
120 if (timeFrame) {
121 timeFrame->reset(new o2::its::TimeFrame<7>);
122 }
123}
124
126{
127 return std::max<int32_t>(0, tbb::this_task_arena::current_thread_index());
128}
129
131{
132 if (mMaster) {
133 throw std::runtime_error("Must not call init on slave!");
134 }
135#ifndef GPUCA_NO_ROOT
136 if (!mROOTDump) {
137 mROOTDump = GPUROOTDumpCore::getAndCreate(GetProcessingSettings().ROOTDumpFile.c_str());
138 }
139#endif
140 int32_t retVal = InitPhaseBeforeDevice();
141 if (retVal) {
142 return retVal;
143 }
144 for (uint32_t i = 0; i < mSlaves.size(); i++) {
145 retVal = mSlaves[i]->InitPhaseBeforeDevice();
146 if (retVal) {
147 GPUError("Error initialization slave (before deviceinit)");
148 return retVal;
149 }
150 mNStreams = std::max(mNStreams, mSlaves[i]->mNStreams);
153 }
154 if (InitDevice()) {
155 return 1;
156 }
157 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
160 } else {
162 }
164 return 1;
165 }
166 for (uint32_t i = 0; i < mSlaves.size(); i++) {
167 mSlaves[i]->mDeviceMemoryBase = mDeviceMemoryPermanent;
168 mSlaves[i]->mHostMemoryBase = mHostMemoryPermanent;
169 mSlaves[i]->mDeviceMemorySize = mDeviceMemorySize - ptrDiff(mSlaves[i]->mDeviceMemoryBase, mDeviceMemoryBase);
170 mSlaves[i]->mHostMemorySize = mHostMemorySize - ptrDiff(mSlaves[i]->mHostMemoryBase, mHostMemoryBase);
171 mSlaves[i]->mHostMemoryPoolEnd = mHostMemoryPoolEnd;
172 mSlaves[i]->mDeviceMemoryPoolEnd = mDeviceMemoryPoolEnd;
173 if (mSlaves[i]->InitDevice()) {
174 GPUError("Error initialization slave (deviceinit)");
175 return 1;
176 }
178 GPUError("Error initialization slave (permanent memory)");
179 return 1;
180 }
181 mDeviceMemoryPermanent = mSlaves[i]->mDeviceMemoryPermanent;
182 mHostMemoryPermanent = mSlaves[i]->mHostMemoryPermanent;
183 }
185 if (retVal) {
186 return retVal;
187 }
189 for (uint32_t i = 0; i < mSlaves.size(); i++) {
190 mSlaves[i]->mDeviceMemoryPermanent = mDeviceMemoryPermanent;
191 mSlaves[i]->mHostMemoryPermanent = mHostMemoryPermanent;
192 retVal = mSlaves[i]->InitPhaseAfterDevice();
193 if (retVal) {
194 GPUError("Error initialization slave (after device init)");
195 return retVal;
196 }
197 mSlaves[i]->ClearAllocatedMemory();
198 }
199 debugInit();
200 return 0;
201}
202
203namespace o2::gpu::internal
204{
205static uint32_t getDefaultNThreads()
206{
207 const char* tbbEnv = getenv("TBB_NUM_THREADS");
208 uint32_t tbbNum = tbbEnv ? atoi(tbbEnv) : 0;
209 if (tbbNum) {
210 return tbbNum;
211 }
212 const char* ompEnv = getenv("OMP_NUM_THREADS");
213 uint32_t ompNum = ompEnv ? atoi(ompEnv) : 0;
214 if (ompNum) {
215 return ompNum;
216 }
217 return tbb::info::default_concurrency();
218}
219} // namespace o2::gpu::internal
220
222{
223 if (GetProcessingSettings().printSettings) {
224 if (mSlaves.size() || mMaster) {
225 printf("\nConfig Dump %s\n", mMaster ? "Slave" : "Master");
226 }
227 const GPUChainTracking* chTrk;
228 for (uint32_t i = 0; i < mChains.size(); i++) {
229 if ((chTrk = dynamic_cast<GPUChainTracking*>(mChains[i].get()))) {
230 break;
231 }
232 }
233 GPUConfigDump::dumpConfig(&param().rec, mProcessingSettings.get(), chTrk ? chTrk->GetQAConfig() : nullptr, chTrk ? chTrk->GetEventDisplayConfig() : nullptr, mDeviceBackendSettings.get(), &mRecoSteps);
234 }
237 if (!IsGPU()) {
238 mRecoSteps.stepsGPUMask.set((uint8_t)0);
239 }
240
241 if (GetProcessingSettings().forceMemoryPoolSize >= 1024 || GetProcessingSettings().forceHostMemoryPoolSize >= 1024) {
243 }
244 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_AUTO) {
246 }
247 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
248 mProcessingSettings->forceMemoryPoolSize = mProcessingSettings->forceHostMemoryPoolSize = 0;
249 }
250 if (GetProcessingSettings().debugLevel >= 4) {
251 mProcessingSettings->keepAllMemory = true;
252 }
253 if (GetProcessingSettings().debugLevel >= 5 && GetProcessingSettings().allocDebugLevel < 2) {
254 mProcessingSettings->allocDebugLevel = 2;
255 }
257 mProcessingSettings->keepDisplayMemory = true;
258 }
259 if (GetProcessingSettings().debugLevel < 6) {
260 mProcessingSettings->debugMask = 0;
261 }
262 if (GetProcessingSettings().debugLevel < 1) {
263 mProcessingSettings->deviceTimers = false;
264 }
265 if (GetProcessingSettings().debugLevel > 0) {
266 mProcessingSettings->recoTaskTiming = true;
267 }
268 bool detMode = false;
269#ifdef GPUCA_DETERMINISTIC_MODE
270 detMode = true;
271#endif
272 if (GetProcessingSettings().deterministicGPUReconstruction == -1) {
273 mProcessingSettings->deterministicGPUReconstruction = detMode ? 1 : (GetProcessingSettings().debugLevel >= 6);
274 }
275 if (GetProcessingSettings().deterministicGPUReconstruction) {
276 if (!detMode) {
277 GPUError("WARNING, deterministicGPUReconstruction needs GPUCA_DETERMINISTIC_MODE for being fully deterministic, without only most indeterminism by concurrency is removed, but floating point effects remain!");
278 }
279 if (mProcessingSettings->debugLevel >= 6 && ((mProcessingSettings->debugMask + 1) & mProcessingSettings->debugMask)) {
280 GPUError("WARNING: debugMask %d - debug output might not be deterministic with intermediate steps missing", mProcessingSettings->debugMask);
281 }
282 mProcessingSettings->overrideClusterizerFragmentLen = TPC_MAX_FRAGMENT_LEN_GPU;
283 if (GetProcessingSettings().createO2Output > 1) {
284 mProcessingSettings->createO2Output = 1;
285 }
286 mProcessingSettings->rtc.deterministic = 1;
287 } else {
288 if (detMode) {
289 GPUError("WARNING, compiled with GPUCA_DETERMINISTIC_MODE but deterministicGPUReconstruction not set, only compile-time determinism and deterministic math enforced, not fully deterministic!");
290 }
291 }
292 if (GetProcessingSettings().deterministicGPUReconstruction && GetProcessingSettings().debugLevel >= 6) {
293 mProcessingSettings->nTPCClustererLanes = 1;
294 }
295 if (GetProcessingSettings().createO2Output > 1 && GetProcessingSettings().runQA && GetProcessingSettings().qcRunFraction == 100.f) {
296 mProcessingSettings->createO2Output = 1;
297 }
298 if (!GetProcessingSettings().createO2Output || !IsGPU()) {
299 mProcessingSettings->clearO2OutputFromGPU = false;
300 }
302 mProcessingSettings->mergerSortTracks = false;
303 }
304 if (GetProcessingSettings().debugLevel > 3 || !IsGPU() || GetProcessingSettings().deterministicGPUReconstruction) {
305 mProcessingSettings->delayedOutput = false;
306 }
307 if (!GetProcessingSettings().rtc.enable) {
308 mProcessingSettings->rtc.optConstexpr = false;
309 }
310 if (GetProcessingSettings().allSanityChecks) {
311 mProcessingSettings->clusterizerZSSanityCheck = mProcessingSettings->mergerSanityCheck = mProcessingSettings->outputSanityCheck = true;
312 }
313
314 static_cast<GPUSettingsProcessingScaling&>(*mMemoryScalers) = GetProcessingSettings().scaling;
315 mMemoryScalers->scalingFactor = GetProcessingSettings().memoryScalingFactor;
316 mMemoryScalers->returnMaxVal = GetProcessingSettings().forceMaxMemScalers != 0;
317 if (GetProcessingSettings().forceMaxMemScalers > 1) {
318 mMemoryScalers->rescaleMaxMem(GetProcessingSettings().forceMaxMemScalers);
319 }
320
321 if (GetProcessingSettings().nHostThreads != -1 && GetProcessingSettings().ompThreads != -1) {
322 GPUFatal("Must not use both nHostThreads and ompThreads at the same time!");
323 } else if (GetProcessingSettings().ompThreads != -1) {
324 mProcessingSettings->nHostThreads = GetProcessingSettings().ompThreads;
325 GPUWarning("You are using the deprecated ompThreads option, please switch to nHostThreads!");
326 }
327
328 if (GetProcessingSettings().nHostThreads <= 0) {
329 mProcessingSettings->nHostThreads = internal::getDefaultNThreads();
330 } else {
331 mProcessingSettings->autoAdjustHostThreads = false;
332 }
333 mMaxHostThreads = GetProcessingSettings().nHostThreads;
334 if (mMaster == nullptr) {
335 mThreading = std::make_shared<GPUReconstructionThreading>();
336 mThreading->control = std::make_unique<tbb::global_control>(tbb::global_control::max_allowed_parallelism, mMaxHostThreads);
337 mThreading->allThreads = std::make_unique<tbb::task_arena>(mMaxHostThreads);
338 mThreading->activeThreads = std::make_unique<tbb::task_arena>(mMaxHostThreads);
339 } else {
341 }
343 if (IsGPU()) {
344 mNStreams = std::max<int32_t>(GetProcessingSettings().nStreams, 3);
345 }
346
347 if (GetProcessingSettings().nTPCClustererLanes == -1) {
348 mProcessingSettings->nTPCClustererLanes = (GetRecoStepsGPU() & RecoStep::TPCClusterFinding) ? 3 : std::max<int32_t>(1, std::min<int32_t>(GPUTPCGeometry::NSECTORS, GetProcessingSettings().inKernelParallel ? (mMaxHostThreads >= 4 ? std::min<int32_t>(mMaxHostThreads / 2, mMaxHostThreads >= 32 ? GPUTPCGeometry::NSECTORS : 4) : 1) : mMaxHostThreads));
349 }
350 if (GetProcessingSettings().overrideClusterizerFragmentLen == -1) {
351 mProcessingSettings->overrideClusterizerFragmentLen = ((GetRecoStepsGPU() & RecoStep::TPCClusterFinding) || (mMaxHostThreads / GetProcessingSettings().nTPCClustererLanes >= 3)) ? TPC_MAX_FRAGMENT_LEN_GPU : TPC_MAX_FRAGMENT_LEN_HOST;
352 }
353 if ((uint32_t)GetProcessingSettings().nTPCClustererLanes > GPUTPCGeometry::NSECTORS) {
354 GPUError("Invalid value for nTPCClustererLanes: %d", GetProcessingSettings().nTPCClustererLanes);
355 mProcessingSettings->nTPCClustererLanes = GPUTPCGeometry::NSECTORS;
356 }
357
358 if (GetProcessingSettings().doublePipeline) {
359 mProcessingSettings->rtctech.allowOptimizedSlaveReconstruction = true;
360 }
361 if (GetProcessingSettings().doublePipeline && (mChains.size() != 1 || mChains[0]->SupportsDoublePipeline() == false || !IsGPU() || GetProcessingSettings().memoryAllocationStrategy != GPUMemoryResource::ALLOCATION_GLOBAL)) {
362 GPUError("Must use double pipeline mode only with exactly one chain that must support it");
363 return 1;
364 }
365 if (mMaster == nullptr && GetProcessingSettings().doublePipeline) {
367 }
368
369 if (mMaster && GetProcessingSettings().rtc.enable && (GetProcessingSettings().rtc.optConstexpr || GetProcessingSettings().rtc.optSpecialCode) && !GetProcessingSettings().rtctech.allowOptimizedSlaveReconstruction) {
370 GPUError("Not allowed to create optimized RTC code with more than one GPUReconstruction instances");
371 return 1;
372 }
373
375 for (uint32_t i = 0; i < mChains.size(); i++) {
376 if (mChains[i]->EarlyConfigure()) {
377 return 1;
378 }
379 mChains[i]->RegisterPermanentMemoryAndProcessors();
380 size_t memPrimary, memPageLocked;
381 mChains[i]->MemorySize(memPrimary, memPageLocked);
382 if (!IsGPU() || mOutputControl.useInternal()) {
383 memPageLocked = memPrimary;
384 }
385 mDeviceMemorySize += memPrimary;
386 mHostMemorySize += memPageLocked;
387 }
388 if (GetProcessingSettings().forceMemoryPoolSize && GetProcessingSettings().forceMemoryPoolSize <= 2 && CanQueryMaxMemory()) {
389 mDeviceMemorySize = GetProcessingSettings().forceMemoryPoolSize;
390 } else if (GetProcessingSettings().forceMemoryPoolSize > 2) {
391 mDeviceMemorySize = GetProcessingSettings().forceMemoryPoolSize;
392 if (!IsGPU() || mOutputControl.useInternal()) {
394 }
395 }
396 if (GetProcessingSettings().forceHostMemoryPoolSize) {
397 mHostMemorySize = GetProcessingSettings().forceHostMemoryPoolSize;
398 }
399
400 for (uint32_t i = 0; i < mProcessors.size(); i++) {
401 (mProcessors[i].proc->*(mProcessors[i].RegisterMemoryAllocation))();
402 }
403
404 return 0;
405}
406
408{
409 if (IsGPU()) {
410 for (uint32_t i = 0; i < mChains.size(); i++) {
411 mChains[i]->RegisterGPUProcessors();
412 }
413 }
415 return 0;
416}
417
419{
420 if (GetProcessingSettings().forceMaxMemScalers <= 1 && GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
422 }
423 for (uint32_t i = 0; i < mChains.size(); i++) {
424 if (mChains[i]->Init()) {
425 return 1;
426 }
427 }
428 for (uint32_t i = 0; i < mProcessors.size(); i++) {
429 (mProcessors[i].proc->*(mProcessors[i].InitializeProcessor))();
430 }
431
432 WriteConstantParams(); // Initialize with initial values, can optionally be updated later
433
434 mInitialized = true;
435 return 0;
436}
437
439{
440 if (IsGPU()) {
441 const auto threadContext = GetThreadContext();
442 WriteToConstantMemory(ptrDiff(&processors()->param, processors()), &param(), sizeof(param()), stream);
443 }
444}
445
447{
448 for (uint32_t i = 0; i < mChains.size(); i++) {
449 mChains[i]->Finalize();
450 }
451 return 0;
452}
453
455{
456 if (!mInitialized) {
457 return 1;
458 }
459 for (uint32_t i = 0; i < mSlaves.size(); i++) {
460 if (mSlaves[i]->Exit()) {
461 GPUError("Error exiting slave");
462 }
463 }
464
465 mChains.clear(); // Make sure we destroy a possible ITS GPU tracker before we call the destructors
466 mHostConstantMem.reset(); // Reset these explicitly before the destruction of other members unloads the library
467 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
468 for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
469 if (mMemoryResources[i].mReuse >= 0) {
470 continue;
471 }
473 mMemoryResources[i].mPtr = mMemoryResources[i].mPtrDevice = nullptr;
474 }
475 }
476 mMemoryResources.clear();
477 if (mInitialized) {
478 ExitDevice();
479 }
480 debugExit();
481 mInitialized = false;
482 return 0;
483}
484
487
489{
490 for (auto it = mMemoryReuse1to1.begin(); it != mMemoryReuse1to1.end(); it++) {
491 auto& re = it->second;
492 if (proc == nullptr || re.proc == proc) {
493 GPUMemoryResource& resMain = mMemoryResources[re.res[0]];
494 resMain.mOverrideSize = 0;
495 for (uint32_t i = 0; i < re.res.size(); i++) {
497 resMain.mOverrideSize = std::max<size_t>(resMain.mOverrideSize, ptrDiff(res.SetPointers((void*)1), (char*)1));
498 }
499 }
500 }
501}
502
504{
506 if ((type & GPUMemoryResource::MEMORY_SCRATCH) && !GetProcessingSettings().keepDisplayMemory) { // keepAllMemory --> keepDisplayMemory
508 } else {
510 }
511 }
513 type &= ~GPUMemoryResource::MEMORY_GPU;
514 }
515 mMemoryResources.emplace_back(proc, setPtr, (GPUMemoryResource::MemoryType)type, name);
516 if (mMemoryResources.size() >= 32768) {
517 throw std::bad_alloc();
518 }
519 uint16_t retVal = mMemoryResources.size() - 1;
520 if (re.type != GPUMemoryReuse::NONE && !GetProcessingSettings().disableMemoryReuse) {
521 const auto& it = mMemoryReuse1to1.find(re.id);
522 if (it == mMemoryReuse1to1.end()) {
523 mMemoryReuse1to1[re.id] = {proc, retVal};
524 } else {
525 mMemoryResources[retVal].mReuse = it->second.res[0];
526 it->second.res.emplace_back(retVal);
527 }
528 }
529 return retVal;
530}
531
533{
534 if (GetProcessingSettings().debugLevel >= 5) {
535 GPUInfo("Allocating memory %p", (void*)proc);
536 }
537 size_t total = 0;
538 for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
539 if (proc == nullptr ? !mMemoryResources[i].mProcessor->mAllocateAndInitializeLate : mMemoryResources[i].mProcessor == proc) {
541 total += AllocateRegisteredMemory(i);
542 } else if (resetCustom && (mMemoryResources[i].mPtr || mMemoryResources[i].mPtrDevice)) {
544 }
545 }
546 }
547 if (GetProcessingSettings().debugLevel >= 5) {
548 GPUInfo("Allocating memory done");
549 }
550 return total;
551}
552
554{
555 if (GetProcessingSettings().debugLevel >= 5) {
556 GPUInfo("Allocating Permanent Memory");
557 }
559 GPUError("Must not allocate permanent memory while volatile chunks are allocated");
560 throw std::bad_alloc();
561 }
562 int32_t total = 0;
563 for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
564 if ((mMemoryResources[i].mType & GPUMemoryResource::MEMORY_PERMANENT) && mMemoryResources[i].mPtr == nullptr) {
565 total += AllocateRegisteredMemory(i);
566 }
567 }
570 if (GetProcessingSettings().debugLevel >= 5) {
571 GPUInfo("Permanent Memory Done");
572 }
573 return total;
574}
575
576size_t GPUReconstruction::AllocateRegisteredMemoryHelper(GPUMemoryResource* res, void*& ptr, void*& memorypool, void* memorybase, size_t memorysize, void* (GPUMemoryResource::*setPtr)(void*) const, void*& memorypoolend, const char* device)
577{
578 if (res->mReuse >= 0) {
579 ptr = (&ptr == &res->mPtrDevice) ? mMemoryResources[res->mReuse].mPtrDevice : mMemoryResources[res->mReuse].mPtr;
580 if (ptr == nullptr) {
581 GPUError("Invalid reuse ptr (%s)", res->mName);
582 throw std::bad_alloc();
583 }
584 size_t retVal = ptrDiff((res->*setPtr)(ptr), ptr);
585 if (retVal > mMemoryResources[res->mReuse].mSize) {
586 GPUError("Insufficient reuse memory %lu < %lu (%s) (%s)", mMemoryResources[res->mReuse].mSize, retVal, res->mName, device);
587 throw std::bad_alloc();
588 }
589 if (GetProcessingSettings().allocDebugLevel >= 2) {
590 std::cout << "Reused (" << device << ") " << res->mName << ": " << retVal << "\n";
591 }
592 return retVal;
593 }
594 if (memorypool == nullptr) {
595 GPUError("Cannot allocate memory from uninitialized pool");
596 throw std::bad_alloc();
597 }
598 size_t retVal;
599 stdspinlock spinlock(mMemoryMutex);
600 if ((res->mType & GPUMemoryResource::MEMORY_STACK) && memorypoolend) {
601 retVal = ptrDiff((res->*setPtr)((char*)1), (char*)(1));
602 memorypoolend = (void*)((char*)memorypoolend - GPUProcessor::getAlignmentMod<constants::GPU_MEMALIGN>(memorypoolend));
603 if (retVal < res->mOverrideSize) {
604 retVal = res->mOverrideSize;
605 }
606 retVal += GPUProcessor::getAlignment<constants::GPU_MEMALIGN>(retVal);
607 memorypoolend = (char*)memorypoolend - retVal;
608 ptr = memorypoolend;
609 retVal = std::max<size_t>(ptrDiff((res->*setPtr)(ptr), ptr), res->mOverrideSize);
610 } else {
611 ptr = memorypool;
612 memorypool = (char*)((res->*setPtr)(ptr));
613 retVal = ptrDiff(memorypool, ptr);
614 if (retVal < res->mOverrideSize) {
615 retVal = res->mOverrideSize;
616 memorypool = (char*)ptr + res->mOverrideSize;
617 }
618 memorypool = (void*)((char*)memorypool + GPUProcessor::getAlignment<constants::GPU_MEMALIGN>(memorypool));
619 }
620 if (memorypoolend ? (memorypool > memorypoolend) : ((size_t)ptrDiff(memorypool, memorybase) > memorysize)) {
621 std::cerr << "Memory pool size exceeded (" << device << ") (" << res->mName << ": " << (memorypoolend ? (memorysize + ptrDiff(memorypool, memorypoolend)) : ptrDiff(memorypool, memorybase)) << " > " << memorysize << "\n";
622 throw std::bad_alloc();
623 }
624 if (GetProcessingSettings().allocDebugLevel >= 2) {
625 std::cout << "Allocated (" << device << ") " << res->mName << ": " << retVal << " - available: " << (memorypoolend ? ptrDiff(memorypoolend, memorypool) : (memorysize - ptrDiff(memorypool, memorybase))) << "\n";
626 }
627 return retVal;
628}
629
631{
632 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL && (control == nullptr || control->useInternal())) {
633 if (!(res->mType & GPUMemoryResource::MEMORY_EXTERNAL)) {
634 if (res->mPtrDevice && res->mReuse < 0) {
635 alignedDefaultBufferDeleter()(res->mPtrDevice);
636 }
637 res->mSize = std::max((size_t)res->SetPointers((void*)1) - 1, res->mOverrideSize);
638 if (res->mReuse >= 0) {
639 if (res->mSize > mMemoryResources[res->mReuse].mSize) {
640 GPUError("Invalid reuse, insufficient size: %ld < %ld", (int64_t)mMemoryResources[res->mReuse].mSize, (int64_t)res->mSize);
641 throw std::bad_alloc();
642 }
643 res->mPtrDevice = mMemoryResources[res->mReuse].mPtrDevice;
644 } else {
645 res->mPtrDevice = alignedDefaultBufferAllocator<char>(res->mSize + constants::GPU_BUFFER_ALIGNMENT);
646 }
647 res->mPtr = GPUProcessor::alignPointer<constants::GPU_BUFFER_ALIGNMENT>(res->mPtrDevice);
648 res->SetPointers(res->mPtr);
649 if (GetProcessingSettings().allocDebugLevel >= 2) {
650 std::cout << (res->mReuse >= 0 ? "Reused " : "Allocated ") << res->mName << ": " << res->mSize << " (individual" << ((res->mType & GPUMemoryResource::MEMORY_STACK) ? " stack" : "") << ")\n";
651 }
653 stdspinlock spinlock(mMemoryMutex);
655 }
656 if ((size_t)res->mPtr % constants::GPU_BUFFER_ALIGNMENT) {
657 GPUError("Got buffer with insufficient alignment");
658 throw std::bad_alloc();
659 }
660 }
661 } else {
662 if (res->mPtr != nullptr) {
663 GPUError("Double allocation! (%s)", res->mName);
664 throw std::bad_alloc();
665 }
666 if (IsGPU() && res->mOverrideSize < constants::GPU_BUFFER_ALIGNMENT) {
667 res->mOverrideSize = constants::GPU_BUFFER_ALIGNMENT;
668 }
669 if ((!IsGPU() || (res->mType & GPUMemoryResource::MEMORY_HOST) || GetProcessingSettings().keepDisplayMemory) && !(res->mType & GPUMemoryResource::MEMORY_EXTERNAL)) { // keepAllMemory --> keepDisplayMemory
670 if (control && control->useExternal()) {
671 if (control->allocator) {
672 res->mSize = std::max((size_t)res->SetPointers((void*)1) - 1, res->mOverrideSize);
673 res->mPtr = control->allocator(CAMath::nextMultipleOf<constants::GPU_BUFFER_ALIGNMENT>(res->mSize));
674 res->mSize = std::max<size_t>(ptrDiff(res->SetPointers(res->mPtr), res->mPtr), res->mOverrideSize);
675 if (GetProcessingSettings().allocDebugLevel >= 2) {
676 std::cout << "Allocated (from callback) " << res->mName << ": " << res->mSize << "\n";
677 }
678 } else {
679 void* dummy = nullptr;
680 res->mSize = AllocateRegisteredMemoryHelper(res, res->mPtr, control->ptrCurrent, control->ptrBase, control->size, &GPUMemoryResource::SetPointers, dummy, "host");
681 }
682 } else {
684 }
685 if ((size_t)res->mPtr % constants::GPU_BUFFER_ALIGNMENT) {
686 GPUError("Got buffer with insufficient alignment");
687 throw std::bad_alloc();
688 }
689 }
690 if (IsGPU() && (res->mType & GPUMemoryResource::MEMORY_GPU)) {
691 if (res->mProcessor->mLinkedProcessor == nullptr) {
692 GPUError("Device Processor not set (%s)", res->mName);
693 throw std::bad_alloc();
694 }
696 GPUError("Must not allocate non-stacked device memory while volatile chunks are allocated");
697 throw std::bad_alloc();
698 }
700
702 res->mSize = size;
703 } else if (size != res->mSize) {
704 GPUError("Inconsistent device memory allocation (%s: device %lu vs %lu)", res->mName, size, res->mSize);
705 throw std::bad_alloc();
706 }
707 if ((size_t)res->mPtrDevice % constants::GPU_BUFFER_ALIGNMENT) {
708 GPUError("Got buffer with insufficient alignment");
709 throw std::bad_alloc();
710 }
711 }
713 }
714}
715
720
722{
724 if ((res->mType & GPUMemoryResource::MEMORY_PERMANENT) && res->mPtr != nullptr) {
726 } else {
728 }
729 return res->mReuse >= 0 ? 0 : res->mSize;
730}
731
733{
734 stdspinlock spinlock(mMemoryMutex);
735 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
736 char* retVal = alignedDefaultBufferAllocator<char>(size);
739 } else {
741 }
742 return retVal;
743 }
744
746 throw std::runtime_error("Requested invalid memory typo for direct allocation");
747 }
749 GPUError("Must not allocate direct memory while volatile chunks are allocated");
750 throw std::bad_alloc();
751 }
752
755 char* retVal;
757 poolend = (char*)poolend - size; // TODO: Implement overflow check
758 poolend = (char*)poolend - GPUProcessor::getAlignmentMod<constants::GPU_MEMALIGN>(poolend);
759 retVal = (char*)poolend;
760 } else {
762 }
763 if (pool > poolend) {
764 GPUError("Insufficient unmanaged memory: missing %ld bytes", ptrDiff(pool, poolend));
765 throw std::bad_alloc();
766 }
768 if (GetProcessingSettings().allocDebugLevel >= 2) {
769 std::cout << "Allocated (unmanaged " << ((type & GPUMemoryResource::MEMORY_GPU) ? "gpu" : "host") << "): " << size << " - available: " << ptrDiff(poolend, pool) << "\n";
770 }
771 return retVal;
772}
773
775{
776 stdspinlock spinlock(mMemoryMutex);
777 if (mVolatileMemoryStart == nullptr) {
779 }
780 if (size == 0) {
781 return nullptr; // Future GPU memory allocation is volatile
782 }
783 char* retVal;
786 GPUError("Insufficient volatile device memory: missing %ld", ptrDiff(mDeviceMemoryPool, mDeviceMemoryPoolEnd));
787 throw std::bad_alloc();
788 }
790 if (GetProcessingSettings().allocDebugLevel >= 2) {
791 std::cout << "Allocated (volatile GPU): " << size << " - available: " << ptrDiff(mDeviceMemoryPoolEnd, mDeviceMemoryPool) << "\n";
792 }
793 return retVal;
794}
795
797{
798 if (device) {
800 }
801 char* retVal = alignedDefaultBufferAllocator<char>(size);
802 stdspinlock spinlock(mMemoryMutex);
804 return retVal;
805}
806
812
814{
818 mVolatileMemoryStart = nullptr;
819 }
820 if (GetProcessingSettings().allocDebugLevel >= 2) {
821 std::cout << "Freed (volatile GPU) - available: " << ptrDiff(mDeviceMemoryPoolEnd, mDeviceMemoryPool) << "\n";
822 }
823}
824
830
832{
833 for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
834 if (proc == nullptr || mMemoryResources[i].mProcessor == proc) {
836 }
837 }
838}
839
841{
844 void* basePtr = res->mReuse >= 0 ? mMemoryResources[res->mReuse].mPtr : res->mPtr;
845 size_t size = ptrDiff(res->SetPointers(basePtr), basePtr);
846 if (basePtr && size > std::max(res->mSize, res->mOverrideSize)) {
847 std::cerr << "Updated pointers exceed available memory size: " << size << " > " << std::max(res->mSize, res->mOverrideSize) << " - host - " << res->mName << "\n";
848 throw std::bad_alloc();
849 }
850 }
851 if (IsGPU() && (res->mType & GPUMemoryResource::MEMORY_GPU)) {
852 void* basePtr = res->mReuse >= 0 ? mMemoryResources[res->mReuse].mPtrDevice : res->mPtrDevice;
853 size_t size = ptrDiff(res->SetDevicePointers(basePtr), basePtr);
854 if (basePtr && size > std::max(res->mSize, res->mOverrideSize)) {
855 std::cerr << "Updated pointers exceed available memory size: " << size << " > " << std::max(res->mSize, res->mOverrideSize) << " - GPU - " << res->mName << "\n";
856 throw std::bad_alloc();
857 }
858 }
859}
860
861void GPUReconstruction::FreeRegisteredMemory(GPUProcessor* proc, bool freeCustom, bool freePermanent)
862{
863 for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
864 if ((proc == nullptr || mMemoryResources[i].mProcessor == proc) && (freeCustom || !(mMemoryResources[i].mType & GPUMemoryResource::MEMORY_CUSTOM)) && (freePermanent || !(mMemoryResources[i].mType & GPUMemoryResource::MEMORY_PERMANENT))) {
866 }
867 }
868}
869
874
876{
877 if (GetProcessingSettings().allocDebugLevel >= 2 && (res->mPtr || res->mPtrDevice)) {
878 std::cout << "Freeing " << res->mName << ": size " << res->mSize << " (reused " << res->mReuse << ")\n";
879 }
880 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL && res->mReuse < 0) {
881 alignedDefaultBufferDeleter()(res->mPtrDevice);
882 }
883 res->mPtr = nullptr;
884 res->mPtrDevice = nullptr;
885}
886
891
893{
894 if (proc && GetProcessingSettings().memoryAllocationStrategy != GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
895 GPUFatal("Processor-depending memory-free works only with allocation strategy ALLOCATION_INDIVIDUAL");
896 }
897 if (GetProcessingSettings().keepDisplayMemory || GetProcessingSettings().disableMemoryReuse) {
898 return;
899 }
900 if (mNonPersistentMemoryStack.size() == 0) {
901 GPUFatal("Trying to pop memory state from empty stack");
902 }
903 if (tag != 0 && std::get<4>(mNonPersistentMemoryStack.back()) != tag) {
904 GPUFatal("Tag mismatch when popping non persistent memory from stack : pop %s vs on stack %s", qTag2Str(tag).c_str(), qTag2Str(std::get<4>(mNonPersistentMemoryStack.back())).c_str());
905 }
906 if (!proc && (GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) && (IsGPU() || GetProcessingSettings().forceHostMemoryPoolSize)) {
907 printf("Allocated memory after %30s (%8s) (Stack %zu): ", gpudatatypes::RECO_STEP_NAMES[getRecoStepNum(step, true)], qTag2Str(std::get<4>(mNonPersistentMemoryStack.back())).c_str(), mNonPersistentMemoryStack.size());
909 printf("%76s", "");
911 }
912 for (uint32_t i = std::get<2>(mNonPersistentMemoryStack.back()); i < mNonPersistentIndividualAllocations.size(); i++) {
914 if (proc && res->mProcessor != proc) {
915 continue;
916 }
917 if (GetProcessingSettings().allocDebugLevel >= 2 && (res->mPtr || res->mPtrDevice)) {
918 std::cout << "Freeing NonPersistent " << res->mName << ": size " << res->mSize << " (reused " << res->mReuse << ")\n";
919 }
920 if (res->mReuse < 0) {
921 alignedDefaultBufferDeleter()(res->mPtrDevice);
922 }
923 res->mPtr = nullptr;
924 res->mPtrDevice = nullptr;
925 }
926 if (!proc) {
927 stdspinlock spinlock(mMemoryMutex);
928 mHostMemoryPoolEnd = std::get<0>(mNonPersistentMemoryStack.back());
932 mNonPersistentMemoryStack.pop_back();
933 }
934}
935
937{
939 throw std::runtime_error("temporary memory stack already blocked");
940 }
943}
944
946{
947 if (mNonPersistentMemoryStack.size()) {
948 throw std::runtime_error("cannot unblock while there is stacked memory");
949 }
952 mHostMemoryPoolBlocked = nullptr;
953 mDeviceMemoryPoolBlocked = nullptr;
954}
955
957{
958 mMemoryResources[res].mPtr = ptr;
959}
960
962{
963 for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
966 }
967 }
970 mDirectMemoryChunks.clear();
972 mVolatileChunks.clear();
973 mVolatileMemoryStart = nullptr;
974 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
975 mHostMemoryPool = GPUProcessor::alignPointer<constants::GPU_MEMALIGN>(mHostMemoryPermanent);
976 mDeviceMemoryPool = GPUProcessor::alignPointer<constants::GPU_MEMALIGN>(mDeviceMemoryPermanent);
979 } else {
981 }
982}
983
989
991{
992 printf("Maximum Memory Allocation: Host %'zu / Device %'zu\n", mHostMemoryUsedMax, mDeviceMemoryUsedMax);
993}
994
996{
997 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
998 printf("Memory Allocation: Host %'13zd / %'13zu (Permanent %'13zd, Data %'13zd, Scratch %'13zd), Device %'13zd / %'13zu, (Permanent %'13zd, Data %'13zd, Scratch %'13zd) %zu chunks\n",
1001 mMemoryResources.size());
1002 }
1003}
1004
1006{
1007 std::map<std::string, std::array<size_t, 3>> sizes;
1008 for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
1009 auto& res = mMemoryResources[i];
1010 if (res.mReuse >= 0) {
1011 continue;
1012 }
1013 auto& x = sizes[res.mName];
1014 if (res.mPtr) {
1015 x[0] += res.mSize;
1016 }
1017 if (res.mPtrDevice) {
1018 x[1] += res.mSize;
1019 }
1021 x[2] = 1;
1022 }
1023 }
1024 printf("%59s CPU / %9s GPU\n", "", "");
1025 for (auto it = sizes.begin(); it != sizes.end(); it++) {
1026 printf("Allocation %50s %s: Size %'14zu / %'14zu\n", it->first.c_str(), it->second[2] ? "P" : " ", it->second[0], it->second[1]);
1027 }
1029 for (uint32_t i = 0; i < mChains.size(); i++) {
1030 mChains[i]->PrintMemoryStatistics();
1031 }
1032}
1033
1035{
1036 if (GetProcessingSettings().noGPUMemoryRegistration) {
1037 return 0;
1038 }
1040 if (retVal == 0) {
1041 mRegisteredMemoryPtrs.emplace(ptr);
1042 }
1043 return retVal;
1044}
1045
1047{
1048 if (GetProcessingSettings().noGPUMemoryRegistration) {
1049 return 0;
1050 }
1051 const auto& pos = mRegisteredMemoryPtrs.find(ptr);
1052 if (pos != mRegisteredMemoryPtrs.end()) {
1055 }
1056 return 1;
1057}
1058
1059namespace o2::gpu::internal
1060{
1061namespace // anonymous
1062{
1063template <class T>
1064constexpr static inline int32_t getStepNum(T step, bool validCheck, int32_t N, const char* err = "Invalid step num")
1065{
1066 static_assert(sizeof(step) == sizeof(uint32_t), "Invalid step enum size");
1067 int32_t retVal = 8 * sizeof(uint32_t) - 1 - CAMath::Clz((uint32_t)step);
1068 if ((uint32_t)step == 0 || retVal >= N) {
1069 if (!validCheck) {
1070 return -1;
1071 }
1072 throw std::runtime_error("Invalid General Step");
1073 }
1074 return retVal;
1075}
1076} // anonymous namespace
1077} // namespace o2::gpu::internal
1078
1079int32_t GPUReconstruction::getRecoStepNum(RecoStep step, bool validCheck) { return internal::getStepNum(step, validCheck, gpudatatypes::N_RECO_STEPS, "Invalid Reco Step"); }
1080int32_t GPUReconstruction::getGeneralStepNum(GeneralStep step, bool validCheck) { return internal::getStepNum(step, validCheck, gpudatatypes::N_GENERAL_STEPS, "Invalid General Step"); }
1081
1083{
1084 if (!mInitialized || !GetProcessingSettings().doublePipeline || mMaster != nullptr || !mSlaves.size()) {
1085 throw std::invalid_argument("Cannot start double pipeline mode");
1086 }
1087 if (GetProcessingSettings().debugLevel >= 3) {
1088 GPUInfo("Pipeline worker started");
1089 }
1090 bool terminate = false;
1091 while (!terminate) {
1092 {
1093 std::unique_lock<std::mutex> lk(mPipelineContext->mutex);
1094 mPipelineContext->cond.wait(lk, [this] { return this->mPipelineContext->pipelineQueue.size() > 0; });
1095 }
1096 GPUReconstructionPipelineQueue* q;
1097 {
1098 std::lock_guard<std::mutex> lk(mPipelineContext->mutex);
1099 q = mPipelineContext->pipelineQueue.front();
1100 mPipelineContext->pipelineQueue.pop();
1101 }
1102 if (q->op == 1) {
1103 terminate = 1;
1104 } else {
1105 q->retVal = q->chain->RunChain();
1106 }
1107 {
1108 std::lock_guard<std::mutex> lk(q->m);
1109 q->done = true;
1110 }
1111 q->c.notify_one();
1112 }
1113 if (GetProcessingSettings().debugLevel >= 3) {
1114 GPUInfo("Pipeline worker ended");
1115 }
1116}
1117
1122
1124{
1127 std::unique_ptr<GPUReconstructionPipelineQueue> qu(new GPUReconstructionPipelineQueue);
1128 GPUReconstructionPipelineQueue* q = qu.get();
1129 q->chain = terminate ? nullptr : mChains[0].get();
1130 q->op = terminate ? 1 : 0;
1131 std::unique_lock<std::mutex> lkdone(q->m);
1132 {
1133 std::lock_guard<std::mutex> lkpipe(rec->mPipelineContext->mutex);
1134 if (rec->mPipelineContext->terminate) {
1135 throw std::runtime_error("Must not enqueue work after termination request");
1136 }
1137 rec->mPipelineContext->pipelineQueue.push(q);
1138 rec->mPipelineContext->terminate = terminate;
1139 rec->mPipelineContext->cond.notify_one();
1140 }
1141 q->c.wait(lkdone, [&q]() { return q->done; });
1142 if (terminate || (q->retVal && (q->retVal != 3 || !GetProcessingSettings().ignoreNonFatalGPUErrors))) {
1143 return q->retVal;
1144 }
1145 int32_t retVal2 = mChains[0]->FinalizePipelinedProcessing();
1146 return retVal2 ? retVal2 : q->retVal;
1147}
1148
1150{
1152 std::lock_guard<std::mutex> lk(rec->mPipelineContext->mutex);
1153 return rec->mPipelineContext->pipelineQueue.size() && rec->mPipelineContext->pipelineQueue.front()->op == 0 ? rec->mPipelineContext->pipelineQueue.front()->chain : nullptr;
1154}
1155
1156void GPUReconstruction::PrepareEvent() // TODO: Clean this up, this should not be called from chainTracking but before
1157{
1159 for (uint32_t i = 0; i < mChains.size(); i++) {
1160 mChains[i]->PrepareEvent();
1161 }
1162 for (uint32_t i = 0; i < mProcessors.size(); i++) {
1163 if (mProcessors[i].proc->mAllocateAndInitializeLate) {
1164 continue;
1165 }
1166 (mProcessors[i].proc->*(mProcessors[i].SetMaxData))(mHostConstantMem->ioPtrs);
1167 if (mProcessors[i].proc->mGPUProcessorType != GPUProcessor::PROCESSOR_TYPE_DEVICE && mProcessors[i].proc->mLinkedProcessor) {
1168 (mProcessors[i].proc->mLinkedProcessor->*(mProcessors[i].SetMaxData))(mHostConstantMem->ioPtrs);
1169 }
1170 }
1171 ComputeReuseMax(nullptr);
1172 AllocateRegisteredMemory(nullptr);
1173}
1174
1175int32_t GPUReconstruction::CheckErrorCodes(bool cpuOnly, bool forceShowErrors, std::vector<std::array<uint32_t, 4>>* fillErrors)
1176{
1177 int32_t retVal = 0;
1178 for (uint32_t i = 0; i < mChains.size(); i++) {
1179 if (mChains[i]->CheckErrorCodes(cpuOnly, forceShowErrors, fillErrors)) {
1180 retVal++;
1181 }
1182 }
1183 return retVal;
1184}
1185
1186int32_t GPUReconstruction::GPUChkErrA(const int64_t error, const char* file, int32_t line, bool failOnError)
1187{
1188 if (error == 0 || !GPUChkErrInternal(error, file, line)) {
1189 return 0;
1190 }
1191 if (failOnError) {
1192 if (mInitialized && mInErrorHandling == false) {
1193 mInErrorHandling = true;
1194 CheckErrorCodes(false, true);
1195 }
1196 throw std::runtime_error("GPU Backend Failure");
1197 }
1198 return 1;
1199}
1200
1202{
1203 std::string f;
1204 f = dir;
1205 f += "settings.dump";
1206 DumpStructToFile(mGRPSettings.get(), f.c_str());
1207 for (uint32_t i = 0; i < mChains.size(); i++) {
1208 mChains[i]->DumpSettings(dir);
1209 }
1210}
1211
1212void GPUReconstruction::UpdateDynamicSettings(const GPUSettingsRecDynamic* d)
1213{
1214 UpdateSettings(nullptr, nullptr, d);
1215}
1216
1217void GPUReconstruction::UpdateSettings(const GPUSettingsGRP* g, const GPUSettingsProcessing* p, const GPUSettingsRecDynamic* d)
1218{
1219 if (g) {
1220 *mGRPSettings = *g;
1221 }
1222 if (p) {
1223 mProcessingSettings->debugLevel = p->debugLevel;
1224 mProcessingSettings->resetTimers = p->resetTimers;
1225 }
1226 GPURecoStepConfiguration* w = nullptr;
1228 w = &mRecoSteps;
1229 }
1230 param().UpdateSettings(g, p, w, d);
1231 if (mInitialized) {
1233 }
1234}
1235
1236int32_t GPUReconstruction::ReadSettings(const char* dir)
1237{
1238 std::string f;
1239 f = dir;
1240 f += "settings.dump";
1241 new (mGRPSettings.get()) GPUSettingsGRP;
1242 bool error;
1243 ReadStructFromFile(f.c_str(), mGRPSettings.get(), &error, true);
1244 if (error) {
1245 return 1;
1246 }
1248 for (uint32_t i = 0; i < mChains.size(); i++) {
1249 mChains[i]->ReadSettings(dir);
1250 }
1251 return 0;
1252}
1253
1254void GPUReconstruction::SetSettings(float solenoidBzNominalGPU, const GPURecoStepConfiguration* workflow)
1255{
1256#ifdef GPUCA_STANDALONE
1257 GPUSettingsGRP grp;
1258 grp.solenoidBzNominalGPU = solenoidBzNominalGPU;
1259 SetSettings(&grp, nullptr, nullptr, workflow);
1260#else
1262 config.ReadConfigurableParam(config);
1263 config.configGRP.solenoidBzNominalGPU = solenoidBzNominalGPU;
1264 SetSettings(&config.configGRP, &config.configReconstruction, &config.configProcessing, workflow);
1265#endif
1266}
1267
1268void GPUReconstruction::SetSettings(const GPUSettingsGRP* grp, const GPUSettingsRec* rec, const GPUSettingsProcessing* proc, const GPURecoStepConfiguration* workflow)
1269{
1270 if (mInitialized) {
1271 GPUError("Cannot update settings while initialized");
1272 throw std::runtime_error("Settings updated while initialized");
1273 }
1274 *mGRPSettings = *grp;
1275 if (proc) {
1276 *mProcessingSettings = *proc;
1277 }
1278 if (workflow) {
1279 mRecoSteps.steps = workflow->steps;
1281 mRecoSteps.inputs = workflow->inputs;
1282 mRecoSteps.outputs = workflow->outputs;
1283 }
1284 param().SetDefaults(mGRPSettings.get(), rec, proc, workflow);
1285}
1286
1288{
1289 GPUOutputControl outputControl;
1290 outputControl.set(ptr, size);
1291 SetOutputControl(outputControl);
1292}
1293
1297void GPUReconstruction::SetResetTimers(bool reset) { mProcessingSettings->resetTimers = reset; }
1302
1303ThrustVolatileAllocator::ThrustVolatileAllocator(GPUReconstruction* r)
1304{
1305 mAlloc = [&r](size_t n) { return (char*)r->AllocateVolatileDeviceMemory(n); };
1306}
int32_t i
uint32_t op
bool done
int32_t retVal
GPUChain * chain
uint16_t pos
Definition RawData.h:3
uint32_t res
Definition RawData.h:0
TBranch * ptr
void set(S v)
Definition bitfield.h:55
bool isSet(const bitfield &v) const
Definition bitfield.h:66
const GPUSettingsDisplay * GetEventDisplayConfig() const
const GPUSettingsQA * GetQAConfig() const
static void dumpConfig(const GPUSettingsRec *rec, const GPUSettingsProcessing *proc, const GPUSettingsQA *qa, const GPUSettingsDisplay *display, const GPUSettingsDeviceBackend *device, const GPURecoStepConfiguration *workflow)
void * SetDevicePointers(void *ptr) const
void * SetPointers(void *ptr) const
static void computePointerWithAlignment(T *&basePtr, S *&objPtr, size_t nEntries=1)
void InitGPUProcessor(GPUReconstruction *rec, ProcessorType type=PROCESSOR_TYPE_CPU, GPUProcessor *slaveProcessor=nullptr)
ProcessorType mGPUProcessorType
GPURecoStepConfiguration mRecoSteps
std::unordered_set< const void * > mRegisteredMemoryPtrs
int16_t RegisterMemoryAllocationHelper(GPUProcessor *proc, void *(GPUProcessor::*setPtr)(void *), int32_t type, const char *name, const GPUMemoryReuse &re)
std::vector< std::unique_ptr< GPUChain > > mChains
void * AllocateVolatileMemory(size_t size, bool device)
ThrustVolatileAllocator getThrustVolatileDeviceAllocator()
std::unique_ptr< GPUMemorySizeScalers > mMemoryScalers
void AllocateRegisteredForeignMemory(int16_t res, GPUReconstruction *rec, GPUOutputControl *control=nullptr)
std::unique_ptr< T > ReadStructFromFile(const char *file, T *obj=nullptr, bool *errorOnMissing=nullptr, bool allowSmaller=false)
void SetInputControl(void *ptr, size_t size)
GPUConstantMem * mDeviceConstantMem
void ConstructGPUProcessor(GPUProcessor *proc)
std::shared_ptr< GPUROOTDumpCore > mROOTDump
const GPUSettingsDeviceBackend & GetDeviceBackendSettings() const
void ComputeReuseMax(GPUProcessor *proc)
void SetMemoryExternalInput(int16_t res, void *ptr)
int32_t getGeneralStepNum(GeneralStep step, bool validCheck=true)
static constexpr uint32_t NSECTORS
RecoStepField GetRecoStepsGPU() const
void RegisterGPUDeviceProcessor(GPUProcessor *proc, GPUProcessor *slaveProcessor)
std::vector< GPUReconstruction * > mSlaves
std::vector< std::tuple< void *, void *, size_t, size_t, uint64_t > > mNonPersistentMemoryStack
alignedDeleter< char, constants::GPU_BUFFER_ALIGNMENT > alignedDefaultBufferDeleter
void UpdateDynamicSettings(const GPUSettingsRecDynamic *d)
std::unique_ptr< GPUSettingsDeviceBackend > mDeviceBackendSettings
std::vector< GPUMemoryResource > mMemoryResources
std::unique_ptr< GPUReconstructionPipelineContext > mPipelineContext
std::unique_ptr< GPUConstantMem > mHostConstantMem
void ResetRegisteredMemoryPointers(GPUProcessor *proc)
void DumpStructToFile(const T *obj, const char *file)
void AllocateRegisteredMemoryInternal(GPUMemoryResource *res, GPUOutputControl *control, GPUReconstruction *recPool)
std::vector< std::unique_ptr< char[], alignedDefaultBufferDeleter > > mVolatileChunks
virtual int32_t registerMemoryForGPU_internal(const void *ptr, size_t size)=0
virtual size_t WriteToConstantMemory(size_t offset, const void *src, size_t size, int32_t stream=-1, gpu_reconstruction_kernels::deviceEvent *ev=nullptr)=0
std::unordered_map< GPUMemoryReuse::ID, MemoryReuseMeta > mMemoryReuse1to1
std::vector< ProcessorData > mProcessors
void * AllocateVolatileDeviceMemory(size_t size)
virtual int32_t InitDevice()=0
void SetSettings(float solenoidBzNominalGPU, const GPURecoStepConfiguration *workflow=nullptr)
const GPUCalibObjectsConst & GetCalib() const
const GPUTrackingInOutPointers GetIOPtrs() const
virtual std::unique_ptr< gpu_reconstruction_kernels::threadContext > GetThreadContext()=0
GPUReconstruction(const GPUReconstruction &)=delete
static constexpr GeometryType geometryType
void WriteConstantParams(int32_t stream=-1)
void FreeRegisteredMemory(GPUProcessor *proc, bool freeCustom=false, bool freePermanent=false)
virtual RecoStepField AvailableGPURecoSteps()
static constexpr const char *const IOTYPENAMES[]
void PopNonPersistentMemory(RecoStep step, uint64_t tag, const GPUProcessor *proc=nullptr)
void UpdateSettings(const GPUSettingsGRP *g, const GPUSettingsProcessing *p=nullptr, const GPUSettingsRecDynamic *d=nullptr)
int32_t CheckErrorCodes(bool cpuOnly=false, bool forceShowErrors=false, std::vector< std::array< uint32_t, 4 > > *fillErrors=nullptr)
const GPUParam & GetParam() const
void ClearAllocatedMemory(bool clearOutputs=true)
static constexpr const char *const GEOMETRY_TYPE_NAMES[]
virtual int32_t ExitDevice()=0
std::unique_ptr< GPUSettingsGRP > mGRPSettings
std::unique_ptr< GPUSettingsProcessing > mProcessingSettings
void PushNonPersistentMemory(uint64_t tag)
int32_t getRecoStepNum(RecoStep step, bool validCheck=true)
virtual int32_t unregisterMemoryForGPU_internal(const void *ptr)=0
void BlockStackedMemory(GPUReconstruction *rec)
const GPUSettingsProcessing & GetProcessingSettings() const
void DumpSettings(const char *dir="")
std::vector< std::unique_ptr< char[], alignedDefaultBufferDeleter > > mNonPersistentIndividualDirectAllocations
void * AllocateDirectMemory(size_t size, int32_t type)
virtual void GetITSTraits(std::unique_ptr< o2::its::TrackerTraits< 7 > > *trackerTraits, std::unique_ptr< o2::its::VertexerTraits< 7 > > *vertexerTraits, std::unique_ptr< o2::its::TimeFrame< 7 > > *timeFrame)
int32_t unregisterMemoryForGPU(const void *ptr)
int32_t registerMemoryForGPU(const void *ptr, size_t size)
void SetDebugLevelTmp(int32_t level)
size_t AllocateRegisteredMemoryHelper(GPUMemoryResource *res, void *&ptr, void *&memorypool, void *memorybase, size_t memorysize, void *(GPUMemoryResource::*SetPointers)(void *) const, void *&memorypoolend, const char *device)
std::vector< std::unique_ptr< char[], alignedDefaultBufferDeleter > > mDirectMemoryChunks
int32_t EnqueuePipeline(bool terminate=false)
std::shared_ptr< GPUReconstructionThreading > mThreading
std::vector< GPUMemoryResource * > mNonPersistentIndividualAllocations
virtual int32_t GPUChkErrInternal(const int64_t error, const char *file, int32_t line) const
int32_t GPUChkErrA(const int64_t error, const char *file, int32_t line, bool failOnError)
size_t AllocateRegisteredMemory(GPUProcessor *proc, bool resetCustom=false)
int32_t ReadSettings(const char *dir="")
void SetOutputControl(const GPUOutputControl &v)
static constexpr uint32_t NSECTORS
void SetSector(int32_t iSector)
#define TPC_MAX_FRAGMENT_LEN_GPU
#define TPC_MAX_FRAGMENT_LEN_HOST
GLdouble n
Definition glcorearb.h:1982
GLint GLenum GLint x
Definition glcorearb.h:403
const GLfloat * m
Definition glcorearb.h:4066
GLsizeiptr size
Definition glcorearb.h:659
GLuint GLsizei const GLuint const GLintptr const GLsizeiptr * sizes
Definition glcorearb.h:2595
GLuint const GLchar * name
Definition glcorearb.h:781
GLdouble f
Definition glcorearb.h:310
GLboolean GLboolean GLboolean b
Definition glcorearb.h:1233
GLint GLint GLsizei GLint GLenum GLenum type
Definition glcorearb.h:275
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLboolean GLboolean g
Definition glcorearb.h:1233
GLint level
Definition glcorearb.h:275
GLboolean r
Definition glcorearb.h:1233
GLenum GLfloat param
Definition glcorearb.h:271
GLboolean GLboolean GLboolean GLboolean a
Definition glcorearb.h:1233
GLuint GLuint stream
Definition glcorearb.h:1806
GLubyte GLubyte GLubyte GLubyte w
Definition glcorearb.h:852
std::unique_ptr< GPUDisplayFrontendInterface > eventDisplay
GPUReconstruction * rec
std::string qTag2Str(const T tag)
Definition strtag.h:36
GPUTPCTracker tpcTrackers[GPUTPCGeometry::NSECTORS]
GPUCalibObjectsConst calibObjects
GPUTPCClusterFinder tpcClusterer[GPUTPCGeometry::NSECTORS]
void set(void *p, size_t s)
std::function< void *(size_t)> allocator
void SetDefaults(float solenoidBz, bool assumeConstantBz=false)
Definition GPUParam.cxx:33
void UpdateSettings(const GPUSettingsGRP *g, const GPUSettingsProcessing *p=nullptr, const GPURecoStepConfiguration *w=nullptr, const GPUSettingsRecDynamic *d=nullptr)
Definition GPUParam.cxx:95
gpudatatypes::RecoStepField steps
gpudatatypes::InOutTypeField inputs
gpudatatypes::RecoStepField stepsGPUMask
gpudatatypes::InOutTypeField outputs
std::queue< GPUReconstructionPipelineQueue * > pipelineQueue