Project
Loading...
Searching...
No Matches
GPUReconstruction.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
15#include <cstring>
16#include <cstdio>
17#include <iostream>
18#include <mutex>
19#include <string>
20#include <map>
21#include <queue>
22#include <mutex>
23#include <condition_variable>
24#include <array>
25
26#include "GPUReconstruction.h"
29#include "GPUReconstructionIO.h"
30#include "GPUROOTDumpCore.h"
31#include "GPUConfigDump.h"
32#include "GPUChainTracking.h"
33#include "GPUConstantMem.h"
34#include "GPUCommonHelpers.h"
35#include "GPUSettings.h"
36
37#include "GPUMemoryResource.h"
38#include "GPUChain.h"
40
41#include "GPULogging.h"
42#include "utils/strtag.h"
43#include "utils/stdspinlock.h"
44
45#ifdef GPUCA_O2_LIB
47#endif
48
50
51namespace o2::gpu
52{
53namespace // anonymous
54{
55struct GPUReconstructionPipelineQueue {
56 uint32_t op = 0; // For now, 0 = process, 1 = terminate
57 GPUChain* chain = nullptr;
58 std::mutex m;
59 std::condition_variable c;
60 bool done = false;
61 int32_t retVal = 0;
62};
63} // namespace
64
66 std::queue<GPUReconstructionPipelineQueue*> queue;
67 std::mutex mutex;
68 std::condition_variable cond;
69 bool terminate = false;
70};
71} // namespace o2::gpu
72
73using namespace o2::gpu;
74
75constexpr const char* const GPUReconstruction::GEOMETRY_TYPE_NAMES[];
76constexpr const char* const GPUReconstruction::IOTYPENAMES[];
78
79static ptrdiff_t ptrDiff(void* a, void* b) { return (char*)a - (char*)b; }
80
81GPUReconstruction::GPUReconstruction(const GPUSettingsDeviceBackend& cfg) : mHostConstantMem(new GPUConstantMem), mGRPSettings(new GPUSettingsGRP), mDeviceBackendSettings(new GPUSettingsDeviceBackend(cfg)), mProcessingSettings(new GPUSettingsProcessing)
82{
83 if (cfg.master) {
85 throw std::invalid_argument("device type of master and slave GPUReconstruction does not match");
86 }
87 if (cfg.master->mMaster) {
88 throw std::invalid_argument("Cannot be slave to a slave");
89 }
90 mMaster = cfg.master;
91 mSlaveId = cfg.master->mSlaves.size();
92 cfg.master->mSlaves.emplace_back(this);
93 }
96 for (uint32_t i = 0; i < NSECTORS; i++) {
97 processors()->tpcTrackers[i].SetSector(i); // TODO: Move to a better place
99#ifdef GPUCA_HAS_ONNX
100 processors()->tpcNNClusterer[i].mISector = i;
101#endif
102 }
103#ifndef GPUCA_NO_ROOT
104 mROOTDump = GPUROOTDumpCore::getAndCreate();
105#endif
106}
107
109{
110 if (mInitialized) {
111 GPUError("GPU Reconstruction not properly deinitialized!");
112 }
113}
114
115void GPUReconstruction::GetITSTraits(std::unique_ptr<o2::its::TrackerTraits<7>>* trackerTraits, std::unique_ptr<o2::its::VertexerTraits<7>>* vertexerTraits, std::unique_ptr<o2::its::TimeFrame<7>>* timeFrame)
116{
117 if (trackerTraits) {
118 trackerTraits->reset(new o2::its::TrackerTraits<7>);
119 }
120 if (vertexerTraits) {
121 vertexerTraits->reset(new o2::its::VertexerTraits<7>);
122 }
123 if (timeFrame) {
124 timeFrame->reset(new o2::its::TimeFrame<7>);
125 }
126}
127
129{
130 return std::max<int32_t>(0, tbb::this_task_arena::current_thread_index());
131}
132
134{
135 if (mMaster) {
136 throw std::runtime_error("Must not call init on slave!");
137 }
138 int32_t retVal = InitPhaseBeforeDevice();
139 if (retVal) {
140 return retVal;
141 }
142 for (uint32_t i = 0; i < mSlaves.size(); i++) {
143 retVal = mSlaves[i]->InitPhaseBeforeDevice();
144 if (retVal) {
145 GPUError("Error initialization slave (before deviceinit)");
146 return retVal;
147 }
148 mNStreams = std::max(mNStreams, mSlaves[i]->mNStreams);
151 }
152 if (InitDevice()) {
153 return 1;
154 }
155 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
158 } else {
160 }
162 return 1;
163 }
164 for (uint32_t i = 0; i < mSlaves.size(); i++) {
165 mSlaves[i]->mDeviceMemoryBase = mDeviceMemoryPermanent;
166 mSlaves[i]->mHostMemoryBase = mHostMemoryPermanent;
167 mSlaves[i]->mDeviceMemorySize = mDeviceMemorySize - ptrDiff(mSlaves[i]->mDeviceMemoryBase, mDeviceMemoryBase);
168 mSlaves[i]->mHostMemorySize = mHostMemorySize - ptrDiff(mSlaves[i]->mHostMemoryBase, mHostMemoryBase);
169 mSlaves[i]->mHostMemoryPoolEnd = mHostMemoryPoolEnd;
170 mSlaves[i]->mDeviceMemoryPoolEnd = mDeviceMemoryPoolEnd;
171 if (mSlaves[i]->InitDevice()) {
172 GPUError("Error initialization slave (deviceinit)");
173 return 1;
174 }
176 GPUError("Error initialization slave (permanent memory)");
177 return 1;
178 }
179 mDeviceMemoryPermanent = mSlaves[i]->mDeviceMemoryPermanent;
180 mHostMemoryPermanent = mSlaves[i]->mHostMemoryPermanent;
181 }
183 if (retVal) {
184 return retVal;
185 }
187 for (uint32_t i = 0; i < mSlaves.size(); i++) {
188 mSlaves[i]->mDeviceMemoryPermanent = mDeviceMemoryPermanent;
189 mSlaves[i]->mHostMemoryPermanent = mHostMemoryPermanent;
190 retVal = mSlaves[i]->InitPhaseAfterDevice();
191 if (retVal) {
192 GPUError("Error initialization slave (after device init)");
193 return retVal;
194 }
195 mSlaves[i]->ClearAllocatedMemory();
196 }
197 debugInit();
198 return 0;
199}
200
201namespace o2::gpu::internal
202{
203static uint32_t getDefaultNThreads()
204{
205 const char* tbbEnv = getenv("TBB_NUM_THREADS");
206 uint32_t tbbNum = tbbEnv ? atoi(tbbEnv) : 0;
207 if (tbbNum) {
208 return tbbNum;
209 }
210 const char* ompEnv = getenv("OMP_NUM_THREADS");
211 uint32_t ompNum = ompEnv ? atoi(ompEnv) : 0;
212 if (ompNum) {
213 return ompNum;
214 }
215 return tbb::info::default_concurrency();
216}
217} // namespace o2::gpu::internal
218
220{
221 if (GetProcessingSettings().printSettings) {
222 if (mSlaves.size() || mMaster) {
223 printf("\nConfig Dump %s\n", mMaster ? "Slave" : "Master");
224 }
225 const GPUChainTracking* chTrk;
226 for (uint32_t i = 0; i < mChains.size(); i++) {
227 if ((chTrk = dynamic_cast<GPUChainTracking*>(mChains[i].get()))) {
228 break;
229 }
230 }
231 GPUConfigDump::dumpConfig(&param().rec, mProcessingSettings.get(), chTrk ? chTrk->GetQAConfig() : nullptr, chTrk ? chTrk->GetEventDisplayConfig() : nullptr, mDeviceBackendSettings.get(), &mRecoSteps);
232 }
235 if (!IsGPU()) {
236 mRecoSteps.stepsGPUMask.set((uint8_t)0);
237 }
238
239 if (GetProcessingSettings().forceMemoryPoolSize >= 1024 || GetProcessingSettings().forceHostMemoryPoolSize >= 1024) {
241 }
242 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_AUTO) {
244 }
245 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
246 mProcessingSettings->forceMemoryPoolSize = mProcessingSettings->forceHostMemoryPoolSize = 0;
247 }
248 if (GetProcessingSettings().debugLevel >= 4) {
249 mProcessingSettings->keepAllMemory = true;
250 }
251 if (GetProcessingSettings().debugLevel >= 5 && GetProcessingSettings().allocDebugLevel < 2) {
252 mProcessingSettings->allocDebugLevel = 2;
253 }
255 mProcessingSettings->keepDisplayMemory = true;
256 }
257 if (GetProcessingSettings().debugLevel < 6) {
258 mProcessingSettings->debugMask = 0;
259 }
260 if (GetProcessingSettings().debugLevel < 1) {
261 mProcessingSettings->deviceTimers = false;
262 }
263 if (GetProcessingSettings().debugLevel > 0) {
264 mProcessingSettings->recoTaskTiming = true;
265 }
266 if (GetProcessingSettings().deterministicGPUReconstruction == -1) {
267#ifdef GPUCA_DETERMINISTIC_MODE
268 mProcessingSettings->deterministicGPUReconstruction = 1;
269#else
270 mProcessingSettings->deterministicGPUReconstruction = GetProcessingSettings().debugLevel >= 6;
271#endif
272 }
273 if (GetProcessingSettings().deterministicGPUReconstruction) {
274#ifndef GPUCA_DETERMINISTIC_MODE
275 GPUError("WARNING, deterministicGPUReconstruction needs GPUCA_DETERMINISTIC_MODE for being fully deterministic, without only most indeterminism by concurrency is removed, but floating point effects remain!");
276#endif
277 if (mProcessingSettings->debugLevel >= 6 && ((mProcessingSettings->debugMask + 1) & mProcessingSettings->debugMask)) {
278 GPUError("WARNING: debugMask %d - debug output might not be deterministic with intermediate steps missing", mProcessingSettings->debugMask);
279 }
280 mProcessingSettings->overrideClusterizerFragmentLen = TPC_MAX_FRAGMENT_LEN_GPU;
281 if (GetProcessingSettings().createO2Output > 1) {
282 mProcessingSettings->createO2Output = 1;
283 }
284 mProcessingSettings->rtc.deterministic = 1;
285 } else {
286#ifdef GPUCA_DETERMINISTIC_MODE
287 GPUError("WARNING, compiled with GPUCA_DETERMINISTIC_MODE but deterministicGPUReconstruction not set, only compile-time determinism and deterministic math enforced, not fully deterministic!");
288#endif
289 }
290 if (GetProcessingSettings().deterministicGPUReconstruction && GetProcessingSettings().debugLevel >= 6) {
291 mProcessingSettings->nTPCClustererLanes = 1;
292 }
293 if (GetProcessingSettings().createO2Output > 1 && GetProcessingSettings().runQA && GetProcessingSettings().qcRunFraction == 100.f) {
294 mProcessingSettings->createO2Output = 1;
295 }
296 if (!GetProcessingSettings().createO2Output || !IsGPU()) {
297 mProcessingSettings->clearO2OutputFromGPU = false;
298 }
300 mProcessingSettings->mergerSortTracks = false;
301 }
302 if (GetProcessingSettings().debugLevel > 3 || !IsGPU() || GetProcessingSettings().deterministicGPUReconstruction) {
303 mProcessingSettings->delayedOutput = false;
304 }
305 if (!GetProcessingSettings().rtc.enable) {
306 mProcessingSettings->rtc.optConstexpr = false;
307 }
308 if (GetProcessingSettings().allSanityChecks) {
309 mProcessingSettings->clusterizerZSSanityCheck = mProcessingSettings->mergerSanityCheck = mProcessingSettings->outputSanityCheck = true;
310 }
311
312 mMemoryScalers->scalingFactor = GetProcessingSettings().memoryScalingFactor;
313 mMemoryScalers->conservative = GetProcessingSettings().conservativeMemoryEstimate;
314 mMemoryScalers->returnMaxVal = GetProcessingSettings().forceMaxMemScalers != 0;
315 if (GetProcessingSettings().forceMaxMemScalers > 1) {
316 mMemoryScalers->rescaleMaxMem(GetProcessingSettings().forceMaxMemScalers);
317 }
318
319 if (GetProcessingSettings().nHostThreads != -1 && GetProcessingSettings().ompThreads != -1) {
320 GPUFatal("Must not use both nHostThreads and ompThreads at the same time!");
321 } else if (GetProcessingSettings().ompThreads != -1) {
322 mProcessingSettings->nHostThreads = GetProcessingSettings().ompThreads;
323 GPUWarning("You are using the deprecated ompThreads option, please switch to nHostThreads!");
324 }
325
326 if (GetProcessingSettings().nHostThreads <= 0) {
327 mProcessingSettings->nHostThreads = internal::getDefaultNThreads();
328 } else {
329 mProcessingSettings->autoAdjustHostThreads = false;
330 }
331 mMaxHostThreads = GetProcessingSettings().nHostThreads;
332 if (mMaster == nullptr) {
333 mThreading = std::make_shared<GPUReconstructionThreading>();
334 mThreading->control = std::make_unique<tbb::global_control>(tbb::global_control::max_allowed_parallelism, mMaxHostThreads);
335 mThreading->allThreads = std::make_unique<tbb::task_arena>(mMaxHostThreads);
336 mThreading->activeThreads = std::make_unique<tbb::task_arena>(mMaxHostThreads);
337 } else {
339 }
341 if (IsGPU()) {
342 mNStreams = std::max<int32_t>(GetProcessingSettings().nStreams, 3);
343 }
344
345 if (GetProcessingSettings().nTPCClustererLanes == -1) {
346 mProcessingSettings->nTPCClustererLanes = (GetRecoStepsGPU() & RecoStep::TPCClusterFinding) ? 3 : std::max<int32_t>(1, std::min<int32_t>(GPUCA_NSECTORS, GetProcessingSettings().inKernelParallel ? (mMaxHostThreads >= 4 ? std::min<int32_t>(mMaxHostThreads / 2, mMaxHostThreads >= 32 ? GPUCA_NSECTORS : 4) : 1) : mMaxHostThreads));
347 }
348 if (GetProcessingSettings().overrideClusterizerFragmentLen == -1) {
349 mProcessingSettings->overrideClusterizerFragmentLen = ((GetRecoStepsGPU() & RecoStep::TPCClusterFinding) || (mMaxHostThreads / GetProcessingSettings().nTPCClustererLanes >= 3)) ? TPC_MAX_FRAGMENT_LEN_GPU : TPC_MAX_FRAGMENT_LEN_HOST;
350 }
351 if (GetProcessingSettings().nTPCClustererLanes > GPUCA_NSECTORS) {
352 GPUError("Invalid value for nTPCClustererLanes: %d", GetProcessingSettings().nTPCClustererLanes);
353 mProcessingSettings->nTPCClustererLanes = GPUCA_NSECTORS;
354 }
355
356 if (GetProcessingSettings().doublePipeline) {
357 mProcessingSettings->rtctech.allowOptimizedSlaveReconstruction = true;
358 }
359 if (GetProcessingSettings().doublePipeline && (mChains.size() != 1 || mChains[0]->SupportsDoublePipeline() == false || !IsGPU() || GetProcessingSettings().memoryAllocationStrategy != GPUMemoryResource::ALLOCATION_GLOBAL)) {
360 GPUError("Must use double pipeline mode only with exactly one chain that must support it");
361 return 1;
362 }
363 if (mMaster == nullptr && GetProcessingSettings().doublePipeline) {
365 }
366
367 if (mMaster && GetProcessingSettings().rtc.enable && (GetProcessingSettings().rtc.optConstexpr || GetProcessingSettings().rtc.optSpecialCode) && !GetProcessingSettings().rtctech.allowOptimizedSlaveReconstruction) {
368 GPUError("Not allowed to create optimized RTC code with more than one GPUReconstruction instances");
369 return 1;
370 }
371
373 for (uint32_t i = 0; i < mChains.size(); i++) {
374 if (mChains[i]->EarlyConfigure()) {
375 return 1;
376 }
377 mChains[i]->RegisterPermanentMemoryAndProcessors();
378 size_t memPrimary, memPageLocked;
379 mChains[i]->MemorySize(memPrimary, memPageLocked);
380 if (!IsGPU() || mOutputControl.useInternal()) {
381 memPageLocked = memPrimary;
382 }
383 mDeviceMemorySize += memPrimary;
384 mHostMemorySize += memPageLocked;
385 }
386 if (GetProcessingSettings().forceMemoryPoolSize && GetProcessingSettings().forceMemoryPoolSize <= 2 && CanQueryMaxMemory()) {
387 mDeviceMemorySize = GetProcessingSettings().forceMemoryPoolSize;
388 } else if (GetProcessingSettings().forceMemoryPoolSize > 2) {
389 mDeviceMemorySize = GetProcessingSettings().forceMemoryPoolSize;
390 if (!IsGPU() || mOutputControl.useInternal()) {
392 }
393 }
394 if (GetProcessingSettings().forceHostMemoryPoolSize) {
395 mHostMemorySize = GetProcessingSettings().forceHostMemoryPoolSize;
396 }
397
398 for (uint32_t i = 0; i < mProcessors.size(); i++) {
399 (mProcessors[i].proc->*(mProcessors[i].RegisterMemoryAllocation))();
400 }
401
402 return 0;
403}
404
406{
407 if (IsGPU()) {
408 for (uint32_t i = 0; i < mChains.size(); i++) {
409 mChains[i]->RegisterGPUProcessors();
410 }
411 }
413 return 0;
414}
415
417{
418 if (GetProcessingSettings().forceMaxMemScalers <= 1 && GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
420 }
421 for (uint32_t i = 0; i < mChains.size(); i++) {
422 if (mChains[i]->Init()) {
423 return 1;
424 }
425 }
426 for (uint32_t i = 0; i < mProcessors.size(); i++) {
427 (mProcessors[i].proc->*(mProcessors[i].InitializeProcessor))();
428 }
429
430 WriteConstantParams(); // Initialize with initial values, can optionally be updated later
431
432 mInitialized = true;
433 return 0;
434}
435
437{
438 if (IsGPU()) {
439 const auto threadContext = GetThreadContext();
440 WriteToConstantMemory(ptrDiff(&processors()->param, processors()), &param(), sizeof(param()), -1);
441 }
442}
443
445{
446 for (uint32_t i = 0; i < mChains.size(); i++) {
447 mChains[i]->Finalize();
448 }
449 return 0;
450}
451
453{
454 if (!mInitialized) {
455 return 1;
456 }
457 for (uint32_t i = 0; i < mSlaves.size(); i++) {
458 if (mSlaves[i]->Exit()) {
459 GPUError("Error exiting slave");
460 }
461 }
462
463 mChains.clear(); // Make sure we destroy a possible ITS GPU tracker before we call the destructors
464 mHostConstantMem.reset(); // Reset these explicitly before the destruction of other members unloads the library
465 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
466 for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
467 if (mMemoryResources[i].mReuse >= 0) {
468 continue;
469 }
470 operator delete(mMemoryResources[i].mPtrDevice, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
471 mMemoryResources[i].mPtr = mMemoryResources[i].mPtrDevice = nullptr;
472 }
473 }
474 mMemoryResources.clear();
475 if (mInitialized) {
476 ExitDevice();
477 }
478 debugExit();
479 mInitialized = false;
480 return 0;
481}
482
485
487{
488 for (auto it = mMemoryReuse1to1.begin(); it != mMemoryReuse1to1.end(); it++) {
489 auto& re = it->second;
490 if (proc == nullptr || re.proc == proc) {
491 GPUMemoryResource& resMain = mMemoryResources[re.res[0]];
492 resMain.mOverrideSize = 0;
493 for (uint32_t i = 0; i < re.res.size(); i++) {
495 resMain.mOverrideSize = std::max<size_t>(resMain.mOverrideSize, ptrDiff(res.SetPointers((void*)1), (char*)1));
496 }
497 }
498 }
499}
500
502{
504 if ((type & GPUMemoryResource::MEMORY_SCRATCH) && !GetProcessingSettings().keepDisplayMemory) { // keepAllMemory --> keepDisplayMemory
506 } else {
508 }
509 }
511 type &= ~GPUMemoryResource::MEMORY_GPU;
512 }
513 mMemoryResources.emplace_back(proc, setPtr, (GPUMemoryResource::MemoryType)type, name);
514 if (mMemoryResources.size() >= 32768) {
515 throw std::bad_alloc();
516 }
517 uint16_t retVal = mMemoryResources.size() - 1;
518 if (re.type != GPUMemoryReuse::NONE && !GetProcessingSettings().disableMemoryReuse) {
519 const auto& it = mMemoryReuse1to1.find(re.id);
520 if (it == mMemoryReuse1to1.end()) {
521 mMemoryReuse1to1[re.id] = {proc, retVal};
522 } else {
523 mMemoryResources[retVal].mReuse = it->second.res[0];
524 it->second.res.emplace_back(retVal);
525 }
526 }
527 return retVal;
528}
529
531{
532 if (GetProcessingSettings().debugLevel >= 5) {
533 GPUInfo("Allocating memory %p", (void*)proc);
534 }
535 size_t total = 0;
536 for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
537 if (proc == nullptr ? !mMemoryResources[i].mProcessor->mAllocateAndInitializeLate : mMemoryResources[i].mProcessor == proc) {
539 total += AllocateRegisteredMemory(i);
540 } else if (resetCustom && (mMemoryResources[i].mPtr || mMemoryResources[i].mPtrDevice)) {
542 }
543 }
544 }
545 if (GetProcessingSettings().debugLevel >= 5) {
546 GPUInfo("Allocating memory done");
547 }
548 return total;
549}
550
552{
553 if (GetProcessingSettings().debugLevel >= 5) {
554 GPUInfo("Allocating Permanent Memory");
555 }
557 GPUError("Must not allocate permanent memory while volatile chunks are allocated");
558 throw std::bad_alloc();
559 }
560 int32_t total = 0;
561 for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
562 if ((mMemoryResources[i].mType & GPUMemoryResource::MEMORY_PERMANENT) && mMemoryResources[i].mPtr == nullptr) {
563 total += AllocateRegisteredMemory(i);
564 }
565 }
568 if (GetProcessingSettings().debugLevel >= 5) {
569 GPUInfo("Permanent Memory Done");
570 }
571 return total;
572}
573
574size_t GPUReconstruction::AllocateRegisteredMemoryHelper(GPUMemoryResource* res, void*& ptr, void*& memorypool, void* memorybase, size_t memorysize, void* (GPUMemoryResource::*setPtr)(void*) const, void*& memorypoolend, const char* device)
575{
576 if (res->mReuse >= 0) {
577 ptr = (&ptr == &res->mPtrDevice) ? mMemoryResources[res->mReuse].mPtrDevice : mMemoryResources[res->mReuse].mPtr;
578 if (ptr == nullptr) {
579 GPUError("Invalid reuse ptr (%s)", res->mName);
580 throw std::bad_alloc();
581 }
582 size_t retVal = ptrDiff((res->*setPtr)(ptr), ptr);
583 if (retVal > mMemoryResources[res->mReuse].mSize) {
584 GPUError("Insufficient reuse memory %lu < %lu (%s) (%s)", mMemoryResources[res->mReuse].mSize, retVal, res->mName, device);
585 throw std::bad_alloc();
586 }
587 if (GetProcessingSettings().allocDebugLevel >= 2) {
588 std::cout << "Reused (" << device << ") " << res->mName << ": " << retVal << "\n";
589 }
590 return retVal;
591 }
592 if (memorypool == nullptr) {
593 GPUError("Cannot allocate memory from uninitialized pool");
594 throw std::bad_alloc();
595 }
596 size_t retVal;
597 stdspinlock spinlock(mMemoryMutex);
598 if ((res->mType & GPUMemoryResource::MEMORY_STACK) && memorypoolend) {
599 retVal = ptrDiff((res->*setPtr)((char*)1), (char*)(1));
600 memorypoolend = (void*)((char*)memorypoolend - GPUProcessor::getAlignmentMod<GPUCA_MEMALIGN>(memorypoolend));
601 if (retVal < res->mOverrideSize) {
602 retVal = res->mOverrideSize;
603 }
604 retVal += GPUProcessor::getAlignment<GPUCA_MEMALIGN>(retVal);
605 memorypoolend = (char*)memorypoolend - retVal;
606 ptr = memorypoolend;
607 retVal = std::max<size_t>(ptrDiff((res->*setPtr)(ptr), ptr), res->mOverrideSize);
608 } else {
609 ptr = memorypool;
610 memorypool = (char*)((res->*setPtr)(ptr));
611 retVal = ptrDiff(memorypool, ptr);
612 if (retVal < res->mOverrideSize) {
613 retVal = res->mOverrideSize;
614 memorypool = (char*)ptr + res->mOverrideSize;
615 }
616 memorypool = (void*)((char*)memorypool + GPUProcessor::getAlignment<GPUCA_MEMALIGN>(memorypool));
617 }
618 if (memorypoolend ? (memorypool > memorypoolend) : ((size_t)ptrDiff(memorypool, memorybase) > memorysize)) {
619 std::cerr << "Memory pool size exceeded (" << device << ") (" << res->mName << ": " << (memorypoolend ? (memorysize + ptrDiff(memorypool, memorypoolend)) : ptrDiff(memorypool, memorybase)) << " > " << memorysize << "\n";
620 throw std::bad_alloc();
621 }
622 if (GetProcessingSettings().allocDebugLevel >= 2) {
623 std::cout << "Allocated (" << device << ") " << res->mName << ": " << retVal << " - available: " << (memorypoolend ? ptrDiff(memorypoolend, memorypool) : (memorysize - ptrDiff(memorypool, memorybase))) << "\n";
624 }
625 return retVal;
626}
627
629{
630 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL && (control == nullptr || control->useInternal())) {
631 if (!(res->mType & GPUMemoryResource::MEMORY_EXTERNAL)) {
632 if (res->mPtrDevice && res->mReuse < 0) {
633 operator delete(res->mPtrDevice, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
634 }
635 res->mSize = std::max((size_t)res->SetPointers((void*)1) - 1, res->mOverrideSize);
636 if (res->mReuse >= 0) {
637 if (res->mSize > mMemoryResources[res->mReuse].mSize) {
638 GPUError("Invalid reuse, insufficient size: %ld < %ld", (int64_t)mMemoryResources[res->mReuse].mSize, (int64_t)res->mSize);
639 throw std::bad_alloc();
640 }
641 res->mPtrDevice = mMemoryResources[res->mReuse].mPtrDevice;
642 } else {
643 res->mPtrDevice = operator new(res->mSize + GPUCA_BUFFER_ALIGNMENT, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
644 }
645 res->mPtr = GPUProcessor::alignPointer<GPUCA_BUFFER_ALIGNMENT>(res->mPtrDevice);
646 res->SetPointers(res->mPtr);
647 if (GetProcessingSettings().allocDebugLevel >= 2) {
648 std::cout << (res->mReuse >= 0 ? "Reused " : "Allocated ") << res->mName << ": " << res->mSize << " (individual" << ((res->mType & GPUMemoryResource::MEMORY_STACK) ? " stack" : "") << ")\n";
649 }
651 stdspinlock spinlock(mMemoryMutex);
653 }
654 if ((size_t)res->mPtr % GPUCA_BUFFER_ALIGNMENT) {
655 GPUError("Got buffer with insufficient alignment");
656 throw std::bad_alloc();
657 }
658 }
659 } else {
660 if (res->mPtr != nullptr) {
661 GPUError("Double allocation! (%s)", res->mName);
662 throw std::bad_alloc();
663 }
664 if (IsGPU() && res->mOverrideSize < GPUCA_BUFFER_ALIGNMENT) {
665 res->mOverrideSize = GPUCA_BUFFER_ALIGNMENT;
666 }
667 if ((!IsGPU() || (res->mType & GPUMemoryResource::MEMORY_HOST) || GetProcessingSettings().keepDisplayMemory) && !(res->mType & GPUMemoryResource::MEMORY_EXTERNAL)) { // keepAllMemory --> keepDisplayMemory
668 if (control && control->useExternal()) {
669 if (control->allocator) {
670 res->mSize = std::max((size_t)res->SetPointers((void*)1) - 1, res->mOverrideSize);
671 res->mPtr = control->allocator(CAMath::nextMultipleOf<GPUCA_BUFFER_ALIGNMENT>(res->mSize));
672 res->mSize = std::max<size_t>(ptrDiff(res->SetPointers(res->mPtr), res->mPtr), res->mOverrideSize);
673 if (GetProcessingSettings().allocDebugLevel >= 2) {
674 std::cout << "Allocated (from callback) " << res->mName << ": " << res->mSize << "\n";
675 }
676 } else {
677 void* dummy = nullptr;
678 res->mSize = AllocateRegisteredMemoryHelper(res, res->mPtr, control->ptrCurrent, control->ptrBase, control->size, &GPUMemoryResource::SetPointers, dummy, "host");
679 }
680 } else {
682 }
683 if ((size_t)res->mPtr % GPUCA_BUFFER_ALIGNMENT) {
684 GPUError("Got buffer with insufficient alignment");
685 throw std::bad_alloc();
686 }
687 }
688 if (IsGPU() && (res->mType & GPUMemoryResource::MEMORY_GPU)) {
689 if (res->mProcessor->mLinkedProcessor == nullptr) {
690 GPUError("Device Processor not set (%s)", res->mName);
691 throw std::bad_alloc();
692 }
694 GPUError("Must not allocate non-stacked device memory while volatile chunks are allocated");
695 throw std::bad_alloc();
696 }
698
700 res->mSize = size;
701 } else if (size != res->mSize) {
702 GPUError("Inconsistent device memory allocation (%s: device %lu vs %lu)", res->mName, size, res->mSize);
703 throw std::bad_alloc();
704 }
705 if ((size_t)res->mPtrDevice % GPUCA_BUFFER_ALIGNMENT) {
706 GPUError("Got buffer with insufficient alignment");
707 throw std::bad_alloc();
708 }
709 }
711 }
712}
713
718
720{
722 if ((res->mType & GPUMemoryResource::MEMORY_PERMANENT) && res->mPtr != nullptr) {
724 } else {
726 }
727 return res->mReuse >= 0 ? 0 : res->mSize;
728}
729
731{
732 stdspinlock spinlock(mMemoryMutex);
733 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
734 char* retVal = new (std::align_val_t(GPUCA_BUFFER_ALIGNMENT)) char[size];
737 } else {
739 }
740 return retVal;
741 }
742
744 throw std::runtime_error("Requested invalid memory typo for direct allocation");
745 }
747 GPUError("Must not allocate direct memory while volatile chunks are allocated");
748 throw std::bad_alloc();
749 }
750
753 char* retVal;
755 poolend = (char*)poolend - size; // TODO: Implement overflow check
756 poolend = (char*)poolend - GPUProcessor::getAlignmentMod<GPUCA_MEMALIGN>(poolend);
757 retVal = (char*)poolend;
758 } else {
760 }
761 if (pool > poolend) {
762 GPUError("Insufficient unmanaged memory: missing %ld bytes", ptrDiff(pool, poolend));
763 throw std::bad_alloc();
764 }
766 if (GetProcessingSettings().allocDebugLevel >= 2) {
767 std::cout << "Allocated (unmanaged " << (type == GPUMemoryResource::MEMORY_GPU ? "gpu" : "host") << "): " << size << " - available: " << ptrDiff(poolend, pool) << "\n";
768 }
769 return retVal;
770}
771
773{
774 stdspinlock spinlock(mMemoryMutex);
775 if (mVolatileMemoryStart == nullptr) {
777 }
778 if (size == 0) {
779 return nullptr; // Future GPU memory allocation is volatile
780 }
781 char* retVal;
784 GPUError("Insufficient volatile device memory: missing %ld", ptrDiff(mDeviceMemoryPool, mDeviceMemoryPoolEnd));
785 throw std::bad_alloc();
786 }
788 if (GetProcessingSettings().allocDebugLevel >= 2) {
789 std::cout << "Allocated (volatile GPU): " << size << " - available: " << ptrDiff(mDeviceMemoryPoolEnd, mDeviceMemoryPool) << "\n";
790 }
791 return retVal;
792}
793
795{
796 if (device) {
798 }
799 char* retVal = new (std::align_val_t(GPUCA_BUFFER_ALIGNMENT)) char[size];
800 stdspinlock spinlock(mMemoryMutex);
801 mVolatileChunks.emplace_back(retVal, alignedDeleter());
802 return retVal;
803}
804
810
812{
816 mVolatileMemoryStart = nullptr;
817 }
818 if (GetProcessingSettings().allocDebugLevel >= 2) {
819 std::cout << "Freed (volatile GPU) - available: " << ptrDiff(mDeviceMemoryPoolEnd, mDeviceMemoryPool) << "\n";
820 }
821}
822
828
830{
831 for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
832 if (proc == nullptr || mMemoryResources[i].mProcessor == proc) {
834 }
835 }
836}
837
839{
842 void* basePtr = res->mReuse >= 0 ? mMemoryResources[res->mReuse].mPtr : res->mPtr;
843 size_t size = ptrDiff(res->SetPointers(basePtr), basePtr);
844 if (basePtr && size > std::max(res->mSize, res->mOverrideSize)) {
845 std::cerr << "Updated pointers exceed available memory size: " << size << " > " << std::max(res->mSize, res->mOverrideSize) << " - host - " << res->mName << "\n";
846 throw std::bad_alloc();
847 }
848 }
849 if (IsGPU() && (res->mType & GPUMemoryResource::MEMORY_GPU)) {
850 void* basePtr = res->mReuse >= 0 ? mMemoryResources[res->mReuse].mPtrDevice : res->mPtrDevice;
851 size_t size = ptrDiff(res->SetDevicePointers(basePtr), basePtr);
852 if (basePtr && size > std::max(res->mSize, res->mOverrideSize)) {
853 std::cerr << "Updated pointers exceed available memory size: " << size << " > " << std::max(res->mSize, res->mOverrideSize) << " - GPU - " << res->mName << "\n";
854 throw std::bad_alloc();
855 }
856 }
857}
858
859void GPUReconstruction::FreeRegisteredMemory(GPUProcessor* proc, bool freeCustom, bool freePermanent)
860{
861 for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
862 if ((proc == nullptr || mMemoryResources[i].mProcessor == proc) && (freeCustom || !(mMemoryResources[i].mType & GPUMemoryResource::MEMORY_CUSTOM)) && (freePermanent || !(mMemoryResources[i].mType & GPUMemoryResource::MEMORY_PERMANENT))) {
864 }
865 }
866}
867
872
874{
875 if (GetProcessingSettings().allocDebugLevel >= 2 && (res->mPtr || res->mPtrDevice)) {
876 std::cout << "Freeing " << res->mName << ": size " << res->mSize << " (reused " << res->mReuse << ")\n";
877 }
878 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL && res->mReuse < 0) {
879 operator delete(res->mPtrDevice, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
880 }
881 res->mPtr = nullptr;
882 res->mPtrDevice = nullptr;
883}
884
889
891{
892 if (proc && GetProcessingSettings().memoryAllocationStrategy != GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
893 GPUFatal("Processor-depending memory-free works only with allocation strategy ALLOCATION_INDIVIDUAL");
894 }
895 if (GetProcessingSettings().keepDisplayMemory || GetProcessingSettings().disableMemoryReuse) {
896 return;
897 }
898 if (mNonPersistentMemoryStack.size() == 0) {
899 GPUFatal("Trying to pop memory state from empty stack");
900 }
901 if (tag != 0 && std::get<4>(mNonPersistentMemoryStack.back()) != tag) {
902 GPUFatal("Tag mismatch when popping non persistent memory from stack : pop %s vs on stack %s", qTag2Str(tag).c_str(), qTag2Str(std::get<4>(mNonPersistentMemoryStack.back())).c_str());
903 }
904 if (!proc && (GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) && (IsGPU() || GetProcessingSettings().forceHostMemoryPoolSize)) {
905 printf("Allocated memory after %30s (%8s) (Stack %zu): ", GPUDataTypes::RECO_STEP_NAMES[getRecoStepNum(step, true)], qTag2Str(std::get<4>(mNonPersistentMemoryStack.back())).c_str(), mNonPersistentMemoryStack.size());
907 printf("%76s", "");
909 }
910 for (uint32_t i = std::get<2>(mNonPersistentMemoryStack.back()); i < mNonPersistentIndividualAllocations.size(); i++) {
912 if (proc && res->mProcessor != proc) {
913 continue;
914 }
915 if (GetProcessingSettings().allocDebugLevel >= 2 && (res->mPtr || res->mPtrDevice)) {
916 std::cout << "Freeing NonPersistent " << res->mName << ": size " << res->mSize << " (reused " << res->mReuse << ")\n";
917 }
918 if (res->mReuse < 0) {
919 operator delete(res->mPtrDevice, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
920 }
921 res->mPtr = nullptr;
922 res->mPtrDevice = nullptr;
923 }
924 if (!proc) {
925 stdspinlock spinlock(mMemoryMutex);
926 mHostMemoryPoolEnd = std::get<0>(mNonPersistentMemoryStack.back());
930 mNonPersistentMemoryStack.pop_back();
931 }
932}
933
935{
937 throw std::runtime_error("temporary memory stack already blocked");
938 }
941}
942
944{
945 if (mNonPersistentMemoryStack.size()) {
946 throw std::runtime_error("cannot unblock while there is stacked memory");
947 }
950 mHostMemoryPoolBlocked = nullptr;
951 mDeviceMemoryPoolBlocked = nullptr;
952}
953
955{
956 mMemoryResources[res].mPtr = ptr;
957}
958
960{
961 for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
964 }
965 }
968 mDirectMemoryChunks.clear();
970 mVolatileChunks.clear();
971 mVolatileMemoryStart = nullptr;
972 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
973 mHostMemoryPool = GPUProcessor::alignPointer<GPUCA_MEMALIGN>(mHostMemoryPermanent);
974 mDeviceMemoryPool = GPUProcessor::alignPointer<GPUCA_MEMALIGN>(mDeviceMemoryPermanent);
977 } else {
979 }
980}
981
987
989{
990 printf("Maximum Memory Allocation: Host %'zu / Device %'zu\n", mHostMemoryUsedMax, mDeviceMemoryUsedMax);
991}
992
994{
995 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
996 printf("Memory Allocation: Host %'13zd / %'13zu (Permanent %'13zd, Data %'13zd, Scratch %'13zd), Device %'13zd / %'13zu, (Permanent %'13zd, Data %'13zd, Scratch %'13zd) %zu chunks\n",
999 mMemoryResources.size());
1000 }
1001}
1002
1004{
1005 std::map<std::string, std::array<size_t, 3>> sizes;
1006 for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
1007 auto& res = mMemoryResources[i];
1008 if (res.mReuse >= 0) {
1009 continue;
1010 }
1011 auto& x = sizes[res.mName];
1012 if (res.mPtr) {
1013 x[0] += res.mSize;
1014 }
1015 if (res.mPtrDevice) {
1016 x[1] += res.mSize;
1017 }
1019 x[2] = 1;
1020 }
1021 }
1022 printf("%59s CPU / %9s GPU\n", "", "");
1023 for (auto it = sizes.begin(); it != sizes.end(); it++) {
1024 printf("Allocation %50s %s: Size %'14zu / %'14zu\n", it->first.c_str(), it->second[2] ? "P" : " ", it->second[0], it->second[1]);
1025 }
1027 for (uint32_t i = 0; i < mChains.size(); i++) {
1028 mChains[i]->PrintMemoryStatistics();
1029 }
1030}
1031
1033{
1034 if (GetProcessingSettings().noGPUMemoryRegistration) {
1035 return 0;
1036 }
1038 if (retVal == 0) {
1039 mRegisteredMemoryPtrs.emplace(ptr);
1040 }
1041 return retVal;
1042}
1043
1045{
1046 if (GetProcessingSettings().noGPUMemoryRegistration) {
1047 return 0;
1048 }
1049 const auto& pos = mRegisteredMemoryPtrs.find(ptr);
1050 if (pos != mRegisteredMemoryPtrs.end()) {
1053 }
1054 return 1;
1055}
1056
1057namespace o2::gpu::internal
1058{
1059namespace // anonymous
1060{
1061template <class T>
1062constexpr static inline int32_t getStepNum(T step, bool validCheck, int32_t N, const char* err = "Invalid step num")
1063{
1064 static_assert(sizeof(step) == sizeof(uint32_t), "Invalid step enum size");
1065 int32_t retVal = 8 * sizeof(uint32_t) - 1 - CAMath::Clz((uint32_t)step);
1066 if ((uint32_t)step == 0 || retVal >= N) {
1067 if (!validCheck) {
1068 return -1;
1069 }
1070 throw std::runtime_error("Invalid General Step");
1071 }
1072 return retVal;
1073}
1074} // anonymous namespace
1075} // namespace o2::gpu::internal
1076
1077int32_t GPUReconstruction::getRecoStepNum(RecoStep step, bool validCheck) { return internal::getStepNum(step, validCheck, GPUDataTypes::N_RECO_STEPS, "Invalid Reco Step"); }
1078int32_t GPUReconstruction::getGeneralStepNum(GeneralStep step, bool validCheck) { return internal::getStepNum(step, validCheck, GPUDataTypes::N_GENERAL_STEPS, "Invalid General Step"); }
1079
1081{
1082 if (!mInitialized || !GetProcessingSettings().doublePipeline || mMaster != nullptr || !mSlaves.size()) {
1083 throw std::invalid_argument("Cannot start double pipeline mode");
1084 }
1085 if (GetProcessingSettings().debugLevel >= 3) {
1086 GPUInfo("Pipeline worker started");
1087 }
1088 bool terminate = false;
1089 while (!terminate) {
1090 {
1091 std::unique_lock<std::mutex> lk(mPipelineContext->mutex);
1092 mPipelineContext->cond.wait(lk, [this] { return this->mPipelineContext->queue.size() > 0; });
1093 }
1094 GPUReconstructionPipelineQueue* q;
1095 {
1096 std::lock_guard<std::mutex> lk(mPipelineContext->mutex);
1097 q = mPipelineContext->queue.front();
1098 mPipelineContext->queue.pop();
1099 }
1100 if (q->op == 1) {
1101 terminate = 1;
1102 } else {
1103 q->retVal = q->chain->RunChain();
1104 }
1105 {
1106 std::lock_guard<std::mutex> lk(q->m);
1107 q->done = true;
1108 }
1109 q->c.notify_one();
1110 }
1111 if (GetProcessingSettings().debugLevel >= 3) {
1112 GPUInfo("Pipeline worker ended");
1113 }
1114}
1115
1120
1122{
1125 std::unique_ptr<GPUReconstructionPipelineQueue> qu(new GPUReconstructionPipelineQueue);
1126 GPUReconstructionPipelineQueue* q = qu.get();
1127 q->chain = terminate ? nullptr : mChains[0].get();
1128 q->op = terminate ? 1 : 0;
1129 std::unique_lock<std::mutex> lkdone(q->m);
1130 {
1131 std::lock_guard<std::mutex> lkpipe(rec->mPipelineContext->mutex);
1132 if (rec->mPipelineContext->terminate) {
1133 throw std::runtime_error("Must not enqueue work after termination request");
1134 }
1135 rec->mPipelineContext->queue.push(q);
1136 rec->mPipelineContext->terminate = terminate;
1137 rec->mPipelineContext->cond.notify_one();
1138 }
1139 q->c.wait(lkdone, [&q]() { return q->done; });
1140 if (q->retVal) {
1141 return q->retVal;
1142 }
1143 if (terminate) {
1144 return 0;
1145 } else {
1146 return mChains[0]->FinalizePipelinedProcessing();
1147 }
1148}
1149
1151{
1153 std::lock_guard<std::mutex> lk(rec->mPipelineContext->mutex);
1154 return rec->mPipelineContext->queue.size() && rec->mPipelineContext->queue.front()->op == 0 ? rec->mPipelineContext->queue.front()->chain : nullptr;
1155}
1156
1157void GPUReconstruction::PrepareEvent() // TODO: Clean this up, this should not be called from chainTracking but before
1158{
1160 for (uint32_t i = 0; i < mChains.size(); i++) {
1161 mChains[i]->PrepareEvent();
1162 }
1163 for (uint32_t i = 0; i < mProcessors.size(); i++) {
1164 if (mProcessors[i].proc->mAllocateAndInitializeLate) {
1165 continue;
1166 }
1167 (mProcessors[i].proc->*(mProcessors[i].SetMaxData))(mHostConstantMem->ioPtrs);
1168 if (mProcessors[i].proc->mGPUProcessorType != GPUProcessor::PROCESSOR_TYPE_DEVICE && mProcessors[i].proc->mLinkedProcessor) {
1169 (mProcessors[i].proc->mLinkedProcessor->*(mProcessors[i].SetMaxData))(mHostConstantMem->ioPtrs);
1170 }
1171 }
1172 ComputeReuseMax(nullptr);
1173 AllocateRegisteredMemory(nullptr);
1174}
1175
1176int32_t GPUReconstruction::CheckErrorCodes(bool cpuOnly, bool forceShowErrors, std::vector<std::array<uint32_t, 4>>* fillErrors)
1177{
1178 int32_t retVal = 0;
1179 for (uint32_t i = 0; i < mChains.size(); i++) {
1180 if (mChains[i]->CheckErrorCodes(cpuOnly, forceShowErrors, fillErrors)) {
1181 retVal++;
1182 }
1183 }
1184 return retVal;
1185}
1186
1187int32_t GPUReconstruction::GPUChkErrA(const int64_t error, const char* file, int32_t line, bool failOnError)
1188{
1189 if (error == 0 || !GPUChkErrInternal(error, file, line)) {
1190 return 0;
1191 }
1192 if (failOnError) {
1193 if (mInitialized && mInErrorHandling == false) {
1194 mInErrorHandling = true;
1195 CheckErrorCodes(false, true);
1196 }
1197 throw std::runtime_error("GPU Backend Failure");
1198 }
1199 return 1;
1200}
1201
1203{
1204 std::string f;
1205 f = dir;
1206 f += "settings.dump";
1207 DumpStructToFile(mGRPSettings.get(), f.c_str());
1208 for (uint32_t i = 0; i < mChains.size(); i++) {
1209 mChains[i]->DumpSettings(dir);
1210 }
1211}
1212
1213void GPUReconstruction::UpdateDynamicSettings(const GPUSettingsRecDynamic* d)
1214{
1215 UpdateSettings(nullptr, nullptr, d);
1216}
1217
1218void GPUReconstruction::UpdateSettings(const GPUSettingsGRP* g, const GPUSettingsProcessing* p, const GPUSettingsRecDynamic* d)
1219{
1220 if (g) {
1221 *mGRPSettings = *g;
1222 }
1223 if (p) {
1224 mProcessingSettings->debugLevel = p->debugLevel;
1225 mProcessingSettings->resetTimers = p->resetTimers;
1226 }
1227 GPURecoStepConfiguration* w = nullptr;
1229 w = &mRecoSteps;
1230 }
1231 param().UpdateSettings(g, p, w, d);
1232 if (mInitialized) {
1234 }
1235}
1236
1237int32_t GPUReconstruction::ReadSettings(const char* dir)
1238{
1239 std::string f;
1240 f = dir;
1241 f += "settings.dump";
1242 new (mGRPSettings.get()) GPUSettingsGRP;
1243 if (ReadStructFromFile(f.c_str(), mGRPSettings.get())) {
1244 return 1;
1245 }
1247 for (uint32_t i = 0; i < mChains.size(); i++) {
1248 mChains[i]->ReadSettings(dir);
1249 }
1250 return 0;
1251}
1252
1253void GPUReconstruction::SetSettings(float solenoidBzNominalGPU, const GPURecoStepConfiguration* workflow)
1254{
1255#ifdef GPUCA_O2_LIB
1257 config.ReadConfigurableParam(config);
1258 config.configGRP.solenoidBzNominalGPU = solenoidBzNominalGPU;
1259 SetSettings(&config.configGRP, &config.configReconstruction, &config.configProcessing, workflow);
1260#else
1261 GPUSettingsGRP grp;
1262 grp.solenoidBzNominalGPU = solenoidBzNominalGPU;
1263 SetSettings(&grp, nullptr, nullptr, workflow);
1264#endif
1265}
1266
1267void GPUReconstruction::SetSettings(const GPUSettingsGRP* grp, const GPUSettingsRec* rec, const GPUSettingsProcessing* proc, const GPURecoStepConfiguration* workflow)
1268{
1269 if (mInitialized) {
1270 GPUError("Cannot update settings while initialized");
1271 throw std::runtime_error("Settings updated while initialized");
1272 }
1273 *mGRPSettings = *grp;
1274 if (proc) {
1275 *mProcessingSettings = *proc;
1276 }
1277 if (workflow) {
1278 mRecoSteps.steps = workflow->steps;
1280 mRecoSteps.inputs = workflow->inputs;
1281 mRecoSteps.outputs = workflow->outputs;
1282 }
1283 param().SetDefaults(mGRPSettings.get(), rec, proc, workflow);
1284}
1285
1287{
1288 GPUOutputControl outputControl;
1289 outputControl.set(ptr, size);
1290 SetOutputControl(outputControl);
1291}
1292
1296void GPUReconstruction::SetResetTimers(bool reset) { mProcessingSettings->resetTimers = reset; }
1301
1302ThrustVolatileAllocator::ThrustVolatileAllocator(GPUReconstruction* r)
1303{
1304 mAlloc = [&r](size_t n) { return (char*)r->AllocateVolatileDeviceMemory(n); };
1305}
int32_t i
#define GPUCA_BUFFER_ALIGNMENT
uint32_t op
bool done
int32_t retVal
GPUChain * chain
#define GPUCA_NSECTORS
uint16_t pos
Definition RawData.h:3
uint32_t res
Definition RawData.h:0
TBranch * ptr
void set(S v)
Definition bitfield.h:55
bool isSet(const bitfield &v) const
Definition bitfield.h:66
const GPUSettingsDisplay * GetEventDisplayConfig() const
const GPUSettingsQA * GetQAConfig() const
static void dumpConfig(const GPUSettingsRec *rec, const GPUSettingsProcessing *proc, const GPUSettingsQA *qa, const GPUSettingsDisplay *display, const GPUSettingsDeviceBackend *device, const GPURecoStepConfiguration *workflow)
static constexpr const char *const RECO_STEP_NAMES[]
static constexpr int32_t N_RECO_STEPS
static constexpr int32_t N_GENERAL_STEPS
void * SetDevicePointers(void *ptr) const
void * SetPointers(void *ptr) const
static void computePointerWithAlignment(T *&basePtr, S *&objPtr, size_t nEntries=1)
void InitGPUProcessor(GPUReconstruction *rec, ProcessorType type=PROCESSOR_TYPE_CPU, GPUProcessor *slaveProcessor=nullptr)
ProcessorType mGPUProcessorType
GPURecoStepConfiguration mRecoSteps
std::unordered_set< const void * > mRegisteredMemoryPtrs
int16_t RegisterMemoryAllocationHelper(GPUProcessor *proc, void *(GPUProcessor::*setPtr)(void *), int32_t type, const char *name, const GPUMemoryReuse &re)
std::vector< std::unique_ptr< GPUChain > > mChains
void * AllocateVolatileMemory(size_t size, bool device)
ThrustVolatileAllocator getThrustVolatileDeviceAllocator()
std::unique_ptr< GPUMemorySizeScalers > mMemoryScalers
void AllocateRegisteredForeignMemory(int16_t res, GPUReconstruction *rec, GPUOutputControl *control=nullptr)
void SetInputControl(void *ptr, size_t size)
GPUConstantMem * mDeviceConstantMem
void ConstructGPUProcessor(GPUProcessor *proc)
std::shared_ptr< GPUROOTDumpCore > mROOTDump
const GPUSettingsDeviceBackend & GetDeviceBackendSettings() const
void ComputeReuseMax(GPUProcessor *proc)
void SetMemoryExternalInput(int16_t res, void *ptr)
int32_t getGeneralStepNum(GeneralStep step, bool validCheck=true)
static constexpr uint32_t NSECTORS
RecoStepField GetRecoStepsGPU() const
void RegisterGPUDeviceProcessor(GPUProcessor *proc, GPUProcessor *slaveProcessor)
std::vector< GPUReconstruction * > mSlaves
std::vector< std::tuple< void *, void *, size_t, size_t, uint64_t > > mNonPersistentMemoryStack
std::unique_ptr< T > ReadStructFromFile(const char *file)
void UpdateDynamicSettings(const GPUSettingsRecDynamic *d)
std::unique_ptr< GPUSettingsDeviceBackend > mDeviceBackendSettings
std::vector< GPUMemoryResource > mMemoryResources
std::unique_ptr< GPUReconstructionPipelineContext > mPipelineContext
std::unique_ptr< GPUConstantMem > mHostConstantMem
void ResetRegisteredMemoryPointers(GPUProcessor *proc)
void DumpStructToFile(const T *obj, const char *file)
void AllocateRegisteredMemoryInternal(GPUMemoryResource *res, GPUOutputControl *control, GPUReconstruction *recPool)
virtual int32_t registerMemoryForGPU_internal(const void *ptr, size_t size)=0
virtual size_t WriteToConstantMemory(size_t offset, const void *src, size_t size, int32_t stream=-1, gpu_reconstruction_kernels::deviceEvent *ev=nullptr)=0
std::unordered_map< GPUMemoryReuse::ID, MemoryReuseMeta > mMemoryReuse1to1
std::vector< ProcessorData > mProcessors
void * AllocateVolatileDeviceMemory(size_t size)
virtual int32_t InitDevice()=0
void SetSettings(float solenoidBzNominalGPU, const GPURecoStepConfiguration *workflow=nullptr)
const GPUCalibObjectsConst & GetCalib() const
const GPUTrackingInOutPointers GetIOPtrs() const
virtual std::unique_ptr< gpu_reconstruction_kernels::threadContext > GetThreadContext()=0
GPUReconstruction(const GPUReconstruction &)=delete
static constexpr GeometryType geometryType
std::vector< std::unique_ptr< char[], alignedDeleter > > mNonPersistentIndividualDirectAllocations
void FreeRegisteredMemory(GPUProcessor *proc, bool freeCustom=false, bool freePermanent=false)
std::vector< std::unique_ptr< char[], alignedDeleter > > mVolatileChunks
virtual RecoStepField AvailableGPURecoSteps()
static constexpr const char *const IOTYPENAMES[]
void PopNonPersistentMemory(RecoStep step, uint64_t tag, const GPUProcessor *proc=nullptr)
std::vector< std::unique_ptr< char[], alignedDeleter > > mDirectMemoryChunks
void UpdateSettings(const GPUSettingsGRP *g, const GPUSettingsProcessing *p=nullptr, const GPUSettingsRecDynamic *d=nullptr)
int32_t CheckErrorCodes(bool cpuOnly=false, bool forceShowErrors=false, std::vector< std::array< uint32_t, 4 > > *fillErrors=nullptr)
const GPUParam & GetParam() const
void ClearAllocatedMemory(bool clearOutputs=true)
static constexpr const char *const GEOMETRY_TYPE_NAMES[]
virtual int32_t ExitDevice()=0
std::unique_ptr< GPUSettingsGRP > mGRPSettings
std::unique_ptr< GPUSettingsProcessing > mProcessingSettings
void PushNonPersistentMemory(uint64_t tag)
int32_t getRecoStepNum(RecoStep step, bool validCheck=true)
virtual int32_t unregisterMemoryForGPU_internal(const void *ptr)=0
void BlockStackedMemory(GPUReconstruction *rec)
const GPUSettingsProcessing & GetProcessingSettings() const
void DumpSettings(const char *dir="")
void * AllocateDirectMemory(size_t size, int32_t type)
virtual void GetITSTraits(std::unique_ptr< o2::its::TrackerTraits< 7 > > *trackerTraits, std::unique_ptr< o2::its::VertexerTraits< 7 > > *vertexerTraits, std::unique_ptr< o2::its::TimeFrame< 7 > > *timeFrame)
int32_t unregisterMemoryForGPU(const void *ptr)
int32_t registerMemoryForGPU(const void *ptr, size_t size)
void SetDebugLevelTmp(int32_t level)
size_t AllocateRegisteredMemoryHelper(GPUMemoryResource *res, void *&ptr, void *&memorypool, void *memorybase, size_t memorysize, void *(GPUMemoryResource::*SetPointers)(void *) const, void *&memorypoolend, const char *device)
int32_t EnqueuePipeline(bool terminate=false)
std::shared_ptr< GPUReconstructionThreading > mThreading
std::vector< GPUMemoryResource * > mNonPersistentIndividualAllocations
virtual int32_t GPUChkErrInternal(const int64_t error, const char *file, int32_t line) const
int32_t GPUChkErrA(const int64_t error, const char *file, int32_t line, bool failOnError)
size_t AllocateRegisteredMemory(GPUProcessor *proc, bool resetCustom=false)
int32_t ReadSettings(const char *dir="")
void SetOutputControl(const GPUOutputControl &v)
void SetSector(int32_t iSector)
#define TPC_MAX_FRAGMENT_LEN_GPU
#define TPC_MAX_FRAGMENT_LEN_HOST
GLdouble n
Definition glcorearb.h:1982
GLint GLenum GLint x
Definition glcorearb.h:403
const GLfloat * m
Definition glcorearb.h:4066
GLsizeiptr size
Definition glcorearb.h:659
GLuint GLsizei const GLuint const GLintptr const GLsizeiptr * sizes
Definition glcorearb.h:2595
GLuint const GLchar * name
Definition glcorearb.h:781
GLdouble f
Definition glcorearb.h:310
GLboolean GLboolean GLboolean b
Definition glcorearb.h:1233
GLint GLint GLsizei GLint GLenum GLenum type
Definition glcorearb.h:275
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLboolean GLboolean g
Definition glcorearb.h:1233
GLint level
Definition glcorearb.h:275
GLboolean r
Definition glcorearb.h:1233
GLenum GLfloat param
Definition glcorearb.h:271
GLboolean GLboolean GLboolean GLboolean a
Definition glcorearb.h:1233
GLubyte GLubyte GLubyte GLubyte w
Definition glcorearb.h:852
std::unique_ptr< GPUDisplayFrontendInterface > eventDisplay
GPUReconstruction * rec
std::string qTag2Str(const T tag)
Definition strtag.h:35
GPUTPCTracker tpcTrackers[GPUCA_NSECTORS]
GPUTPCClusterFinder tpcClusterer[GPUCA_NSECTORS]
GPUCalibObjectsConst calibObjects
void set(void *p, size_t s)
std::function< void *(size_t)> allocator
void UpdateSettings(const GPUSettingsGRP *g, const GPUSettingsProcessing *p=nullptr, const GPURecoStepConfiguration *w=nullptr, const GPUSettingsRecDynamic *d=nullptr)
Definition GPUParam.cxx:117
void SetDefaults(float solenoidBz, bool assumeConstantBz)
Definition GPUParam.cxx:33
GPUDataTypes::RecoStepField stepsGPUMask
GPUDataTypes::InOutTypeField outputs
GPUDataTypes::RecoStepField steps
GPUDataTypes::InOutTypeField inputs
std::queue< GPUReconstructionPipelineQueue * > queue