Project
Loading...
Searching...
No Matches
GPUReconstruction.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
15#include <cstring>
16#include <cstdio>
17#include <iostream>
18#include <mutex>
19#include <string>
20#include <map>
21#include <queue>
22#include <mutex>
23#include <condition_variable>
24#include <array>
25
26#include "GPUReconstruction.h"
29#include "GPUReconstructionIO.h"
30#include "GPUROOTDumpCore.h"
31#include "GPUConfigDump.h"
32#include "GPUChainTracking.h"
33#include "GPUConstantMem.h"
34#include "GPUCommonHelpers.h"
35#include "GPUSettings.h"
36
37#include "GPUMemoryResource.h"
38#include "GPUChain.h"
40
41#include "GPULogging.h"
42#include "utils/strtag.h"
43
44#ifdef GPUCA_O2_LIB
46#endif
47
49
50namespace o2::gpu
51{
52namespace // anonymous
53{
54struct GPUReconstructionPipelineQueue {
55 uint32_t op = 0; // For now, 0 = process, 1 = terminate
56 GPUChain* chain = nullptr;
57 std::mutex m;
58 std::condition_variable c;
59 bool done = false;
60 int32_t retVal = 0;
61};
62} // namespace
63
65 std::queue<GPUReconstructionPipelineQueue*> queue;
66 std::mutex mutex;
67 std::condition_variable cond;
68 bool terminate = false;
69};
70} // namespace o2::gpu
71
72using namespace o2::gpu;
73
74constexpr const char* const GPUReconstruction::GEOMETRY_TYPE_NAMES[];
75constexpr const char* const GPUReconstruction::IOTYPENAMES[];
77
78static ptrdiff_t ptrDiff(void* a, void* b) { return (char*)a - (char*)b; }
79
80GPUReconstruction::GPUReconstruction(const GPUSettingsDeviceBackend& cfg) : mHostConstantMem(new GPUConstantMem), mGRPSettings(new GPUSettingsGRP), mDeviceBackendSettings(new GPUSettingsDeviceBackend(cfg)), mProcessingSettings(new GPUSettingsProcessing)
81{
82 if (cfg.master) {
84 throw std::invalid_argument("device type of master and slave GPUReconstruction does not match");
85 }
86 if (cfg.master->mMaster) {
87 throw std::invalid_argument("Cannot be slave to a slave");
88 }
89 mMaster = cfg.master;
90 mSlaveId = cfg.master->mSlaves.size();
91 cfg.master->mSlaves.emplace_back(this);
92 }
95 for (uint32_t i = 0; i < NSECTORS; i++) {
96 processors()->tpcTrackers[i].SetSector(i); // TODO: Move to a better place
98#ifdef GPUCA_HAS_ONNX
99 processors()->tpcNNClusterer[i].mISector = i;
100#endif
101 }
102#ifndef GPUCA_NO_ROOT
103 mROOTDump = GPUROOTDumpCore::getAndCreate();
104#endif
105}
106
108{
109 if (mInitialized) {
110 GPUError("GPU Reconstruction not properly deinitialized!");
111 }
112}
113
114void GPUReconstruction::GetITSTraits(std::unique_ptr<o2::its::TrackerTraits>* trackerTraits, std::unique_ptr<o2::its::VertexerTraits>* vertexerTraits, std::unique_ptr<o2::its::TimeFrame>* timeFrame)
115{
116 if (trackerTraits) {
117 trackerTraits->reset(new o2::its::TrackerTraits);
118 }
119 if (vertexerTraits) {
120 vertexerTraits->reset(new o2::its::VertexerTraits);
121 }
122 if (timeFrame) {
123 timeFrame->reset(new o2::its::TimeFrame);
124 }
125}
126
128{
129 return std::max<int32_t>(0, tbb::this_task_arena::current_thread_index());
130}
131
133{
134 if (mMaster) {
135 throw std::runtime_error("Must not call init on slave!");
136 }
137 int32_t retVal = InitPhaseBeforeDevice();
138 if (retVal) {
139 return retVal;
140 }
141 for (uint32_t i = 0; i < mSlaves.size(); i++) {
142 retVal = mSlaves[i]->InitPhaseBeforeDevice();
143 if (retVal) {
144 GPUError("Error initialization slave (before deviceinit)");
145 return retVal;
146 }
147 mNStreams = std::max(mNStreams, mSlaves[i]->mNStreams);
150 }
151 if (InitDevice()) {
152 return 1;
153 }
154 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
157 } else {
159 }
161 return 1;
162 }
163 for (uint32_t i = 0; i < mSlaves.size(); i++) {
164 mSlaves[i]->mDeviceMemoryBase = mDeviceMemoryPermanent;
165 mSlaves[i]->mHostMemoryBase = mHostMemoryPermanent;
166 mSlaves[i]->mDeviceMemorySize = mDeviceMemorySize - ptrDiff(mSlaves[i]->mDeviceMemoryBase, mDeviceMemoryBase);
167 mSlaves[i]->mHostMemorySize = mHostMemorySize - ptrDiff(mSlaves[i]->mHostMemoryBase, mHostMemoryBase);
168 mSlaves[i]->mHostMemoryPoolEnd = mHostMemoryPoolEnd;
169 mSlaves[i]->mDeviceMemoryPoolEnd = mDeviceMemoryPoolEnd;
170 if (mSlaves[i]->InitDevice()) {
171 GPUError("Error initialization slave (deviceinit)");
172 return 1;
173 }
175 GPUError("Error initialization slave (permanent memory)");
176 return 1;
177 }
178 mDeviceMemoryPermanent = mSlaves[i]->mDeviceMemoryPermanent;
179 mHostMemoryPermanent = mSlaves[i]->mHostMemoryPermanent;
180 }
182 if (retVal) {
183 return retVal;
184 }
186 for (uint32_t i = 0; i < mSlaves.size(); i++) {
187 mSlaves[i]->mDeviceMemoryPermanent = mDeviceMemoryPermanent;
188 mSlaves[i]->mHostMemoryPermanent = mHostMemoryPermanent;
189 retVal = mSlaves[i]->InitPhaseAfterDevice();
190 if (retVal) {
191 GPUError("Error initialization slave (after device init)");
192 return retVal;
193 }
194 mSlaves[i]->ClearAllocatedMemory();
195 }
196 return 0;
197}
198
199namespace o2::gpu::internal
200{
201static uint32_t getDefaultNThreads()
202{
203 const char* tbbEnv = getenv("TBB_NUM_THREADS");
204 uint32_t tbbNum = tbbEnv ? atoi(tbbEnv) : 0;
205 if (tbbNum) {
206 return tbbNum;
207 }
208 const char* ompEnv = getenv("OMP_NUM_THREADS");
209 uint32_t ompNum = ompEnv ? atoi(ompEnv) : 0;
210 if (ompNum) {
211 return tbbNum;
212 }
213 return tbb::info::default_concurrency();
214}
215} // namespace o2::gpu::internal
216
218{
219 if (GetProcessingSettings().printSettings) {
220 if (mSlaves.size() || mMaster) {
221 printf("\nConfig Dump %s\n", mMaster ? "Slave" : "Master");
222 }
223 const GPUChainTracking* chTrk;
224 for (uint32_t i = 0; i < mChains.size(); i++) {
225 if ((chTrk = dynamic_cast<GPUChainTracking*>(mChains[i].get()))) {
226 break;
227 }
228 }
229 GPUConfigDump::dumpConfig(&param().rec, mProcessingSettings.get(), chTrk ? chTrk->GetQAConfig() : nullptr, chTrk ? chTrk->GetEventDisplayConfig() : nullptr, mDeviceBackendSettings.get(), &mRecoSteps);
230 }
233 if (!IsGPU()) {
234 mRecoSteps.stepsGPUMask.set((uint8_t)0);
235 }
236
237 if (GetProcessingSettings().forceMemoryPoolSize >= 1024 || GetProcessingSettings().forceHostMemoryPoolSize >= 1024) {
239 }
240 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_AUTO) {
242 }
243 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
244 mProcessingSettings->forceMemoryPoolSize = mProcessingSettings->forceHostMemoryPoolSize = 0;
245 }
246 if (GetProcessingSettings().debugLevel >= 4) {
247 mProcessingSettings->keepAllMemory = true;
248 }
249 if (GetProcessingSettings().debugLevel >= 5 && GetProcessingSettings().allocDebugLevel < 2) {
250 mProcessingSettings->allocDebugLevel = 2;
251 }
253 mProcessingSettings->keepDisplayMemory = true;
254 }
255 if (GetProcessingSettings().debugLevel < 6) {
256 mProcessingSettings->debugMask = 0;
257 }
258 if (GetProcessingSettings().debugLevel < 1) {
259 mProcessingSettings->deviceTimers = false;
260 }
261 if (GetProcessingSettings().debugLevel > 0) {
262 mProcessingSettings->recoTaskTiming = true;
263 }
264 if (GetProcessingSettings().deterministicGPUReconstruction == -1) {
265 mProcessingSettings->deterministicGPUReconstruction = GetProcessingSettings().debugLevel >= 6;
266 }
267 if (GetProcessingSettings().deterministicGPUReconstruction) {
268#ifndef GPUCA_DETERMINISTIC_MODE
269 GPUError("WARNING, deterministicGPUReconstruction needs GPUCA_DETERMINISTIC_MODE for being fully deterministic, without only most indeterminism by concurrency is removed, but floating point effects remain!");
270#endif
271 mProcessingSettings->overrideClusterizerFragmentLen = TPC_MAX_FRAGMENT_LEN_GPU;
272 param().rec.tpc.nWaysOuter = true;
273 if (param().rec.tpc.looperInterpolationInExtraPass == -1) {
274 param().rec.tpc.looperInterpolationInExtraPass = 0;
275 }
276 if (GetProcessingSettings().createO2Output > 1) {
277 mProcessingSettings->createO2Output = 1;
278 }
279 mProcessingSettings->rtc.deterministic = 1;
280 } else {
281#ifdef GPUCA_DETERMINISTIC_MODE
282 GPUError("WARNING, compiled with GPUCA_DETERMINISTIC_MODE but deterministicGPUReconstruction not set, only compile-time determinism and deterministic math enforced, not fully deterministic!");
283#endif
284 }
285 if (GetProcessingSettings().deterministicGPUReconstruction && GetProcessingSettings().debugLevel >= 6) {
286 mProcessingSettings->nTPCClustererLanes = 1;
287 }
288 if (GetProcessingSettings().createO2Output > 1 && GetProcessingSettings().runQA && GetProcessingSettings().qcRunFraction == 100.f) {
289 mProcessingSettings->createO2Output = 1;
290 }
291 if (!GetProcessingSettings().createO2Output || !IsGPU()) {
292 mProcessingSettings->clearO2OutputFromGPU = false;
293 }
295 mProcessingSettings->mergerSortTracks = false;
296 }
297
298 if (GetProcessingSettings().debugLevel > 3 || !IsGPU() || GetProcessingSettings().deterministicGPUReconstruction) {
299 mProcessingSettings->delayedOutput = false;
300 }
301
302 if (!GetProcessingSettings().rtc.enable) {
303 mProcessingSettings->rtc.optConstexpr = false;
304 }
305
306 mMemoryScalers->factor = GetProcessingSettings().memoryScalingFactor;
307 mMemoryScalers->conservative = GetProcessingSettings().conservativeMemoryEstimate;
308 mMemoryScalers->returnMaxVal = GetProcessingSettings().forceMaxMemScalers != 0;
309 if (GetProcessingSettings().forceMaxMemScalers > 1) {
310 mMemoryScalers->rescaleMaxMem(GetProcessingSettings().forceMaxMemScalers);
311 }
312
313 if (GetProcessingSettings().nHostThreads != -1 && GetProcessingSettings().ompThreads != -1) {
314 GPUFatal("Must not use both nHostThreads and ompThreads at the same time!");
315 } else if (GetProcessingSettings().ompThreads != -1) {
316 mProcessingSettings->nHostThreads = GetProcessingSettings().ompThreads;
317 GPUWarning("You are using the deprecated ompThreads option, please switch to nHostThreads!");
318 }
319
320 if (GetProcessingSettings().nHostThreads <= 0) {
321 mProcessingSettings->nHostThreads = internal::getDefaultNThreads();
322 } else {
323 mProcessingSettings->autoAdjustHostThreads = false;
324 }
325 mMaxHostThreads = GetProcessingSettings().nHostThreads;
326 if (mMaster == nullptr) {
327 mThreading = std::make_shared<GPUReconstructionThreading>();
328 mThreading->control = std::make_unique<tbb::global_control>(tbb::global_control::max_allowed_parallelism, mMaxHostThreads);
329 mThreading->allThreads = std::make_unique<tbb::task_arena>(mMaxHostThreads);
330 mThreading->activeThreads = std::make_unique<tbb::task_arena>(mMaxHostThreads);
331 } else {
333 }
335 if (IsGPU()) {
336 mNStreams = std::max<int32_t>(GetProcessingSettings().nStreams, 3);
337 }
338
339 if (GetProcessingSettings().nTPCClustererLanes == -1) {
340 mProcessingSettings->nTPCClustererLanes = (GetRecoStepsGPU() & RecoStep::TPCClusterFinding) ? 3 : std::max<int32_t>(1, std::min<int32_t>(GPUCA_NSECTORS, GetProcessingSettings().inKernelParallel ? (mMaxHostThreads >= 4 ? std::min<int32_t>(mMaxHostThreads / 2, mMaxHostThreads >= 32 ? GPUCA_NSECTORS : 4) : 1) : mMaxHostThreads));
341 }
342 if (GetProcessingSettings().overrideClusterizerFragmentLen == -1) {
343 mProcessingSettings->overrideClusterizerFragmentLen = ((GetRecoStepsGPU() & RecoStep::TPCClusterFinding) || (mMaxHostThreads / GetProcessingSettings().nTPCClustererLanes >= 3)) ? TPC_MAX_FRAGMENT_LEN_GPU : TPC_MAX_FRAGMENT_LEN_HOST;
344 }
345 if (GetProcessingSettings().nTPCClustererLanes > GPUCA_NSECTORS) {
346 GPUError("Invalid value for nTPCClustererLanes: %d", GetProcessingSettings().nTPCClustererLanes);
347 mProcessingSettings->nTPCClustererLanes = GPUCA_NSECTORS;
348 }
349
350 if (GetProcessingSettings().doublePipeline && (mChains.size() != 1 || mChains[0]->SupportsDoublePipeline() == false || !IsGPU() || GetProcessingSettings().memoryAllocationStrategy != GPUMemoryResource::ALLOCATION_GLOBAL)) {
351 GPUError("Must use double pipeline mode only with exactly one chain that must support it");
352 return 1;
353 }
354
355 if (mMaster == nullptr && GetProcessingSettings().doublePipeline) {
357 }
358
360 for (uint32_t i = 0; i < mChains.size(); i++) {
361 if (mChains[i]->EarlyConfigure()) {
362 return 1;
363 }
364 mChains[i]->RegisterPermanentMemoryAndProcessors();
365 size_t memPrimary, memPageLocked;
366 mChains[i]->MemorySize(memPrimary, memPageLocked);
367 if (!IsGPU() || mOutputControl.useInternal()) {
368 memPageLocked = memPrimary;
369 }
370 mDeviceMemorySize += memPrimary;
371 mHostMemorySize += memPageLocked;
372 }
373 if (GetProcessingSettings().forceMemoryPoolSize && GetProcessingSettings().forceMemoryPoolSize <= 2 && CanQueryMaxMemory()) {
374 mDeviceMemorySize = GetProcessingSettings().forceMemoryPoolSize;
375 } else if (GetProcessingSettings().forceMemoryPoolSize > 2) {
376 mDeviceMemorySize = GetProcessingSettings().forceMemoryPoolSize;
377 if (!IsGPU() || mOutputControl.useInternal()) {
379 }
380 }
381 if (GetProcessingSettings().forceHostMemoryPoolSize) {
382 mHostMemorySize = GetProcessingSettings().forceHostMemoryPoolSize;
383 }
384
385 for (uint32_t i = 0; i < mProcessors.size(); i++) {
386 (mProcessors[i].proc->*(mProcessors[i].RegisterMemoryAllocation))();
387 }
388
389 return 0;
390}
391
393{
394 if (IsGPU()) {
395 for (uint32_t i = 0; i < mChains.size(); i++) {
396 mChains[i]->RegisterGPUProcessors();
397 }
398 }
400 return 0;
401}
402
404{
405 if (GetProcessingSettings().forceMaxMemScalers <= 1 && GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
407 }
408 for (uint32_t i = 0; i < mChains.size(); i++) {
409 if (mChains[i]->Init()) {
410 return 1;
411 }
412 }
413 for (uint32_t i = 0; i < mProcessors.size(); i++) {
414 (mProcessors[i].proc->*(mProcessors[i].InitializeProcessor))();
415 }
416
417 WriteConstantParams(); // Initialize with initial values, can optionally be updated later
418
419 mInitialized = true;
420 return 0;
421}
422
424{
425 if (IsGPU()) {
426 const auto threadContext = GetThreadContext();
427 WriteToConstantMemory(ptrDiff(&processors()->param, processors()), &param(), sizeof(param()), -1);
428 }
429}
430
432{
433 for (uint32_t i = 0; i < mChains.size(); i++) {
434 mChains[i]->Finalize();
435 }
436 return 0;
437}
438
440{
441 if (!mInitialized) {
442 return 1;
443 }
444 for (uint32_t i = 0; i < mSlaves.size(); i++) {
445 if (mSlaves[i]->Exit()) {
446 GPUError("Error exiting slave");
447 }
448 }
449
450 mChains.clear(); // Make sure we destroy a possible ITS GPU tracker before we call the destructors
451 mHostConstantMem.reset(); // Reset these explicitly before the destruction of other members unloads the library
452 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
453 for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
454 if (mMemoryResources[i].mReuse >= 0) {
455 continue;
456 }
457 operator delete(mMemoryResources[i].mPtrDevice, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
458 mMemoryResources[i].mPtr = mMemoryResources[i].mPtrDevice = nullptr;
459 }
460 }
461 mMemoryResources.clear();
462 if (mInitialized) {
463 ExitDevice();
464 }
465 mInitialized = false;
466 return 0;
467}
468
471
473{
474 for (auto it = mMemoryReuse1to1.begin(); it != mMemoryReuse1to1.end(); it++) {
475 auto& re = it->second;
476 if (proc == nullptr || re.proc == proc) {
477 GPUMemoryResource& resMain = mMemoryResources[re.res[0]];
478 resMain.mOverrideSize = 0;
479 for (uint32_t i = 0; i < re.res.size(); i++) {
481 resMain.mOverrideSize = std::max<size_t>(resMain.mOverrideSize, ptrDiff(res.SetPointers((void*)1), (char*)1));
482 }
483 }
484 }
485}
486
488{
490 if ((type & GPUMemoryResource::MEMORY_SCRATCH) && !GetProcessingSettings().keepDisplayMemory) { // keepAllMemory --> keepDisplayMemory
492 } else {
494 }
495 }
497 type &= ~GPUMemoryResource::MEMORY_GPU;
498 }
499 mMemoryResources.emplace_back(proc, setPtr, (GPUMemoryResource::MemoryType)type, name);
500 if (mMemoryResources.size() >= 32768) {
501 throw std::bad_alloc();
502 }
503 uint16_t retVal = mMemoryResources.size() - 1;
504 if (re.type != GPUMemoryReuse::NONE && !GetProcessingSettings().disableMemoryReuse) {
505 const auto& it = mMemoryReuse1to1.find(re.id);
506 if (it == mMemoryReuse1to1.end()) {
507 mMemoryReuse1to1[re.id] = {proc, retVal};
508 } else {
509 mMemoryResources[retVal].mReuse = it->second.res[0];
510 it->second.res.emplace_back(retVal);
511 }
512 }
513 return retVal;
514}
515
517{
518 if (GetProcessingSettings().debugLevel >= 5) {
519 GPUInfo("Allocating memory %p", (void*)proc);
520 }
521 size_t total = 0;
522 for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
523 if (proc == nullptr ? !mMemoryResources[i].mProcessor->mAllocateAndInitializeLate : mMemoryResources[i].mProcessor == proc) {
525 total += AllocateRegisteredMemory(i);
526 } else if (resetCustom && (mMemoryResources[i].mPtr || mMemoryResources[i].mPtrDevice)) {
528 }
529 }
530 }
531 if (GetProcessingSettings().debugLevel >= 5) {
532 GPUInfo("Allocating memory done");
533 }
534 return total;
535}
536
538{
539 if (GetProcessingSettings().debugLevel >= 5) {
540 GPUInfo("Allocating Permanent Memory");
541 }
543 GPUError("Must not allocate permanent memory while volatile chunks are allocated");
544 throw std::bad_alloc();
545 }
546 int32_t total = 0;
547 for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
548 if ((mMemoryResources[i].mType & GPUMemoryResource::MEMORY_PERMANENT) && mMemoryResources[i].mPtr == nullptr) {
549 total += AllocateRegisteredMemory(i);
550 }
551 }
554 if (GetProcessingSettings().debugLevel >= 5) {
555 GPUInfo("Permanent Memory Done");
556 }
557 return total;
558}
559
560size_t GPUReconstruction::AllocateRegisteredMemoryHelper(GPUMemoryResource* res, void*& ptr, void*& memorypool, void* memorybase, size_t memorysize, void* (GPUMemoryResource::*setPtr)(void*), void*& memorypoolend, const char* device)
561{
562 if (res->mReuse >= 0) {
563 ptr = (&ptr == &res->mPtrDevice) ? mMemoryResources[res->mReuse].mPtrDevice : mMemoryResources[res->mReuse].mPtr;
564 if (ptr == nullptr) {
565 GPUError("Invalid reuse ptr (%s)", res->mName);
566 throw std::bad_alloc();
567 }
568 size_t retVal = ptrDiff((res->*setPtr)(ptr), ptr);
569 if (retVal > mMemoryResources[res->mReuse].mSize) {
570 GPUError("Insufficient reuse memory %lu < %lu (%s) (%s)", mMemoryResources[res->mReuse].mSize, retVal, res->mName, device);
571 throw std::bad_alloc();
572 }
573 if (GetProcessingSettings().allocDebugLevel >= 2) {
574 std::cout << "Reused (" << device << ") " << res->mName << ": " << retVal << "\n";
575 }
576 return retVal;
577 }
578 if (memorypool == nullptr) {
579 GPUError("Cannot allocate memory from uninitialized pool");
580 throw std::bad_alloc();
581 }
582 size_t retVal;
583 if ((res->mType & GPUMemoryResource::MEMORY_STACK) && memorypoolend) {
584 retVal = ptrDiff((res->*setPtr)((char*)1), (char*)(1));
585 memorypoolend = (void*)((char*)memorypoolend - GPUProcessor::getAlignmentMod<GPUCA_MEMALIGN>(memorypoolend));
586 if (retVal < res->mOverrideSize) {
587 retVal = res->mOverrideSize;
588 }
589 retVal += GPUProcessor::getAlignment<GPUCA_MEMALIGN>(retVal);
590 memorypoolend = (char*)memorypoolend - retVal;
591 ptr = memorypoolend;
592 retVal = std::max<size_t>(ptrDiff((res->*setPtr)(ptr), ptr), res->mOverrideSize);
593 } else {
594 ptr = memorypool;
595 memorypool = (char*)((res->*setPtr)(ptr));
596 retVal = ptrDiff(memorypool, ptr);
597 if (retVal < res->mOverrideSize) {
598 retVal = res->mOverrideSize;
599 memorypool = (char*)ptr + res->mOverrideSize;
600 }
601 memorypool = (void*)((char*)memorypool + GPUProcessor::getAlignment<GPUCA_MEMALIGN>(memorypool));
602 }
603 if (memorypoolend ? (memorypool > memorypoolend) : ((size_t)ptrDiff(memorypool, memorybase) > memorysize)) {
604 std::cerr << "Memory pool size exceeded (" << device << ") (" << res->mName << ": " << (memorypoolend ? (memorysize + ptrDiff(memorypool, memorypoolend)) : ptrDiff(memorypool, memorybase)) << " > " << memorysize << "\n";
605 throw std::bad_alloc();
606 }
607 if (GetProcessingSettings().allocDebugLevel >= 2) {
608 std::cout << "Allocated (" << device << ") " << res->mName << ": " << retVal << " - available: " << (memorypoolend ? ptrDiff(memorypoolend, memorypool) : (memorysize - ptrDiff(memorypool, memorybase))) << "\n";
609 }
610 return retVal;
611}
612
614{
615 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL && (control == nullptr || control->useInternal())) {
616 if (!(res->mType & GPUMemoryResource::MEMORY_EXTERNAL)) {
617 if (res->mPtrDevice && res->mReuse < 0) {
618 operator delete(res->mPtrDevice, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
619 }
620 res->mSize = std::max((size_t)res->SetPointers((void*)1) - 1, res->mOverrideSize);
621 if (res->mReuse >= 0) {
622 if (res->mSize > mMemoryResources[res->mReuse].mSize) {
623 GPUError("Invalid reuse, insufficient size: %ld < %ld", (int64_t)mMemoryResources[res->mReuse].mSize, (int64_t)res->mSize);
624 throw std::bad_alloc();
625 }
626 res->mPtrDevice = mMemoryResources[res->mReuse].mPtrDevice;
627 } else {
628 res->mPtrDevice = operator new(res->mSize + GPUCA_BUFFER_ALIGNMENT, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
629 }
630 res->mPtr = GPUProcessor::alignPointer<GPUCA_BUFFER_ALIGNMENT>(res->mPtrDevice);
631 res->SetPointers(res->mPtr);
632 if (GetProcessingSettings().allocDebugLevel >= 2) {
633 std::cout << (res->mReuse >= 0 ? "Reused " : "Allocated ") << res->mName << ": " << res->mSize << "\n";
634 }
637 }
638 if ((size_t)res->mPtr % GPUCA_BUFFER_ALIGNMENT) {
639 GPUError("Got buffer with insufficient alignment");
640 throw std::bad_alloc();
641 }
642 }
643 } else {
644 if (res->mPtr != nullptr) {
645 GPUError("Double allocation! (%s)", res->mName);
646 throw std::bad_alloc();
647 }
648 if (IsGPU() && res->mOverrideSize < GPUCA_BUFFER_ALIGNMENT) {
649 res->mOverrideSize = GPUCA_BUFFER_ALIGNMENT;
650 }
651 if ((!IsGPU() || (res->mType & GPUMemoryResource::MEMORY_HOST) || GetProcessingSettings().keepDisplayMemory) && !(res->mType & GPUMemoryResource::MEMORY_EXTERNAL)) { // keepAllMemory --> keepDisplayMemory
652 if (control && control->useExternal()) {
653 if (control->allocator) {
654 res->mSize = std::max((size_t)res->SetPointers((void*)1) - 1, res->mOverrideSize);
655 res->mPtr = control->allocator(CAMath::nextMultipleOf<GPUCA_BUFFER_ALIGNMENT>(res->mSize));
656 res->mSize = std::max<size_t>(ptrDiff(res->SetPointers(res->mPtr), res->mPtr), res->mOverrideSize);
657 if (GetProcessingSettings().allocDebugLevel >= 2) {
658 std::cout << "Allocated (from callback) " << res->mName << ": " << res->mSize << "\n";
659 }
660 } else {
661 void* dummy = nullptr;
662 res->mSize = AllocateRegisteredMemoryHelper(res, res->mPtr, control->ptrCurrent, control->ptrBase, control->size, &GPUMemoryResource::SetPointers, dummy, "host");
663 }
664 } else {
666 }
667 if ((size_t)res->mPtr % GPUCA_BUFFER_ALIGNMENT) {
668 GPUError("Got buffer with insufficient alignment");
669 throw std::bad_alloc();
670 }
671 }
672 if (IsGPU() && (res->mType & GPUMemoryResource::MEMORY_GPU)) {
673 if (res->mProcessor->mLinkedProcessor == nullptr) {
674 GPUError("Device Processor not set (%s)", res->mName);
675 throw std::bad_alloc();
676 }
678 GPUError("Must not allocate non-stacked device memory while volatile chunks are allocated");
679 throw std::bad_alloc();
680 }
682
684 res->mSize = size;
685 } else if (size != res->mSize) {
686 GPUError("Inconsistent device memory allocation (%s: device %lu vs %lu)", res->mName, size, res->mSize);
687 throw std::bad_alloc();
688 }
689 if ((size_t)res->mPtrDevice % GPUCA_BUFFER_ALIGNMENT) {
690 GPUError("Got buffer with insufficient alignment");
691 throw std::bad_alloc();
692 }
693 }
695 }
696}
697
702
704{
706 if ((res->mType & GPUMemoryResource::MEMORY_PERMANENT) && res->mPtr != nullptr) {
708 } else {
710 }
711 return res->mReuse >= 0 ? 0 : res->mSize;
712}
713
715{
716 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
717 char* retVal = new (std::align_val_t(GPUCA_BUFFER_ALIGNMENT)) char[size];
720 } else {
722 }
723 return retVal;
724 }
725
727 throw std::runtime_error("Requested invalid memory typo for direct allocation");
728 }
730 GPUError("Must not allocate direct memory while volatile chunks are allocated");
731 throw std::bad_alloc();
732 }
733
736 char* retVal;
738 poolend = (char*)poolend - size;
739 poolend = (char*)poolend - GPUProcessor::getAlignmentMod<GPUCA_MEMALIGN>(poolend);
740 retVal = (char*)poolend;
741 } else {
743 }
744 if (pool > poolend) {
745 GPUError("Insufficient unmanaged memory: missing %ld bytes", ptrDiff(pool, poolend));
746 throw std::bad_alloc();
747 }
749 if (GetProcessingSettings().allocDebugLevel >= 2) {
750 std::cout << "Allocated (unmanaged " << (type == GPUMemoryResource::MEMORY_GPU ? "gpu" : "host") << "): " << size << " - available: " << ptrDiff(poolend, pool) << "\n";
751 }
752 return retVal;
753}
754
756{
757 if (mVolatileMemoryStart == nullptr) {
759 }
760 if (size == 0) {
761 return nullptr; // Future GPU memory allocation is volatile
762 }
763 char* retVal;
766 GPUError("Insufficient volatile device memory: missing %ld", ptrDiff(mDeviceMemoryPool, mDeviceMemoryPoolEnd));
767 throw std::bad_alloc();
768 }
770 if (GetProcessingSettings().allocDebugLevel >= 2) {
771 std::cout << "Allocated (volatile GPU): " << size << " - available: " << ptrDiff(mDeviceMemoryPoolEnd, mDeviceMemoryPool) << "\n";
772 }
773 return retVal;
774}
775
777{
778 if (device) {
780 }
781 char* retVal = new (std::align_val_t(GPUCA_BUFFER_ALIGNMENT)) char[size];
782 mVolatileChunks.emplace_back(retVal, alignedDeleter());
783 return retVal;
784}
785
791
793{
797 mVolatileMemoryStart = nullptr;
798 }
799 if (GetProcessingSettings().allocDebugLevel >= 2) {
800 std::cout << "Freed (volatile GPU) - available: " << ptrDiff(mDeviceMemoryPoolEnd, mDeviceMemoryPool) << "\n";
801 }
802}
803
809
811{
812 for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
813 if (proc == nullptr || mMemoryResources[i].mProcessor == proc) {
815 }
816 }
817}
818
820{
823 void* basePtr = res->mReuse >= 0 ? mMemoryResources[res->mReuse].mPtr : res->mPtr;
824 size_t size = ptrDiff(res->SetPointers(basePtr), basePtr);
825 if (basePtr && size > std::max(res->mSize, res->mOverrideSize)) {
826 std::cerr << "Updated pointers exceed available memory size: " << size << " > " << std::max(res->mSize, res->mOverrideSize) << " - host - " << res->mName << "\n";
827 throw std::bad_alloc();
828 }
829 }
830 if (IsGPU() && (res->mType & GPUMemoryResource::MEMORY_GPU)) {
831 void* basePtr = res->mReuse >= 0 ? mMemoryResources[res->mReuse].mPtrDevice : res->mPtrDevice;
832 size_t size = ptrDiff(res->SetDevicePointers(basePtr), basePtr);
833 if (basePtr && size > std::max(res->mSize, res->mOverrideSize)) {
834 std::cerr << "Updated pointers exceed available memory size: " << size << " > " << std::max(res->mSize, res->mOverrideSize) << " - GPU - " << res->mName << "\n";
835 throw std::bad_alloc();
836 }
837 }
838}
839
840void GPUReconstruction::FreeRegisteredMemory(GPUProcessor* proc, bool freeCustom, bool freePermanent)
841{
842 for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
843 if ((proc == nullptr || mMemoryResources[i].mProcessor == proc) && (freeCustom || !(mMemoryResources[i].mType & GPUMemoryResource::MEMORY_CUSTOM)) && (freePermanent || !(mMemoryResources[i].mType & GPUMemoryResource::MEMORY_PERMANENT))) {
845 }
846 }
847}
848
853
855{
856 if (GetProcessingSettings().allocDebugLevel >= 2 && (res->mPtr || res->mPtrDevice)) {
857 std::cout << "Freeing " << res->mName << ": size " << res->mSize << " (reused " << res->mReuse << ")\n";
858 }
859 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL && res->mReuse < 0) {
860 operator delete(res->mPtrDevice, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
861 }
862 res->mPtr = nullptr;
863 res->mPtrDevice = nullptr;
864}
865
870
872{
873 if (GetProcessingSettings().keepDisplayMemory || GetProcessingSettings().disableMemoryReuse) {
874 return;
875 }
876 if (mNonPersistentMemoryStack.size() == 0) {
877 GPUFatal("Trying to pop memory state from empty stack");
878 }
879 if (tag != 0 && std::get<4>(mNonPersistentMemoryStack.back()) != tag) {
880 GPUFatal("Tag mismatch when popping non persistent memory from stack : pop %s vs on stack %s", qTag2Str(tag).c_str(), qTag2Str(std::get<4>(mNonPersistentMemoryStack.back())).c_str());
881 }
882 if ((GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) && (IsGPU() || GetProcessingSettings().forceHostMemoryPoolSize)) {
883 printf("Allocated memory after %30s (%8s) (Stack %zu): ", GPUDataTypes::RECO_STEP_NAMES[getRecoStepNum(step, true)], qTag2Str(std::get<4>(mNonPersistentMemoryStack.back())).c_str(), mNonPersistentMemoryStack.size());
885 printf("%76s", "");
887 }
888 mHostMemoryPoolEnd = std::get<0>(mNonPersistentMemoryStack.back());
890 for (uint32_t i = std::get<2>(mNonPersistentMemoryStack.back()); i < mNonPersistentIndividualAllocations.size(); i++) {
892 if (res->mReuse < 0) {
893 operator delete(res->mPtrDevice, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
894 }
895 res->mPtr = nullptr;
896 res->mPtrDevice = nullptr;
897 }
900 mNonPersistentMemoryStack.pop_back();
901}
902
904{
906 throw std::runtime_error("temporary memory stack already blocked");
907 }
910}
911
913{
914 if (mNonPersistentMemoryStack.size()) {
915 throw std::runtime_error("cannot unblock while there is stacked memory");
916 }
919 mHostMemoryPoolBlocked = nullptr;
920 mDeviceMemoryPoolBlocked = nullptr;
921}
922
924{
925 mMemoryResources[res].mPtr = ptr;
926}
927
929{
930 for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
933 }
934 }
937 mDirectMemoryChunks.clear();
939 mVolatileChunks.clear();
940 mVolatileMemoryStart = nullptr;
941 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
942 mHostMemoryPool = GPUProcessor::alignPointer<GPUCA_MEMALIGN>(mHostMemoryPermanent);
943 mDeviceMemoryPool = GPUProcessor::alignPointer<GPUCA_MEMALIGN>(mDeviceMemoryPermanent);
946 } else {
948 }
949}
950
956
958{
959 printf("Maximum Memory Allocation: Host %'zu / Device %'zu\n", mHostMemoryUsedMax, mDeviceMemoryUsedMax);
960}
961
963{
964 if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
965 printf("Memory Allocation: Host %'13zd / %'13zu (Permanent %'13zd, Data %'13zd, Scratch %'13zd), Device %'13zd / %'13zu, (Permanent %'13zd, Data %'13zd, Scratch %'13zd) %zu chunks\n",
968 mMemoryResources.size());
969 }
970}
971
973{
974 std::map<std::string, std::array<size_t, 3>> sizes;
975 for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
976 auto& res = mMemoryResources[i];
977 if (res.mReuse >= 0) {
978 continue;
979 }
980 auto& x = sizes[res.mName];
981 if (res.mPtr) {
982 x[0] += res.mSize;
983 }
984 if (res.mPtrDevice) {
985 x[1] += res.mSize;
986 }
988 x[2] = 1;
989 }
990 }
991 printf("%59s CPU / %9s GPU\n", "", "");
992 for (auto it = sizes.begin(); it != sizes.end(); it++) {
993 printf("Allocation %30s %s: Size %'14zu / %'14zu\n", it->first.c_str(), it->second[2] ? "P" : " ", it->second[0], it->second[1]);
994 }
996 for (uint32_t i = 0; i < mChains.size(); i++) {
997 mChains[i]->PrintMemoryStatistics();
998 }
999}
1000
1002{
1003 if (GetProcessingSettings().noGPUMemoryRegistration) {
1004 return 0;
1005 }
1007 if (retVal == 0) {
1008 mRegisteredMemoryPtrs.emplace(ptr);
1009 }
1010 return retVal;
1011}
1012
1014{
1015 if (GetProcessingSettings().noGPUMemoryRegistration) {
1016 return 0;
1017 }
1018 const auto& pos = mRegisteredMemoryPtrs.find(ptr);
1019 if (pos != mRegisteredMemoryPtrs.end()) {
1022 }
1023 return 1;
1024}
1025
1026namespace o2::gpu::internal
1027{
1028namespace // anonymous
1029{
1030template <class T>
1031constexpr static inline int32_t getStepNum(T step, bool validCheck, int32_t N, const char* err = "Invalid step num")
1032{
1033 static_assert(sizeof(step) == sizeof(uint32_t), "Invalid step enum size");
1034 int32_t retVal = 8 * sizeof(uint32_t) - 1 - CAMath::Clz((uint32_t)step);
1035 if ((uint32_t)step == 0 || retVal >= N) {
1036 if (!validCheck) {
1037 return -1;
1038 }
1039 throw std::runtime_error("Invalid General Step");
1040 }
1041 return retVal;
1042}
1043} // anonymous namespace
1044} // namespace o2::gpu::internal
1045
1046int32_t GPUReconstruction::getRecoStepNum(RecoStep step, bool validCheck) { return internal::getStepNum(step, validCheck, GPUDataTypes::N_RECO_STEPS, "Invalid Reco Step"); }
1047int32_t GPUReconstruction::getGeneralStepNum(GeneralStep step, bool validCheck) { return internal::getStepNum(step, validCheck, GPUDataTypes::N_GENERAL_STEPS, "Invalid General Step"); }
1048
1050{
1051 if (!mInitialized || !GetProcessingSettings().doublePipeline || mMaster != nullptr || !mSlaves.size()) {
1052 throw std::invalid_argument("Cannot start double pipeline mode");
1053 }
1054 if (GetProcessingSettings().debugLevel >= 3) {
1055 GPUInfo("Pipeline worker started");
1056 }
1057 bool terminate = false;
1058 while (!terminate) {
1059 {
1060 std::unique_lock<std::mutex> lk(mPipelineContext->mutex);
1061 mPipelineContext->cond.wait(lk, [this] { return this->mPipelineContext->queue.size() > 0; });
1062 }
1063 GPUReconstructionPipelineQueue* q;
1064 {
1065 std::lock_guard<std::mutex> lk(mPipelineContext->mutex);
1066 q = mPipelineContext->queue.front();
1067 mPipelineContext->queue.pop();
1068 }
1069 if (q->op == 1) {
1070 terminate = 1;
1071 } else {
1072 q->retVal = q->chain->RunChain();
1073 }
1074 {
1075 std::lock_guard<std::mutex> lk(q->m);
1076 q->done = true;
1077 }
1078 q->c.notify_one();
1079 }
1080 if (GetProcessingSettings().debugLevel >= 3) {
1081 GPUInfo("Pipeline worker ended");
1082 }
1083}
1084
1089
1091{
1094 std::unique_ptr<GPUReconstructionPipelineQueue> qu(new GPUReconstructionPipelineQueue);
1095 GPUReconstructionPipelineQueue* q = qu.get();
1096 q->chain = terminate ? nullptr : mChains[0].get();
1097 q->op = terminate ? 1 : 0;
1098 std::unique_lock<std::mutex> lkdone(q->m);
1099 {
1100 std::lock_guard<std::mutex> lkpipe(rec->mPipelineContext->mutex);
1101 if (rec->mPipelineContext->terminate) {
1102 throw std::runtime_error("Must not enqueue work after termination request");
1103 }
1104 rec->mPipelineContext->queue.push(q);
1105 rec->mPipelineContext->terminate = terminate;
1106 rec->mPipelineContext->cond.notify_one();
1107 }
1108 q->c.wait(lkdone, [&q]() { return q->done; });
1109 if (q->retVal) {
1110 return q->retVal;
1111 }
1112 if (terminate) {
1113 return 0;
1114 } else {
1115 return mChains[0]->FinalizePipelinedProcessing();
1116 }
1117}
1118
1120{
1122 std::lock_guard<std::mutex> lk(rec->mPipelineContext->mutex);
1123 return rec->mPipelineContext->queue.size() && rec->mPipelineContext->queue.front()->op == 0 ? rec->mPipelineContext->queue.front()->chain : nullptr;
1124}
1125
1126void GPUReconstruction::PrepareEvent() // TODO: Clean this up, this should not be called from chainTracking but before
1127{
1129 for (uint32_t i = 0; i < mChains.size(); i++) {
1130 mChains[i]->PrepareEvent();
1131 }
1132 for (uint32_t i = 0; i < mProcessors.size(); i++) {
1133 if (mProcessors[i].proc->mAllocateAndInitializeLate) {
1134 continue;
1135 }
1136 (mProcessors[i].proc->*(mProcessors[i].SetMaxData))(mHostConstantMem->ioPtrs);
1137 if (mProcessors[i].proc->mGPUProcessorType != GPUProcessor::PROCESSOR_TYPE_DEVICE && mProcessors[i].proc->mLinkedProcessor) {
1138 (mProcessors[i].proc->mLinkedProcessor->*(mProcessors[i].SetMaxData))(mHostConstantMem->ioPtrs);
1139 }
1140 }
1141 ComputeReuseMax(nullptr);
1142 AllocateRegisteredMemory(nullptr);
1143}
1144
1145int32_t GPUReconstruction::CheckErrorCodes(bool cpuOnly, bool forceShowErrors, std::vector<std::array<uint32_t, 4>>* fillErrors)
1146{
1147 int32_t retVal = 0;
1148 for (uint32_t i = 0; i < mChains.size(); i++) {
1149 if (mChains[i]->CheckErrorCodes(cpuOnly, forceShowErrors, fillErrors)) {
1150 retVal++;
1151 }
1152 }
1153 return retVal;
1154}
1155
1156int32_t GPUReconstruction::GPUChkErrA(const int64_t error, const char* file, int32_t line, bool failOnError)
1157{
1158 if (error == 0 || !GPUChkErrInternal(error, file, line)) {
1159 return 0;
1160 }
1161 if (failOnError) {
1162 if (mInitialized && mInErrorHandling == false) {
1163 mInErrorHandling = true;
1164 CheckErrorCodes(false, true);
1165 }
1166 throw std::runtime_error("GPU Backend Failure");
1167 }
1168 return 1;
1169}
1170
1172{
1173 std::string f;
1174 f = dir;
1175 f += "settings.dump";
1176 DumpStructToFile(mGRPSettings.get(), f.c_str());
1177 for (uint32_t i = 0; i < mChains.size(); i++) {
1178 mChains[i]->DumpSettings(dir);
1179 }
1180}
1181
1182void GPUReconstruction::UpdateDynamicSettings(const GPUSettingsRecDynamic* d)
1183{
1184 UpdateSettings(nullptr, nullptr, d);
1185}
1186
1187void GPUReconstruction::UpdateSettings(const GPUSettingsGRP* g, const GPUSettingsProcessing* p, const GPUSettingsRecDynamic* d)
1188{
1189 if (g) {
1190 *mGRPSettings = *g;
1191 }
1192 if (p) {
1193 mProcessingSettings->debugLevel = p->debugLevel;
1194 mProcessingSettings->resetTimers = p->resetTimers;
1195 }
1196 GPURecoStepConfiguration* w = nullptr;
1198 w = &mRecoSteps;
1199 }
1200 param().UpdateSettings(g, p, w, d);
1201 if (mInitialized) {
1203 }
1204}
1205
1206int32_t GPUReconstruction::ReadSettings(const char* dir)
1207{
1208 std::string f;
1209 f = dir;
1210 f += "settings.dump";
1211 new (mGRPSettings.get()) GPUSettingsGRP;
1212 if (ReadStructFromFile(f.c_str(), mGRPSettings.get())) {
1213 return 1;
1214 }
1216 for (uint32_t i = 0; i < mChains.size(); i++) {
1217 mChains[i]->ReadSettings(dir);
1218 }
1219 return 0;
1220}
1221
1222void GPUReconstruction::SetSettings(float solenoidBzNominalGPU, const GPURecoStepConfiguration* workflow)
1223{
1224#ifdef GPUCA_O2_LIB
1226 config.ReadConfigurableParam(config);
1227 config.configGRP.solenoidBzNominalGPU = solenoidBzNominalGPU;
1228 SetSettings(&config.configGRP, &config.configReconstruction, &config.configProcessing, workflow);
1229#else
1230 GPUSettingsGRP grp;
1231 grp.solenoidBzNominalGPU = solenoidBzNominalGPU;
1232 SetSettings(&grp, nullptr, nullptr, workflow);
1233#endif
1234}
1235
1236void GPUReconstruction::SetSettings(const GPUSettingsGRP* grp, const GPUSettingsRec* rec, const GPUSettingsProcessing* proc, const GPURecoStepConfiguration* workflow)
1237{
1238 if (mInitialized) {
1239 GPUError("Cannot update settings while initialized");
1240 throw std::runtime_error("Settings updated while initialized");
1241 }
1242 *mGRPSettings = *grp;
1243 if (proc) {
1244 *mProcessingSettings = *proc;
1245 }
1246 if (workflow) {
1247 mRecoSteps.steps = workflow->steps;
1249 mRecoSteps.inputs = workflow->inputs;
1250 mRecoSteps.outputs = workflow->outputs;
1251 }
1252 param().SetDefaults(mGRPSettings.get(), rec, proc, workflow);
1253}
1254
1256{
1257 GPUOutputControl outputControl;
1258 outputControl.set(ptr, size);
1259 SetOutputControl(outputControl);
1260}
1261
1265void GPUReconstruction::SetResetTimers(bool reset) { mProcessingSettings->resetTimers = reset; }
1270
1271ThrustVolatileAllocator::ThrustVolatileAllocator(GPUReconstruction* r)
1272{
1273 mAlloc = [&r](size_t n) { return (char*)r->AllocateVolatileDeviceMemory(n); };
1274}
int32_t i
#define GPUCA_BUFFER_ALIGNMENT
uint32_t op
bool done
int32_t retVal
GPUChain * chain
#define GPUCA_NSECTORS
uint16_t pos
Definition RawData.h:3
uint32_t res
Definition RawData.h:0
TBranch * ptr
void set(S v)
Definition bitfield.h:59
bool isSet(const bitfield &v) const
Definition bitfield.h:70
const GPUSettingsDisplay * GetEventDisplayConfig() const
const GPUSettingsQA * GetQAConfig() const
static void dumpConfig(const GPUSettingsRec *rec, const GPUSettingsProcessing *proc, const GPUSettingsQA *qa, const GPUSettingsDisplay *display, const GPUSettingsDeviceBackend *device, const GPURecoStepConfiguration *workflow)
static constexpr const char *const RECO_STEP_NAMES[]
static constexpr int32_t N_RECO_STEPS
static constexpr int32_t N_GENERAL_STEPS
void * SetDevicePointers(void *ptr)
static void computePointerWithAlignment(T *&basePtr, S *&objPtr, size_t nEntries=1)
void InitGPUProcessor(GPUReconstruction *rec, ProcessorType type=PROCESSOR_TYPE_CPU, GPUProcessor *slaveProcessor=nullptr)
ProcessorType mGPUProcessorType
GPURecoStepConfiguration mRecoSteps
std::unordered_set< const void * > mRegisteredMemoryPtrs
int16_t RegisterMemoryAllocationHelper(GPUProcessor *proc, void *(GPUProcessor::*setPtr)(void *), int32_t type, const char *name, const GPUMemoryReuse &re)
std::vector< std::unique_ptr< GPUChain > > mChains
void * AllocateVolatileMemory(size_t size, bool device)
ThrustVolatileAllocator getThrustVolatileDeviceAllocator()
std::unique_ptr< GPUMemorySizeScalers > mMemoryScalers
void AllocateRegisteredForeignMemory(int16_t res, GPUReconstruction *rec, GPUOutputControl *control=nullptr)
void SetInputControl(void *ptr, size_t size)
GPUConstantMem * mDeviceConstantMem
void ConstructGPUProcessor(GPUProcessor *proc)
std::shared_ptr< GPUROOTDumpCore > mROOTDump
void PopNonPersistentMemory(RecoStep step, uint64_t tag)
size_t AllocateRegisteredMemoryHelper(GPUMemoryResource *res, void *&ptr, void *&memorypool, void *memorybase, size_t memorysize, void *(GPUMemoryResource::*SetPointers)(void *), void *&memorypoolend, const char *device)
const GPUSettingsDeviceBackend & GetDeviceBackendSettings() const
void ComputeReuseMax(GPUProcessor *proc)
void SetMemoryExternalInput(int16_t res, void *ptr)
int32_t getGeneralStepNum(GeneralStep step, bool validCheck=true)
static constexpr uint32_t NSECTORS
RecoStepField GetRecoStepsGPU() const
void RegisterGPUDeviceProcessor(GPUProcessor *proc, GPUProcessor *slaveProcessor)
std::vector< GPUReconstruction * > mSlaves
std::vector< std::tuple< void *, void *, size_t, size_t, uint64_t > > mNonPersistentMemoryStack
std::unique_ptr< T > ReadStructFromFile(const char *file)
virtual void GetITSTraits(std::unique_ptr< o2::its::TrackerTraits > *trackerTraits, std::unique_ptr< o2::its::VertexerTraits > *vertexerTraits, std::unique_ptr< o2::its::TimeFrame > *timeFrame)
void UpdateDynamicSettings(const GPUSettingsRecDynamic *d)
std::unique_ptr< GPUSettingsDeviceBackend > mDeviceBackendSettings
std::vector< GPUMemoryResource > mMemoryResources
std::unique_ptr< GPUReconstructionPipelineContext > mPipelineContext
std::unique_ptr< GPUConstantMem > mHostConstantMem
void ResetRegisteredMemoryPointers(GPUProcessor *proc)
void DumpStructToFile(const T *obj, const char *file)
void AllocateRegisteredMemoryInternal(GPUMemoryResource *res, GPUOutputControl *control, GPUReconstruction *recPool)
virtual int32_t registerMemoryForGPU_internal(const void *ptr, size_t size)=0
virtual size_t WriteToConstantMemory(size_t offset, const void *src, size_t size, int32_t stream=-1, gpu_reconstruction_kernels::deviceEvent *ev=nullptr)=0
std::unordered_map< GPUMemoryReuse::ID, MemoryReuseMeta > mMemoryReuse1to1
std::vector< ProcessorData > mProcessors
void * AllocateVolatileDeviceMemory(size_t size)
virtual int32_t InitDevice()=0
void SetSettings(float solenoidBzNominalGPU, const GPURecoStepConfiguration *workflow=nullptr)
const GPUCalibObjectsConst & GetCalib() const
const GPUTrackingInOutPointers GetIOPtrs() const
virtual std::unique_ptr< gpu_reconstruction_kernels::threadContext > GetThreadContext()=0
GPUReconstruction(const GPUReconstruction &)=delete
static constexpr GeometryType geometryType
std::vector< std::unique_ptr< char[], alignedDeleter > > mNonPersistentIndividualDirectAllocations
void FreeRegisteredMemory(GPUProcessor *proc, bool freeCustom=false, bool freePermanent=false)
std::vector< std::unique_ptr< char[], alignedDeleter > > mVolatileChunks
virtual RecoStepField AvailableGPURecoSteps()
static constexpr const char *const IOTYPENAMES[]
std::vector< std::unique_ptr< char[], alignedDeleter > > mDirectMemoryChunks
void UpdateSettings(const GPUSettingsGRP *g, const GPUSettingsProcessing *p=nullptr, const GPUSettingsRecDynamic *d=nullptr)
int32_t CheckErrorCodes(bool cpuOnly=false, bool forceShowErrors=false, std::vector< std::array< uint32_t, 4 > > *fillErrors=nullptr)
const GPUParam & GetParam() const
void ClearAllocatedMemory(bool clearOutputs=true)
static constexpr const char *const GEOMETRY_TYPE_NAMES[]
virtual int32_t ExitDevice()=0
std::unique_ptr< GPUSettingsGRP > mGRPSettings
std::unique_ptr< GPUSettingsProcessing > mProcessingSettings
void PushNonPersistentMemory(uint64_t tag)
int32_t getRecoStepNum(RecoStep step, bool validCheck=true)
virtual int32_t unregisterMemoryForGPU_internal(const void *ptr)=0
void BlockStackedMemory(GPUReconstruction *rec)
const GPUSettingsProcessing & GetProcessingSettings() const
void DumpSettings(const char *dir="")
void * AllocateDirectMemory(size_t size, int32_t type)
int32_t unregisterMemoryForGPU(const void *ptr)
int32_t registerMemoryForGPU(const void *ptr, size_t size)
void SetDebugLevelTmp(int32_t level)
int32_t EnqueuePipeline(bool terminate=false)
std::shared_ptr< GPUReconstructionThreading > mThreading
std::vector< GPUMemoryResource * > mNonPersistentIndividualAllocations
virtual int32_t GPUChkErrInternal(const int64_t error, const char *file, int32_t line) const
int32_t GPUChkErrA(const int64_t error, const char *file, int32_t line, bool failOnError)
size_t AllocateRegisteredMemory(GPUProcessor *proc, bool resetCustom=false)
int32_t ReadSettings(const char *dir="")
void SetOutputControl(const GPUOutputControl &v)
void SetSector(int32_t iSector)
#define TPC_MAX_FRAGMENT_LEN_GPU
#define TPC_MAX_FRAGMENT_LEN_HOST
GLdouble n
Definition glcorearb.h:1982
GLint GLenum GLint x
Definition glcorearb.h:403
const GLfloat * m
Definition glcorearb.h:4066
GLsizeiptr size
Definition glcorearb.h:659
GLuint GLsizei const GLuint const GLintptr const GLsizeiptr * sizes
Definition glcorearb.h:2595
GLuint const GLchar * name
Definition glcorearb.h:781
GLdouble f
Definition glcorearb.h:310
GLboolean GLboolean GLboolean b
Definition glcorearb.h:1233
GLint GLint GLsizei GLint GLenum GLenum type
Definition glcorearb.h:275
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLboolean GLboolean g
Definition glcorearb.h:1233
GLint level
Definition glcorearb.h:275
GLboolean r
Definition glcorearb.h:1233
GLenum GLfloat param
Definition glcorearb.h:271
GLboolean GLboolean GLboolean GLboolean a
Definition glcorearb.h:1233
GLubyte GLubyte GLubyte GLubyte w
Definition glcorearb.h:852
std::unique_ptr< GPUDisplayFrontendInterface > eventDisplay
GPUReconstruction * rec
std::string qTag2Str(const T tag)
Definition strtag.h:35
GPUTPCTracker tpcTrackers[GPUCA_NSECTORS]
GPUTPCClusterFinder tpcClusterer[GPUCA_NSECTORS]
GPUCalibObjectsConst calibObjects
void set(void *p, size_t s)
std::function< void *(size_t)> allocator
void SetDefaults(float solenoidBz)
Definition GPUParam.cxx:33
void UpdateSettings(const GPUSettingsGRP *g, const GPUSettingsProcessing *p=nullptr, const GPURecoStepConfiguration *w=nullptr, const GPUSettingsRecDynamic *d=nullptr)
Definition GPUParam.cxx:121
GPUDataTypes::RecoStepField stepsGPUMask
GPUDataTypes::InOutTypeField outputs
GPUDataTypes::RecoStepField steps
GPUDataTypes::InOutTypeField inputs
std::queue< GPUReconstructionPipelineQueue * > queue