23#include <condition_variable>
51struct GPUReconstructionPipelineQueue {
55 std::condition_variable c;
62 std::queue<GPUReconstructionPipelineQueue*>
queue;
64 std::condition_variable
cond;
75static ptrdiff_t ptrDiff(
void*
a,
void*
b) {
return (
char*)
a - (
char*)
b; }
81 throw std::invalid_argument(
"device type of master and slave GPUReconstruction does not match");
84 throw std::invalid_argument(
"Cannot be slave to a slave");
98 mROOTDump = GPUROOTDumpCore::getAndCreate();
105 GPUError(
"GPU Reconstruction not properly deinitialized!");
109void GPUReconstruction::GetITSTraits(std::unique_ptr<o2::its::TrackerTraits>* trackerTraits, std::unique_ptr<o2::its::VertexerTraits>* vertexerTraits, std::unique_ptr<o2::its::TimeFrame>* timeFrame)
114 if (vertexerTraits) {
124 return std::max<int32_t>(0, tbb::this_task_arena::current_thread_index());
130 throw std::runtime_error(
"Must not call init on slave!");
136 for (uint32_t
i = 0;
i <
mSlaves.size();
i++) {
139 GPUError(
"Error initialization slave (before deviceinit)");
154 for (uint32_t
i = 0;
i <
mSlaves.size();
i++) {
162 GPUError(
"Error initialization slave (deviceinit)");
166 GPUError(
"Error initialization slave (permanent memory)");
177 for (uint32_t
i = 0;
i <
mSlaves.size();
i++) {
182 GPUError(
"Error initialization slave (after device init)");
192static uint32_t getDefaultNThreads()
194 const char* tbbEnv = getenv(
"TBB_NUM_THREADS");
195 uint32_t tbbNum = tbbEnv ? atoi(tbbEnv) : 0;
199 const char* ompEnv = getenv(
"OMP_NUM_THREADS");
200 uint32_t ompNum = ompEnv ? atoi(ompEnv) : 0;
204 return tbb::info::default_concurrency();
212 printf(
"\nConfig Dump %s\n",
mMaster ?
"Slave" :
"Master");
215 for (uint32_t
i = 0;
i <
mChains.size();
i++) {
259#ifndef GPUCA_NO_FAST_MATH
260 GPUError(
"Warning, deterministicGPUReconstruction needs GPUCA_NO_FAST_MATH for being fully deterministic, without only most indeterminism by concurrency is removed, but floating point effects remain!");
264 if (
param().
rec.tpc.looperInterpolationInExtraPass == -1) {
265 param().
rec.tpc.looperInterpolationInExtraPass = 0;
314 GPUFatal(
"Must not use both nHostThreads and ompThreads at the same time!");
317 GPUWarning(
"You are using the deprecated ompThreads option, please switch to nHostThreads!");
327 mThreading = std::make_shared<GPUReconstructionThreading>();
328 mThreading->control = std::make_unique<tbb::global_control>(tbb::global_control::max_allowed_parallelism,
mMaxHostThreads);
346 GPUError(
"Invalid value for nTPCClustererLanes: %d",
mProcessingSettings.nTPCClustererLanes);
351 GPUError(
"Must use double pipeline mode only with exactly one chain that must support it");
360 for (uint32_t
i = 0;
i <
mChains.size();
i++) {
364 mChains[
i]->RegisterPermanentMemoryAndProcessors();
365 size_t memPrimary, memPageLocked;
366 mChains[
i]->MemorySize(memPrimary, memPageLocked);
368 memPageLocked = memPrimary;
395 for (uint32_t
i = 0;
i <
mChains.size();
i++) {
396 mChains[
i]->RegisterGPUProcessors();
408 for (uint32_t
i = 0;
i <
mChains.size();
i++) {
433 for (uint32_t
i = 0;
i <
mChains.size();
i++) {
444 for (uint32_t
i = 0;
i <
mSlaves.size();
i++) {
446 GPUError(
"Error exiting slave");
475 auto& re = it->second;
476 if (proc ==
nullptr || re.proc == proc) {
478 resMain.mOverrideSize = 0;
479 for (uint32_t
i = 0;
i < re.res.size();
i++) {
481 resMain.mOverrideSize = std::max<size_t>(resMain.mOverrideSize, ptrDiff(
res.SetPointers((
void*)1), (
char*)1));
490 GPUInfo(
"Allocating memory %p", (
void*)proc);
503 GPUInfo(
"Allocating memory done");
511 GPUInfo(
"Allocating Permanent Memory");
522 GPUInfo(
"Permanent Memory Done");
529 if (
res->mReuse >= 0) {
531 if (
ptr ==
nullptr) {
532 GPUError(
"Invalid reuse ptr (%s)",
res->mName);
533 throw std::bad_alloc();
538 throw std::bad_alloc();
541 std::cout <<
"Reused (" << device <<
") " <<
res->mName <<
": " <<
retVal <<
"\n";
545 if (memorypool ==
nullptr) {
546 GPUError(
"Cannot allocate memory from uninitialized pool");
547 throw std::bad_alloc();
551 retVal = ptrDiff((
res->*setPtr)((
char*)1), (
char*)(1));
552 memorypoolend = (
void*)((
char*)memorypoolend - GPUProcessor::getAlignmentMod<GPUCA_MEMALIGN>(memorypoolend));
553 if (retVal < res->mOverrideSize) {
556 retVal += GPUProcessor::getAlignment<GPUCA_MEMALIGN>(
retVal);
557 memorypoolend = (
char*)memorypoolend -
retVal;
562 memorypool = (
char*)((
res->*setPtr)(
ptr));
564 if (retVal < res->mOverrideSize) {
566 memorypool = (
char*)
ptr +
res->mOverrideSize;
568 memorypool = (
void*)((
char*)memorypool + GPUProcessor::getAlignment<GPUCA_MEMALIGN>(memorypool));
570 if (memorypoolend ? (memorypool > memorypoolend) : ((size_t)ptrDiff(memorypool, memorybase) > memorysize)) {
571 std::cerr <<
"Memory pool size exceeded (" << device <<
") (" <<
res->mName <<
": " << (memorypoolend ? (memorysize + ptrDiff(memorypool, memorypoolend)) : ptrDiff(memorypool, memorybase)) <<
" < " << memorysize <<
"\n";
572 throw std::bad_alloc();
575 std::cout <<
"Allocated (" << device <<
") " <<
res->mName <<
": " <<
retVal <<
" - available: " << (memorypoolend ? ptrDiff(memorypoolend, memorypool) : (memorysize - ptrDiff(memorypool, memorybase))) <<
"\n";
584 if (
res->mPtrDevice &&
res->mReuse < 0) {
587 res->mSize = std::max((
size_t)
res->SetPointers((
void*)1) - 1,
res->mOverrideSize);
588 if (
res->mReuse >= 0) {
590 GPUError(
"Invalid reuse, insufficient size: %ld < %ld", (int64_t)
mMemoryResources[
res->mReuse].mSize, (int64_t)
res->mSize);
591 throw std::bad_alloc();
597 res->mPtr = GPUProcessor::alignPointer<GPUCA_BUFFER_ALIGNMENT>(
res->mPtrDevice);
598 res->SetPointers(
res->mPtr);
600 std::cout << (
res->mReuse >= 0 ?
"Reused " :
"Allocated ") <<
res->mName <<
": " <<
res->mSize <<
"\n";
606 GPUError(
"Got buffer with insufficient alignment");
607 throw std::bad_alloc();
611 if (
res->mPtr !=
nullptr) {
612 GPUError(
"Double allocation! (%s)",
res->mName);
613 throw std::bad_alloc();
621 res->mSize = std::max((
size_t)
res->SetPointers((
void*)1) - 1,
res->mOverrideSize);
622 res->mPtr = control->
allocator(CAMath::nextMultipleOf<GPUCA_BUFFER_ALIGNMENT>(
res->mSize));
623 res->mSize = std::max<size_t>(ptrDiff(
res->SetPointers(
res->mPtr),
res->mPtr),
res->mOverrideSize);
625 std::cout <<
"Allocated (from callback) " <<
res->mName <<
": " <<
res->mSize <<
"\n";
628 void* dummy =
nullptr;
635 GPUError(
"Got buffer with insufficient alignment");
636 throw std::bad_alloc();
640 if (
res->mProcessor->mLinkedProcessor ==
nullptr) {
641 GPUError(
"Device Processor not set (%s)",
res->mName);
642 throw std::bad_alloc();
648 }
else if (
size !=
res->mSize) {
649 GPUError(
"Inconsistent device memory allocation (%s: device %lu vs %lu)",
res->mName,
size,
res->mSize);
650 throw std::bad_alloc();
653 GPUError(
"Got buffer with insufficient alignment");
654 throw std::bad_alloc();
674 return res->mReuse >= 0 ? 0 :
res->mSize;
680 throw std::runtime_error(
"Requested invalid memory typo for unmanaged allocation");
684 return GPUProcessor::alignPointer<GPUCA_BUFFER_ALIGNMENT>(
mUnmanagedChunks.back().get());
690 if (pool > poolend) {
691 GPUError(
"Insufficient unmanaged memory: missing %ld bytes", ptrDiff(pool, poolend));
692 throw std::bad_alloc();
714 throw std::bad_alloc();
730 return GPUProcessor::alignPointer<GPUCA_BUFFER_ALIGNMENT>(
mVolatileChunks.back().get());
747 size_t size = ptrDiff(
res->SetPointers(basePtr), basePtr);
748 if (basePtr &&
size > std::max(
res->mSize,
res->mOverrideSize)) {
749 std::cerr <<
"Updated pointers exceed available memory size: " <<
size <<
" > " << std::max(
res->mSize,
res->mOverrideSize) <<
" - host - " <<
res->mName <<
"\n";
750 throw std::bad_alloc();
755 size_t size = ptrDiff(
res->SetDevicePointers(basePtr), basePtr);
756 if (basePtr &&
size > std::max(
res->mSize,
res->mOverrideSize)) {
757 std::cerr <<
"Updated pointers exceed available memory size: " <<
size <<
" > " << std::max(
res->mSize,
res->mOverrideSize) <<
" - GPU - " <<
res->mName <<
"\n";
758 throw std::bad_alloc();
780 std::cout <<
"Freeing " <<
res->mName <<
": size " <<
res->mSize <<
" (reused " <<
res->mReuse <<
")\n";
786 res->mPtrDevice =
nullptr;
817 GPUFatal(
"Trying to pop memory state from empty stack");
824 printf(
"Allocated Device memory after %30s (%8s): %'13zd (non temporary %'13zd, blocked %'13zd)\n",
GPUDataTypes::RECO_STEP_NAMES[
getRecoStepNum(step,
true)],
qTag2Str(std::get<3>(
mNonPersistentMemoryStack.back())).c_str(), ptrDiff(
mDeviceMemoryPool,
mDeviceMemoryBase) + ptrDiff((
char*)
mDeviceMemoryBase +
mDeviceMemorySize,
mDeviceMemoryPoolEnd), ptrDiff(
mDeviceMemoryPool,
mDeviceMemoryBase),
mDeviceMemoryPoolBlocked ? ptrDiff((
char*)
mDeviceMemoryBase +
mDeviceMemorySize,
mDeviceMemoryPoolBlocked) : 0);
826 printf(
"Allocated Host memory after %30s (%8s): %'13zd (non temporary %'13zd, blocked %'13zd)\n",
GPUDataTypes::RECO_STEP_NAMES[
getRecoStepNum(step,
true)],
qTag2Str(std::get<3>(
mNonPersistentMemoryStack.back())).c_str(), ptrDiff(
mHostMemoryPool,
mHostMemoryBase) + ptrDiff((
char*)
mHostMemoryBase +
mHostMemorySize,
mHostMemoryPoolEnd), ptrDiff(
mHostMemoryPool,
mHostMemoryBase),
mHostMemoryPoolBlocked ? ptrDiff((
char*)
mHostMemoryBase +
mHostMemorySize,
mHostMemoryPoolBlocked) : 0);
834 if (
res->mReuse < 0) {
838 res->mPtrDevice =
nullptr;
847 throw std::runtime_error(
"temporary memory stack already blocked");
856 throw std::runtime_error(
"cannot unblock while there is stacked memory");
900 printf(
"Memory Allocation: Host %'zd / %'zu (Permanent %'zd), Device %'zd / %'zu, (Permanent %'zd) %zu chunks\n",
908 std::map<std::string, std::array<size_t, 3>>
sizes;
911 if (
res.mReuse >= 0) {
918 if (
res.mPtrDevice) {
925 printf(
"%59s CPU / %9s GPU\n",
"",
"");
926 for (
auto it =
sizes.begin(); it !=
sizes.end(); it++) {
927 printf(
"Allocation %30s %s: Size %'14zu / %'14zu\n", it->first.c_str(), it->second[2] ?
"P" :
" ", it->second[0], it->second[1]);
930 for (uint32_t
i = 0;
i <
mChains.size();
i++) {
931 mChains[
i]->PrintMemoryStatistics();
965constexpr static inline int32_t getStepNum(T step,
bool validCheck, int32_t N,
const char* err =
"Invalid step num")
967 static_assert(
sizeof(step) ==
sizeof(uint32_t),
"Invalid step enum size");
968 int32_t
retVal = 8 *
sizeof(uint32_t) - 1 - CAMath::Clz((uint32_t)step);
969 if ((uint32_t)step == 0 ||
retVal >= N) {
973 throw std::runtime_error(
"Invalid General Step");
986 throw std::invalid_argument(
"Cannot start double pipeline mode");
989 GPUInfo(
"Pipeline worker started");
991 bool terminate =
false;
997 GPUReconstructionPipelineQueue* q;
1006 q->retVal = q->chain->RunChain();
1009 std::lock_guard<std::mutex> lk(q->m);
1015 GPUInfo(
"Pipeline worker ended");
1028 std::unique_ptr<GPUReconstructionPipelineQueue> qu(
new GPUReconstructionPipelineQueue);
1029 GPUReconstructionPipelineQueue* q = qu.get();
1030 q->chain = terminate ? nullptr :
mChains[0].get();
1031 q->op = terminate ? 1 : 0;
1032 std::unique_lock<std::mutex> lkdone(q->m);
1036 throw std::runtime_error(
"Must not enqueue work after termination request");
1042 q->c.wait(lkdone, [&q]() {
return q->done; });
1049 return mChains[0]->FinalizePipelinedProcessing();
1063 for (uint32_t
i = 0;
i <
mChains.size();
i++) {
1082 for (uint32_t
i = 0;
i <
mChains.size();
i++) {
1094 f +=
"settings.dump";
1096 for (uint32_t
i = 0;
i <
mChains.size();
i++) {
1129 f +=
"settings.dump";
1135 for (uint32_t
i = 0;
i <
mChains.size();
i++) {
1158 GPUError(
"Cannot update settings while initialized");
1159 throw std::runtime_error(
"Settings updated while initialized");
#define GPUCA_OPERATOR_NEW_ALIGNMENT
#define GPUCA_BUFFER_ALIGNMENT
#define GPUCA_GPUReconstructionUpdateDefaults()
bool isSet(const bitfield &v) const
const GPUSettingsDisplay * GetEventDisplayConfig() const
const GPUSettingsQA * GetQAConfig() const
static void dumpConfig(const GPUSettingsRec *rec, const GPUSettingsProcessing *proc, const GPUSettingsQA *qa, const GPUSettingsDisplay *display, const GPUSettingsDeviceBackend *device, const GPURecoStepConfiguration *workflow)
static constexpr const char *const RECO_STEP_NAMES[]
static constexpr int32_t N_RECO_STEPS
static constexpr int32_t N_GENERAL_STEPS
void * SetDevicePointers(void *ptr)
void * SetPointers(void *ptr)
static void computePointerWithAlignment(T *&basePtr, S *&objPtr, size_t nEntries=1)
void InitGPUProcessor(GPUReconstruction *rec, ProcessorType type=PROCESSOR_TYPE_CPU, GPUProcessor *slaveProcessor=nullptr)
ProcessorType mGPUProcessorType
GPURecoStepConfiguration mRecoSteps
int32_t InitPhaseBeforeDevice()
std::unordered_set< const void * > mRegisteredMemoryPtrs
int32_t InitPhasePermanentMemory()
std::vector< std::unique_ptr< GPUChain > > mChains
GPUDataTypes::RecoStep RecoStep
GPUReconstruction * mMaster
void * AllocateVolatileMemory(size_t size, bool device)
std::unique_ptr< GPUMemorySizeScalers > mMemoryScalers
virtual void UpdateAutomaticProcessingSettings()
void AllocateRegisteredForeignMemory(int16_t res, GPUReconstruction *rec, GPUOutputControl *control=nullptr)
void SetInputControl(void *ptr, size_t size)
GPUConstantMem * mDeviceConstantMem
void ConstructGPUProcessor(GPUProcessor *proc)
void TerminatePipelineWorker()
std::shared_ptr< GPUROOTDumpCore > mROOTDump
void PopNonPersistentMemory(RecoStep step, uint64_t tag)
size_t AllocateRegisteredMemoryHelper(GPUMemoryResource *res, void *&ptr, void *&memorypool, void *memorybase, size_t memorysize, void *(GPUMemoryResource::*SetPointers)(void *), void *&memorypoolend, const char *device)
GPUConstantMem * processors()
void ReturnVolatileMemory()
void ComputeReuseMax(GPUProcessor *proc)
void SetMemoryExternalInput(int16_t res, void *ptr)
int32_t getGeneralStepNum(GeneralStep step, bool validCheck=true)
static constexpr uint32_t NSECTORS
GPUOutputControl mInputControl
RecoStepField GetRecoStepsGPU() const
void RegisterGPUDeviceProcessor(GPUProcessor *proc, GPUProcessor *slaveProcessor)
std::vector< GPUReconstruction * > mSlaves
std::unique_ptr< T > ReadStructFromFile(const char *file)
virtual void GetITSTraits(std::unique_ptr< o2::its::TrackerTraits > *trackerTraits, std::unique_ptr< o2::its::VertexerTraits > *vertexerTraits, std::unique_ptr< o2::its::TimeFrame > *timeFrame)
std::vector< std::tuple< void *, void *, size_t, uint64_t > > mNonPersistentMemoryStack
void UpdateDynamicSettings(const GPUSettingsRecDynamic *d)
std::vector< GPUMemoryResource > mMemoryResources
std::unique_ptr< GPUReconstructionPipelineContext > mPipelineContext
std::unique_ptr< GPUConstantMem > mHostConstantMem
size_t AllocateRegisteredPermanentMemory()
void ResetRegisteredMemoryPointers(GPUProcessor *proc)
void DumpStructToFile(const T *obj, const char *file)
void AllocateRegisteredMemoryInternal(GPUMemoryResource *res, GPUOutputControl *control, GPUReconstruction *recPool)
virtual int32_t registerMemoryForGPU_internal(const void *ptr, size_t size)=0
virtual size_t WriteToConstantMemory(size_t offset, const void *src, size_t size, int32_t stream=-1, gpu_reconstruction_kernels::deviceEvent *ev=nullptr)=0
std::unordered_map< GPUMemoryReuse::ID, MemoryReuseMeta > mMemoryReuse1to1
size_t mDeviceMemoryUsedMax
std::vector< std::unique_ptr< char[]> > mUnmanagedChunks
std::vector< ProcessorData > mProcessors
void ReturnVolatileDeviceMemory()
void * AllocateVolatileDeviceMemory(size_t size)
virtual int32_t InitDevice()=0
void SetSettings(float solenoidBzNominalGPU, const GPURecoStepConfiguration *workflow=nullptr)
virtual ~GPUReconstruction()
int32_t mMaxBackendThreads
virtual std::unique_ptr< gpu_reconstruction_kernels::threadContext > GetThreadContext()=0
void UnblockStackedMemory()
GPUReconstruction(const GPUReconstruction &)=delete
static constexpr GeometryType geometryType
GPUSettingsProcessing mProcessingSettings
GPUDataTypes::GeometryType GeometryType
void WriteConstantParams()
void FreeRegisteredMemory(GPUProcessor *proc, bool freeCustom=false, bool freePermanent=false)
void UpdateMaxMemoryUsed()
std::vector< std::unique_ptr< char[]> > mVolatileChunks
virtual RecoStepField AvailableGPURecoSteps()
static constexpr const char *const IOTYPENAMES[]
void UpdateSettings(const GPUSettingsGRP *g, const GPUSettingsProcessing *p=nullptr, const GPUSettingsRecDynamic *d=nullptr)
int32_t CheckErrorCodes(bool cpuOnly=false, bool forceShowErrors=false, std::vector< std::array< uint32_t, 4 > > *fillErrors=nullptr)
void ClearAllocatedMemory(bool clearOutputs=true)
static constexpr const char *const GEOMETRY_TYPE_NAMES[]
GPUOutputControl mOutputControl
size_t mHostMemoryUsedMax
GPUSettingsGRP mGRPSettings
void * mDeviceMemoryPoolEnd
virtual int32_t ExitDevice()=0
void PrintMemoryOverview()
virtual bool CanQueryMaxMemory()
void * AllocateUnmanagedMemory(size_t size, int32_t type)
void PrintMemoryStatistics()
void PushNonPersistentMemory(uint64_t tag)
int32_t getRecoStepNum(RecoStep step, bool validCheck=true)
virtual int32_t unregisterMemoryForGPU_internal(const void *ptr)=0
int32_t InitPhaseAfterDevice()
static int32_t getHostThreadIndex()
GPUDataTypes::GeneralStep GeneralStep
void * mDeviceMemoryPermanent
void BlockStackedMemory(GPUReconstruction *rec)
void DumpSettings(const char *dir="")
void * mHostMemoryPoolBlocked
int32_t unregisterMemoryForGPU(const void *ptr)
int32_t registerMemoryForGPU(const void *ptr, size_t size)
GPUSettingsDeviceBackend mDeviceBackendSettings
int32_t EnqueuePipeline(bool terminate=false)
std::shared_ptr< GPUReconstructionThreading > mThreading
std::vector< GPUMemoryResource * > mNonPersistentIndividualAllocations
void * mHostMemoryPoolEnd
void * mDeviceMemoryPoolBlocked
void * mVolatileMemoryStart
GPUChain * GetNextChainInQueue()
void * mHostMemoryPermanent
size_t AllocateRegisteredMemory(GPUProcessor *proc, bool resetCustom=false)
int32_t ReadSettings(const char *dir="")
void SetOutputControl(const GPUOutputControl &v)
void SetSector(int32_t iSector)
#define TPC_MAX_FRAGMENT_LEN_GPU
#define TPC_MAX_FRAGMENT_LEN_HOST
GLuint GLsizei const GLuint const GLintptr const GLsizeiptr * sizes
GLboolean GLboolean GLboolean b
GLint GLint GLsizei GLint GLenum GLenum type
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLboolean GLboolean GLboolean GLboolean a
GLubyte GLubyte GLubyte GLubyte w
std::string qTag2Str(const T tag)
GPUTPCTracker tpcTrackers[GPUCA_NSECTORS]
GPUTPCClusterFinder tpcClusterer[GPUCA_NSECTORS]
GPUSettingsProcessing configProcessing
GPUSettingsO2 ReadConfigurableParam()
GPUSettingsRec configReconstruction
void set(void *p, size_t s)
std::function< void *(size_t)> allocator
void SetDefaults(float solenoidBz)
void UpdateSettings(const GPUSettingsGRP *g, const GPUSettingsProcessing *p=nullptr, const GPURecoStepConfiguration *w=nullptr, const GPUSettingsRecDynamic *d=nullptr)
GPUDataTypes::RecoStepField stepsGPUMask
GPUDataTypes::InOutTypeField outputs
GPUDataTypes::RecoStepField steps
GPUDataTypes::InOutTypeField inputs
std::condition_variable cond
std::queue< GPUReconstructionPipelineQueue * > queue
GPUReconstruction * master
float solenoidBzNominalGPU