23#include <condition_variable>
52struct GPUReconstructionPipelineQueue {
56 std::condition_variable c;
63 std::queue<GPUReconstructionPipelineQueue*>
queue;
65 std::condition_variable
cond;
76static ptrdiff_t ptrDiff(
void*
a,
void*
b) {
return (
char*)
a - (
char*)
b; }
82 throw std::invalid_argument(
"device type of master and slave GPUReconstruction does not match");
85 throw std::invalid_argument(
"Cannot be slave to a slave");
100 mROOTDump = GPUROOTDumpCore::getAndCreate();
107 GPUError(
"GPU Reconstruction not properly deinitialized!");
111void GPUReconstruction::GetITSTraits(std::unique_ptr<o2::its::TrackerTraits>* trackerTraits, std::unique_ptr<o2::its::VertexerTraits>* vertexerTraits, std::unique_ptr<o2::its::TimeFrame>* timeFrame)
116 if (vertexerTraits) {
126 return std::max<int32_t>(0, tbb::this_task_arena::current_thread_index());
132 throw std::runtime_error(
"Must not call init on slave!");
138 for (uint32_t
i = 0;
i <
mSlaves.size();
i++) {
141 GPUError(
"Error initialization slave (before deviceinit)");
160 for (uint32_t
i = 0;
i <
mSlaves.size();
i++) {
168 GPUError(
"Error initialization slave (deviceinit)");
172 GPUError(
"Error initialization slave (permanent memory)");
183 for (uint32_t
i = 0;
i <
mSlaves.size();
i++) {
188 GPUError(
"Error initialization slave (after device init)");
198static uint32_t getDefaultNThreads()
200 const char* tbbEnv = getenv(
"TBB_NUM_THREADS");
201 uint32_t tbbNum = tbbEnv ? atoi(tbbEnv) : 0;
205 const char* ompEnv = getenv(
"OMP_NUM_THREADS");
206 uint32_t ompNum = ompEnv ? atoi(ompEnv) : 0;
210 return tbb::info::default_concurrency();
218 printf(
"\nConfig Dump %s\n",
mMaster ?
"Slave" :
"Master");
221 for (uint32_t
i = 0;
i <
mChains.size();
i++) {
265#ifndef GPUCA_DETERMINISTIC_MODE
266 GPUError(
"Warning, deterministicGPUReconstruction needs GPUCA_DETERMINISTIC_MODE for being fully deterministic, without only most indeterminism by concurrency is removed, but floating point effects remain!");
270 if (
param().
rec.tpc.looperInterpolationInExtraPass == -1) {
271 param().
rec.tpc.looperInterpolationInExtraPass = 0;
309 GPUFatal(
"Must not use both nHostThreads and ompThreads at the same time!");
312 GPUWarning(
"You are using the deprecated ompThreads option, please switch to nHostThreads!");
322 mThreading = std::make_shared<GPUReconstructionThreading>();
323 mThreading->control = std::make_unique<tbb::global_control>(tbb::global_control::max_allowed_parallelism,
mMaxHostThreads);
341 GPUError(
"Invalid value for nTPCClustererLanes: %d",
mProcessingSettings.nTPCClustererLanes);
346 GPUError(
"Must use double pipeline mode only with exactly one chain that must support it");
355 for (uint32_t
i = 0;
i <
mChains.size();
i++) {
359 mChains[
i]->RegisterPermanentMemoryAndProcessors();
360 size_t memPrimary, memPageLocked;
361 mChains[
i]->MemorySize(memPrimary, memPageLocked);
363 memPageLocked = memPrimary;
390 for (uint32_t
i = 0;
i <
mChains.size();
i++) {
391 mChains[
i]->RegisterGPUProcessors();
403 for (uint32_t
i = 0;
i <
mChains.size();
i++) {
428 for (uint32_t
i = 0;
i <
mChains.size();
i++) {
439 for (uint32_t
i = 0;
i <
mSlaves.size();
i++) {
441 GPUError(
"Error exiting slave");
470 auto& re = it->second;
471 if (proc ==
nullptr || re.proc == proc) {
473 resMain.mOverrideSize = 0;
474 for (uint32_t
i = 0;
i < re.res.size();
i++) {
476 resMain.mOverrideSize = std::max<size_t>(resMain.mOverrideSize, ptrDiff(
res.SetPointers((
void*)1), (
char*)1));
485 GPUInfo(
"Allocating memory %p", (
void*)proc);
498 GPUInfo(
"Allocating memory done");
506 GPUInfo(
"Allocating Permanent Memory");
517 GPUInfo(
"Permanent Memory Done");
524 if (
res->mReuse >= 0) {
526 if (
ptr ==
nullptr) {
527 GPUError(
"Invalid reuse ptr (%s)",
res->mName);
528 throw std::bad_alloc();
533 throw std::bad_alloc();
536 std::cout <<
"Reused (" << device <<
") " <<
res->mName <<
": " <<
retVal <<
"\n";
540 if (memorypool ==
nullptr) {
541 GPUError(
"Cannot allocate memory from uninitialized pool");
542 throw std::bad_alloc();
546 retVal = ptrDiff((
res->*setPtr)((
char*)1), (
char*)(1));
547 memorypoolend = (
void*)((
char*)memorypoolend - GPUProcessor::getAlignmentMod<GPUCA_MEMALIGN>(memorypoolend));
548 if (retVal < res->mOverrideSize) {
551 retVal += GPUProcessor::getAlignment<GPUCA_MEMALIGN>(
retVal);
552 memorypoolend = (
char*)memorypoolend -
retVal;
557 memorypool = (
char*)((
res->*setPtr)(
ptr));
559 if (retVal < res->mOverrideSize) {
561 memorypool = (
char*)
ptr +
res->mOverrideSize;
563 memorypool = (
void*)((
char*)memorypool + GPUProcessor::getAlignment<GPUCA_MEMALIGN>(memorypool));
565 if (memorypoolend ? (memorypool > memorypoolend) : ((size_t)ptrDiff(memorypool, memorybase) > memorysize)) {
566 std::cerr <<
"Memory pool size exceeded (" << device <<
") (" <<
res->mName <<
": " << (memorypoolend ? (memorysize + ptrDiff(memorypool, memorypoolend)) : ptrDiff(memorypool, memorybase)) <<
" > " << memorysize <<
"\n";
567 throw std::bad_alloc();
570 std::cout <<
"Allocated (" << device <<
") " <<
res->mName <<
": " <<
retVal <<
" - available: " << (memorypoolend ? ptrDiff(memorypoolend, memorypool) : (memorysize - ptrDiff(memorypool, memorybase))) <<
"\n";
579 if (
res->mPtrDevice &&
res->mReuse < 0) {
582 res->mSize = std::max((
size_t)
res->SetPointers((
void*)1) - 1,
res->mOverrideSize);
583 if (
res->mReuse >= 0) {
585 GPUError(
"Invalid reuse, insufficient size: %ld < %ld", (int64_t)
mMemoryResources[
res->mReuse].mSize, (int64_t)
res->mSize);
586 throw std::bad_alloc();
592 res->mPtr = GPUProcessor::alignPointer<GPUCA_BUFFER_ALIGNMENT>(
res->mPtrDevice);
593 res->SetPointers(
res->mPtr);
595 std::cout << (
res->mReuse >= 0 ?
"Reused " :
"Allocated ") <<
res->mName <<
": " <<
res->mSize <<
"\n";
601 GPUError(
"Got buffer with insufficient alignment");
602 throw std::bad_alloc();
606 if (
res->mPtr !=
nullptr) {
607 GPUError(
"Double allocation! (%s)",
res->mName);
608 throw std::bad_alloc();
616 res->mSize = std::max((
size_t)
res->SetPointers((
void*)1) - 1,
res->mOverrideSize);
617 res->mPtr = control->
allocator(CAMath::nextMultipleOf<GPUCA_BUFFER_ALIGNMENT>(
res->mSize));
618 res->mSize = std::max<size_t>(ptrDiff(
res->SetPointers(
res->mPtr),
res->mPtr),
res->mOverrideSize);
620 std::cout <<
"Allocated (from callback) " <<
res->mName <<
": " <<
res->mSize <<
"\n";
623 void* dummy =
nullptr;
630 GPUError(
"Got buffer with insufficient alignment");
631 throw std::bad_alloc();
635 if (
res->mProcessor->mLinkedProcessor ==
nullptr) {
636 GPUError(
"Device Processor not set (%s)",
res->mName);
637 throw std::bad_alloc();
643 }
else if (
size !=
res->mSize) {
644 GPUError(
"Inconsistent device memory allocation (%s: device %lu vs %lu)",
res->mName,
size,
res->mSize);
645 throw std::bad_alloc();
648 GPUError(
"Got buffer with insufficient alignment");
649 throw std::bad_alloc();
669 return res->mReuse >= 0 ? 0 :
res->mSize;
675 throw std::runtime_error(
"Requested invalid memory typo for unmanaged allocation");
679 return GPUProcessor::alignPointer<GPUCA_BUFFER_ALIGNMENT>(
mUnmanagedChunks.back().get());
685 if (pool > poolend) {
686 GPUError(
"Insufficient unmanaged memory: missing %ld bytes", ptrDiff(pool, poolend));
687 throw std::bad_alloc();
709 throw std::bad_alloc();
725 return GPUProcessor::alignPointer<GPUCA_BUFFER_ALIGNMENT>(
mVolatileChunks.back().get());
742 size_t size = ptrDiff(
res->SetPointers(basePtr), basePtr);
743 if (basePtr &&
size > std::max(
res->mSize,
res->mOverrideSize)) {
744 std::cerr <<
"Updated pointers exceed available memory size: " <<
size <<
" > " << std::max(
res->mSize,
res->mOverrideSize) <<
" - host - " <<
res->mName <<
"\n";
745 throw std::bad_alloc();
750 size_t size = ptrDiff(
res->SetDevicePointers(basePtr), basePtr);
751 if (basePtr &&
size > std::max(
res->mSize,
res->mOverrideSize)) {
752 std::cerr <<
"Updated pointers exceed available memory size: " <<
size <<
" > " << std::max(
res->mSize,
res->mOverrideSize) <<
" - GPU - " <<
res->mName <<
"\n";
753 throw std::bad_alloc();
775 std::cout <<
"Freeing " <<
res->mName <<
": size " <<
res->mSize <<
" (reused " <<
res->mReuse <<
")\n";
781 res->mPtrDevice =
nullptr;
812 GPUFatal(
"Trying to pop memory state from empty stack");
827 if (
res->mReuse < 0) {
831 res->mPtrDevice =
nullptr;
840 throw std::runtime_error(
"temporary memory stack already blocked");
849 throw std::runtime_error(
"cannot unblock while there is stacked memory");
897 printf(
"Memory Allocation: Host %'13zd / %'13zu (Permanent %'13zd, Data %'13zd, Scratch %'13zd), Device %'13zd / %'13zu, (Permanent %'13zd, Data %'13zd, Scratch %'13zd) %zu chunks\n",
898 ptrDiff(
mHostMemoryPool,
mHostMemoryBase) + ptrDiff((
char*)
mHostMemoryBase +
mHostMemorySize,
mHostMemoryPoolEnd),
mHostMemorySize, ptrDiff(
mHostMemoryPermanent,
mHostMemoryBase), ptrDiff(
mHostMemoryPool,
mHostMemoryPermanent), ptrDiff((
char*)
mHostMemoryBase +
mHostMemorySize,
mHostMemoryPoolEnd),
899 ptrDiff(
mDeviceMemoryPool,
mDeviceMemoryBase) + ptrDiff((
char*)
mDeviceMemoryBase +
mDeviceMemorySize,
mDeviceMemoryPoolEnd),
mDeviceMemorySize, ptrDiff(
mDeviceMemoryPermanent,
mDeviceMemoryBase), ptrDiff(
mDeviceMemoryPool,
mDeviceMemoryPermanent), ptrDiff((
char*)
mDeviceMemoryBase +
mDeviceMemorySize,
mDeviceMemoryPoolEnd),
906 std::map<std::string, std::array<size_t, 3>>
sizes;
909 if (
res.mReuse >= 0) {
916 if (
res.mPtrDevice) {
923 printf(
"%59s CPU / %9s GPU\n",
"",
"");
924 for (
auto it =
sizes.begin(); it !=
sizes.end(); it++) {
925 printf(
"Allocation %30s %s: Size %'14zu / %'14zu\n", it->first.c_str(), it->second[2] ?
"P" :
" ", it->second[0], it->second[1]);
928 for (uint32_t
i = 0;
i <
mChains.size();
i++) {
929 mChains[
i]->PrintMemoryStatistics();
963constexpr static inline int32_t getStepNum(T step,
bool validCheck, int32_t N,
const char* err =
"Invalid step num")
965 static_assert(
sizeof(step) ==
sizeof(uint32_t),
"Invalid step enum size");
966 int32_t
retVal = 8 *
sizeof(uint32_t) - 1 - CAMath::Clz((uint32_t)step);
967 if ((uint32_t)step == 0 ||
retVal >= N) {
971 throw std::runtime_error(
"Invalid General Step");
984 throw std::invalid_argument(
"Cannot start double pipeline mode");
987 GPUInfo(
"Pipeline worker started");
989 bool terminate =
false;
995 GPUReconstructionPipelineQueue* q;
1004 q->retVal = q->chain->RunChain();
1007 std::lock_guard<std::mutex> lk(q->m);
1013 GPUInfo(
"Pipeline worker ended");
1026 std::unique_ptr<GPUReconstructionPipelineQueue> qu(
new GPUReconstructionPipelineQueue);
1027 GPUReconstructionPipelineQueue* q = qu.get();
1028 q->chain = terminate ? nullptr :
mChains[0].get();
1029 q->op = terminate ? 1 : 0;
1030 std::unique_lock<std::mutex> lkdone(q->m);
1034 throw std::runtime_error(
"Must not enqueue work after termination request");
1040 q->c.wait(lkdone, [&q]() {
return q->done; });
1047 return mChains[0]->FinalizePipelinedProcessing();
1061 for (uint32_t
i = 0;
i <
mChains.size();
i++) {
1080 for (uint32_t
i = 0;
i <
mChains.size();
i++) {
1098 throw std::runtime_error(
"GPU Backend Failure");
1107 f +=
"settings.dump";
1109 for (uint32_t
i = 0;
i <
mChains.size();
i++) {
1142 f +=
"settings.dump";
1148 for (uint32_t
i = 0;
i <
mChains.size();
i++) {
1171 GPUError(
"Cannot update settings while initialized");
1172 throw std::runtime_error(
"Settings updated while initialized");
1201 mAlloc = [&
r](
size_t n) {
return (
char*)
r->AllocateVolatileDeviceMemory(
n); };
#define GPUCA_OPERATOR_NEW_ALIGNMENT
#define GPUCA_BUFFER_ALIGNMENT
#define GPUCA_GPUReconstructionUpdateDefaults()
bool isSet(const bitfield &v) const
const GPUSettingsDisplay * GetEventDisplayConfig() const
const GPUSettingsQA * GetQAConfig() const
static void dumpConfig(const GPUSettingsRec *rec, const GPUSettingsProcessing *proc, const GPUSettingsQA *qa, const GPUSettingsDisplay *display, const GPUSettingsDeviceBackend *device, const GPURecoStepConfiguration *workflow)
static constexpr const char *const RECO_STEP_NAMES[]
static constexpr int32_t N_RECO_STEPS
static constexpr int32_t N_GENERAL_STEPS
void * SetDevicePointers(void *ptr)
void * SetPointers(void *ptr)
static void computePointerWithAlignment(T *&basePtr, S *&objPtr, size_t nEntries=1)
void InitGPUProcessor(GPUReconstruction *rec, ProcessorType type=PROCESSOR_TYPE_CPU, GPUProcessor *slaveProcessor=nullptr)
ProcessorType mGPUProcessorType
GPURecoStepConfiguration mRecoSteps
int32_t InitPhaseBeforeDevice()
std::unordered_set< const void * > mRegisteredMemoryPtrs
int32_t InitPhasePermanentMemory()
std::vector< std::unique_ptr< GPUChain > > mChains
GPUReconstruction * mMaster
void * AllocateVolatileMemory(size_t size, bool device)
ThrustVolatileAllocator getThrustVolatileDeviceAllocator()
std::unique_ptr< GPUMemorySizeScalers > mMemoryScalers
virtual void UpdateAutomaticProcessingSettings()
void AllocateRegisteredForeignMemory(int16_t res, GPUReconstruction *rec, GPUOutputControl *control=nullptr)
void SetInputControl(void *ptr, size_t size)
GPUConstantMem * mDeviceConstantMem
void ConstructGPUProcessor(GPUProcessor *proc)
void TerminatePipelineWorker()
std::shared_ptr< GPUROOTDumpCore > mROOTDump
void PopNonPersistentMemory(RecoStep step, uint64_t tag)
size_t AllocateRegisteredMemoryHelper(GPUMemoryResource *res, void *&ptr, void *&memorypool, void *memorybase, size_t memorysize, void *(GPUMemoryResource::*SetPointers)(void *), void *&memorypoolend, const char *device)
GPUConstantMem * processors()
void ReturnVolatileMemory()
void ComputeReuseMax(GPUProcessor *proc)
void SetMemoryExternalInput(int16_t res, void *ptr)
int32_t getGeneralStepNum(GeneralStep step, bool validCheck=true)
static constexpr uint32_t NSECTORS
GPUOutputControl mInputControl
RecoStepField GetRecoStepsGPU() const
void RegisterGPUDeviceProcessor(GPUProcessor *proc, GPUProcessor *slaveProcessor)
std::vector< GPUReconstruction * > mSlaves
std::unique_ptr< T > ReadStructFromFile(const char *file)
virtual void GetITSTraits(std::unique_ptr< o2::its::TrackerTraits > *trackerTraits, std::unique_ptr< o2::its::VertexerTraits > *vertexerTraits, std::unique_ptr< o2::its::TimeFrame > *timeFrame)
std::vector< std::tuple< void *, void *, size_t, uint64_t > > mNonPersistentMemoryStack
void UpdateDynamicSettings(const GPUSettingsRecDynamic *d)
std::vector< GPUMemoryResource > mMemoryResources
std::unique_ptr< GPUReconstructionPipelineContext > mPipelineContext
std::unique_ptr< GPUConstantMem > mHostConstantMem
size_t AllocateRegisteredPermanentMemory()
void ResetRegisteredMemoryPointers(GPUProcessor *proc)
void DumpStructToFile(const T *obj, const char *file)
void AllocateRegisteredMemoryInternal(GPUMemoryResource *res, GPUOutputControl *control, GPUReconstruction *recPool)
virtual int32_t registerMemoryForGPU_internal(const void *ptr, size_t size)=0
virtual size_t WriteToConstantMemory(size_t offset, const void *src, size_t size, int32_t stream=-1, gpu_reconstruction_kernels::deviceEvent *ev=nullptr)=0
std::unordered_map< GPUMemoryReuse::ID, MemoryReuseMeta > mMemoryReuse1to1
size_t mDeviceMemoryUsedMax
std::vector< std::unique_ptr< char[]> > mUnmanagedChunks
std::vector< ProcessorData > mProcessors
void ReturnVolatileDeviceMemory()
void * AllocateVolatileDeviceMemory(size_t size)
virtual int32_t InitDevice()=0
void SetSettings(float solenoidBzNominalGPU, const GPURecoStepConfiguration *workflow=nullptr)
virtual ~GPUReconstruction()
int32_t mMaxBackendThreads
virtual std::unique_ptr< gpu_reconstruction_kernels::threadContext > GetThreadContext()=0
void UnblockStackedMemory()
GPUReconstruction(const GPUReconstruction &)=delete
static constexpr GeometryType geometryType
GPUSettingsProcessing mProcessingSettings
void WriteConstantParams()
void FreeRegisteredMemory(GPUProcessor *proc, bool freeCustom=false, bool freePermanent=false)
void UpdateMaxMemoryUsed()
std::vector< std::unique_ptr< char[]> > mVolatileChunks
virtual RecoStepField AvailableGPURecoSteps()
static constexpr const char *const IOTYPENAMES[]
void UpdateSettings(const GPUSettingsGRP *g, const GPUSettingsProcessing *p=nullptr, const GPUSettingsRecDynamic *d=nullptr)
int32_t CheckErrorCodes(bool cpuOnly=false, bool forceShowErrors=false, std::vector< std::array< uint32_t, 4 > > *fillErrors=nullptr)
void ClearAllocatedMemory(bool clearOutputs=true)
static constexpr const char *const GEOMETRY_TYPE_NAMES[]
GPUOutputControl mOutputControl
size_t mHostMemoryUsedMax
GPUSettingsGRP mGRPSettings
void * mDeviceMemoryPoolEnd
virtual int32_t ExitDevice()=0
void PrintMemoryOverview()
virtual bool CanQueryMaxMemory()
void * AllocateUnmanagedMemory(size_t size, int32_t type)
void PrintMemoryStatistics()
void PushNonPersistentMemory(uint64_t tag)
int32_t getRecoStepNum(RecoStep step, bool validCheck=true)
virtual int32_t unregisterMemoryForGPU_internal(const void *ptr)=0
int32_t InitPhaseAfterDevice()
static int32_t getHostThreadIndex()
void * mDeviceMemoryPermanent
void BlockStackedMemory(GPUReconstruction *rec)
void DumpSettings(const char *dir="")
void * mHostMemoryPoolBlocked
int32_t unregisterMemoryForGPU(const void *ptr)
int32_t registerMemoryForGPU(const void *ptr, size_t size)
GPUSettingsDeviceBackend mDeviceBackendSettings
int32_t EnqueuePipeline(bool terminate=false)
std::shared_ptr< GPUReconstructionThreading > mThreading
std::vector< GPUMemoryResource * > mNonPersistentIndividualAllocations
void * mHostMemoryPoolEnd
void * mDeviceMemoryPoolBlocked
void * mVolatileMemoryStart
virtual int32_t GPUChkErrInternal(const int64_t error, const char *file, int32_t line) const
GPUChain * GetNextChainInQueue()
void * mHostMemoryPermanent
int32_t GPUChkErrA(const int64_t error, const char *file, int32_t line, bool failOnError)
size_t AllocateRegisteredMemory(GPUProcessor *proc, bool resetCustom=false)
int32_t ReadSettings(const char *dir="")
void SetOutputControl(const GPUOutputControl &v)
void SetSector(int32_t iSector)
#define TPC_MAX_FRAGMENT_LEN_GPU
#define TPC_MAX_FRAGMENT_LEN_HOST
GLuint GLsizei const GLuint const GLintptr const GLsizeiptr * sizes
GLboolean GLboolean GLboolean b
GLint GLint GLsizei GLint GLenum GLenum type
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLboolean GLboolean GLboolean GLboolean a
GLubyte GLubyte GLubyte GLubyte w
std::string qTag2Str(const T tag)
GPUTPCTracker tpcTrackers[GPUCA_NSECTORS]
GPUTPCClusterFinder tpcClusterer[GPUCA_NSECTORS]
GPUSettingsProcessing configProcessing
GPUSettingsO2 ReadConfigurableParam()
GPUSettingsRec configReconstruction
void set(void *p, size_t s)
std::function< void *(size_t)> allocator
void SetDefaults(float solenoidBz)
void UpdateSettings(const GPUSettingsGRP *g, const GPUSettingsProcessing *p=nullptr, const GPURecoStepConfiguration *w=nullptr, const GPUSettingsRecDynamic *d=nullptr)
GPUDataTypes::RecoStepField stepsGPUMask
GPUDataTypes::InOutTypeField outputs
GPUDataTypes::RecoStepField steps
GPUDataTypes::InOutTypeField inputs
std::condition_variable cond
std::queue< GPUReconstructionPipelineQueue * > queue
GPUReconstruction * master
float solenoidBzNominalGPU