19#include "GPUDefParametersRuntime.h"
33#include "GPUReconstructionProcessingKernels.inc"
56template <
class T, int32_t I,
typename... Args>
62 throw std::runtime_error(
"Cannot run device kernel on host");
64 if (
x.nThreads != 1) {
65 throw std::runtime_error(
"Cannot run device kernel on host with nThreads != 1");
70 GPUInfo(
"Running %d Threads",
mThreading->activeThreads->max_concurrency());
72 tbb::this_task_arena::isolate([&] {
74 tbb::parallel_for(tbb::blocked_range<uint32_t>(0,
x.nBlocks, 1), [&](
const tbb::blocked_range<uint32_t>&
r) {
75 typename T::GPUSharedMemory smem;
76 for (uint32_t iB = r.begin(); iB < r.end(); iB++) {
77 T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.index], args...);
83 for (uint32_t iB = 0; iB <
x.nBlocks; iB++) {
84 typename T::GPUSharedMemory smem;
85 T::template Thread<I>(
x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[
y.index], args...);
91inline void GPUReconstructionCPU::runKernelBackend<GPUMemClean16, 0>(
const krnlSetupTime& _xyz,
void*
const&
ptr, uint64_t
const&
size)
95 tbb::parallel_for(0, nThreads, [&](
int iThread) {
96 size_t threadSize =
size / nThreads;
97 if (threadSize % 4096) {
98 threadSize += 4096 - threadSize % 4096;
100 size_t offset = threadSize * iThread;
101 size_t mySize = std::min<size_t>(threadSize,
size -
offset);
105 }, tbb::static_partitioner());
111template <
class S,
int32_t I>
117 const auto num = GetKernelNum<S, I>();
129#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \
130 template GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelProperties<GPUCA_M_KRNL_TEMPLATE(x_class)>(int gpu);
131#include "GPUReconstructionKernelList.h"
143size_t GPUReconstructionCPU::TransferMemoryResourcesHelper(
GPUProcessor* proc, int32_t
stream,
bool all,
bool toGPU)
150 if (
res.mPtr ==
nullptr) {
153 if (proc &&
res.mProcessor != proc) {
174#if defined(__APPLE__)
177 return ((int32_t)(
size_t)GetCurrentThread());
179 return ((int32_t)syscall(SYS_gettid));
220 static std::mt19937 rng;
221 static std::uniform_int_distribution<uint64_t> dist(0, 1000000);
223 GPUInfo(
"Fuzzing memory scaling factor with %lu", fuzzFactor);
231 GPUInfo(
"Allocated memory when starting processing %34s",
"");
236 const std::clock_t cpuTimerStart = std::clock();
244 for (uint32_t
i = 0;
i <
mChains.size();
i++) {
255 mStatCPUTime += (double)(std::clock() - cpuTimerStart) / CLOCKS_PER_SEC;
257 GPUInfo(
"Allocated memory when ending processing %36s",
"");
262 std::string nEventReport;
266 double kernelTotal = 0;
267 std::vector<double> kernelStepTimes(gpudatatypes::N_RECO_STEPS, 0.);
273 for (uint32_t
i = 0;
i <
mTimers.size();
i++) {
290 kernelStepTimes[stepNum] +=
time;
300 for (int32_t
i = 0;
i < gpudatatypes::N_RECO_STEPS;
i++) {
301 if (kernelStepTimes[
i] != 0. ||
mTimersRecoSteps[
i].timerTotal.GetElapsedTime() != 0.) {
302 writer.row(
' ', 0, std::string(gpudatatypes::RECO_STEP_NAMES[
i]) +
" (Tasks)", kernelStepTimes[
i],
mTimersRecoSteps[
i].timerCPU,
mTimersRecoSteps[
i].timerTotal.GetElapsedTime(), 0);
320 for (int32_t
i = 0;
i < gpudatatypes::N_GENERAL_STEPS;
i++) {
322 writer.row(
' ', 0, gpudatatypes::GENERAL_STEP_NAMES[
i],
mTimersGeneralSteps[
i].GetElapsedTime(), -1.0, -1.0, 0);
328 GPUInfo(
"Total Wall Time: %10.0f us%s",
mStatWallTime, nEventReport.c_str());
356 if (mapHost && mapSize != GPUTPCClusterOccupancyMapBin::getNBins(
param())) {
357 throw std::runtime_error(
"Updating occupancy map with object of invalid size");
363 if (!((
size_t)&
param().occupancyMapSize - (
size_t)&
param().occupancyMap ==
sizeof(
param().occupancyMap) +
sizeof(
param().occupancyTotal) &&
sizeof(
param().occupancyMap) ==
sizeof(
void*) &&
sizeof(
param().occupancyTotal) ==
sizeof(uint32_t))) {
364 throw std::runtime_error(
"occupancy data not consecutive in GPUParam");
366 struct tmpOccuapncyParam {
371 tmpOccuapncyParam tmp = {mapGPU, occupancyTotal, mapSize};
#define GPUCA_BUFFER_ALIGNMENT
Online TRD tracker based on extrapolated TPC tracks.
Used for storing the MC labels for the TRD tracklets.
TRD Tracklet word for GPU tracker - 32bit tracklet info + half chamber ID + index.
ProcessorType mGPUProcessorType
uint32_t mMultiprocessorCount
~GPUReconstructionCPU() override
virtual size_t GPUMemCpy(void *dst, const void *src, size_t size, int32_t stream, int32_t toGPU, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
void runKernelBackend(const krnlSetupTime &_xyz, const Args &... args)
virtual size_t GPUMemCpyAlways(bool onGpu, void *dst, const void *src, size_t size, int32_t stream, int32_t toGPU, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
static constexpr krnlRunRange krnlRunRangeNone
size_t TransferMemoryResourceToHost(GPUMemoryResource *res, int32_t stream=-1, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
int32_t InitDevice() override
void UpdateParamOccupancyMap(const uint32_t *mapHost, const uint32_t *mapGPU, uint32_t occupancyTotal, uint32_t mapSize, int32_t stream=-1, deviceEvent *ev=nullptr)
size_t TransferMemoryResourceToGPU(GPUMemoryResource *res, int32_t stream=-1, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
int32_t RunChains() override
GPUProcessorProcessors mProcShadow
krnlProperties getKernelProperties(int gpu=-1)
void ResetDeviceProcessorTypes()
int32_t ExitDevice() override
virtual int32_t GPUDebug(const char *state="UNKNOWN", int32_t stream=-1, bool force=false)
static constexpr krnlEvent krnlEventNone
size_t WriteToConstantMemory(size_t offset, const void *src, size_t size, int32_t stream=-1, deviceEvent *ev=nullptr) override
virtual size_t TransferMemoryInternal(GPUMemoryResource *res, int32_t stream, deviceEvent *ev, deviceEvent *evList, int32_t nEvents, bool toGPU, const void *src, void *dst)
RecoStepTimerMeta mTimersRecoSteps[gpudatatypes::N_RECO_STEPS]
int32_t mActiveHostKernelThreads
std::vector< std::unique_ptr< timerMeta > > mTimers
GPUDefParameters * mParCPU
HighResTimer mTimersGeneralSteps[gpudatatypes::N_GENERAL_STEPS]
int32_t getNKernelHostThreads(bool splitCores)
virtual std::unique_ptr< threadContext > GetThreadContext() override
GPUDefParameters * mParDevice
std::vector< std::unique_ptr< GPUChain > > mChains
GPUReconstruction * mMaster
std::unique_ptr< GPUMemorySizeScalers > mMemoryScalers
GPUConstantMem * processors()
uint32_t mNEventsProcessed
std::vector< GPUReconstruction * > mSlaves
std::vector< GPUMemoryResource > mMemoryResources
std::vector< ProcessorData > mProcessors
void WriteConstantParams(int32_t stream=-1)
static GPUReconstruction * GPUReconstruction_Create_CPU(const GPUSettingsDeviceBackend &cfg)
void ClearAllocatedMemory(bool clearOutputs=true)
void PrintMemoryOverview()
void PrintMemoryStatistics()
int32_t getRecoStepNum(RecoStep step, bool validCheck=true)
const GPUSettingsProcessing & GetProcessingSettings() const
int32_t EnqueuePipeline(bool terminate=false)
std::shared_ptr< GPUReconstructionThreading > mThreading
void * mHostMemoryPoolEnd
void * mHostMemoryPermanent
GLuint const GLchar * name
GLint GLint GLsizei GLint GLenum GLenum type
std::string to_string(gsl::span< T, Size > span)
GPUConstantMem * mProcessorsProc
uint32_t occupancyMapSize
const uint32_t * occupancyMap