38#define GPUCA_LOGGING_PRINTF
58template <
class T, int32_t I,
typename... Args>
64 throw std::runtime_error(
"Cannot run device kernel on host");
66 if (
x.nThreads != 1) {
67 throw std::runtime_error(
"Cannot run device kernel on host with nThreads != 1");
69 uint32_t
num =
y.num == 0 ||
y.num == -1 ? 1 :
y.num;
70 for (uint32_t k = 0; k <
num; k++) {
74 printf(
"Running %d Threads\n", nThreads);
76 tbb::this_task_arena::isolate([&] {
78 tbb::parallel_for(tbb::blocked_range<uint32_t>(0,
x.nBlocks, 1), [&](
const tbb::blocked_range<uint32_t>&
r) {
79 typename T::GPUSharedMemory smem;
80 for (uint32_t iB = r.begin(); iB < r.end(); iB++) {
81 T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.start + k], args...);
87 for (uint32_t iB = 0; iB <
x.nBlocks; iB++) {
88 typename T::GPUSharedMemory smem;
89 T::template Thread<I>(
x.nBlocks, 1, iB, 0, smem, T::Processor(*
mHostConstantMem)[
y.start + k], args...);
97inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal<GPUMemClean16, 0>(
const krnlSetupTime& _xyz,
void*
const&
ptr, uint64_t
const&
size)
101 tbb::parallel_for(0, nnThreads, [&](
int iThread) {
102 size_t threadSize =
size / nnThreads;
103 if (threadSize % 4096) {
104 threadSize += 4096 - threadSize % 4096;
106 size_t offset = threadSize * iThread;
107 size_t mySize = std::min<size_t>(threadSize,
size -
offset);
111 }, tbb::static_partitioner());
118template <
class T, int32_t I,
typename... Args>
124template <
class T,
int32_t I>
130#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types) \
131 template int32_t GPUReconstructionCPUBackend::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args); \
132 template krnlProperties GPUReconstructionCPUBackend::getKernelPropertiesBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>();
133#include "GPUReconstructionKernelList.h"
145size_t GPUReconstructionCPU::TransferMemoryResourcesHelper(
GPUProcessor* proc, int32_t
stream,
bool all,
bool toGPU)
152 if (
res.mPtr ==
nullptr) {
155 if (proc &&
res.mProcessor != proc) {
176#if defined(__APPLE__)
179 return ((int32_t)(
size_t)GetCurrentThread());
181 return ((int32_t)syscall(SYS_gettid));
225 const std::clock_t cpuTimerStart = std::clock();
235 for (uint32_t
i = 0;
i <
mChains.size();
i++) {
243 mStatCPUTime += (double)(std::clock() - cpuTimerStart) / CLOCKS_PER_SEC;
246 std::string nEventReport;
250 double kernelTotal = 0;
254 for (uint32_t
i = 0;
i <
mTimers.size();
i++) {
271 kernelStepTimes[stepNum] +=
time;
273 char bandwidth[256] =
"";
277 printf(
"Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n",
type == 0 ?
'K' :
'C',
mTimers[
i]->count,
mTimers[
i]->name.c_str(),
time * 1000000 /
mStatNEvents, bandwidth);
286 if (kernelStepTimes[
i] != 0. ||
mTimersRecoSteps[
i].timerTotal.GetElapsedTime() != 0.) {
287 printf(
"Execution Time: Step : %11s %38s Time: %'10.0f us %64s ( Total Time : %'14.0f us, CPU Time : %'14.0f us, %'7.2fx )\n",
"Tasks",
315 printf(
"Execution Time: Total : %50s Time: %'10.0f us%s\n",
"Total Kernel",
mStatKernelTime, nEventReport.c_str());
319 GPUInfo(
"Total Wall Time: %10.0f us%s",
mStatWallTime, nEventReport.c_str());
344 if (!((
size_t)&
param().occupancyTotal - (
size_t)&
param().occupancyMap ==
sizeof(
param().occupancyMap) &&
sizeof(
param().occupancyMap) ==
sizeof(
size_t) &&
sizeof(
param().occupancyTotal) <
sizeof(
size_t))) {
345 throw std::runtime_error(
"occupancy data not consecutive in GPUParam");
348 size_t tmp[2] = {(size_t)mapGPU, 0};
349 memcpy(&tmp[1], &occupancyTotal,
sizeof(occupancyTotal));
#define GPUCA_OPERATOR_NEW_ALIGNMENT
Online TRD tracker based on extrapolated TPC tracks.
Used for storing the MC labels for the TRD tracklets.
TRD Tracklet word for GPU tracker - 32bit tracklet info + half chamber ID + index.
static constexpr const char *const GENERAL_STEP_NAMES[]
static constexpr const char *const RECO_STEP_NAMES[]
static constexpr int32_t N_RECO_STEPS
static constexpr int32_t N_GENERAL_STEPS
ProcessorType mGPUProcessorType
int32_t runKernelBackendInternal(const gpu_reconstruction_kernels::krnlSetupTime &_xyz, const Args &... args)
int32_t runKernelBackend(const gpu_reconstruction_kernels::krnlSetupArgs< T, I, Args... > &args)
gpu_reconstruction_kernels::krnlProperties getKernelPropertiesBackend()
~GPUReconstructionCPU() override
virtual size_t GPUMemCpy(void *dst, const void *src, size_t size, int32_t stream, int32_t toGPU, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
virtual size_t GPUMemCpyAlways(bool onGpu, void *dst, const void *src, size_t size, int32_t stream, int32_t toGPU, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
static constexpr krnlRunRange krnlRunRangeNone
size_t TransferMemoryResourceToHost(GPUMemoryResource *res, int32_t stream=-1, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
int32_t InitDevice() override
void UpdateParamOccupancyMap(const uint32_t *mapHost, const uint32_t *mapGPU, uint32_t occupancyTotal, int32_t stream=-1)
size_t TransferMemoryResourceToGPU(GPUMemoryResource *res, int32_t stream=-1, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
int32_t RunChains() override
GPUProcessorProcessors mProcShadow
void ResetDeviceProcessorTypes()
int32_t ExitDevice() override
virtual int32_t GPUDebug(const char *state="UNKNOWN", int32_t stream=-1, bool force=false)
static constexpr krnlEvent krnlEventNone
size_t WriteToConstantMemory(size_t offset, const void *src, size_t size, int32_t stream=-1, deviceEvent *ev=nullptr) override
virtual size_t TransferMemoryInternal(GPUMemoryResource *res, int32_t stream, deviceEvent *ev, deviceEvent *evList, int32_t nEvents, bool toGPU, const void *src, void *dst)
virtual std::unique_ptr< gpu_reconstruction_kernels::threadContext > GetThreadContext() override
RecoStepTimerMeta mTimersRecoSteps[GPUDataTypes::N_RECO_STEPS]
int32_t mActiveHostKernelThreads
std::vector< std::unique_ptr< timerMeta > > mTimers
HighResTimer mTimersGeneralSteps[GPUDataTypes::N_GENERAL_STEPS]
int32_t getNKernelHostThreads(bool splitCores)
std::vector< std::unique_ptr< GPUChain > > mChains
GPUReconstruction * mMaster
std::unique_ptr< GPUMemorySizeScalers > mMemoryScalers
GPUConstantMem * processors()
uint32_t mNEventsProcessed
std::vector< GPUReconstruction * > mSlaves
std::vector< GPUMemoryResource > mMemoryResources
std::unique_ptr< GPUConstantMem > mHostConstantMem
std::vector< ProcessorData > mProcessors
GPUSettingsProcessing mProcessingSettings
void WriteConstantParams()
static GPUReconstruction * GPUReconstruction_Create_CPU(const GPUSettingsDeviceBackend &cfg)
void ClearAllocatedMemory(bool clearOutputs=true)
int32_t getRecoStepNum(RecoStep step, bool validCheck=true)
const GPUSettingsProcessing & GetProcessingSettings() const
int32_t EnqueuePipeline(bool terminate=false)
std::shared_ptr< GPUReconstructionThreading > mThreading
void * mHostMemoryPoolEnd
void * mHostMemoryPermanent
GLint GLint GLsizei GLint GLenum GLenum type
std::string to_string(gsl::span< T, Size > span)
GPUConstantMem * mProcessorsProc
std::tuple< typename std::conditional<(sizeof(Args) > sizeof(void *)), const Args &, const Args >::type... > v
const uint32_t * occupancyMap