Project
Loading...
Searching...
No Matches
GPUReconstructionProcessing.h
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
15#if !defined(GPURECONSTRUCTIONPROCESSING_H)
16#define GPURECONSTRUCTIONPROCESSING_H
17
18#include "GPUReconstruction.h"
19
20#include "utils/timer.h"
21#include <functional>
22#include <atomic>
23
24namespace Ort
25{
26struct SessionOptions;
27}
28
29namespace o2::gpu
30{
31
32struct GPUDefParameters;
33
34namespace gpu_reconstruction_kernels // TODO: Get rid of this namespace
35{
37 constexpr deviceEvent() = default;
38 constexpr deviceEvent(std::nullptr_t p) : v(nullptr) {};
39 template <class T>
40 void set(T val)
41 {
42 v = reinterpret_cast<void*&>(val);
43 }
44 template <class T>
45 T& get()
46 {
47 return reinterpret_cast<T&>(v);
48 }
49 template <class T>
51 {
52 return reinterpret_cast<T*>(this);
53 }
54 bool isSet() const { return v; }
55
56 private:
57 void* v = nullptr; // We use only pointers anyway, and since cl_event and cudaEvent_t and hipEvent_t are actually pointers, we can cast them to deviceEvent (void*) this way.
58};
59
61{
62 public:
64 virtual ~threadContext();
65};
66
67} // namespace gpu_reconstruction_kernels
68
70{
71 public:
73
76
87
88 template <class T, int32_t I = 0>
90 using t = T;
91 static constexpr int32_t i = I;
92 };
93
94 struct krnlExec {
95 constexpr krnlExec(uint32_t b, uint32_t t, int32_t s, GPUReconstruction::krnlDeviceType d = GPUReconstruction::krnlDeviceType::Auto) : nBlocks(b), nThreads(t), stream(s), device(d), step(GPUDataTypes::RecoStep::NoRecoStep) {}
96 constexpr krnlExec(uint32_t b, uint32_t t, int32_t s, GPUDataTypes::RecoStep st) : nBlocks(b), nThreads(t), stream(s), device(GPUReconstruction::krnlDeviceType::Auto), step(st) {}
97 constexpr krnlExec(uint32_t b, uint32_t t, int32_t s, GPUReconstruction::krnlDeviceType d, GPUDataTypes::RecoStep st) : nBlocks(b), nThreads(t), stream(s), device(d), step(st) {}
98 uint32_t nBlocks;
99 uint32_t nThreads;
100 int32_t stream;
103 };
105 constexpr krnlRunRange() = default;
106 constexpr krnlRunRange(uint32_t v) : index(v) {}
107 uint32_t index = 0;
108 };
109 struct krnlEvent {
110 constexpr krnlEvent(deviceEvent* e = nullptr, deviceEvent* el = nullptr, int32_t n = 1) : ev(e), evList(el), nEvents(n) {}
113 int32_t nEvents;
114 };
115
117 krnlProperties(int32_t t = 0, int32_t b = 1, int32_t b2 = 0) : nThreads(t), minBlocks(b), forceBlocks(b2) {}
118 uint32_t nThreads;
119 uint32_t minBlocks;
120 uint32_t forceBlocks;
121 uint32_t total() { return forceBlocks ? forceBlocks : (nThreads * minBlocks); }
122 };
123
124 struct krnlSetup {
125 krnlSetup(const krnlExec& xx, const krnlRunRange& yy = {0}, const krnlEvent& zz = {nullptr, nullptr, 0}) : x(xx), y(yy), z(zz) {}
129 };
130
131 struct krnlSetupTime : public krnlSetup {
132 double& t;
133 };
134
135 template <class T, int32_t I = 0, typename... Args>
138 std::tuple<typename std::conditional<(sizeof(Args) > sizeof(void*)), const Args&, const Args>::type...> v;
139 };
140
141 template <class T, class S>
142 class KernelInterface : public S
143 {
144 public:
145 template <typename... Args>
146 KernelInterface(const Args&... args) : S(args...)
147 {
148 }
149
150 protected:
151 virtual void runKernelVirtual(const int num, const void* args);
152 };
153
154 // Threading
155 int32_t getNKernelHostThreads(bool splitCores);
158 uint32_t SetAndGetNActiveThreadsOuterLoop(bool condition, uint32_t max);
159 void runParallelOuterLoop(bool doGPU, uint32_t nThreads, std::function<void(uint32_t)> lambda);
160 void SetNActiveThreads(int32_t n);
161
162 // Interface to query name of a kernel
163 template <class T, int32_t I>
164 static const char* GetKernelName();
165 const std::string& GetKernelName(int32_t i) const { return mKernelNames[i]; }
166 template <class T, int32_t I = 0>
167 static uint32_t GetKernelNum();
168
169 // Public queries for timers
172
173 template <class T>
174 void AddGPUEvents(T*& events);
175
176 virtual std::unique_ptr<threadContext> GetThreadContext() override;
177
178 const GPUDefParameters& getGPUParameters(bool doGPU) const override { return *(doGPU ? mParDevice : mParCPU); }
179
180 protected:
182
183 static const std::vector<std::string> mKernelNames;
184
185 int32_t mActiveHostKernelThreads = 0; // Number of currently active threads on the host for kernels
186 uint32_t mNActiveThreadsOuterLoop = 1; // Number of threads currently running an outer loop
187
188 std::vector<std::vector<deviceEvent>> mEvents;
189
190 // Timer related stuff
191 struct timerMeta {
192 std::unique_ptr<HighResTimer[]> timer;
193 std::string name;
194 int32_t num; // How many parallel instances to sum up (CPU threads / GPU streams)
195 int32_t type; // 0 = kernel, 1 = CPU step, 2 = DMA transfer
196 uint32_t count; // How often was the timer queried
197 RecoStep step; // Which RecoStep is this
198 size_t memSize; // Memory size for memory bandwidth computation
199 };
200
202
203 std::vector<std::unique_ptr<timerMeta>> mTimers;
206 template <class T, int32_t I = 0>
207 HighResTimer& getKernelTimer(RecoStep step, int32_t num = 0, size_t addMemorySize = 0, bool increment = true);
208 template <class T, int32_t J = -1>
209 HighResTimer& getTimer(const char* name, int32_t num = -1);
210
213
214 private:
215 uint32_t getNextTimerId();
216 timerMeta* getTimerById(uint32_t id, bool increment = true);
217 timerMeta* insertTimer(uint32_t id, std::string&& name, int32_t J, int32_t num, int32_t type, RecoStep step);
218
219 static std::atomic_flag mTimerFlag;
220};
221
222template <class T>
224{
225 mEvents.emplace_back(std::vector<deviceEvent>(sizeof(T) / sizeof(deviceEvent)));
226 events = (T*)mEvents.back().data();
227}
228
229template <class T, int32_t I>
230HighResTimer& GPUReconstructionProcessing::getKernelTimer(RecoStep step, int32_t num, size_t addMemorySize, bool increment)
231{
232 static int32_t id = getNextTimerId();
233 timerMeta* timer = getTimerById(id, increment);
234 if (timer == nullptr) {
235 timer = insertTimer(id, GetKernelName<T, I>(), -1, NSECTORS, 0, step);
236 }
237 if (addMemorySize) {
238 timer->memSize += addMemorySize;
239 }
240 if (num < 0 || num >= timer->num) {
241 throw std::runtime_error("Invalid timer requested");
242 }
243 return timer->timer[num];
244}
245
246template <class T, int32_t J>
248{
249 static int32_t id = getNextTimerId();
250 timerMeta* timer = getTimerById(id);
251 if (timer == nullptr) {
252 int32_t max = std::max<int32_t>({mMaxHostThreads, GPUCA_MAX_STREAMS});
253 timer = insertTimer(id, name, J, max, 1, RecoStep::NoRecoStep);
254 }
255 if (num == -1) {
257 }
258 if (num < 0 || num >= timer->num) {
259 throw std::runtime_error("Invalid timer requested");
260 }
261 return timer->timer[num];
262}
263
264} // namespace o2::gpu
265
266#endif
int32_t i
#define GPUCA_MAX_STREAMS
double num
benchmark::State & st
static constexpr int32_t N_RECO_STEPS
static constexpr int32_t N_GENERAL_STEPS
virtual void runKernelVirtual(const int num, const void *args)
HighResTimer & getGeneralStepTimer(GeneralStep step)
RecoStepTimerMeta mTimersRecoSteps[GPUDataTypes::N_RECO_STEPS]
static const char * GetKernelName()
void runParallelOuterLoop(bool doGPU, uint32_t nThreads, std::function< void(uint32_t)> lambda)
std::vector< std::vector< deviceEvent > > mEvents
std::vector< std::unique_ptr< timerMeta > > mTimers
HighResTimer mTimersGeneralSteps[GPUDataTypes::N_GENERAL_STEPS]
HighResTimer & getKernelTimer(RecoStep step, int32_t num=0, size_t addMemorySize=0, bool increment=true)
const std::string & GetKernelName(int32_t i) const
static const std::vector< std::string > mKernelNames
virtual std::unique_ptr< threadContext > GetThreadContext() override
const GPUDefParameters & getGPUParameters(bool doGPU) const override
uint32_t SetAndGetNActiveThreadsOuterLoop(bool condition, uint32_t max)
HighResTimer & getTimer(const char *name, int32_t num=-1)
int32_t getGeneralStepNum(GeneralStep step, bool validCheck=true)
static constexpr uint32_t NSECTORS
int32_t getRecoStepNum(RecoStep step, bool validCheck=true)
GLdouble n
Definition glcorearb.h:1982
GLint GLenum GLint x
Definition glcorearb.h:403
const GLdouble * v
Definition glcorearb.h:832
GLuint index
Definition glcorearb.h:781
GLuint const GLchar * name
Definition glcorearb.h:781
GLdouble f
Definition glcorearb.h:310
GLboolean GLboolean GLboolean b
Definition glcorearb.h:1233
GLint GLint GLsizei GLint GLenum GLenum type
Definition glcorearb.h:275
GLuint GLfloat * val
Definition glcorearb.h:1582
GLuint GLuint stream
Definition glcorearb.h:1806
GLdouble GLdouble GLdouble z
Definition glcorearb.h:843
constexpr krnlEvent(deviceEvent *e=nullptr, deviceEvent *el=nullptr, int32_t n=1)
constexpr krnlExec(uint32_t b, uint32_t t, int32_t s, GPUDataTypes::RecoStep st)
constexpr krnlExec(uint32_t b, uint32_t t, int32_t s, GPUReconstruction::krnlDeviceType d=GPUReconstruction::krnlDeviceType::Auto)
constexpr krnlExec(uint32_t b, uint32_t t, int32_t s, GPUReconstruction::krnlDeviceType d, GPUDataTypes::RecoStep st)
krnlSetup(const krnlExec &xx, const krnlRunRange &yy={0}, const krnlEvent &zz={nullptr, nullptr, 0})
constexpr size_t max