16#ifndef ITSTRACKINGGPU_UTILS_H_
17#define ITSTRACKINGGPU_UTILS_H_
32#include <thrust/device_ptr.h>
34#define THRUST_NAMESPACE thrust::cuda
36#define THRUST_NAMESPACE thrust::hip
43 LOGP(info, __VA_ARGS__); \
44 GPUChkErrS(cudaDeviceSynchronize()); \
57template <
typename T1,
typename T2>
71 GPUd()
ref operator[](
unsigned int idx)
const {
return _data[idx]; }
93 GPUd()
ref operator[](
unsigned int idx)
const {
return _data[idx]; }
110#if defined(__HIPCC__)
111 using Handle = hipStream_t;
113 static constexpr unsigned int DefaultFlag = hipStreamNonBlocking;
114 using Event = hipEvent_t;
115#elif defined(__CUDACC__)
116 using Handle = cudaStream_t;
118 static constexpr unsigned int DefaultFlag = cudaStreamNonBlocking;
119 using Event = cudaEvent_t;
129#if defined(__HIPCC__)
131 GPUChkErrS(hipEventCreateWithFlags(&mEvent, hipEventDisableTiming));
132#elif defined(__CUDACC__)
134 GPUChkErrS(cudaEventCreateWithFlags(&mEvent, cudaEventDisableTiming));
142#if defined(__HIPCC__)
145#elif defined(__CUDACC__)
158#if defined(__HIPCC__)
160#elif defined(__CUDACC__)
166#if defined(__HIPCC__)
168#elif defined(__CUDACC__)
175 Event mEvent{
nullptr};
182 size_t size() const noexcept {
return mStreams.size(); }
190#if defined(__HIPCC__)
192#elif defined(__CUDACC__)
196 for (
auto& s : mStreams) {
203#if defined(__HIPCC__)
204 GPUChkErrS(hipStreamWaitEvent(mStreams[iStream].
get(), mStreams[iEvent].getEvent()));
205#elif defined(__CUDACC__)
206 GPUChkErrS(cudaStreamWaitEvent(mStreams[iStream].
get(), mStreams[iEvent].getEvent()));
211 std::vector<Stream> mStreams;
214#ifdef ITS_MEASURE_GPU_TIME
218 GPUTimer(
const std::string&
name)
224 GPUTimer(Streams& streams,
const std::string&
name)
227 for (
size_t i{0};
i < streams.size(); ++
i) {
228 mStreams.push_back(streams[
i].
get());
235 for (
size_t sta{
start}; sta <
end; ++sta) {
236 mStreams.push_back(streams[sta].
get());
243 mStreams.push_back(
stream.get());
249 for (
size_t i{0};
i < mStreams.size(); ++
i) {
251#if defined(__HIPCC__)
254 GPUChkErrS(hipEventElapsedTime(&ms, mStarts[
i], mStops[
i]));
257#elif defined(__CUDACC__)
260 GPUChkErrS(cudaEventElapsedTime(&ms, mStarts[
i], mStops[
i]));
264 LOGP(info,
"Elapsed time for {}:{} {} ms", mName,
i, ms);
270 mStarts.resize(mStreams.size());
271 mStops.resize(mStreams.size());
272 for (
size_t i{0};
i < mStreams.size(); ++
i) {
273#if defined(__HIPCC__)
277#elif defined(__CUDACC__)
287 std::vector<Stream::Event> mStarts, mStops;
288 std::vector<Stream::Handle> mStreams;
294 template <
typename... Args>
303struct TypedAllocator {
304 using value_type = T;
305 using pointer = thrust::device_ptr<T>;
306 using const_pointer = thrust::device_ptr<const T>;
307 using size_type = std::size_t;
308 using difference_type = std::ptrdiff_t;
310 TypedAllocator() noexcept : mInternalAllocator(
nullptr) {}
311 explicit TypedAllocator(ExternalAllocator*
a) noexcept : mInternalAllocator(
a) {}
313 template <
typename U>
314 TypedAllocator(
const TypedAllocator<U>& o) noexcept : mInternalAllocator(o.mInternalAllocator)
320 void*
raw = mInternalAllocator->allocateStack(
n *
sizeof(T));
321 return thrust::device_pointer_cast(
static_cast<T*
>(
raw));
324 void deallocate(
pointer p, size_type
n)
noexcept
329 void*
raw = thrust::raw_pointer_cast(p);
330 mInternalAllocator->deallocate(
static_cast<char*
>(
raw),
n *
sizeof(T));
333 bool operator==(TypedAllocator
const& o)
const noexcept
335 return mInternalAllocator == o.mInternalAllocator;
337 bool operator!=(TypedAllocator
const& o)
const noexcept
339 return !(*
this == o);
343 ExternalAllocator* mInternalAllocator;
346GPUdii() gpuSpan<const
Vertex> getPrimaryVertices(const
int rof,
347 const
int* roframesPV,
352 const int start_pv_id = roframesPV[rof];
353 const int stop_rof = rof >= nROF - 1 ? nROF : rof + 1;
354 size_t delta =
mask[rof] ? roframesPV[stop_rof] - start_pv_id : 0;
355 return gpuSpan<const Vertex>(&vertices[start_pv_id], delta);
358GPUdii() gpuSpan<const
Vertex> getPrimaryVertices(const
int romin,
360 const
int* roframesPV,
364 const int start_pv_id = roframesPV[romin];
365 const int stop_rof = romax >= nROF - 1 ? nROF : romax + 1;
366 return gpuSpan<const Vertex>(&vertices[start_pv_id], roframesPV[stop_rof] - roframesPV[romin]);
369GPUdii() gpuSpan<const
Cluster> getClustersOnLayer(const
int rof,
372 const
int** roframesClus,
375 if (rof < 0 || rof >= totROFs) {
376 return gpuSpan<const Cluster>();
378 const int start_clus_id{roframesClus[
layer][rof]};
379 const int stop_rof = rof >= totROFs - 1 ? totROFs : rof + 1;
380 const unsigned int delta = roframesClus[
layer][stop_rof] - start_clus_id;
381 return gpuSpan<const Cluster>(&(
clusters[
layer][start_clus_id]), delta);
384GPUdii() gpuSpan<const Tracklet> getTrackletsPerCluster(const
int rof,
387 const
int** roframesClus,
390 if (rof < 0 || rof >= totROFs) {
391 return gpuSpan<const Tracklet>();
393 const int start_clus_id{roframesClus[1][rof]};
394 const int stop_rof = rof >= totROFs - 1 ? totROFs : rof + 1;
395 const unsigned int delta = roframesClus[1][stop_rof] - start_clus_id;
396 return gpuSpan<const Tracklet>(&(
tracklets[
mode][start_clus_id]), delta);
399GPUdii() gpuSpan<
int> getNTrackletsPerCluster(const
int rof,
402 const
int** roframesClus,
405 if (rof < 0 || rof >= totROFs) {
406 return gpuSpan<int>();
408 const int start_clus_id{roframesClus[1][rof]};
409 const int stop_rof = rof >= totROFs - 1 ? totROFs : rof + 1;
410 const unsigned int delta = roframesClus[1][stop_rof] - start_clus_id;
411 return gpuSpan<int>(&(ntracklets[
mode][start_clus_id]), delta);
414GPUdii() gpuSpan<const
int> getNTrackletsPerCluster(const
int rof,
417 const
int** roframesClus,
418 const
int** ntracklets)
420 if (rof < 0 || rof >= totROFs) {
421 return gpuSpan<const int>();
423 const int start_clus_id{roframesClus[1][rof]};
424 const int stop_rof = rof >= totROFs - 1 ? totROFs : rof + 1;
425 const unsigned int delta = roframesClus[1][stop_rof] - start_clus_id;
426 return gpuSpan<const int>(&(ntracklets[
mode][start_clus_id]), delta);
429GPUdii() gpuSpan<
int> getNLinesPerCluster(const
int rof,
431 const
int** roframesClus,
434 if (rof < 0 || rof >= totROFs) {
435 return gpuSpan<int>();
437 const int start_clus_id{roframesClus[1][rof]};
438 const int stop_rof = rof >= totROFs - 1 ? totROFs : rof + 1;
439 const unsigned int delta = roframesClus[1][stop_rof] - start_clus_id;
440 return gpuSpan<int>(&(nlines[start_clus_id]), delta);
443GPUdii() gpuSpan<const
int> getNLinesPerCluster(const
int rof,
445 const
int** roframesClus,
448 if (rof < 0 || rof >= totROFs) {
449 return gpuSpan<const int>();
451 const int start_clus_id{roframesClus[1][rof]};
452 const int stop_rof = rof >= totROFs - 1 ? totROFs : rof + 1;
453 const unsigned int delta = roframesClus[1][stop_rof] - start_clus_id;
454 return gpuSpan<const int>(&(nlines[start_clus_id]), delta);
o2::raw::RawFileWriter * raw
Class for time synchronization of RawReader instances.
HMPID cluster implementation.
const Handle & getStream()
static constexpr Handle DefaultStream
static constexpr unsigned int DefaultFlag
Stream(unsigned int flags=DefaultFlag)
void push_back(const Stream &stream)
void waitEvent(size_t iStream, size_t iEvent)
void sync(bool device=true)
auto & operator[](size_t i)
size_t size() const noexcept
GLuint const GLchar * name
GLenum GLuint GLint GLint layer
GLboolean GLboolean GLboolean GLboolean a
std::pair< T1, T2 > gpuPair
o2::dataformats::Vertex< o2::its::TimeEstBC > Vertex
bool operator!=(const DsChannelId &a, const DsChannelId &b)
std::string to_string(gsl::span< T, Size > span)
GPUd() gpuSpan(const gpuSpan< T > &other)
GPUd() bool empty() const
GPUd() ref operator[](unsigned int idx) const
GPUd() unsigned int size() const
GPUd() bool empty() const
GPUd() ref operator[](unsigned int idx) const
GPUd() unsigned int size() const
VectorOfTObjectPtrs other
std::vector< Cluster > clusters
std::vector< Tracklet64 > tracklets