Project
Loading...
Searching...
No Matches
GPUTPCNNClusterizerHost.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
16
18#include "GPUTPCNNClusterizer.h"
19#include "GPUSettings.h"
21#include "GPUReconstruction.h"
22#include "GPUTPCGeometry.h"
24#include "clusterFinderDefs.h"
25
26#ifdef GPUCA_HAS_ONNX
27#include <onnxruntime_cxx_api.h>
28#endif
29
30using namespace o2::gpu;
31
32void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings, bool useDeterministicMode)
33{
34 std::string class_model_path = settings.nnClassificationPath, reg_model_path = settings.nnRegressionPath;
35 std::vector<std::string> reg_model_paths_local;
36 std::vector<std::string> evalMode = o2::utils::Str::tokenize(settings.nnEvalMode, ':');
37
38 if (settings.nnLoadFromCCDB) {
39 reg_model_path = settings.nnLocalFolder + "/net_regression_c1.onnx"; // Needs to be set identical to NeuralNetworkClusterizer.cxx, otherwise the networks might be loaded from the wrong place
40 if (evalMode[0] == "c1") {
41 class_model_path = settings.nnLocalFolder + "/net_classification_c1.onnx";
42 } else if (evalMode[0] == "c2") {
43 class_model_path = settings.nnLocalFolder + "/net_classification_c2.onnx";
44 }
45
46 if (evalMode[1] == "r2") {
47 reg_model_path += ":" + settings.nnLocalFolder + "/net_regression_c2.onnx";
48 }
49 }
50
51 mOrtOptions = {
52 {"model-path", class_model_path},
53 {"device-type", settings.nnInferenceDevice},
54 {"allocate-device-memory", std::to_string(settings.nnInferenceAllocateDevMem)},
55 {"intra-op-num-threads", std::to_string(settings.nnInferenceIntraOpNumThreads)},
56 {"inter-op-num-threads", std::to_string(settings.nnInferenceInterOpNumThreads)},
57 {"enable-optimizations", std::to_string(settings.nnInferenceEnableOrtOptimization)},
58 {"deterministic-compute", std::to_string(useDeterministicMode ? 1 : settings.nnInferenceUseDeterministicCompute)}, // TODO: This unfortunately doesn't guarantee determinism (25.07.2025)
59 {"enable-profiling", std::to_string(settings.nnInferenceOrtProfiling)},
60 {"profiling-output-path", settings.nnInferenceOrtProfilingPath},
61 {"logging-level", std::to_string(settings.nnInferenceVerbosity)},
62 {"onnx-environment-name", "c1"}};
63
65 mModelsUsed[0] = true;
66
67 reg_model_paths_local = o2::utils::Str::tokenize(reg_model_path, ':');
68
69 if (!settings.nnClusterizerUseCfRegression) {
70 if (reg_model_paths_local.size() == 1) {
71 mOrtOptions["model-path"] = reg_model_paths_local[0];
72 mOrtOptions["onnx-environment-name"] = "r1";
74 mModelsUsed[1] = true;
75 } else {
76 mOrtOptions["model-path"] = reg_model_paths_local[0];
77 mOrtOptions["onnx-environment-name"] = "r1";
79 mModelsUsed[1] = true;
80 mOrtOptions["model-path"] = reg_model_paths_local[1];
81 mOrtOptions["onnx-environment-name"] = "r2";
83 mModelsUsed[2] = true;
84 }
85 }
86}
87
88void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clustererNN, int32_t maxFragmentLen, int32_t maxAllowedTimebin)
89{
90 clustererNN.mNnClusterizerUseCfRegression = settings.nnClusterizerUseCfRegression;
91 clustererNN.mNnClusterizerSizeInputRow = settings.nnClusterizerSizeInputRow;
92 clustererNN.mNnClusterizerSizeInputPad = settings.nnClusterizerSizeInputPad;
93 clustererNN.mNnClusterizerSizeInputTime = settings.nnClusterizerSizeInputTime;
94 clustererNN.mNnClusterizerFullRowSize = 2 * settings.nnClusterizerSizeInputRow + 1;
95 clustererNN.mNnClusterizerFullPadSize = 2 * settings.nnClusterizerSizeInputPad + 1;
96 clustererNN.mNnClusterizerFullTimeSize = 2 * settings.nnClusterizerSizeInputTime + 1;
100 clustererNN.mNnClusterizerRowTimeSizeFull = clustererNN.mNnClusterizerRowTimeSize + (settings.nnClusterizerAddIndexData ? 3 : 0);
101 clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0);
102 // clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW;
103 // clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work
104 // clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW - 1) + 2 * clustererNN.mBoundaryPadding;
105 // clustererNN.mBoundaryMapSize = clustererNN.mBoundaryMapSizeRow * clustererNN.mBoundaryMapSizePadsPerRow;
106 // clustererNN.mIndexLookupSize = 3 * clustererNN.mNnClusterizerChargeArraySize; // local row, pad, time shift from flat index
107 clustererNN.mNnClusterizerAddIndexData = settings.nnClusterizerAddIndexData;
108 clustererNN.mNnClusterizerBatchedMode = settings.nnClusterizerBatchedMode;
109 clustererNN.mNnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue;
110 clustererNN.mNnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold;
111 clustererNN.mNnClusterizerUseClassification = settings.nnClusterizerUseClassification;
112 clustererNN.mNnClusterizerSetDeconvolutionFlags = (bool)settings.nnClusterizerSetDeconvolutionFlags;
113 clustererNN.maxFragmentLen = maxFragmentLen == -1 ? TPC_MAX_FRAGMENT_LEN_GPU : maxFragmentLen;
114 clustererNN.maxAllowedTimebin = maxAllowedTimebin == -1 ? TPC_MAX_FRAGMENT_LEN_GPU : maxAllowedTimebin;
115 if (clustererNN.mNnSigmoidTrafoClassThreshold) {
116 clustererNN.mNnClassThreshold = (float)std::log(settings.nnClassThreshold / (1.f - settings.nnClassThreshold));
117 } else {
118 clustererNN.mNnClassThreshold = settings.nnClassThreshold;
119 }
120 if (settings.nnClusterizerVerbosity < 0) {
121 clustererNN.mNnClusterizerVerbosity = settings.nnInferenceVerbosity;
122 } else {
123 clustererNN.mNnClusterizerVerbosity = settings.nnClusterizerVerbosity;
124 }
125 clustererNN.mNnInferenceInputDType = settings.nnInferenceInputDType.find("32") != std::string::npos;
126 clustererNN.mNnInferenceOutputDType = settings.nnInferenceOutputDType.find("32") != std::string::npos;
128 if (!settings.nnClusterizerUseCfRegression) {
129 if (mModelClass.getNumOutputNodes()[0][1] == 1 || !mModelReg2.isInitialized()) {
131 } else {
134 }
135 }
136}
137
138// void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN)
139// {
140// // Call after init of the clustererNN elements
141// for (int r = 0; r < clustererNN.mBoundaryMapSizeRow; r++) {
142// int8_t skipCheckInRow = 0;
143// for (int p = 0; p < clustererNN.mBoundaryMapSizePadsPerRow; p++) {
144// int32_t i = r * clustererNN.mBoundaryMapSizePadsPerRow + p;
145// clustererNN.mIsBoundary[i] = 1;
146// if (!skipCheckInRow && (p >= clustererNN.mBoundaryPadding || r >= clustererNN.mNnClusterizerSizeInputRow)) {
147// if (r < (GPUTPCGeometry::EndIROC() + clustererNN.mNnClusterizerSizeInputRow)) {
148// clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - clustererNN.mNnClusterizerSizeInputRow)));
149// } else if (r >= (GPUTPCGeometry::EndIROC() + 2 * clustererNN.mNnClusterizerSizeInputRow) && r < (o2::tpc::constants::MAXGLOBALPADROW + 2 * clustererNN.mNnClusterizerSizeInputRow)) {
150// clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - 2 * clustererNN.mNnClusterizerSizeInputRow)));
151// }
152// skipCheckInRow = (clustererNN.mIsBoundary[i] == 1); // No need to check further pads in this row
153// }
154// }
155// }
156// }
157
158// void GPUTPCNNClusterizerHost::createIndexLookup(GPUTPCNNClusterizer& clustererNN)
159// {
160// for (int32_t i = 0; i < clustererNN.mNnClusterizerChargeArraySize; i++) {
161// int32_t r = CAMath::Floor(i / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow;
162// int32_t rest_1 = i % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1));
163// int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad;
164// int32_t t = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime;
165// clustererNN.mIndexLookup[3 * i] = r;
166// clustererNN.mIndexLookup[3 * i + 1] = p;
167// clustererNN.mIndexLookup[3 * i + 2] = t;
168// }
169// }
170
171// MockedOrtAllocator implementation to be able to use volatile assignment
173 MockedOrtAllocator(GPUReconstruction* = nullptr, OrtMemoryInfo* = nullptr);
175
176 void* Alloc(size_t size);
177 void Free(void* p);
178 const OrtMemoryInfo* Info() const;
179 void* Reserve(size_t size);
180 size_t NumAllocations() const;
181 size_t NumReserveAllocations() const;
182
183 void LeakCheck();
184
185 private:
186 MockedOrtAllocator(const MockedOrtAllocator&) = delete;
187 MockedOrtAllocator& operator=(const MockedOrtAllocator&) = delete;
188
189 std::atomic<size_t> memory_inuse{0};
190 std::atomic<size_t> num_allocations{0};
191 std::atomic<size_t> num_reserve_allocations{0};
192 OrtMemoryInfo* mMemoryInfoInternal;
193 GPUReconstruction* mRecInternal;
194};
195
197{
198 OrtAllocator::version = ORT_API_VERSION;
199 OrtAllocator::Alloc = [](OrtAllocator* this_, size_t size) { return static_cast<MockedOrtAllocator*>(this_)->Alloc(size); };
200 OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast<MockedOrtAllocator*>(this_)->Free(p); };
201 OrtAllocator::Info = [](const OrtAllocator* this_) { return static_cast<const MockedOrtAllocator*>(this_)->Info(); };
202 OrtAllocator::Reserve = [](OrtAllocator* this_, size_t size) { return static_cast<MockedOrtAllocator*>(this_)->Reserve(size); };
203 mRecInternal = r;
204 mMemoryInfoInternal = info;
205}
206
208{
209 // Ort::GetApi().ReleaseMemoryInfo(mMemoryInfoInternal);
210 (void)0; // Suppress warning for empty destructor
211}
212
214{
215 LOG(info) << "(ORT) Allocating direct memory of size " << size << " bytes";
216 return mRecInternal->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_STACK);
217}
218
220{
221 LOG(info) << "(ORT) Reserving direct memory of size " << size << " bytes";
222 return mRecInternal->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_STACK);
223}
224
226{
227 // LOG(info) << "(ORT) Freeing volatile memory " << p;
228}
229
230const OrtMemoryInfo* MockedOrtAllocator::Info() const
231{
232 return mMemoryInfoInternal;
233}
234
236{
237 return num_allocations.load();
238}
239
241{
242 return num_reserve_allocations.load();
243}
244
246{
247 if (memory_inuse.load()) {
248 LOG(warning) << "memory leak!!!";
249 }
250}
251
252void GPUTPCNNClusterizerHost::directOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate)
253{
254 mMockedAlloc = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo*)(*memInfo));
255 if (recreate) {
256 Ort::ThrowOnError(Ort::GetApi().UnregisterAllocator((OrtEnv*)(*env), (OrtMemoryInfo*)(*memInfo)));
257 }
258 Ort::ThrowOnError(Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mMockedAlloc.get()));
259 memInfo = (Ort::MemoryInfo*)mMockedAlloc->Info();
260}
261
263{
264 return mMockedAlloc->Info();
265}
266
void init(const GPUSettingsProcessingNNclusterizer &, bool=false)
std::unordered_map< std::string, std::string > mOrtOptions
std::shared_ptr< MockedOrtAllocator > mMockedAlloc
void directOrtAllocator(Ort::Env *, Ort::MemoryInfo *, GPUReconstruction *, bool=false)
void initClusterizer(const GPUSettingsProcessingNNclusterizer &, GPUTPCNNClusterizer &, int32_t=-1, int32_t=-1)
void initOptions(std::unordered_map< std::string, std::string > optionsMap)
std::vector< std::vector< int64_t > > getNumOutputNodes() const
#define TPC_MAX_FRAGMENT_LEN_GPU
GLsizeiptr size
Definition glcorearb.h:659
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLboolean r
Definition glcorearb.h:1233
std::string to_string(gsl::span< T, Size > span)
Definition common.h:52
GPUReconstruction * rec
const OrtMemoryInfo * Info() const
MockedOrtAllocator(GPUReconstruction *=nullptr, OrtMemoryInfo *=nullptr)
static std::vector< std::string > tokenize(const std::string &src, char delim, bool trimToken=true, bool skipEmpty=true)
LOG(info)<< "Compressed in "<< sw.CpuTime()<< " s"