Project
Loading...
Searching...
No Matches
GPUTPCNNClusterizerHost.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
16
18#include "GPUTPCNNClusterizer.h"
19#include "GPUSettings.h"
21#include "GPUReconstruction.h"
22#include "GPUTPCGeometry.h"
24
25#ifdef GPUCA_HAS_ONNX
26#include <onnxruntime_cxx_api.h>
27#endif
28
29using namespace o2::gpu;
30
31void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings, bool useDeterministicMode)
32{
33 std::string class_model_path = settings.nnClassificationPath, reg_model_path = settings.nnRegressionPath;
34 std::vector<std::string> reg_model_paths_local;
35 std::vector<std::string> evalMode = o2::utils::Str::tokenize(settings.nnEvalMode, ':');
36
37 if (settings.nnLoadFromCCDB) {
38 reg_model_path = settings.nnLocalFolder + "/net_regression_c1.onnx"; // Needs to be set identical to NeuralNetworkClusterizer.cxx, otherwise the networks might be loaded from the wrong place
39 if (evalMode[0] == "c1") {
40 class_model_path = settings.nnLocalFolder + "/net_classification_c1.onnx";
41 } else if (evalMode[0] == "c2") {
42 class_model_path = settings.nnLocalFolder + "/net_classification_c2.onnx";
43 }
44
45 if (evalMode[1] == "r2") {
46 reg_model_path += ":" + settings.nnLocalFolder + "/net_regression_c2.onnx";
47 }
48 }
49
50 mOrtOptions = {
51 {"model-path", class_model_path},
52 {"device-type", settings.nnInferenceDevice},
53 {"allocate-device-memory", std::to_string(settings.nnInferenceAllocateDevMem)},
54 {"intra-op-num-threads", std::to_string(settings.nnInferenceIntraOpNumThreads)},
55 {"inter-op-num-threads", std::to_string(settings.nnInferenceInterOpNumThreads)},
56 {"enable-optimizations", std::to_string(settings.nnInferenceEnableOrtOptimization)},
57 {"deterministic-compute", std::to_string(useDeterministicMode ? 1 : settings.nnInferenceUseDeterministicCompute)}, // TODO: This unfortunately doesn't guarantee determinism (25.07.2025)
58 {"enable-profiling", std::to_string(settings.nnInferenceOrtProfiling)},
59 {"profiling-output-path", settings.nnInferenceOrtProfilingPath},
60 {"logging-level", std::to_string(settings.nnInferenceVerbosity)},
61 {"onnx-environment-name", "c1"}};
62
64 mModelsUsed[0] = true;
65
66 reg_model_paths_local = o2::utils::Str::tokenize(reg_model_path, ':');
67
68 if (!settings.nnClusterizerUseCfRegression) {
69 if (reg_model_paths_local.size() == 1) {
70 mOrtOptions["model-path"] = reg_model_paths_local[0];
71 mOrtOptions["onnx-environment-name"] = "r1";
73 mModelsUsed[1] = true;
74 } else {
75 mOrtOptions["model-path"] = reg_model_paths_local[0];
76 mOrtOptions["onnx-environment-name"] = "r1";
78 mModelsUsed[1] = true;
79 mOrtOptions["model-path"] = reg_model_paths_local[1];
80 mOrtOptions["onnx-environment-name"] = "r2";
82 mModelsUsed[2] = true;
83 }
84 }
85}
86
87void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clustererNN)
88{
89 clustererNN.mNnClusterizerUseCfRegression = settings.nnClusterizerUseCfRegression;
90 clustererNN.mNnClusterizerSizeInputRow = settings.nnClusterizerSizeInputRow;
91 clustererNN.mNnClusterizerSizeInputPad = settings.nnClusterizerSizeInputPad;
92 clustererNN.mNnClusterizerSizeInputTime = settings.nnClusterizerSizeInputTime;
93 clustererNN.mNnClusterizerFullRowSize = 2 * settings.nnClusterizerSizeInputRow + 1;
94 clustererNN.mNnClusterizerFullPadSize = 2 * settings.nnClusterizerSizeInputPad + 1;
95 clustererNN.mNnClusterizerFullTimeSize = 2 * settings.nnClusterizerSizeInputTime + 1;
99 clustererNN.mNnClusterizerRowTimeSizeFull = clustererNN.mNnClusterizerRowTimeSize + (settings.nnClusterizerAddIndexData ? 3 : 0);
100 clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0);
101 // clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW;
102 // clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work
103 // clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW - 1) + 2 * clustererNN.mBoundaryPadding;
104 // clustererNN.mBoundaryMapSize = clustererNN.mBoundaryMapSizeRow * clustererNN.mBoundaryMapSizePadsPerRow;
105 // clustererNN.mIndexLookupSize = 3 * clustererNN.mNnClusterizerChargeArraySize; // local row, pad, time shift from flat index
106 clustererNN.mNnClusterizerAddIndexData = settings.nnClusterizerAddIndexData;
107 clustererNN.mNnClusterizerBatchedMode = settings.nnClusterizerBatchedMode;
108 clustererNN.mNnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue;
109 clustererNN.mNnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold;
110 clustererNN.mNnClusterizerUseClassification = settings.nnClusterizerUseClassification;
111 clustererNN.mNnClusterizerSetDeconvolutionFlags = (bool)settings.nnClusterizerSetDeconvolutionFlags;
112 if (clustererNN.mNnSigmoidTrafoClassThreshold) {
113 clustererNN.mNnClassThreshold = (float)std::log(settings.nnClassThreshold / (1.f - settings.nnClassThreshold));
114 } else {
115 clustererNN.mNnClassThreshold = settings.nnClassThreshold;
116 }
117 if (settings.nnClusterizerVerbosity < 0) {
118 clustererNN.mNnClusterizerVerbosity = settings.nnInferenceVerbosity;
119 } else {
120 clustererNN.mNnClusterizerVerbosity = settings.nnClusterizerVerbosity;
121 }
122 clustererNN.mNnInferenceInputDType = settings.nnInferenceInputDType.find("32") != std::string::npos;
123 clustererNN.mNnInferenceOutputDType = settings.nnInferenceOutputDType.find("32") != std::string::npos;
125 if (!settings.nnClusterizerUseCfRegression) {
126 if (mModelClass.getNumOutputNodes()[0][1] == 1 || !mModelReg2.isInitialized()) {
128 } else {
131 }
132 }
133}
134
135// void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN)
136// {
137// // Call after init of the clustererNN elements
138// for (int r = 0; r < clustererNN.mBoundaryMapSizeRow; r++) {
139// int8_t skipCheckInRow = 0;
140// for (int p = 0; p < clustererNN.mBoundaryMapSizePadsPerRow; p++) {
141// int32_t i = r * clustererNN.mBoundaryMapSizePadsPerRow + p;
142// clustererNN.mIsBoundary[i] = 1;
143// if (!skipCheckInRow && (p >= clustererNN.mBoundaryPadding || r >= clustererNN.mNnClusterizerSizeInputRow)) {
144// if (r < (GPUTPCGeometry::EndIROC() + clustererNN.mNnClusterizerSizeInputRow)) {
145// clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - clustererNN.mNnClusterizerSizeInputRow)));
146// } else if (r >= (GPUTPCGeometry::EndIROC() + 2 * clustererNN.mNnClusterizerSizeInputRow) && r < (o2::tpc::constants::MAXGLOBALPADROW + 2 * clustererNN.mNnClusterizerSizeInputRow)) {
147// clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - 2 * clustererNN.mNnClusterizerSizeInputRow)));
148// }
149// skipCheckInRow = (clustererNN.mIsBoundary[i] == 1); // No need to check further pads in this row
150// }
151// }
152// }
153// }
154
155// void GPUTPCNNClusterizerHost::createIndexLookup(GPUTPCNNClusterizer& clustererNN)
156// {
157// for (int32_t i = 0; i < clustererNN.mNnClusterizerChargeArraySize; i++) {
158// int32_t r = CAMath::Floor(i / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow;
159// int32_t rest_1 = i % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1));
160// int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad;
161// int32_t t = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime;
162// clustererNN.mIndexLookup[3 * i] = r;
163// clustererNN.mIndexLookup[3 * i + 1] = p;
164// clustererNN.mIndexLookup[3 * i + 2] = t;
165// }
166// }
167
168// MockedOrtAllocator implementation to be able to use volatile assignment
170 MockedOrtAllocator(GPUReconstruction* = nullptr, OrtMemoryInfo* = nullptr);
172
173 void* Alloc(size_t size);
174 void Free(void* p);
175 const OrtMemoryInfo* Info() const;
176 void* Reserve(size_t size);
177 size_t NumAllocations() const;
178 size_t NumReserveAllocations() const;
179
180 void LeakCheck();
181
182 private:
183 MockedOrtAllocator(const MockedOrtAllocator&) = delete;
184 MockedOrtAllocator& operator=(const MockedOrtAllocator&) = delete;
185
186 std::atomic<size_t> memory_inuse{0};
187 std::atomic<size_t> num_allocations{0};
188 std::atomic<size_t> num_reserve_allocations{0};
189 OrtMemoryInfo* mMemoryInfoInternal;
190 GPUReconstruction* mRecInternal;
191};
192
194{
195 OrtAllocator::version = ORT_API_VERSION;
196 OrtAllocator::Alloc = [](OrtAllocator* this_, size_t size) { return static_cast<MockedOrtAllocator*>(this_)->Alloc(size); };
197 OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast<MockedOrtAllocator*>(this_)->Free(p); };
198 OrtAllocator::Info = [](const OrtAllocator* this_) { return static_cast<const MockedOrtAllocator*>(this_)->Info(); };
199 OrtAllocator::Reserve = [](OrtAllocator* this_, size_t size) { return static_cast<MockedOrtAllocator*>(this_)->Reserve(size); };
200 mRecInternal = r;
201 mMemoryInfoInternal = info;
202}
203
205{
206 // Ort::GetApi().ReleaseMemoryInfo(mMemoryInfoInternal);
207 (void)0; // Suppress warning for empty destructor
208}
209
211{
212 LOG(info) << "(ORT) Allocating direct memory of size " << size << " bytes";
213 return mRecInternal->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_STACK);
214}
215
217{
218 LOG(info) << "(ORT) Reserving direct memory of size " << size << " bytes";
219 return mRecInternal->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_STACK);
220}
221
223{
224 // LOG(info) << "(ORT) Freeing volatile memory " << p;
225}
226
227const OrtMemoryInfo* MockedOrtAllocator::Info() const
228{
229 return mMemoryInfoInternal;
230}
231
233{
234 return num_allocations.load();
235}
236
238{
239 return num_reserve_allocations.load();
240}
241
243{
244 if (memory_inuse.load()) {
245 LOG(warning) << "memory leak!!!";
246 }
247}
248
249void GPUTPCNNClusterizerHost::directOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate)
250{
251 mMockedAlloc = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo*)(*memInfo));
252 if (recreate) {
253 Ort::ThrowOnError(Ort::GetApi().UnregisterAllocator((OrtEnv*)(*env), (OrtMemoryInfo*)(*memInfo)));
254 }
255 Ort::ThrowOnError(Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mMockedAlloc.get()));
256 memInfo = (Ort::MemoryInfo*)mMockedAlloc->Info();
257}
258
260{
261 return mMockedAlloc->Info();
262}
263
void init(const GPUSettingsProcessingNNclusterizer &, bool=false)
std::unordered_map< std::string, std::string > mOrtOptions
void initClusterizer(const GPUSettingsProcessingNNclusterizer &, GPUTPCNNClusterizer &)
std::shared_ptr< MockedOrtAllocator > mMockedAlloc
void directOrtAllocator(Ort::Env *, Ort::MemoryInfo *, GPUReconstruction *, bool=false)
void initOptions(std::unordered_map< std::string, std::string > optionsMap)
std::vector< std::vector< int64_t > > getNumOutputNodes() const
GLsizeiptr size
Definition glcorearb.h:659
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLboolean r
Definition glcorearb.h:1233
std::string to_string(gsl::span< T, Size > span)
Definition common.h:52
GPUReconstruction * rec
const OrtMemoryInfo * Info() const
MockedOrtAllocator(GPUReconstruction *=nullptr, OrtMemoryInfo *=nullptr)
static std::vector< std::string > tokenize(const std::string &src, char delim, bool trimToken=true, bool skipEmpty=true)
LOG(info)<< "Compressed in "<< sw.CpuTime()<< " s"