Project
Loading...
Searching...
No Matches
GPUTPCNNClusterizerHost.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
16
18#include "GPUTPCNNClusterizer.h"
19#include "GPUSettings.h"
21#include "GPUReconstruction.h"
22#include "GPUTPCGeometry.h"
24#include "clusterFinderDefs.h"
25
26#ifdef GPUCA_HAS_ONNX
27#include <onnxruntime_cxx_api.h>
28#endif
29
30using namespace o2::gpu;
31
32void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings, bool useDeterministicMode)
33{
34 std::string class_model_path = settings.nnClassificationPath, reg_model_path = settings.nnRegressionPath;
35 std::vector<std::string> reg_model_paths_local;
36 std::vector<std::string> evalMode = o2::utils::Str::tokenize(settings.nnEvalMode, ':');
37
38 if (settings.nnLoadFromCCDB) {
39 reg_model_path = settings.nnLocalFolder + "/net_regression_c1.onnx"; // Needs to be set identical to NeuralNetworkClusterizer.cxx, otherwise the networks might be loaded from the wrong place
40 if (evalMode[0] == "c1") {
41 class_model_path = settings.nnLocalFolder + "/net_classification_c1.onnx";
42 } else if (evalMode[0] == "c2") {
43 class_model_path = settings.nnLocalFolder + "/net_classification_c2.onnx";
44 }
45
46 if (evalMode[1] == "r2") {
47 reg_model_path += ":" + settings.nnLocalFolder + "/net_regression_c2.onnx";
48 }
49 }
50
51 mOrtOptions = {
52 {"model-path", class_model_path},
53 {"device-type", settings.nnInferenceDevice},
54 {"allocate-device-memory", std::to_string(settings.nnInferenceAllocateDevMem)},
55 {"intra-op-num-threads", std::to_string(settings.nnInferenceIntraOpNumThreads)},
56 {"inter-op-num-threads", std::to_string(settings.nnInferenceInterOpNumThreads)},
57 {"enable-optimizations", std::to_string(settings.nnInferenceEnableOrtOptimization)},
58 {"deterministic-compute", std::to_string(useDeterministicMode ? 1 : settings.nnInferenceUseDeterministicCompute)}, // TODO: This unfortunately doesn't guarantee determinism (25.07.2025)
59 {"enable-profiling", std::to_string(settings.nnInferenceOrtProfiling)},
60 {"profiling-output-path", settings.nnInferenceOrtProfilingPath},
61 {"logging-level", std::to_string(settings.nnInferenceVerbosity)},
62 {"onnx-environment-name", "c1"}};
63
65 mModelsUsed[0] = true;
66
67 reg_model_paths_local = o2::utils::Str::tokenize(reg_model_path, ':');
68
69 if (!settings.nnClusterizerUseCfRegression) {
70 if (reg_model_paths_local.size() == 1) {
71 mOrtOptions["model-path"] = reg_model_paths_local[0];
72 mOrtOptions["onnx-environment-name"] = "r1";
74 mModelsUsed[1] = true;
75 } else {
76 mOrtOptions["model-path"] = reg_model_paths_local[0];
77 mOrtOptions["onnx-environment-name"] = "r1";
79 mModelsUsed[1] = true;
80 mOrtOptions["model-path"] = reg_model_paths_local[1];
81 mOrtOptions["onnx-environment-name"] = "r2";
83 mModelsUsed[2] = true;
84 }
85 }
86}
87
88void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clustererNN, int32_t maxFragmentLen, int32_t maxAllowedTimebin)
89{
90 clustererNN.mNnClusterizerUseCfRegression = settings.nnClusterizerUseCfRegression;
91 clustererNN.mNnClusterizerSizeInputRow = settings.nnClusterizerSizeInputRow;
92 clustererNN.mNnClusterizerSizeInputPad = settings.nnClusterizerSizeInputPad;
93 clustererNN.mNnClusterizerSizeInputTime = settings.nnClusterizerSizeInputTime;
94 clustererNN.mNnClusterizerFullRowSize = 2 * settings.nnClusterizerSizeInputRow + 1;
95 clustererNN.mNnClusterizerFullPadSize = 2 * settings.nnClusterizerSizeInputPad + 1;
96 clustererNN.mNnClusterizerFullTimeSize = 2 * settings.nnClusterizerSizeInputTime + 1;
100 clustererNN.mNnClusterizerRowTimeSizeFull = clustererNN.mNnClusterizerRowTimeSize + (settings.nnClusterizerAddIndexData ? 3 : 0);
101 clustererNN.mNnClusterizerRowTimeSizeThreads = clustererNN.mNnClusterizerRowTimeSize + (settings.nnClusterizerAddIndexData ? 1 : 0);
102 clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0);
103 // clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW;
104 // clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work
105 // clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW - 1) + 2 * clustererNN.mBoundaryPadding;
106 // clustererNN.mBoundaryMapSize = clustererNN.mBoundaryMapSizeRow * clustererNN.mBoundaryMapSizePadsPerRow;
107 // clustererNN.mIndexLookupSize = 3 * clustererNN.mNnClusterizerChargeArraySize; // local row, pad, time shift from flat index
108 clustererNN.mNnClusterizerAddIndexData = settings.nnClusterizerAddIndexData;
109 clustererNN.mNnClusterizerBatchedMode = settings.nnClusterizerBatchedMode;
110 clustererNN.mNnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue;
111 clustererNN.mNnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold;
112 clustererNN.mNnClusterizerUseClassification = settings.nnClusterizerUseClassification;
113 clustererNN.mNnClusterizerSetDeconvolutionFlags = (bool)settings.nnClusterizerSetDeconvolutionFlags;
114 clustererNN.maxFragmentLen = maxFragmentLen == -1 ? TPC_MAX_FRAGMENT_LEN_GPU : maxFragmentLen;
115 clustererNN.maxAllowedTimebin = maxAllowedTimebin == -1 ? TPC_MAX_FRAGMENT_LEN_GPU : maxAllowedTimebin;
116 if (clustererNN.mNnSigmoidTrafoClassThreshold) {
117 clustererNN.mNnClassThreshold = (float)std::log(settings.nnClassThreshold / (1.f - settings.nnClassThreshold));
118 } else {
119 clustererNN.mNnClassThreshold = settings.nnClassThreshold;
120 }
121 if (settings.nnClusterizerVerbosity < 0) {
122 clustererNN.mNnClusterizerVerbosity = settings.nnInferenceVerbosity;
123 } else {
124 clustererNN.mNnClusterizerVerbosity = settings.nnClusterizerVerbosity;
125 }
126 clustererNN.mNnInferenceInputDType = settings.nnInferenceInputDType.find("32") != std::string::npos;
127 clustererNN.mNnInferenceOutputDType = settings.nnInferenceOutputDType.find("32") != std::string::npos;
129 if (!settings.nnClusterizerUseCfRegression) {
130 if (mModelClass.getNumOutputNodes()[0][1] == 1 || !mModelReg2.isInitialized()) {
132 } else {
135 }
136 }
137}
138
139// void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN)
140// {
141// // Call after init of the clustererNN elements
142// for (int r = 0; r < clustererNN.mBoundaryMapSizeRow; r++) {
143// int8_t skipCheckInRow = 0;
144// for (int p = 0; p < clustererNN.mBoundaryMapSizePadsPerRow; p++) {
145// int32_t i = r * clustererNN.mBoundaryMapSizePadsPerRow + p;
146// clustererNN.mIsBoundary[i] = 1;
147// if (!skipCheckInRow && (p >= clustererNN.mBoundaryPadding || r >= clustererNN.mNnClusterizerSizeInputRow)) {
148// if (r < (GPUTPCGeometry::EndIROC() + clustererNN.mNnClusterizerSizeInputRow)) {
149// clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - clustererNN.mNnClusterizerSizeInputRow)));
150// } else if (r >= (GPUTPCGeometry::EndIROC() + 2 * clustererNN.mNnClusterizerSizeInputRow) && r < (o2::tpc::constants::MAXGLOBALPADROW + 2 * clustererNN.mNnClusterizerSizeInputRow)) {
151// clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - 2 * clustererNN.mNnClusterizerSizeInputRow)));
152// }
153// skipCheckInRow = (clustererNN.mIsBoundary[i] == 1); // No need to check further pads in this row
154// }
155// }
156// }
157// }
158
159// void GPUTPCNNClusterizerHost::createIndexLookup(GPUTPCNNClusterizer& clustererNN)
160// {
161// for (int32_t i = 0; i < clustererNN.mNnClusterizerChargeArraySize; i++) {
162// int32_t r = CAMath::Floor(i / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow;
163// int32_t rest_1 = i % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1));
164// int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad;
165// int32_t t = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime;
166// clustererNN.mIndexLookup[3 * i] = r;
167// clustererNN.mIndexLookup[3 * i + 1] = p;
168// clustererNN.mIndexLookup[3 * i + 2] = t;
169// }
170// }
171
172// MockedOrtAllocator implementation to be able to use volatile assignment
174 MockedOrtAllocator(GPUReconstruction* = nullptr, OrtMemoryInfo* = nullptr);
176
177 void* Alloc(size_t size);
178 void Free(void* p);
179 const OrtMemoryInfo* Info() const;
180 void* Reserve(size_t size);
181 size_t NumAllocations() const;
182 size_t NumReserveAllocations() const;
183
184 void LeakCheck();
185
186 private:
187 MockedOrtAllocator(const MockedOrtAllocator&) = delete;
188 MockedOrtAllocator& operator=(const MockedOrtAllocator&) = delete;
189
190 std::atomic<size_t> memory_inuse{0};
191 std::atomic<size_t> num_allocations{0};
192 std::atomic<size_t> num_reserve_allocations{0};
193 OrtMemoryInfo* mMemoryInfoInternal;
194 GPUReconstruction* mRecInternal;
195};
196
198{
199 OrtAllocator::version = ORT_API_VERSION;
200 OrtAllocator::Alloc = [](OrtAllocator* this_, size_t size) { return static_cast<MockedOrtAllocator*>(this_)->Alloc(size); };
201 OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast<MockedOrtAllocator*>(this_)->Free(p); };
202 OrtAllocator::Info = [](const OrtAllocator* this_) { return static_cast<const MockedOrtAllocator*>(this_)->Info(); };
203 OrtAllocator::Reserve = [](OrtAllocator* this_, size_t size) { return static_cast<MockedOrtAllocator*>(this_)->Reserve(size); };
204 mRecInternal = r;
205 mMemoryInfoInternal = info;
206}
207
209{
210 // Ort::GetApi().ReleaseMemoryInfo(mMemoryInfoInternal);
211 (void)0; // Suppress warning for empty destructor
212}
213
215{
216 LOG(info) << "(ORT) Allocating direct memory of size " << size << " bytes";
217 return mRecInternal->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_STACK);
218}
219
221{
222 LOG(info) << "(ORT) Reserving direct memory of size " << size << " bytes";
223 return mRecInternal->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_STACK);
224}
225
227{
228 // LOG(info) << "(ORT) Freeing volatile memory " << p;
229}
230
231const OrtMemoryInfo* MockedOrtAllocator::Info() const
232{
233 return mMemoryInfoInternal;
234}
235
237{
238 return num_allocations.load();
239}
240
242{
243 return num_reserve_allocations.load();
244}
245
247{
248 if (memory_inuse.load()) {
249 LOG(warning) << "memory leak!!!";
250 }
251}
252
253void GPUTPCNNClusterizerHost::directOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate)
254{
255 mMockedAlloc = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo*)(*memInfo));
256 if (recreate) {
257 Ort::ThrowOnError(Ort::GetApi().UnregisterAllocator((OrtEnv*)(*env), (OrtMemoryInfo*)(*memInfo)));
258 }
259 Ort::ThrowOnError(Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mMockedAlloc.get()));
260 memInfo = (Ort::MemoryInfo*)mMockedAlloc->Info();
261}
262
264{
265 return mMockedAlloc->Info();
266}
267
void init(const GPUSettingsProcessingNNclusterizer &, bool=false)
std::unordered_map< std::string, std::string > mOrtOptions
std::shared_ptr< MockedOrtAllocator > mMockedAlloc
void directOrtAllocator(Ort::Env *, Ort::MemoryInfo *, GPUReconstruction *, bool=false)
void initClusterizer(const GPUSettingsProcessingNNclusterizer &, GPUTPCNNClusterizer &, int32_t=-1, int32_t=-1)
void initOptions(std::unordered_map< std::string, std::string > optionsMap)
std::vector< std::vector< int64_t > > getNumOutputNodes() const
#define TPC_MAX_FRAGMENT_LEN_GPU
GLsizeiptr size
Definition glcorearb.h:659
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLboolean r
Definition glcorearb.h:1233
std::string to_string(gsl::span< T, Size > span)
Definition common.h:52
GPUReconstruction * rec
const OrtMemoryInfo * Info() const
MockedOrtAllocator(GPUReconstruction *=nullptr, OrtMemoryInfo *=nullptr)
static std::vector< std::string > tokenize(const std::string &src, char delim, bool trimToken=true, bool skipEmpty=true)
LOG(info)<< "Compressed in "<< sw.CpuTime()<< " s"