Project
Loading...
Searching...
No Matches
GPUReconstructionCUDAGenRTC.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
15#define GPUCA_GPUCODE_HOSTONLY
16
18#include "GPUParamRTC.h"
19#include "GPUDefParametersLoad.inc"
20#include <unistd.h>
21#include "Framework/SHA1.h"
22#include <sys/stat.h>
23#include <fcntl.h>
24#include <filesystem>
25
26#include <oneapi/tbb.h>
27using namespace o2::gpu;
28
30QGET_LD_BINARY_SYMBOLS(GPUReconstructionCUDArtc_src);
31QGET_LD_BINARY_SYMBOLS(GPUReconstructionCUDArtc_command);
32QGET_LD_BINARY_SYMBOLS(GPUReconstructionCUDArtc_command_arch);
33QGET_LD_BINARY_SYMBOLS(GPUReconstructionCUDArtc_command_no_fast_math);
34
35#include "GPUNoFastMathKernels.h"
36
37int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
38{
39 std::string rtcparam = std::string("#define GPUCA_RTC_CODE\n") +
40 std::string(mProcessingSettings.rtc.optSpecialCode ? "#define GPUCA_RTC_SPECIAL_CODE(...) __VA_ARGS__\n" : "#define GPUCA_RTC_SPECIAL_CODE(...)\n") +
41 GPUParamRTC::generateRTCCode(param(), mProcessingSettings.rtc.optConstexpr);
42 if (filename == "") {
43 filename = "/tmp/o2cagpu_rtc_";
44 }
45 filename += std::to_string(getpid());
46 filename += "_";
47 filename += std::to_string(rand());
48
49 std::vector<std::string> kernels;
50 getRTCKernelCalls(kernels);
51 std::string kernelsall;
52 for (uint32_t i = 0; i < kernels.size(); i++) {
53 kernelsall += kernels[i] + "\n";
54 }
55
56 std::string baseCommand = (mProcessingSettings.rtctech.prependCommand != "" ? (mProcessingSettings.rtctech.prependCommand + " ") : "");
57 baseCommand += (getenv("O2_GPU_RTC_OVERRIDE_CMD") ? std::string(getenv("O2_GPU_RTC_OVERRIDE_CMD")) : std::string(_binary_GPUReconstructionCUDArtc_command_start, _binary_GPUReconstructionCUDArtc_command_len));
58 baseCommand += std::string(" ") + (mProcessingSettings.rtctech.overrideArchitecture != "" ? mProcessingSettings.rtctech.overrideArchitecture : std::string(_binary_GPUReconstructionCUDArtc_command_arch_start, _binary_GPUReconstructionCUDArtc_command_arch_len));
59
60 if (mProcessingSettings.rtctech.loadLaunchBoundsFromFile.size()) {
61 FILE* fp = fopen(mProcessingSettings.rtctech.loadLaunchBoundsFromFile.c_str(), "rb");
62 if (fp == nullptr) {
63 throw std::runtime_error("Cannot open launch bounds parameter module file");
64 }
65 fseek(fp, 0, SEEK_END);
66 size_t size = ftell(fp);
67 if (size != sizeof(*mParDevice)) {
68 throw std::runtime_error("launch bounds parameter file has incorrect size");
69 }
70 fseek(fp, 0, SEEK_SET);
71 if (fread(mParDevice, 1, size, fp) != size) {
72 throw std::runtime_error("Error reading launch bounds parameter file");
73 }
74 fclose(fp);
75 }
76 const std::string launchBounds = o2::gpu::internal::GPUDefParametersExport(*mParDevice, true);
77 if (mProcessingSettings.rtctech.printLaunchBounds || mProcessingSettings.debugLevel >= 3) {
78 GPUInfo("RTC Launch Bounds:\n%s", launchBounds.c_str());
79 }
80
81 char shasource[21], shaparam[21], shacmd[21], shakernels[21], shabounds[21];
82 if (mProcessingSettings.rtc.cacheOutput) {
83 o2::framework::internal::SHA1(shasource, _binary_GPUReconstructionCUDArtc_src_start, _binary_GPUReconstructionCUDArtc_src_len);
84 o2::framework::internal::SHA1(shaparam, rtcparam.c_str(), rtcparam.size());
85 o2::framework::internal::SHA1(shacmd, baseCommand.c_str(), baseCommand.size());
86 o2::framework::internal::SHA1(shakernels, kernelsall.c_str(), kernelsall.size());
87 o2::framework::internal::SHA1(shabounds, launchBounds.c_str(), launchBounds.size());
88 }
89
90 nCompile = mProcessingSettings.rtc.compilePerKernel ? kernels.size() : 1;
91 bool cacheLoaded = false;
92 int32_t fd = 0;
93 if (mProcessingSettings.rtc.cacheOutput) {
94 if (mProcessingSettings.rtctech.cacheFolder != ".") {
95 std::filesystem::create_directories(mProcessingSettings.rtctech.cacheFolder);
96 }
97 if (mProcessingSettings.rtctech.cacheMutex) {
98 mode_t mask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH;
99 fd = open((mProcessingSettings.rtctech.cacheFolder + "/cache.lock").c_str(), O_RDWR | O_CREAT | O_CLOEXEC, mask);
100 if (fd == -1) {
101 throw std::runtime_error("Error opening rtc cache mutex lock file");
102 }
103 fchmod(fd, mask);
104 if (lockf(fd, F_LOCK, 0)) {
105 throw std::runtime_error("Error locking rtc cache mutex file");
106 }
107 }
108
109 FILE* fp = fopen((mProcessingSettings.rtctech.cacheFolder + "/rtc.cuda.cache").c_str(), "rb");
110 char sharead[20];
111 if (fp) {
112 size_t len;
113 while (true) {
114 auto checkSHA = [&](const char* shacmp, const char* name) {
115 if (fread(sharead, 1, 20, fp) != 20) {
116 throw std::runtime_error("Cache file corrupt");
117 }
118 if (mProcessingSettings.debugLevel >= 3) {
119 char shaprint1[41], shaprint2[41];
120 for (uint32_t i = 0; i < 20; i++) {
121 sprintf(shaprint1 + 2 * i, "%02X ", shacmp[i]);
122 sprintf(shaprint2 + 2 * i, "%02X ", sharead[i]);
123 }
124 GPUInfo("SHA for %s: expected %s, read %s", name, shaprint1, shaprint2);
125 }
126 if (!mProcessingSettings.rtctech.ignoreCacheValid && memcmp(sharead, shacmp, 20)) {
127 GPUInfo("Cache file content outdated (%s)", name);
128 return 1;
129 }
130 return 0;
131 };
132 if (checkSHA(shasource, "source") ||
133 checkSHA(shaparam, "param") ||
134 checkSHA(shacmd, "command line") ||
135 checkSHA(shakernels, "kernel definitions") ||
136 checkSHA(shabounds, "launch bounds")) {
137 break;
138 }
139 GPUSettingsProcessingRTC cachedSettings;
140 static_assert(std::is_trivially_copyable_v<GPUSettingsProcessingRTC> == true, "GPUSettingsProcessingRTC must be POD");
141 if (fread(&cachedSettings, sizeof(cachedSettings), 1, fp) != 1) {
142 throw std::runtime_error("Cache file corrupt");
143 }
144 if (!mProcessingSettings.rtctech.ignoreCacheValid && !(cachedSettings == mProcessingSettings.rtc)) {
145 GPUInfo("Cache file content outdated (rtc parameters)");
146 break;
147 }
148 std::vector<char> buffer;
149 for (uint32_t i = 0; i < nCompile; i++) {
150 if (fread(&len, sizeof(len), 1, fp) != 1) {
151 throw std::runtime_error("Cache file corrupt");
152 }
153 buffer.resize(len);
154 if (fread(buffer.data(), 1, len, fp) != len) {
155 throw std::runtime_error("Cache file corrupt");
156 }
157 FILE* fp2 = fopen((filename + "_" + std::to_string(i) + mRtcBinExtension).c_str(), "w+b");
158 if (fp2 == nullptr) {
159 throw std::runtime_error("Cannot open tmp file");
160 }
161 if (fwrite(buffer.data(), 1, len, fp2) != len) {
162 throw std::runtime_error("Error writing file");
163 }
164 fclose(fp2);
165 }
166 GPUInfo("Using RTC cache file");
167 cacheLoaded = true;
168 break;
169 };
170 fclose(fp);
171 }
172 }
173 if (!cacheLoaded) {
174 if (mProcessingSettings.debugLevel >= 0) {
175 GPUInfo("Starting CUDA RTC Compilation");
176 }
177 HighResTimer rtcTimer;
178 rtcTimer.ResetStart();
179 tbb::parallel_for<uint32_t>(0, nCompile, [&](auto i) {
180 if (mProcessingSettings.debugLevel >= 3) {
181 printf("Compiling %s\n", (filename + "_" + std::to_string(i) + mRtcSrcExtension).c_str());
182 }
183 FILE* fp = fopen((filename + "_" + std::to_string(i) + mRtcSrcExtension).c_str(), "w+b");
184 if (fp == nullptr) {
185 throw std::runtime_error("Error opening file");
186 }
187
188 std::string kernel = "extern \"C\" {";
189 kernel += mProcessingSettings.rtc.compilePerKernel ? kernels[i] : kernelsall;
190 kernel += "}";
191
192 bool deterministic = mProcessingSettings.rtc.deterministic || (mProcessingSettings.rtc.compilePerKernel && o2::gpu::internal::noFastMathKernels.find(GetKernelName(i)) != o2::gpu::internal::noFastMathKernels.end());
193 const std::string deterministicStr = std::string(deterministic ? "#define GPUCA_DETERMINISTIC_CODE(det, indet) det\n" : "#define GPUCA_DETERMINISTIC_CODE(det, indet) indet\n");
194
195 if (fwrite(deterministicStr.c_str(), 1, deterministicStr.size(), fp) != deterministicStr.size() ||
196 fwrite(rtcparam.c_str(), 1, rtcparam.size(), fp) != rtcparam.size() ||
197 fwrite(launchBounds.c_str(), 1, launchBounds.size(), fp) != launchBounds.size() ||
198 fwrite(_binary_GPUReconstructionCUDArtc_src_start, 1, _binary_GPUReconstructionCUDArtc_src_len, fp) != _binary_GPUReconstructionCUDArtc_src_len ||
199 fwrite(kernel.c_str(), 1, kernel.size(), fp) != kernel.size()) {
200 throw std::runtime_error("Error writing file");
201 }
202 fclose(fp);
203 std::string command = baseCommand;
204 if (deterministic) {
205 command += std::string(" ") + std::string(_binary_GPUReconstructionCUDArtc_command_no_fast_math_start, _binary_GPUReconstructionCUDArtc_command_no_fast_math_len);
206 }
207 command += " -c " + filename + "_" + std::to_string(i) + mRtcSrcExtension + " -o " + filename + "_" + std::to_string(i) + mRtcBinExtension;
208 if (mProcessingSettings.debugLevel < 0) {
209 command += " &> /dev/null";
210 } else if (mProcessingSettings.debugLevel < 2) {
211 command += " > /dev/null";
212 }
213 if (mProcessingSettings.debugLevel >= 3) {
214 printf("Running command %s\n", command.c_str());
215 }
216 if (system(command.c_str())) {
217 if (mProcessingSettings.debugLevel >= 3) {
218 printf("Source code file: %s", filename.c_str());
219 }
220 throw std::runtime_error("Error during CUDA compilation");
221 } // clang-format off
222 }, tbb::simple_partitioner()); // clang-format on
223 if (mProcessingSettings.debugLevel >= 0) {
224 GPUInfo("RTC Compilation finished (%f seconds)", rtcTimer.GetCurrentElapsedTime());
225 }
226 if (mProcessingSettings.rtc.cacheOutput) {
227 FILE* fp = fopen((mProcessingSettings.rtctech.cacheFolder + "/rtc.cuda.cache").c_str(), "w+b");
228 if (fp == nullptr) {
229 throw std::runtime_error("Cannot open cache file for writing");
230 }
231 GPUInfo("Storing RTC compilation result in cache file");
232
233 if (fwrite(shasource, 1, 20, fp) != 20 ||
234 fwrite(shaparam, 1, 20, fp) != 20 ||
235 fwrite(shacmd, 1, 20, fp) != 20 ||
236 fwrite(shakernels, 1, 20, fp) != 20 ||
237 fwrite(shabounds, 1, 20, fp) != 20 ||
238 fwrite(&mProcessingSettings.rtc, sizeof(mProcessingSettings.rtc), 1, fp) != 1) {
239 throw std::runtime_error("Error writing cache file");
240 }
241
242 std::vector<char> buffer;
243 for (uint32_t i = 0; i < nCompile; i++) {
244 FILE* fp2 = fopen((filename + "_" + std::to_string(i) + mRtcBinExtension).c_str(), "rb");
245 if (fp2 == nullptr) {
246 throw std::runtime_error("Cannot open cuda module file");
247 }
248 fseek(fp2, 0, SEEK_END);
249 size_t size = ftell(fp2);
250 buffer.resize(size);
251 fseek(fp2, 0, SEEK_SET);
252 if (fread(buffer.data(), 1, size, fp2) != size) {
253 throw std::runtime_error("Error reading cuda module file");
254 }
255 fclose(fp2);
256
257 if (fwrite(&size, sizeof(size), 1, fp) != 1 ||
258 fwrite(buffer.data(), 1, size, fp) != size) {
259 throw std::runtime_error("Error writing cache file");
260 }
261 }
262 fclose(fp);
263 }
264 }
265 if (mProcessingSettings.rtc.cacheOutput && mProcessingSettings.rtctech.cacheMutex) {
266 if (lockf(fd, F_ULOCK, 0)) {
267 throw std::runtime_error("Error unlocking RTC cache mutex file");
268 }
269 close(fd);
270 }
271
272 return 0;
273}
int32_t i
double GetCurrentElapsedTime(bool reset=false)
Definition timer.cxx:110
void ResetStart()
Definition timer.cxx:63
void getRTCKernelCalls(std::vector< std::string > &kernels)
static const char * GetKernelName()
GPUSettingsProcessing mProcessingSettings
GLsizei const GLchar *const * string
Definition glcorearb.h:809
GLuint buffer
Definition glcorearb.h:655
GLsizeiptr size
Definition glcorearb.h:659
GLuint const GLchar * name
Definition glcorearb.h:781
GLenum GLenum GLsizei len
Definition glcorearb.h:4232
GLenum GLfloat param
Definition glcorearb.h:271
GLint GLuint mask
Definition glcorearb.h:291
Polygon< T > close(Polygon< T > polygon)
Definition Polygon.h:126
Defining DataPointCompositeObject explicitly as copiable.
std::string to_string(gsl::span< T, Size > span)
Definition common.h:52
std::string filename()
#define QGET_LD_BINARY_SYMBOLS(filename)