Project
Loading...
Searching...
No Matches
GPUReconstructionCUDAGenRTC.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
15#define GPUCA_GPUCODE_HOSTONLY
16
18#include "GPUParamRTC.h"
19#include "GPUDefParametersLoad.inc"
20#include "GPUKernelsWith1Warp.inc"
21#include <unistd.h>
22#include "Framework/SHA1.h"
23#include <sys/stat.h>
24#include <fcntl.h>
25#include <filesystem>
26#include <algorithm>
27
28#include <oneapi/tbb.h>
29using namespace o2::gpu;
30
32QGET_LD_BINARY_SYMBOLS(GPUReconstructionCUDArtc_src);
33QGET_LD_BINARY_SYMBOLS(GPUReconstructionCUDArtc_command);
34QGET_LD_BINARY_SYMBOLS(GPUReconstructionCUDArtc_command_arch);
35QGET_LD_BINARY_SYMBOLS(GPUReconstructionCUDArtc_command_no_fast_math);
36
37#include "GPUNoFastMathKernels.h"
38
39int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
40{
41 std::string rtcparam = std::string("#define GPUCA_RTC_CODE\n") +
42 std::string(GetProcessingSettings().rtc.optSpecialCode ? "#define GPUCA_RTC_SPECIAL_CODE(...) __VA_ARGS__\n" : "#define GPUCA_RTC_SPECIAL_CODE(...)\n") +
43 std::string(GetProcessingSettings().rtc.optConstexpr ? "#define GPUCA_RTC_CONSTEXPR constexpr\n" : "#define GPUCA_RTC_CONSTEXPR\n") +
44 GPUParamRTC::generateRTCCode(param(), GetProcessingSettings().rtc.optConstexpr);
45 if (filename == "") {
46 filename = "/tmp/o2cagpu_rtc_";
47 }
48 filename += std::to_string(getpid());
49 filename += "_";
50 filename += std::to_string(rand());
51
52 std::vector<std::string> kernels;
53 getRTCKernelCalls(kernels);
54 std::string kernelsall;
55 for (uint32_t i = 0; i < kernels.size(); i++) {
56 kernelsall += kernels[i] + "\n";
57 }
58
59 std::string baseCommand = (GetProcessingSettings().rtctech.prependCommand != "" ? (GetProcessingSettings().rtctech.prependCommand + " ") : "");
60 baseCommand += (getenv("O2_GPU_RTC_OVERRIDE_CMD") ? std::string(getenv("O2_GPU_RTC_OVERRIDE_CMD")) : std::string(_binary_GPUReconstructionCUDArtc_command_start, _binary_GPUReconstructionCUDArtc_command_len));
61 baseCommand += std::string(" ") + (GetProcessingSettings().rtctech.overrideArchitecture != "" ? GetProcessingSettings().rtctech.overrideArchitecture : std::string(_binary_GPUReconstructionCUDArtc_command_arch_start, _binary_GPUReconstructionCUDArtc_command_arch_len));
62
63 if (GetProcessingSettings().rtctech.loadLaunchBoundsFromFile.size()) {
64 FILE* fp = fopen(GetProcessingSettings().rtctech.loadLaunchBoundsFromFile.c_str(), "rb");
65 if (fp == nullptr) {
66 throw std::runtime_error("Cannot open launch bounds parameter module file");
67 }
68 fseek(fp, 0, SEEK_END);
69 size_t size = ftell(fp);
70 if (size != sizeof(*mParDevice)) {
71 throw std::runtime_error("launch bounds parameter file has incorrect size");
72 }
73 fseek(fp, 0, SEEK_SET);
74 if (fread(mParDevice, 1, size, fp) != size) {
75 throw std::runtime_error("Error reading launch bounds parameter file");
76 }
77 fclose(fp);
78 }
79 if constexpr (std::string_view("CUDA") == "HIP") { // Check if we are RTC-compiling for HIP
80 if (GetProcessingSettings().hipOverrideAMDEUSperCU > 0) {
81 mParDevice->par_AMD_EUS_PER_CU = GetProcessingSettings().hipOverrideAMDEUSperCU;
82 } else if (mParDevice->par_AMD_EUS_PER_CU <= 0) {
83 GPUFatal("AMD_EUS_PER_CU not set in the parameters provided for the AMD GPU, you can override this via --PROChipOverrideAMDEUSperCU [n]");
84 }
85 }
86 for (uint32_t i = 0; i < GetNKernels(); i++) {
87 if (std::find(gpuKernelsWith1Warp.begin(), gpuKernelsWith1Warp.end(), GetKernelName(i)) != gpuKernelsWith1Warp.end()) {
89 }
90 }
91 const std::string launchBounds = o2::gpu::internal::GPUDefParametersExport(*mParDevice, true, mParDevice->par_AMD_EUS_PER_CU ? (mParDevice->par_AMD_EUS_PER_CU * mWarpSize) : 0) +
92 "#define GPUCA_WARP_SIZE " + std::to_string(mWarpSize) + "\n";
93 if (GetProcessingSettings().rtctech.printLaunchBounds || GetProcessingSettings().debugLevel >= 3) {
94 GPUInfo("RTC Launch Bounds:\n%s", launchBounds.c_str());
95 if (GetProcessingSettings().rtctech.printLaunchBounds >= 2) {
96 return 1;
97 }
98 }
99
100 const std::string compilerVersions = getBackendVersions();
101
102 char shasource[21], shaparam[21], shacmd[21], shakernels[21], shabounds[21], shaversion[21];
103 if (GetProcessingSettings().rtc.cacheOutput) {
104 o2::framework::internal::SHA1(shasource, _binary_GPUReconstructionCUDArtc_src_start, _binary_GPUReconstructionCUDArtc_src_len);
105 o2::framework::internal::SHA1(shaparam, rtcparam.c_str(), rtcparam.size());
106 o2::framework::internal::SHA1(shacmd, baseCommand.c_str(), baseCommand.size());
107 o2::framework::internal::SHA1(shakernels, kernelsall.c_str(), kernelsall.size());
108 o2::framework::internal::SHA1(shabounds, launchBounds.c_str(), launchBounds.size());
109 o2::framework::internal::SHA1(shaversion, compilerVersions.c_str(), compilerVersions.size());
110 }
111
112 nCompile = GetProcessingSettings().rtc.compilePerKernel ? kernels.size() : 1;
113 bool cacheLoaded = false;
114 int32_t fd = 0;
115 if (GetProcessingSettings().rtc.cacheOutput) {
116 if (GetProcessingSettings().rtctech.cacheFolder != ".") {
117 std::filesystem::create_directories(GetProcessingSettings().rtctech.cacheFolder);
118 }
119 if (GetProcessingSettings().rtctech.cacheMutex) {
120 mode_t mask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH;
121 fd = open((GetProcessingSettings().rtctech.cacheFolder + "/cache.lock").c_str(), O_RDWR | O_CREAT | O_CLOEXEC, mask);
122 if (fd == -1) {
123 throw std::runtime_error("Error opening rtc cache mutex lock file");
124 }
125 fchmod(fd, mask);
126 if (lockf(fd, F_LOCK, 0)) {
127 throw std::runtime_error("Error locking rtc cache mutex file");
128 }
129 }
130
131 FILE* fp = fopen((GetProcessingSettings().rtctech.cacheFolder + "/rtc.cuda.cache").c_str(), "rb");
132 char sharead[20];
133 if (fp) {
134 size_t len;
135 while (true) {
136 auto checkSHA = [&](const char* shacmp, const char* name) {
137 if (fread(sharead, 1, 20, fp) != 20) {
138 throw std::runtime_error("Cache file corrupt");
139 }
140 if (GetProcessingSettings().debugLevel >= 3) {
141 char shaprint1[41], shaprint2[41];
142 for (uint32_t i = 0; i < 20; i++) {
143 sprintf(shaprint1 + 2 * i, "%02X ", shacmp[i]);
144 sprintf(shaprint2 + 2 * i, "%02X ", sharead[i]);
145 }
146 GPUInfo("SHA for %s: expected %s, read %s", name, shaprint1, shaprint2);
147 }
148 if (!GetProcessingSettings().rtctech.ignoreCacheValid && memcmp(sharead, shacmp, 20)) {
149 GPUInfo("Cache file content outdated (%s)", name);
150 return 1;
151 }
152 return 0;
153 };
154 if (checkSHA(shasource, "source") ||
155 checkSHA(shaparam, "param") ||
156 checkSHA(shacmd, "command line") ||
157 checkSHA(shakernels, "kernel definitions") ||
158 checkSHA(shabounds, "launch bounds") ||
159 checkSHA(shaversion, "compiler versions")) {
160 break;
161 }
162 GPUSettingsProcessingRTC cachedSettings;
163 static_assert(std::is_trivially_copyable_v<GPUSettingsProcessingRTC> == true, "GPUSettingsProcessingRTC must be POD");
164 if (fread(&cachedSettings, sizeof(cachedSettings), 1, fp) != 1) {
165 throw std::runtime_error("Cache file corrupt");
166 }
167 if (!GetProcessingSettings().rtctech.ignoreCacheValid && !(cachedSettings == GetProcessingSettings().rtc)) {
168 GPUInfo("Cache file content outdated (rtc parameters)");
169 break;
170 }
171 std::vector<char> buffer;
172 for (uint32_t i = 0; i < nCompile; i++) {
173 if (fread(&len, sizeof(len), 1, fp) != 1) {
174 throw std::runtime_error("Cache file corrupt");
175 }
176 buffer.resize(len);
177 if (fread(buffer.data(), 1, len, fp) != len) {
178 throw std::runtime_error("Cache file corrupt");
179 }
180 FILE* fp2 = fopen((filename + "_" + std::to_string(i) + mRtcBinExtension).c_str(), "w+b");
181 if (fp2 == nullptr) {
182 throw std::runtime_error("Cannot open tmp file");
183 }
184 if (fwrite(buffer.data(), 1, len, fp2) != len) {
185 throw std::runtime_error("Error writing file");
186 }
187 fclose(fp2);
188 }
189 GPUInfo("Using RTC cache file");
190 cacheLoaded = true;
191 break;
192 };
193 fclose(fp);
194 }
195 }
196 if (!cacheLoaded) {
197 if (GetProcessingSettings().debugLevel >= 0) {
198 GPUInfo("Starting CUDA RTC Compilation");
199 }
200 HighResTimer rtcTimer;
201 rtcTimer.ResetStart();
202 tbb::parallel_for<uint32_t>(0, nCompile, [&](auto i) {
203 if (GetProcessingSettings().debugLevel >= 3) {
204 printf("Compiling %s\n", (filename + "_" + std::to_string(i) + mRtcSrcExtension).c_str());
205 }
206 FILE* fp = fopen((filename + "_" + std::to_string(i) + mRtcSrcExtension).c_str(), "w+b");
207 if (fp == nullptr) {
208 throw std::runtime_error("Error opening file");
209 }
210
211 std::string kernel = "extern \"C\" {";
212 kernel += GetProcessingSettings().rtc.compilePerKernel ? kernels[i] : kernelsall;
213 kernel += "}";
214
215 bool deterministic = GetProcessingSettings().rtc.deterministic || (GetProcessingSettings().rtc.compilePerKernel && o2::gpu::internal::noFastMathKernels.find(GetKernelName(i)) != o2::gpu::internal::noFastMathKernels.end());
216 const std::string deterministicStr = std::string(deterministic ? "#define GPUCA_DETERMINISTIC_CODE(det, indet) det\n" : "#define GPUCA_DETERMINISTIC_CODE(det, indet) indet\n");
217
218 if (fwrite(deterministicStr.c_str(), 1, deterministicStr.size(), fp) != deterministicStr.size() ||
219 fwrite(rtcparam.c_str(), 1, rtcparam.size(), fp) != rtcparam.size() ||
220 fwrite(launchBounds.c_str(), 1, launchBounds.size(), fp) != launchBounds.size() ||
221 fwrite(_binary_GPUReconstructionCUDArtc_src_start, 1, _binary_GPUReconstructionCUDArtc_src_len, fp) != _binary_GPUReconstructionCUDArtc_src_len ||
222 fwrite(kernel.c_str(), 1, kernel.size(), fp) != kernel.size()) {
223 throw std::runtime_error("Error writing file");
224 }
225 fclose(fp);
226 std::string command = baseCommand;
227 if (deterministic) {
228 command += std::string(" ") + std::string(_binary_GPUReconstructionCUDArtc_command_no_fast_math_start, _binary_GPUReconstructionCUDArtc_command_no_fast_math_len);
229 }
230 command += " -c " + filename + "_" + std::to_string(i) + mRtcSrcExtension + " -o " + filename + "_" + std::to_string(i) + mRtcBinExtension;
231 if (GetProcessingSettings().debugLevel < 0) {
232 command += " &> /dev/null";
233 } else if (GetProcessingSettings().debugLevel < 2) {
234 command += " > /dev/null";
235 }
236 if (GetProcessingSettings().debugLevel >= 3) {
237 printf("Running command %s\n", command.c_str());
238 }
239 if (system(command.c_str())) {
240 if (GetProcessingSettings().debugLevel >= 3) {
241 printf("Source code file: %s", filename.c_str());
242 }
243 throw std::runtime_error("Error during CUDA compilation");
244 } // clang-format off
245 }, tbb::simple_partitioner()); // clang-format on
246 if (GetProcessingSettings().debugLevel >= 0) {
247 GPUInfo("RTC Compilation finished (%f seconds)", rtcTimer.GetCurrentElapsedTime());
248 }
249 if (GetProcessingSettings().rtc.cacheOutput) {
250 FILE* fp = fopen((GetProcessingSettings().rtctech.cacheFolder + "/rtc.cuda.cache").c_str(), "w+b");
251 if (fp == nullptr) {
252 throw std::runtime_error("Cannot open cache file for writing");
253 }
254 GPUInfo("Storing RTC compilation result in cache file");
255
256 if (fwrite(shasource, 1, 20, fp) != 20 ||
257 fwrite(shaparam, 1, 20, fp) != 20 ||
258 fwrite(shacmd, 1, 20, fp) != 20 ||
259 fwrite(shakernels, 1, 20, fp) != 20 ||
260 fwrite(shabounds, 1, 20, fp) != 20 ||
261 fwrite(shaversion, 1, 20, fp) != 20 ||
262 fwrite(&GetProcessingSettings().rtc, sizeof(GetProcessingSettings().rtc), 1, fp) != 1) {
263 throw std::runtime_error("Error writing cache file");
264 }
265
266 std::vector<char> buffer;
267 for (uint32_t i = 0; i < nCompile; i++) {
268 FILE* fp2 = fopen((filename + "_" + std::to_string(i) + mRtcBinExtension).c_str(), "rb");
269 if (fp2 == nullptr) {
270 throw std::runtime_error("Cannot open cuda module file");
271 }
272 fseek(fp2, 0, SEEK_END);
273 size_t size = ftell(fp2);
274 buffer.resize(size);
275 fseek(fp2, 0, SEEK_SET);
276 if (fread(buffer.data(), 1, size, fp2) != size) {
277 throw std::runtime_error("Error reading cuda module file");
278 }
279 fclose(fp2);
280
281 if (fwrite(&size, sizeof(size), 1, fp) != 1 ||
282 fwrite(buffer.data(), 1, size, fp) != size) {
283 throw std::runtime_error("Error writing cache file");
284 }
285 }
286 fclose(fp);
287 }
288 }
289 if (GetProcessingSettings().rtc.cacheOutput && GetProcessingSettings().rtctech.cacheMutex) {
290 if (lockf(fd, F_ULOCK, 0)) {
291 throw std::runtime_error("Error unlocking RTC cache mutex file");
292 }
293 close(fd);
294 }
295
296 return 0;
297}
int32_t i
double GetCurrentElapsedTime(bool reset=false)
Definition timer.cxx:117
void ResetStart()
Definition timer.cxx:70
static const char * GetKernelName()
const GPUSettingsProcessing & GetProcessingSettings() const
GLsizei const GLchar *const * string
Definition glcorearb.h:809
GLuint buffer
Definition glcorearb.h:655
GLsizeiptr size
Definition glcorearb.h:659
GLuint const GLchar * name
Definition glcorearb.h:781
GLenum GLenum GLsizei len
Definition glcorearb.h:4232
GLenum GLfloat param
Definition glcorearb.h:271
GLint GLuint mask
Definition glcorearb.h:291
Polygon< T > close(Polygon< T > polygon)
Definition Polygon.h:126
std::string to_string(gsl::span< T, Size > span)
Definition common.h:52
std::string filename()
#define QGET_LD_BINARY_SYMBOLS(filename)
int32_t par_LB_maxThreads[$< LIST:LENGTH, $< TARGET_PROPERTY:O2_GPU_KERNELS, O2_GPU_KERNEL_NAMES > >]