15#define GPUCA_GPUCODE_HOSTONLY
19#include "GPUDefParametersLoad.inc"
26#include <oneapi/tbb.h>
35#include "GPUNoFastMathKernels.h"
37int32_t GPUReconstructionCUDA::genRTC(std::string&
filename, uint32_t& nCompile)
39 std::string rtcparam = std::string(
"#define GPUCA_RTC_CODE\n") +
40 std::string(
mProcessingSettings.rtc.optSpecialCode ?
"#define GPUCA_RTC_SPECIAL_CODE(...) __VA_ARGS__\n" :
"#define GPUCA_RTC_SPECIAL_CODE(...)\n") +
49 std::vector<std::string> kernels;
51 std::string kernelsall;
52 for (uint32_t
i = 0;
i < kernels.size();
i++) {
53 kernelsall += kernels[
i] +
"\n";
57 baseCommand += (getenv(
"O2_GPU_RTC_OVERRIDE_CMD") ? std::string(getenv(
"O2_GPU_RTC_OVERRIDE_CMD")) :
std::
string(_binary_GPUReconstructionCUDArtc_command_start, _binary_GPUReconstructionCUDArtc_command_len));
58 baseCommand += std::string(
" ") + (
mProcessingSettings.rtctech.overrideArchitecture !=
"" ?
mProcessingSettings.rtctech.overrideArchitecture : std::string(_binary_GPUReconstructionCUDArtc_command_arch_start, _binary_GPUReconstructionCUDArtc_command_arch_len));
63 throw std::runtime_error(
"Cannot open launch bounds parameter module file");
65 fseek(fp, 0, SEEK_END);
66 size_t size = ftell(fp);
68 throw std::runtime_error(
"launch bounds parameter file has incorrect size");
70 fseek(fp, 0, SEEK_SET);
72 throw std::runtime_error(
"Error reading launch bounds parameter file");
76 const std::string launchBounds = o2::gpu::internal::GPUDefParametersExport(*
mParDevice,
true);
78 GPUInfo(
"RTC Launch Bounds:\n%s", launchBounds.c_str());
81 char shasource[21], shaparam[21], shacmd[21], shakernels[21], shabounds[21];
83 o2::framework::internal::SHA1(shasource, _binary_GPUReconstructionCUDArtc_src_start, _binary_GPUReconstructionCUDArtc_src_len);
84 o2::framework::internal::SHA1(shaparam, rtcparam.c_str(), rtcparam.size());
85 o2::framework::internal::SHA1(shacmd, baseCommand.c_str(), baseCommand.size());
86 o2::framework::internal::SHA1(shakernels, kernelsall.c_str(), kernelsall.size());
87 o2::framework::internal::SHA1(shabounds, launchBounds.c_str(), launchBounds.size());
91 bool cacheLoaded =
false;
98 mode_t
mask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH;
99 fd = open((
mProcessingSettings.rtctech.cacheFolder +
"/cache.lock").c_str(), O_RDWR | O_CREAT | O_CLOEXEC,
mask);
101 throw std::runtime_error(
"Error opening rtc cache mutex lock file");
104 if (lockf(fd, F_LOCK, 0)) {
105 throw std::runtime_error(
"Error locking rtc cache mutex file");
109 FILE* fp = fopen((
mProcessingSettings.rtctech.cacheFolder +
"/rtc.cuda.cache").c_str(),
"rb");
114 auto checkSHA = [&](
const char* shacmp,
const char*
name) {
115 if (fread(sharead, 1, 20, fp) != 20) {
116 throw std::runtime_error(
"Cache file corrupt");
119 char shaprint1[41], shaprint2[41];
120 for (uint32_t
i = 0;
i < 20;
i++) {
121 sprintf(shaprint1 + 2 *
i,
"%02X ", shacmp[
i]);
122 sprintf(shaprint2 + 2 *
i,
"%02X ", sharead[
i]);
124 GPUInfo(
"SHA for %s: expected %s, read %s",
name, shaprint1, shaprint2);
127 GPUInfo(
"Cache file content outdated (%s)",
name);
132 if (checkSHA(shasource,
"source") ||
133 checkSHA(shaparam,
"param") ||
134 checkSHA(shacmd,
"command line") ||
135 checkSHA(shakernels,
"kernel definitions") ||
136 checkSHA(shabounds,
"launch bounds")) {
139 GPUSettingsProcessingRTC cachedSettings;
140 static_assert(std::is_trivially_copyable_v<GPUSettingsProcessingRTC> ==
true,
"GPUSettingsProcessingRTC must be POD");
141 if (fread(&cachedSettings,
sizeof(cachedSettings), 1, fp) != 1) {
142 throw std::runtime_error(
"Cache file corrupt");
145 GPUInfo(
"Cache file content outdated (rtc parameters)");
149 for (uint32_t
i = 0;
i < nCompile;
i++) {
150 if (fread(&
len,
sizeof(
len), 1, fp) != 1) {
151 throw std::runtime_error(
"Cache file corrupt");
155 throw std::runtime_error(
"Cache file corrupt");
158 if (fp2 ==
nullptr) {
159 throw std::runtime_error(
"Cannot open tmp file");
162 throw std::runtime_error(
"Error writing file");
166 GPUInfo(
"Using RTC cache file");
175 GPUInfo(
"Starting CUDA RTC Compilation");
179 tbb::parallel_for<uint32_t>(0, nCompile, [&](
auto i) {
181 printf(
"Compiling %s\n", (filename +
"_" + std::to_string(i) + mRtcSrcExtension).c_str());
185 throw std::runtime_error(
"Error opening file");
188 std::string kernel =
"extern \"C\" {";
193 const std::string deterministicStr = std::string(deterministic ?
"#define GPUCA_DETERMINISTIC_CODE(det, indet) det\n" :
"#define GPUCA_DETERMINISTIC_CODE(det, indet) indet\n");
195 if (fwrite(deterministicStr.c_str(), 1, deterministicStr.size(), fp) != deterministicStr.size() ||
196 fwrite(rtcparam.c_str(), 1, rtcparam.size(), fp) != rtcparam.size() ||
197 fwrite(launchBounds.c_str(), 1, launchBounds.size(), fp) != launchBounds.size() ||
198 fwrite(_binary_GPUReconstructionCUDArtc_src_start, 1, _binary_GPUReconstructionCUDArtc_src_len, fp) != _binary_GPUReconstructionCUDArtc_src_len ||
199 fwrite(kernel.c_str(), 1, kernel.size(), fp) != kernel.size()) {
200 throw std::runtime_error(
"Error writing file");
203 std::string command = baseCommand;
205 command += std::string(
" ") + std::string(_binary_GPUReconstructionCUDArtc_command_no_fast_math_start, _binary_GPUReconstructionCUDArtc_command_no_fast_math_len);
209 command +=
" &> /dev/null";
211 command +=
" > /dev/null";
214 printf(
"Running command %s\n", command.c_str());
216 if (system(command.c_str())) {
218 printf(
"Source code file: %s",
filename.c_str());
220 throw std::runtime_error(
"Error during CUDA compilation");
222 }, tbb::simple_partitioner());
227 FILE* fp = fopen((
mProcessingSettings.rtctech.cacheFolder +
"/rtc.cuda.cache").c_str(),
"w+b");
229 throw std::runtime_error(
"Cannot open cache file for writing");
231 GPUInfo(
"Storing RTC compilation result in cache file");
233 if (fwrite(shasource, 1, 20, fp) != 20 ||
234 fwrite(shaparam, 1, 20, fp) != 20 ||
235 fwrite(shacmd, 1, 20, fp) != 20 ||
236 fwrite(shakernels, 1, 20, fp) != 20 ||
237 fwrite(shabounds, 1, 20, fp) != 20 ||
239 throw std::runtime_error(
"Error writing cache file");
243 for (uint32_t
i = 0;
i < nCompile;
i++) {
245 if (fp2 ==
nullptr) {
246 throw std::runtime_error(
"Cannot open cuda module file");
248 fseek(fp2, 0, SEEK_END);
249 size_t size = ftell(fp2);
251 fseek(fp2, 0, SEEK_SET);
253 throw std::runtime_error(
"Error reading cuda module file");
257 if (fwrite(&
size,
sizeof(
size), 1, fp) != 1 ||
259 throw std::runtime_error(
"Error writing cache file");
266 if (lockf(fd, F_ULOCK, 0)) {
267 throw std::runtime_error(
"Error unlocking RTC cache mutex file");
double GetCurrentElapsedTime(bool reset=false)
void getRTCKernelCalls(std::vector< std::string > &kernels)
static const char * GetKernelName()
GPUDefParameters * mParDevice
GPUSettingsProcessing mProcessingSettings
GLsizei const GLchar *const * string
GLuint const GLchar * name
GLenum GLenum GLsizei len
Polygon< T > close(Polygon< T > polygon)
Defining DataPointCompositeObject explicitly as copiable.
std::string to_string(gsl::span< T, Size > span)
#define QGET_LD_BINARY_SYMBOLS(filename)