Project
Loading...
Searching...
No Matches
GPUReconstructionDebug.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
15#include "GPUReconstruction.h"
17#include "GPULogging.h"
18#include "GPUSettings.h"
19
20#include <csignal>
21#include <functional>
22#include <unordered_map>
23#include <mutex>
24#include <filesystem>
25#include <chrono>
26#include <format>
27#include <iostream>
28#include <string>
29
30using namespace o2::gpu;
31
33 std::function<void(int32_t, siginfo_t*, void*)> signalCallback;
34 std::function<void()> debugCallback = nullptr;
35 std::function<void()> reinstallCallback = nullptr;
36 std::unordered_map<int32_t, struct sigaction> oldActions;
37 size_t debugCount = 0;
38 static void globalCallback(int32_t signal, siginfo_t* info, void* ucontext)
39 {
40 GPUReconstruction::mDebugData->signalCallback(signal, info, ucontext);
41 }
42};
43
44std::unique_ptr<GPUReconstruction::debugInternal> GPUReconstruction::mDebugData;
45
47{
48 if (GetProcessingSettings().debugOnFailure) {
49 static std::mutex initMutex;
50 {
51 std::lock_guard<std::mutex> guard(initMutex);
52 if (mDebugData) {
53 GPUFatal("Error handlers for debug dumps already set, cannot set them again");
54 }
55 mDebugData = std::make_unique<debugInternal>();
56 }
57 mDebugEnabled = true;
58 if ((GetProcessingSettings().debugOnFailure & 1) || (GetProcessingSettings().debugOnFailure & 2)) {
59 struct sigaction sa, oldsa;
60 memset(&sa, 0, sizeof(sa));
62 sa.sa_flags = SA_SIGINFO;
63 uint32_t mask = GetProcessingSettings().debugOnFailureSignalMask == (uint32_t)-1 ? ((1 << SIGINT) | (1 << SIGABRT) | (1 << SIGBUS) | (1 << SIGTERM) | (1 << SIGSEGV)) : GetProcessingSettings().debugOnFailureSignalMask;
64 if (mask) {
65 for (uint32_t i = 0; i < sizeof(mask) * 8; i++) {
66 if (mask & (1 << i)) {
67 if (sigaction(i, &sa, &oldsa)) {
68 GPUFatal("Error installing signal handler for error dump on signal %d", i);
69 }
70 mDebugData->oldActions.emplace(i, oldsa);
71 }
72 }
73 }
74
75 mDebugData->signalCallback = [this, &oldActions = mDebugData->oldActions, myAction = std::move(sa)](int32_t signal, siginfo_t* info, void* ucontext) {
76 static std::mutex callbackMutex;
77 std::lock_guard<std::mutex> guard(callbackMutex);
78 if (mDebugData->debugCallback) {
79 GPUInfo("Running debug callback for signal %d", signal);
80 mDebugData->debugCallback();
81 mDebugData->debugCount++;
82 }
83 mDebugData->debugCallback = nullptr;
84 if (!GetProcessingSettings().debugOnFailureNoForwardSignal) {
85 sigaction(signal, &oldActions[signal], nullptr);
86 raise(signal);
87 mDebugData->reinstallCallback = [signal, myAction]() { sigaction(signal, &myAction, nullptr); };
88 }
89 };
90 }
91 }
92}
93
95{
96 if (!mDebugEnabled) {
97 return;
98 }
99 if (mDebugData) {
100 for (auto& it : mDebugData->oldActions) {
101 if (sigaction(it.first, &it.second, nullptr)) {
102 GPUFatal("Error restoring signal handler for signal %d", it.first);
103 }
104 }
105 }
106 mDebugEnabled = false;
107}
108
109void GPUReconstruction::setDebugDumpCallback(std::function<void()>&& callback)
110{
111 if (mMaster) {
112 if (mDebugData->reinstallCallback) {
113 mDebugData->reinstallCallback();
114 mDebugData->reinstallCallback = nullptr;
115 }
116 mMaster->setDebugDumpCallback(std::move(callback));
117 } else if (mDebugEnabled && mDebugData) {
118 mDebugData->debugCallback = callback;
119 }
120}
121
122std::string GPUReconstruction::getDebugFolder(const std::string& prefix)
123{
124 const std::filesystem::path target_dir = GetProcessingSettings().debugOnFailureDirectory;
125
126 std::size_t total_size = 0;
127 std::size_t subfolder_count = 0;
128
129 if (!std::filesystem::exists(target_dir) || !std::filesystem::is_directory(target_dir)) {
130 GPUError("Invalid debugOnFailureDirectory %s", GetProcessingSettings().debugOnFailureDirectory.c_str());
131 return "";
132 }
133
134 for (const auto& entry : std::filesystem::directory_iterator(target_dir)) {
135 if (entry.is_directory()) {
136 subfolder_count++;
137
138 for (const auto& subentry : std::filesystem::directory_iterator(entry.path())) {
139 if (subentry.is_regular_file()) {
140 std::error_code ec;
141 auto size = std::filesystem::file_size(subentry.path(), ec);
142 if (!ec) {
143 total_size += size;
144 }
145 }
146 }
147 }
148 }
149
150 if ((GetProcessingSettings().debugOnFailureMaxFiles && subfolder_count >= GetProcessingSettings().debugOnFailureMaxFiles) || (GetProcessingSettings().debugOnFailureMaxSize && (total_size >> 30) >= GetProcessingSettings().debugOnFailureMaxSize)) {
151 GPUError("Cannot store debug dump files, target storage exceeded: %zu dumps, %zu bytes", subfolder_count, total_size);
152 return "";
153 }
154
155 auto currentTime = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
156 std::ostringstream dateTime;
157 dateTime << std::put_time(std::localtime(&currentTime), "%Y-%m-%d_%H-%M-%S");
158
159 int32_t attempt = 0;
160 std::string outname;
161 while (true) {
162 if (attempt++ >= 512) {
163 GPUError("Error creating debug dump folder");
164 return "";
165 }
166
167 outname = GetProcessingSettings().debugOnFailureDirectory + "/debug_" + prefix + (prefix == "" ? "" : "_") + dateTime.str() + "_" + std::to_string(attempt);
168 std::error_code ec;
169 bool created = std::filesystem::create_directory(outname, ec);
170 if (!ec && created) {
171 break;
172 }
173 }
174
175 GPUInfo("Debug dump to %s", outname.c_str());
176 return outname;
177}
178
180{
181 if (mMaster) {
182 return mMaster->triggerDebugDump();
183 } else if (mDebugEnabled && mDebugData && mDebugData->debugCallback) {
184 GPUInfo("Running triggered debug callback");
185 mDebugData->debugCallback();
186 mDebugData->debugCount++;
187 mDebugData->debugCallback = nullptr;
188 return true;
189 }
190 return false;
191}
192
193GPUReconstructionCPU::debugWriter::debugWriter(std::string filenameCSV, bool markdown, uint32_t statNEvents) : mMarkdown{markdown}, mStatNEvents{statNEvents}
194{
195 if (!filenameCSV.empty()) {
196 streamCSV.open(filenameCSV, std::ios::out | std::ios::app);
197 }
198}
199
200void GPUReconstructionCPU::debugWriter::header()
201{
202 if (streamCSV.is_open() && !streamCSV.tellp()) {
203 streamCSV << "type,count,name,gpu (us),cpu (us),cpu/total,total (us),GB/s,bytes,bytes/call\n";
204 }
205
206 if (mMarkdown) {
207 std::cout << "| | count | name | gpu (us) | cpu (us) | cpu/tot | tot (us) | GB/s | bytes | bytes/call |\n";
208 std::cout << "|---|--------|-------------------------------------------|-----------|-----------|---------|-----------|-----------|---------------|---------------|\n";
209 }
210}
211
212void GPUReconstructionCPU::debugWriter::row(char type, uint32_t count, std::string name, double gpu_time, double cpu_time, double total_time, std::size_t memSize, std::string nEventReport)
213{
214 double scale = 1000000.0 / mStatNEvents;
215
216 if (streamCSV.is_open()) {
217 streamCSV << type << ",";
218 if (count != 0) {
219 streamCSV << count;
220 }
221 streamCSV << "," << name << ",";
222 if (gpu_time != -1.0) {
223 streamCSV << std::format("{:.0f}", gpu_time * scale);
224 }
225 streamCSV << ",";
226 if (cpu_time != -1.0) {
227 streamCSV << std::format("{:.0f}", cpu_time * scale);
228 }
229 streamCSV << ",";
230 if (cpu_time != -1.0 && total_time != -1.0) {
231 streamCSV << std::format("{:.2f}", cpu_time / total_time);
232 }
233 streamCSV << ",";
234 if (total_time != -1.0) {
235 streamCSV << std::format("{:.0f}", total_time * scale);
236 }
237 streamCSV << ",";
238 if (memSize != 0 && count != 0) {
239 streamCSV << std::format("{:.3f},{},{}", memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count);
240 } else {
241 streamCSV << ",,";
242 }
243 streamCSV << std::endl;
244 }
245
246 if (mMarkdown) {
247 std::cout << "| " << type << " | ";
248 if (count != 0) {
249 std::cout << std::format("{:6} |", count);
250 } else {
251 std::cout << " |";
252 }
253 std::cout << std::format(" {:42}|", name);
254 if (gpu_time != -1.0) {
255 std::cout << std::format("{:10.0f} |", gpu_time * scale);
256 } else {
257 std::cout << " |";
258 }
259 if (cpu_time != -1.0) {
260 std::cout << std::format("{:10.0f} |", cpu_time * scale);
261 } else {
262 std::cout << " |";
263 }
264 if (cpu_time != -1.0 && total_time != -1.0) {
265 std::cout << std::format("{:8.2f} |", cpu_time / total_time);
266 } else {
267 std::cout << " |";
268 }
269 if (total_time != -1.0) {
270 std::cout << std::format("{:10.0f} |", total_time * scale);
271 } else {
272 std::cout << " |";
273 }
274 if (memSize != 0 && count != 0) {
275 std::cout << std::format("{:10.3f} |{:14} |{:14} |", memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count);
276 } else {
277 std::cout << " | | |";
278 }
279 std::cout << std::endl;
280 } else {
281 if (name.substr(0, 3) == "GPU") {
282 char bandwidth[256] = "";
283 if (memSize && mStatNEvents && gpu_time != 0.0) {
284 snprintf(bandwidth, 256, " (%8.3f GB/s - %'14zu bytes - %'14zu per call)", memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count);
285 }
286 printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type, count, name.c_str(), gpu_time * scale, bandwidth);
287 } else if (name.substr(0, 3) == "TPC") {
288 std::size_t n = name.find('(');
289 std::string basename = name.substr(0, n - 1);
290 std::string postfix = name.substr(n + 1, name.size() - n - 2);
291 if (total_time != -1.0) {
292 printf("Execution Time: Step : %11s %38s Time: %'10.0f us %64s ( Total Time : %'14.0f us, CPU Time : %'14.0f us, %'7.2fx )\n", postfix.c_str(),
293 basename.c_str(), gpu_time * scale, "", total_time * scale, cpu_time * scale, cpu_time / total_time);
294 } else {
295 printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", count, postfix.c_str(), basename.c_str(), gpu_time * scale,
296 memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count);
297 }
298 } else if (name == "Prepare") {
299 printf("Execution Time: General Step : %50s Time: %'10.0f us\n", name.c_str(), gpu_time * scale);
300 } else if (name == "Wall") {
301 if (gpu_time != -1.0) {
302 printf("Execution Time: Total : %50s Time: %'10.0f us%s\n", "Total Kernel", gpu_time * scale, nEventReport.c_str());
303 }
304 printf("Execution Time: Total : %50s Time: %'10.0f us ( CPU Time : %'10.0f us, %7.2fx ) %s\n", "Total Wall", total_time * scale, cpu_time * scale, cpu_time / total_time, nEventReport.c_str());
305 }
306 }
307}
int32_t i
std::string getDebugFolder(const std::string &prefix="")
void setDebugDumpCallback(std::function< void()> &&callback=std::function< void()>(nullptr))
static std::unique_ptr< debugInternal > mDebugData
const GPUSettingsProcessing & GetProcessingSettings() const
GLdouble n
Definition glcorearb.h:1982
GLint GLsizei count
Definition glcorearb.h:399
GLuint entry
Definition glcorearb.h:5735
GLsizeiptr size
Definition glcorearb.h:659
GLuint const GLchar * name
Definition glcorearb.h:781
GLint GLint GLsizei GLint GLenum GLenum type
Definition glcorearb.h:275
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLint GLuint mask
Definition glcorearb.h:291
std::string to_string(gsl::span< T, Size > span)
Definition common.h:52
std::function< void(int32_t, siginfo_t *, void *)> signalCallback
static void globalCallback(int32_t signal, siginfo_t *info, void *ucontext)
std::unordered_map< int32_t, struct sigaction > oldActions