Project
Loading...
Searching...
No Matches
GPUChainTrackingDebugAndProfiling.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
14
15#include "GPUChainTracking.h"
18#include <map>
19#include <memory>
20#include <string>
21
22#ifdef GPUCA_TRACKLET_CONSTRUCTOR_DO_PROFILE
23#include "bitmapfile.h"
24#endif
25
26#include "GPUTPCClusterFilter.h"
27
28#define PROFILE_MAX_SIZE (100 * 1024 * 1024)
29
30using namespace o2::gpu;
31
32static inline uint32_t RGB(uint8_t r, uint8_t g, uint8_t b) { return (uint32_t)r | ((uint32_t)g << 8) | ((uint32_t)b << 16); }
33
35{
36#ifdef GPUCA_TRACKLET_CONSTRUCTOR_DO_PROFILE
38 processorsShadow()->tpcTrackers[0].mStageAtSync = tmpMem;
39 runKernel<GPUMemClean16>({{BlockCount(), ThreadCount(), -1}}, tmpMem, PROFILE_MAX_SIZE);
40#endif
41 return 0;
42}
43
45{
46#ifdef GPUCA_TRACKLET_CONSTRUCTOR_DO_PROFILE
47 std::unique_ptr<char[]> stageAtSync{new char[PROFILE_MAX_SIZE]};
48 mRec->GPUMemCpy(stageAtSync.get(), processorsShadow()->tpcTrackers[0].mStageAtSync, PROFILE_MAX_SIZE, -1, false);
49
50 FILE* fp = fopen("profile.txt", "w+");
51 FILE* fp2 = fopen("profile.bmp", "w+b");
52
53 const int32_t bmpheight = 8192;
54 BITMAPFILEHEADER bmpFH;
55 BITMAPINFOHEADER bmpIH;
56 memset(&bmpFH, 0, sizeof(bmpFH));
57 memset(&bmpIH, 0, sizeof(bmpIH));
58
59 bmpFH.bfType = 19778; //"BM"
60 bmpFH.bfSize = sizeof(bmpFH) + sizeof(bmpIH) + (ConstructorBlockCount() * ConstructorThreadCount() / 32 * 33 - 1) * bmpheight;
61 bmpFH.bfOffBits = sizeof(bmpFH) + sizeof(bmpIH);
62
63 bmpIH.biSize = sizeof(bmpIH);
64 bmpIH.biWidth = ConstructorBlockCount() * ConstructorThreadCount() / 32 * 33 - 1;
65 bmpIH.biHeight = bmpheight;
66 bmpIH.biPlanes = 1;
67 bmpIH.biBitCount = 32;
68
69 fwrite(&bmpFH, 1, sizeof(bmpFH), fp2);
70 fwrite(&bmpIH, 1, sizeof(bmpIH), fp2);
71
72 int32_t nEmptySync = 0;
73 for (uint32_t i = 0; i < bmpheight * ConstructorBlockCount() * ConstructorThreadCount(); i += ConstructorBlockCount() * ConstructorThreadCount()) {
74 int32_t fEmpty = 1;
75 for (uint32_t j = 0; j < ConstructorBlockCount() * ConstructorThreadCount(); j++) {
76 fprintf(fp, "%d\t", stageAtSync[i + j]);
77 int32_t color = 0;
78 if (stageAtSync[i + j] == 1) {
79 color = RGB(255, 0, 0);
80 }
81 if (stageAtSync[i + j] == 2) {
82 color = RGB(0, 255, 0);
83 }
84 if (stageAtSync[i + j] == 3) {
85 color = RGB(0, 0, 255);
86 }
87 if (stageAtSync[i + j] == 4) {
88 color = RGB(255, 255, 0);
89 }
90 fwrite(&color, 1, sizeof(int32_t), fp2);
91 if (j > 0 && j % 32 == 0) {
92 color = RGB(255, 255, 255);
93 fwrite(&color, 1, 4, fp2);
94 }
95 if (stageAtSync[i + j]) {
96 fEmpty = 0;
97 }
98 }
99 fprintf(fp, "\n");
100 if (fEmpty) {
101 nEmptySync++;
102 } else {
103 nEmptySync = 0;
104 }
105 (void)nEmptySync;
106 // if (nEmptySync == GPUCA_SCHED_ROW_STEP + 2) break;
107 }
108
109 fclose(fp);
110 fclose(fp2);
111#endif
112 return 0;
113}
114
115namespace
116{
117struct GPUChainTrackingMemUsage {
118 void add(size_t n, size_t bound)
119 {
120 nMax = std::max(nMax, n);
121 maxUse = std::max(n / std::max<double>(bound, 1.), maxUse);
122 nSum += n;
123 nBoundSum += bound;
124 count++;
125 }
126 size_t nMax;
127 size_t nSum = 0;
128 size_t nBoundSum = 0;
129 double maxUse = 0.;
130 uint32_t count = 0;
131};
132
133void addToMap(std::string name, std::map<std::string, GPUChainTrackingMemUsage>& map, uint64_t n, uint64_t bound)
134{
135 GPUChainTrackingMemUsage& obj = map.insert({name, {}}).first->second;
136 obj.add(n, bound);
137}
138} // namespace
139
141{
142 std::map<std::string, GPUChainTrackingMemUsage> usageMap;
143 for (int32_t i = 0; i < NSECTORS; i++) {
144#ifdef GPUCA_TPC_GEOMETRY_O2
145 addToMap("TPC Clusterer Sector Peaks", usageMap, processors()->tpcClusterer[i].mPmemory->counters.nPeaks, processors()->tpcClusterer[i].mNMaxPeaks);
146 addToMap("TPC Clusterer Sector Clusters", usageMap, processors()->tpcClusterer[i].mPmemory->counters.nClusters, processors()->tpcClusterer[i].mNMaxClusters);
147#endif
148 addToMap("TPC Sector Start Hits", usageMap, *processors()->tpcTrackers[i].NStartHits(), processors()->tpcTrackers[i].NMaxStartHits());
149 addToMap("TPC Sector Tracklets", usageMap, *processors()->tpcTrackers[i].NTracklets(), processors()->tpcTrackers[i].NMaxTracklets());
150 addToMap("TPC Sector TrackletHits", usageMap, *processors()->tpcTrackers[i].NRowHits(), processors()->tpcTrackers[i].NMaxRowHits());
151 addToMap("TPC Sector Tracks", usageMap, *processors()->tpcTrackers[i].NTracks(), processors()->tpcTrackers[i].NMaxTracks());
152 addToMap("TPC Sector TrackHits", usageMap, *processors()->tpcTrackers[i].NTrackHits(), processors()->tpcTrackers[i].NMaxTrackHits());
153 }
154 addToMap("TPC Clusterer Clusters", usageMap, mRec->MemoryScalers()->nTPCHits, mRec->MemoryScalers()->NTPCClusters(mRec->MemoryScalers()->nTPCdigits));
155 addToMap("TPC Tracks", usageMap, processors()->tpcMerger.NOutputTracks(), processors()->tpcMerger.NMaxTracks());
156 addToMap("TPC TrackHits", usageMap, processors()->tpcMerger.NOutputTrackClusters(), processors()->tpcMerger.NMaxOutputTrackClusters());
157
158 if (mRec->GetProcessingSettings().createO2Output) {
159 addToMap("TPC O2 Tracks", usageMap, processors()->tpcMerger.NOutputTracksTPCO2(), processors()->tpcMerger.NOutputTracksTPCO2());
160 addToMap("TPC O2 ClusRefs", usageMap, processors()->tpcMerger.NOutputClusRefsTPCO2(), processors()->tpcMerger.NOutputClusRefsTPCO2());
161 }
162
163#ifdef GPUCA_TPC_GEOMETRY_O2
164 addToMap("TPC ComprCache HitsAttached", usageMap, processors()->tpcCompressor.mOutput->nAttachedClusters, processors()->tpcCompressor.mMaxTrackClusters);
165 addToMap("TPC ComprCache HitsUnattached", usageMap, processors()->tpcCompressor.mOutput->nUnattachedClusters, processors()->tpcCompressor.mMaxClustersInCache);
166 addToMap("TPC ComprCache Tracks", usageMap, processors()->tpcCompressor.mOutput->nTracks, processors()->tpcCompressor.mMaxTracks);
167#endif
168
169 for (auto& elem : usageMap) {
170 printf("Mem Usage %-30s : %'14zu / %'14zu (%3.0f%% / %3.0f%% / count %3u / max %'14zu)\n", elem.first.c_str(), elem.second.nSum, elem.second.nBoundSum, 100. * elem.second.nSum / std::max<size_t>(1, elem.second.nBoundSum), 100. * elem.second.maxUse, elem.second.count, elem.second.nMax);
171 }
172}
173
175{
176 for (int32_t i = 0; i < NSECTORS; i++) {
177 GPUInfo("MEMREL StartHits NCl %d NTrkl %d", processors()->tpcTrackers[i].NHitsTotal(), *processors()->tpcTrackers[i].NStartHits());
178 GPUInfo("MEMREL Tracklets NCl %d NTrkl %d", processors()->tpcTrackers[i].NHitsTotal(), *processors()->tpcTrackers[i].NTracklets());
179 GPUInfo("MEMREL Tracklets NCl %d NTrkl %d", processors()->tpcTrackers[i].NHitsTotal(), *processors()->tpcTrackers[i].NRowHits());
180 GPUInfo("MEMREL SectorTracks NCl %d NTrk %d", processors()->tpcTrackers[i].NHitsTotal(), *processors()->tpcTrackers[i].NTracks());
181 GPUInfo("MEMREL SectorTrackHits NCl %d NTrkH %d", processors()->tpcTrackers[i].NHitsTotal(), *processors()->tpcTrackers[i].NTrackHits());
182 }
183 GPUInfo("MEMREL Tracks NCl %d NTrk %d", processors()->tpcMerger.NMaxClusters(), processors()->tpcMerger.NOutputTracks());
184 GPUInfo("MEMREL TrackHitss NCl %d NTrkH %d", processors()->tpcMerger.NMaxClusters(), processors()->tpcMerger.NOutputTrackClusters());
185}
186
188{
189#ifdef GPUCA_KERNEL_DEBUGGER_OUTPUT
190 const auto& threadContext = GetThreadContext();
191 if (mRec->IsGPU()) {
192 SetupGPUProcessor(&processors()->debugOutput, false);
193 WriteToConstantMemory(RecoStep::NoRecoStep, (char*)&processors()->debugOutput - (char*)processors(), &processorsShadow()->debugOutput, sizeof(processors()->debugOutput), -1);
194 memset(processors()->debugOutput.memory(), 0, processors()->debugOutput.memorySize() * sizeof(processors()->debugOutput.memory()[0]));
195 }
196 runKernel<GPUMemClean16>({{BlockCount(), ThreadCount(), 0, RecoStep::TPCSectorTracking}}, (mRec->IsGPU() ? processorsShadow() : processors())->debugOutput.memory(), processorsShadow()->debugOutput.memorySize() * sizeof(processors()->debugOutput.memory()[0]));
197#endif
198}
199
201{
202#ifdef GPUCA_KERNEL_DEBUGGER_OUTPUT
203 const auto& threadContext = GetThreadContext();
204 TransferMemoryResourcesToHost(RecoStep::NoRecoStep, &processors()->debugOutput, -1);
205 processors()->debugOutput.Print();
206#endif
207}
208
210{
211 int32_t nTracks = 0, nAttachedClusters = 0, nAttachedClustersFitted = 0, nAdjacentClusters = 0;
212 uint32_t nCls = GetProcessingSettings().doublePipeline ? mIOPtrs.clustersNative->nClustersTotal : GetTPCMerger().NMaxClusters();
213 if (GetProcessingSettings().createO2Output > 1) {
214 nTracks = mIOPtrs.nOutputTracksTPCO2;
215 nAttachedClusters = mIOPtrs.nMergedTrackHits;
216 } else {
217 for (uint32_t k = 0; k < mIOPtrs.nMergedTracks; k++) {
218 if (mIOPtrs.mergedTracks[k].OK()) {
219 nTracks++;
220 nAttachedClusters += mIOPtrs.mergedTracks[k].NClusters();
221 nAttachedClustersFitted += mIOPtrs.mergedTracks[k].NClustersFitted();
222 }
223 }
224 for (uint32_t k = 0; k < nCls; k++) {
225 int32_t attach = mIOPtrs.mergedTrackHitAttachment[k];
227 nAdjacentClusters++;
228 }
229 }
230 }
231
232 char trdText[1024] = "";
233 if (GetRecoSteps() & GPUDataTypes::RecoStep::TRDTracking) {
234 int32_t nTRDTracks = 0;
235 int32_t nTRDTracklets = 0;
236 for (uint32_t k = 0; k < mIOPtrs.nTRDTracks; k++) {
237 if (mIOPtrs.trdTracksO2) {
238 auto& trk = mIOPtrs.trdTracksO2[k];
239 nTRDTracklets += trk.getNtracklets();
240 nTRDTracks += trk.getNtracklets() != 0;
241 } else {
242 auto& trk = mIOPtrs.trdTracks[k];
243 nTRDTracklets += trk.getNtracklets();
244 nTRDTracks += trk.getNtracklets() != 0;
245 }
246 }
247 snprintf(trdText, 1024, " - TRD Tracker reconstructed %d tracks (%d tracklets)", nTRDTracks, nTRDTracklets);
248 }
249 GPUInfo("Output Tracks: %d (%d / %d / %d / %d clusters (fitted / attached / adjacent / total) - %s format)%s", nTracks, nAttachedClustersFitted, nAttachedClusters, nAdjacentClusters, nCls, GetProcessingSettings().createO2Output > 1 ? "O2" : "GPU", trdText);
250}
251
252void GPUChainTracking::SanityCheck()
253{
254 size_t nErrors = 0;
255
256 for (uint32_t i = 0; i < mIOPtrs.nOutputTracksTPCO2; i++) {
257 const auto& trk = mIOPtrs.outputTracksTPCO2[i];
258 const auto& ref = trk.getClusterRef();
259 if (ref.getFirstEntry() > mIOPtrs.nOutputClusRefsTPCO2) {
260 if (nErrors++ < 1000) {
261 GPUError("Invalid getFirst() entry in cluster reference: %u > $u", ref.getFirstEntry(), mIOPtrs.nOutputClusRefsTPCO2);
262 continue;
263 }
264 }
265 if (ref.getFirstEntry() + (ref.getEntries() * 3 + 1) / 2 > mIOPtrs.nOutputClusRefsTPCO2) {
266 if (nErrors++ < 1000) {
267 GPUError("Invalid getEntries() entry in cluster reference: %u > $u", ref.getFirstEntry() + (ref.getEntries() * 3 + 1) / 2, mIOPtrs.nOutputClusRefsTPCO2);
268 continue;
269 }
270 }
271 for (int32_t j = 0; j < trk.getNClusters(); j++) {
272 uint8_t sector, row;
273 uint32_t cl;
274 trk.getClusterReference(mIOPtrs.outputClusRefsTPCO2, j, sector, row, cl);
275 if (sector >= GPUCA_NSECTORS || row >= GPUCA_ROW_COUNT) {
276 if (nErrors++ < 1000) {
277 GPUError("Invalid sector / row %d / %d", (int32_t)sector, (int32_t)row);
278 continue;
279 }
280 }
281 if (cl >= mIOPtrs.clustersNative->nClusters[sector][row]) {
282 if (nErrors++ < 1000) {
283 GPUError("Invalid cluster index %d >= %d", cl, mIOPtrs.clustersNative->nClusters[sector][row]);
284 }
285 }
286 }
287 }
288
289 if (nErrors == 0) {
290 GPUInfo("Sanity check passed");
291 } else {
292 GPUError("Sanity check found %lu errors", nErrors);
293 }
294}
295
296void GPUChainTracking::RunTPCClusterFilter(o2::tpc::ClusterNativeAccess* clusters, std::function<o2::tpc::ClusterNative*(size_t)> allocator, bool applyClusterCuts)
297{
298 GPUTPCClusterFilter clusterFilter(*clusters);
299 o2::tpc::ClusterNative* outputBuffer = nullptr;
300 for (int32_t iPhase = 0; iPhase < 2; iPhase++) {
301 uint32_t countTotal = 0;
302 for (uint32_t iSector = 0; iSector < GPUCA_NSECTORS; iSector++) {
303 for (uint32_t iRow = 0; iRow < GPUCA_ROW_COUNT; iRow++) {
304 uint32_t count = 0;
305 for (uint32_t k = 0; k < clusters->nClusters[iSector][iRow]; k++) {
306 o2::tpc::ClusterNative cl = clusters->clusters[iSector][iRow][k];
307 bool keep = true;
308 if (applyClusterCuts) {
309 keep = keep && cl.qTot > param().rec.tpc.cfQTotCutoff && cl.qMax > param().rec.tpc.cfQMaxCutoff;
310 keep = keep && (!(cl.getFlags() & o2::tpc::ClusterNative::flagSingle) || ((cl.sigmaPadPacked || cl.qMax > param().rec.tpc.cfQMaxCutoffSinglePad) && (cl.sigmaTimePacked || cl.qMax > param().rec.tpc.cfQMaxCutoffSingleTime)));
311 }
312 if (param().tpcCutTimeBin > 0) {
313 keep = keep && cl.getTime() < param().tpcCutTimeBin;
314 }
315 keep = keep && (!GetProcessingSettings().tpcApplyDebugClusterFilter || clusterFilter.filter(iSector, iRow, cl));
316 if (iPhase && keep) {
317 outputBuffer[countTotal] = cl;
318 }
319 count += keep;
320 countTotal += keep;
321 }
322 if (iPhase) {
323 clusters->nClusters[iSector][iRow] = count;
324 }
325 }
326 }
327 if (iPhase) {
328 clusters->clustersLinear = outputBuffer;
329 clusters->setOffsetPtrs();
330 } else {
331 outputBuffer = allocator(countTotal);
332 }
333 }
334}
int32_t i
#define GPUCA_NSECTORS
#define GPUCA_ROW_COUNT
uint32_t j
Definition RawData.h:0
const GPUTPCGMMerger & GetTPCMerger() const
GPUTrackingInOutPointers & mIOPtrs
GPUReconstruction::RecoStepField GetRecoSteps() const
Definition GPUChain.h:67
virtual std::unique_ptr< gpu_reconstruction_kernels::threadContext > GetThreadContext()
Definition GPUChain.h:104
void WriteToConstantMemory(RecoStep step, size_t offset, const void *src, size_t size, int32_t stream=-1, deviceEvent *ev=nullptr)
Definition GPUChain.h:122
uint32_t ThreadCount() const
Definition GPUChain.h:207
GPUConstantMem * processors()
Definition GPUChain.h:80
GPUParam & param()
Definition GPUChain.h:83
void SetupGPUProcessor(T *proc, bool allocate)
Definition GPUChain.h:212
const GPUSettingsProcessing & GetProcessingSettings() const
Definition GPUChain.h:72
GPUReconstructionCPU * mRec
Definition GPUChain.h:75
GPUConstantMem * processorsShadow()
Definition GPUChain.h:81
static constexpr int32_t NSECTORS
Definition GPUChain.h:54
void TransferMemoryResourcesToHost(RecoStep step, GPUProcessor *proc, int32_t stream=-1, bool all=false)
Definition GPUChain.h:118
uint32_t BlockCount() const
Definition GPUChain.h:205
GPUReconstruction * rec()
Definition GPUChain.h:62
virtual size_t GPUMemCpy(void *dst, const void *src, size_t size, int32_t stream, int32_t toGPU, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
void * AllocateUnmanagedMemory(size_t size, int32_t type)
GPUMemorySizeScalers * MemoryScalers()
const GPUSettingsProcessing & GetProcessingSettings() const
GLdouble n
Definition glcorearb.h:1982
GLint GLsizei count
Definition glcorearb.h:399
GLuint color
Definition glcorearb.h:1272
GLuint const GLchar * name
Definition glcorearb.h:781
GLboolean GLboolean GLboolean b
Definition glcorearb.h:1233
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLboolean GLboolean g
Definition glcorearb.h:1233
GLboolean r
Definition glcorearb.h:1233
GLenum GLfloat param
Definition glcorearb.h:271
uint8_t itsSharedClusterMap uint8_t
uint32_t bfSize
Definition bitmapfile.h:17
uint32_t bfOffBits
Definition bitmapfile.h:19
uint16_t bfType
Definition bitmapfile.h:16
uint32_t biWidth
Definition bitmapfile.h:24
uint32_t biSize
Definition bitmapfile.h:23
uint16_t biBitCount
Definition bitmapfile.h:27
uint16_t biPlanes
Definition bitmapfile.h:26
uint32_t biHeight
Definition bitmapfile.h:25
GPUTPCTracker tpcTrackers[GPUCA_NSECTORS]
size_t NTPCClusters(size_t tpcDigits, bool perSector=false)
const o2::tpc::ClusterNativeAccess * clustersNative
const uint32_t * mergedTrackHitAttachment
const GPUTRDTrackGPU * trdTracks
const o2::tpc::TrackTPC * outputTracksTPCO2
const GPUTPCGMMergedTrack * mergedTracks
unsigned int nClusters[constants::MAXSECTOR][constants::MAXGLOBALPADROW]
std::vector< Cluster > clusters
std::vector< int > row