25#ifdef GPUCA_TRACKLET_CONSTRUCTOR_DO_PROFILE
29#define PROFILE_MAX_SIZE (100 * 1024 * 1024)
33static inline uint32_t RGB(uint8_t
r, uint8_t
g, uint8_t
b) {
return (uint32_t)
r | ((uint32_t)
g << 8) | ((uint32_t)
b << 16); }
37#ifdef GPUCA_TRACKLET_CONSTRUCTOR_DO_PROFILE
47#ifdef GPUCA_TRACKLET_CONSTRUCTOR_DO_PROFILE
51 FILE* fp = fopen(
"profile.txt",
"w+");
52 FILE* fp2 = fopen(
"profile.bmp",
"w+b");
54 const int32_t bmpheight = 8192;
57 memset(&bmpFH, 0,
sizeof(bmpFH));
58 memset(&bmpIH, 0,
sizeof(bmpIH));
61 bmpFH.
bfSize =
sizeof(bmpFH) +
sizeof(bmpIH) + (ConstructorBlockCount() * ConstructorThreadCount() / 32 * 33 - 1) * bmpheight;
62 bmpFH.
bfOffBits =
sizeof(bmpFH) +
sizeof(bmpIH);
64 bmpIH.
biSize =
sizeof(bmpIH);
65 bmpIH.
biWidth = ConstructorBlockCount() * ConstructorThreadCount() / 32 * 33 - 1;
70 fwrite(&bmpFH, 1,
sizeof(bmpFH), fp2);
71 fwrite(&bmpIH, 1,
sizeof(bmpIH), fp2);
73 int32_t nEmptySync = 0;
74 for (uint32_t
i = 0;
i < bmpheight * ConstructorBlockCount() * ConstructorThreadCount();
i += ConstructorBlockCount() * ConstructorThreadCount()) {
76 for (uint32_t
j = 0;
j < ConstructorBlockCount() * ConstructorThreadCount();
j++) {
77 fprintf(fp,
"%d\t", stageAtSync[
i +
j]);
79 if (stageAtSync[
i +
j] == 1) {
80 color = RGB(255, 0, 0);
82 if (stageAtSync[
i +
j] == 2) {
83 color = RGB(0, 255, 0);
85 if (stageAtSync[
i +
j] == 3) {
86 color = RGB(0, 0, 255);
88 if (stageAtSync[
i +
j] == 4) {
89 color = RGB(255, 255, 0);
91 fwrite(&
color, 1,
sizeof(int32_t), fp2);
92 if (
j > 0 &&
j % 32 == 0) {
93 color = RGB(255, 255, 255);
94 fwrite(&
color, 1, 4, fp2);
96 if (stageAtSync[
i +
j]) {
118struct GPUChainTrackingMemUsage {
119 void add(
size_t n,
size_t bound)
121 nMax = std::max(nMax,
n);
122 maxUse = std::max(
n / std::max<double>(bound, 1.), maxUse);
129 size_t nBoundSum = 0;
134void addToMap(std::string
name, std::map<std::string, GPUChainTrackingMemUsage>& map, uint64_t
n, uint64_t bound)
136 GPUChainTrackingMemUsage& obj = map.insert({
name, {}}).
first->second;
143 std::map<std::string, GPUChainTrackingMemUsage> usageMap;
145#ifdef GPUCA_TPC_GEOMETRY_O2
146 addToMap(
"TPC Clusterer Sector Peaks", usageMap,
processors()->tpcClusterer[
i].mPmemory->counters.nPeaks,
processors()->tpcClusterer[
i].mNMaxPeaks);
147 addToMap(
"TPC Clusterer Sector Clusters", usageMap,
processors()->tpcClusterer[
i].mPmemory->counters.nClusters,
processors()->tpcClusterer[
i].mNMaxClusters);
149 addToMap(
"TPC Sector Start Hits", usageMap, *
processors()->tpcTrackers[
i].NStartHits(),
processors()->tpcTrackers[
i].NMaxStartHits());
150 addToMap(
"TPC Sector Tracklets", usageMap, *
processors()->tpcTrackers[
i].NTracklets(),
processors()->tpcTrackers[
i].NMaxTracklets());
151 addToMap(
"TPC Sector TrackletHits", usageMap, *
processors()->tpcTrackers[
i].NRowHits(),
processors()->tpcTrackers[
i].NMaxRowHits());
152 addToMap(
"TPC Sector Tracks", usageMap, *
processors()->tpcTrackers[
i].NTracks(),
processors()->tpcTrackers[
i].NMaxTracks());
153 addToMap(
"TPC Sector TrackHits", usageMap, *
processors()->tpcTrackers[
i].NTrackHits(),
processors()->tpcTrackers[
i].NMaxTrackHits());
156 addToMap(
"TPC Tracks", usageMap,
processors()->tpcMerger.NMergedTracks(),
processors()->tpcMerger.NMaxTracks());
157 addToMap(
"TPC TrackHits", usageMap,
processors()->tpcMerger.NMergedTrackClusters(),
processors()->tpcMerger.NMaxMergedTrackClusters());
160 addToMap(
"TPC O2 Tracks", usageMap,
processors()->tpcMerger.NOutputTracksTPCO2(),
processors()->tpcMerger.NOutputTracksTPCO2());
161 addToMap(
"TPC O2 ClusRefs", usageMap,
processors()->tpcMerger.NOutputClusRefsTPCO2(),
processors()->tpcMerger.NOutputClusRefsTPCO2());
164#ifdef GPUCA_TPC_GEOMETRY_O2
165 addToMap(
"TPC ComprCache HitsAttached", usageMap,
processors()->tpcCompressor.mOutput->nAttachedClusters,
processors()->tpcCompressor.mMaxTrackClusters);
166 addToMap(
"TPC ComprCache HitsUnattached", usageMap,
processors()->tpcCompressor.mOutput->nUnattachedClusters,
processors()->tpcCompressor.mMaxClustersInCache);
167 addToMap(
"TPC ComprCache Tracks", usageMap,
processors()->tpcCompressor.mOutput->nTracks,
processors()->tpcCompressor.mMaxTracks);
170 for (
auto& elem : usageMap) {
171 printf(
"Mem Usage %-30s : %'14zu / %'14zu (%3.0f%% / %3.0f%% / count %3u / max %'14zu)\n", elem.first.c_str(), elem.second.nSum, elem.second.nBoundSum, 100. * elem.second.nSum / std::max<size_t>(1, elem.second.nBoundSum), 100. * elem.second.maxUse, elem.second.count, elem.second.nMax);
178 GPUInfo(
"MEMREL StartHits NCl %d NTrkl %d",
processors()->tpcTrackers[
i].NHitsTotal(), *
processors()->tpcTrackers[
i].NStartHits());
179 GPUInfo(
"MEMREL Tracklets NCl %d NTrkl %d",
processors()->tpcTrackers[
i].NHitsTotal(), *
processors()->tpcTrackers[
i].NTracklets());
180 GPUInfo(
"MEMREL Tracklets NCl %d NTrkl %d",
processors()->tpcTrackers[
i].NHitsTotal(), *
processors()->tpcTrackers[
i].NRowHits());
181 GPUInfo(
"MEMREL SectorTracks NCl %d NTrk %d",
processors()->tpcTrackers[
i].NHitsTotal(), *
processors()->tpcTrackers[
i].NTracks());
182 GPUInfo(
"MEMREL SectorTrackHits NCl %d NTrkH %d",
processors()->tpcTrackers[
i].NHitsTotal(), *
processors()->tpcTrackers[
i].NTrackHits());
184 GPUInfo(
"MEMREL Tracks NCl %d NTrk %d",
processors()->tpcMerger.NMaxClusters(),
processors()->tpcMerger.NMergedTracks());
185 GPUInfo(
"MEMREL TrackHitss NCl %d NTrkH %d",
processors()->tpcMerger.NMaxClusters(),
processors()->tpcMerger.NMergedTrackClusters());
190#ifdef GPUCA_KERNEL_DEBUGGER_OUTPUT
203#ifdef GPUCA_KERNEL_DEBUGGER_OUTPUT
212 int32_t nTracks = 0, nAttachedClusters = 0, nAttachedClustersFitted = 0, nAdjacentClusters = 0;
225 for (uint32_t k = 0; k < nCls; k++) {
233 char trdText[1024] =
"";
235 int32_t nTRDTracks = 0;
236 int32_t nTRDTracklets = 0;
240 nTRDTracklets += trk.getNtracklets();
241 nTRDTracks += trk.getNtracklets() != 0;
244 nTRDTracklets += trk.getNtracklets();
245 nTRDTracks += trk.getNtracklets() != 0;
248 snprintf(trdText, 1024,
" - TRD Tracker reconstructed %d tracks (%d tracklets)", nTRDTracks, nTRDTracklets);
250 GPUInfo(
"Output Tracks: %d (%d / %d / %d / %d clusters (fitted / attached / adjacent / total) - %s format)%s", nTracks, nAttachedClustersFitted, nAttachedClusters, nAdjacentClusters, nCls,
GetProcessingSettings().createO2Output > 1 ?
"O2" :
"GPU", trdText);
253void GPUChainTracking::SanityCheck()
259 const auto&
ref = trk.getClusterRef();
261 if (nErrors++ < 1000) {
267 if (nErrors++ < 1000) {
272 for (int32_t
j = 0;
j < trk.getNClusters();
j++) {
277 if (nErrors++ < 1000) {
278 GPUError(
"Invalid sector / row %d / %d", (int32_t)sector, (int32_t)
row);
283 if (nErrors++ < 1000) {
291 GPUInfo(
"Sanity check passed");
293 GPUError(
"Sanity check found %lu errors", nErrors);
302 for (int32_t iPhase = 0; iPhase < 2; iPhase++) {
303 uint32_t countTotal = 0;
307 for (uint32_t k = 0; k <
clusters->nClusters[iSector][iRow]; k++) {
310 if (applyClusterCuts) {
317 keep = keep && (!filterType || clusterFilter.filter(iSector, iRow, cl));
318 if (iPhase && keep) {
319 outputBuffer[countTotal] = cl;
330 clusters->clustersLinear = outputBuffer;
333 outputBuffer = allocator(countTotal);
340 out <<
"\nTPC Clusters:\n";
342 out <<
"TPCClusters - Sector " << iSec <<
"\n";
344 out <<
" Row: " <<
i <<
": " <<
clusters->nClusters[iSec][
i] <<
" clusters:\n";
345 for (uint32_t
j = 0;
j <
clusters->nClusters[iSec][
i];
j++) {
346 const auto& cl =
clusters->clusters[iSec][
i][
j];
356 std::vector<uint32_t> sorted(
c.nTracks),
offsets(
c.nTracks);
357 std::iota(sorted.begin(), sorted.end(), 0);
358 auto sorter = [&
c](
const auto a,
const auto b) {
359 return std::tie(
c.sliceA[
a],
c.rowA[
a],
c.timeA[
a],
c.padA[
a],
c.qPtA[
a]) <
360 std::tie(
c.sliceA[
b],
c.rowA[
b],
c.timeA[
b],
c.padA[
b],
c.qPtA[
b]);
362 std::sort(sorted.begin(), sorted.end(), sorter);
364 for (uint32_t
i = 0;
i <
c.nTracks;
i++) {
369 auto sortArray = [&
c, &sorted, &
offsets](
auto*
src,
size_t totalSize,
auto getOffset,
auto getSize) {
370 auto buf = std::make_unique<std::remove_reference_t<
decltype(
src[0])>[]>(totalSize);
371 memcpy(
buf.get(),
src, totalSize *
sizeof(*
src));
372 uint32_t targetOffset = 0;
373 for (uint32_t
i = 0;
i <
c.nTracks;
i++) {
374 const uint32_t
j = sorted[
i];
376 targetOffset += getSize(
j);
379 auto sortMultiple = [&sortArray](
size_t totalSize,
auto getOffset,
auto getSize,
auto&&...
arrays) {
382 auto getFullOffset = [](uint32_t off, uint32_t ind) {
return off; };
383 auto getReducedOffset = [](uint32_t off, uint32_t ind) {
return off - ind; };
384 auto getIndex = [](uint32_t off, uint32_t ind) {
return ind; };
385 auto getN = [&
c](uint32_t
j) {
return c.nTrackClusters[
j]; };
386 auto getN1 = [&
c](uint32_t
j) {
return c.nTrackClusters[
j] - 1; };
387 auto get1 = [](uint32_t
j) {
return 1; };
389 sortMultiple(
c.nAttachedClusters, getFullOffset, getN,
c.qTotA,
c.qMaxA,
c.flagsA,
c.sigmaPadA,
c.sigmaTimeA);
390 sortMultiple(
c.nAttachedClustersReduced, getReducedOffset, getN1,
c.rowDiffA,
c.sliceLegDiffA,
c.padResA,
c.timeResA);
391 sortMultiple(
c.nTracks, getIndex, get1,
c.qPtA,
c.rowA,
c.sliceA,
c.timeA,
c.padA,
c.nTrackClusters);
409 GPUInfo(
"Doing debug raw dump");
411 DumpData((dirName +
"/event.0.dump").c_str(), &ioPtrs);
static void DebugSortCompressedClusters(o2::tpc::CompressedClustersFlat *cls)
void PrintKernelDebugOutput()
void PrintMemoryStatistics() override
static void DumpClusters(std::ostream &out, const o2::tpc::ClusterNativeAccess *clusters)
void PrintMemoryRelations()
void DumpData(const char *filename, const GPUTrackingInOutPointers *ioPtrs=nullptr)
void PrepareKernelDebugOutput()
GPUTrackingInOutPointers & mIOPtrs
GPUReconstruction::RecoStepField GetRecoSteps() const
void WriteToConstantMemory(RecoStep step, size_t offset, const void *src, size_t size, int32_t stream=-1, deviceEvent *ev=nullptr)
uint32_t ThreadCount() const
virtual std::unique_ptr< GPUReconstructionProcessing::threadContext > GetThreadContext()
GPUConstantMem * processors()
void SetupGPUProcessor(T *proc, bool allocate)
const GPUSettingsProcessing & GetProcessingSettings() const
GPUReconstructionCPU * mRec
GPUConstantMem * processorsShadow()
static constexpr int32_t NSECTORS
void TransferMemoryResourcesToHost(RecoStep step, GPUProcessor *proc, int32_t stream=-1, bool all=false)
uint32_t BlockCount() const
GPUReconstruction * rec()
virtual size_t GPUMemCpy(void *dst, const void *src, size_t size, int32_t stream, int32_t toGPU, deviceEvent *ev=nullptr, deviceEvent *evList=nullptr, int32_t nEvents=1)
std::string getDebugFolder(const std::string &prefix="")
GPUMemorySizeScalers * MemoryScalers()
const GPUSettingsProcessing & GetProcessingSettings() const
void DumpSettings(const char *dir="")
void * AllocateDirectMemory(size_t size, int32_t type)
GLuint GLsizei const GLuint const GLintptr * offsets
GLuint const GLchar * name
GLboolean GLboolean GLboolean b
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
GLboolean GLboolean GLboolean GLboolean a
GLenum GLuint GLenum GLsizei const GLchar * buf
uint8_t itsSharedClusterMap uint8_t
GPUTPCTracker tpcTrackers[GPUCA_NSECTORS]
size_t NTPCClusters(size_t tpcDigits, bool perSector=false)
const o2::tpc::ClusterNativeAccess * clustersNative
const uint32_t * outputClusRefsTPCO2
const uint32_t * mergedTrackHitAttachment
const GPUTRDTrackGPU * trdTracks
const GPUTrackingInOutZS * tpcZS
const GPUTRDTrack * trdTracksO2
uint32_t nOutputTracksTPCO2
uint32_t nMergedTrackHits
uint32_t nOutputClusRefsTPCO2
const o2::tpc::TrackTPC * outputTracksTPCO2
const GPUTrackingInOutDigits * tpcPackedDigits
const GPUTPCGMMergedTrack * mergedTracks
unsigned int nClusters[constants::MAXSECTOR][constants::MAXGLOBALPADROW]
unsigned int nClustersTotal
std::vector< Cluster > clusters
auto getOffset(const map_T &resultsMap) -> typename map_T::key_type