20#include "GPUChainTrackingGetters.inc"
61#include <sys/select.h>
86std::unique_ptr<GPUReconstructionTimeframe>
tf;
98 printf(
"Error parsing command line parameters\n");
103 printf(
"Config Dump before ReadConfiguration\n");
110 setlocale(LC_ALL,
"en_US.utf-8");
111 setlocale(LC_NUMERIC,
"en_US.utf-8");
117 printf(
"Setting affinitiy to restrict on CPU core %d\n",
configStandalone.cpuAffinity);
118 if (0 != sched_setaffinity(0,
sizeof(
mask), &
mask)) {
119 printf(
"Error setting CPU affinity\n");
124 printf(
"Setting FIFO scheduler\n");
126 sched_getparam(0, &
param);
127 param.sched_priority = 1;
128 if (0 != sched_setscheduler(0, SCHED_FIFO, &
param)) {
129 printf(
"Error setting scheduler\n");
138 feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
140 bool detMode =
false, noFTZMode =
false;
141#ifdef GPUCA_DETERMINISTIC_MODE
144#ifdef GPUCA_DETERMINISTIC_NO_FTZ
153 printf(
"Affinity setting not supported on Windows\n");
157 printf(
"FIFO Scheduler setting not supported on Windows\n");
161 printf(
"FPE not supported on Windows\n");
166#warning Why was configStandalone.rec.tpc.mergerReadFromTrackerDirectly = 0 needed?
173#ifndef GPUCA_BUILD_QA
175 printf(
"QA not enabled in build\n");
180 printf(
"Cannot run asynchronous processing with double pipeline\n");
184 printf(
"Double pipeline mode needs at least 4 runs per event and external output. To cycle though multiple events, use --preloadEvents and --runs n for n iterations round-robin\n");
188 printf(
"Cannot run --MERGE and --SIMBUNCHES togeterh\n");
205 printf(
"Can only produce QA pdf output when input files are specified!\n");
231 printf(
"setO2Settings requires the usage of --inputMemory and --outputMemory as in O2\n");
249 bool forceEmptyMemory = getenv(
"LD_PRELOAD") && strstr(getenv(
"LD_PRELOAD"),
"valgrind") !=
nullptr;
251 if (forceEmptyMemory) {
252 printf(
"Valgrind detected, emptying GPU output memory to avoid false positive undefined reads");
257 if (forceEmptyMemory) {
279 printf(
"No GPU backend / device found, running on CPU is disabled due to runGPUforce\n");
291 printf(
"Config Dump after ReadConfiguration\n");
305 auto tmpTRDGeometry = std::make_unique<o2::trd::GeometryFlat>();
307 auto tmpTRDRecoParam = std::make_unique<GPUTRDRecoParam>();
309 auto tmpdEdxCalibContainer = std::make_unique<o2::tpc::CalibdEdxContainer>();
310 tmpdEdxCalibContainer->setDefaultZeroSupresssionThreshold();
311 tmpdEdxCalibContainer->setDefaultPolTopologyCorrection();
313 auto tmpTPCPadGainCalib = std::make_unique<TPCPadGainCalib>();
315 auto tmpTPCZSLinkMapping = std::make_unique<TPCZSLinkMapping>();
320 printf(
"Wrote trivial calibration objects to current folder\n");
330 printf(
"Error reading event config file\n");
333 const char* tmptext =
configStandalone.noEvents ?
"Using default event settings, no event dir loaded" :
"Read event settings from dir ";
352 GPUSettingsProcessing procSet;
365 printf(
"ERROR: requested to overlay continuous data - not supported\n");
369 printf(
"Continuous mode forced\n");
386 printf(
"Standalone Test Framework for CA Tracker - Using CPU\n");
388 printf(
"Standalone Test Framework for CA Tracker - Using GPU\n");
395 throw std::runtime_error(
"Requested display not available");
397 printf(
"Enabling event display (%s backend)\n",
eventDisplay->frontendName());
400 procSet.runMC =
true;
405 procSet.runMC =
true;
408 steps.
steps = gpudatatypes::RecoStep::AllRecoSteps;
412 steps.
steps.
setBits(gpudatatypes::RecoStep::TRDTracking,
false);
422 steps.
steps.
setBits(gpudatatypes::RecoStep::TPCMerging,
false);
423 steps.
steps.
setBits(gpudatatypes::RecoStep::TRDTracking,
false);
424 steps.
steps.
setBits(gpudatatypes::RecoStep::TPCdEdx,
false);
425 steps.
steps.
setBits(gpudatatypes::RecoStep::TPCCompression,
false);
426 steps.
steps.
setBits(gpudatatypes::RecoStep::Refit,
false);
430 steps.
steps.
setBits(gpudatatypes::RecoStep::TRDTracking,
false);
432 steps.
inputs.
set(gpudatatypes::InOutType::TPCClusters, gpudatatypes::InOutType::TRDTracklets);
433 steps.
steps.
setBits(gpudatatypes::RecoStep::TPCDecompression,
false);
434 steps.
inputs.
setBits(gpudatatypes::InOutType::TPCCompressedClusters,
false);
436 steps.
inputs.
setBits(gpudatatypes::InOutType::TPCCompressedClusters,
true);
437 steps.
inputs.
setBits(gpudatatypes::InOutType::TPCClusters,
false);
438 steps.
steps.
setBits(gpudatatypes::RecoStep::TPCCompression,
false);
439 steps.
steps.
setBits(gpudatatypes::RecoStep::TPCClusterFinding,
false);
440 steps.
steps.
setBits(gpudatatypes::RecoStep::TPCDecompression,
true);
441 steps.
outputs.
setBits(gpudatatypes::InOutType::TPCCompressedClusters,
false);
444 steps.
inputs.
setBits(gpudatatypes::InOutType::TPCClusters,
false);
446 steps.
steps.
setBits(gpudatatypes::RecoStep::TPCClusterFinding,
false);
451 int32_t runAsyncQA = procSet.runQA && !
configStandalone.testSyncAsyncQcInSync ? procSet.runQA : 0;
453 procSet.eventDisplay =
nullptr;
455 procSet.runQA =
false;
469 steps.
outputs.
setBits(gpudatatypes::InOutType::TPCMergedTracks, steps.
steps.
isSet(gpudatatypes::RecoStep::TPCMerging));
470 steps.
outputs.
setBits(gpudatatypes::InOutType::TPCCompressedClusters, steps.
steps.
isSet(gpudatatypes::RecoStep::TPCCompression));
472 steps.
outputs.
setBits(gpudatatypes::InOutType::TPCClusters, steps.
steps.
isSet(gpudatatypes::RecoStep::TPCClusterFinding));
474 if (steps.
steps.
isSet(gpudatatypes::RecoStep::TRDTracking)) {
475 if (procSet.createO2Output && !procSet.trdTrackModelO2) {
476 procSet.createO2Output = 1;
486 steps.
steps.
setBits(gpudatatypes::RecoStep::TPCDecompression,
true);
487 steps.
steps.
setBits(gpudatatypes::RecoStep::TPCdEdx,
true);
488 steps.
steps.
setBits(gpudatatypes::RecoStep::TPCCompression,
false);
489 steps.
steps.
setBits(gpudatatypes::RecoStep::TPCClusterFinding,
false);
490 steps.
inputs.
setBits(gpudatatypes::InOutType::TPCRaw,
false);
491 steps.
inputs.
setBits(gpudatatypes::InOutType::TPCClusters,
false);
492 steps.
inputs.
setBits(gpudatatypes::InOutType::TPCCompressedClusters,
true);
493 steps.
outputs.
setBits(gpudatatypes::InOutType::TPCCompressedClusters,
false);
494 procSet.runMC =
false;
495 procSet.runQA = runAsyncQA;
497 procSet.runCompressionStatistics = 0;
499 procSet.tpcInputWithClusterRejection = 1;
501 recSet.tpc.disableRefitAttachment = 0xFF;
502 recSet.maxTrackQPtB5 = CAMath::Min(recSet.maxTrackQPtB5, recSet.tpc.rejectQPtB5);
526 procSet.o2PropagatorUseGPUField =
true;
529 printf(
"Error initializing GPUReconstruction!\n");
534 printf(
"ERROR registering memory for the GPU!!!\n");
540 printf(
"ERROR registering input memory for the GPU!!!\n");
559#if !defined(GPUCA_RUN2) && defined(GPUCA_BUILD_QA) && defined(GPUCA_STANDALONE)
565 throw std::runtime_error(
"Error reading O2 MC dump");
571 printf(
"Converting Native to Legacy ClusterData for overlaying - WARNING: No raw clusters produced - Compression etc will not run!!!\n");
581 if (
tf->LoadCreateTimeFrame(iEvent)) {
585 if (
tf->LoadMergedEvents(iEvent)) {
595 if (encodeZS || zsFilter) {
597 printf(
"Need digit input to run ZS\n");
613 printf(
"Converting Legacy Raw Cluster to Native\n");
628 printf(
"Need cluster native data for on-the-fly TPC transform\n");
650 if (nTracksTotal && nClustersTotal) {
651 *nTracksTotal += nTracks;
658 int32_t iRun = 0, iteration = 0;
659 while ((iteration =
nIteration.fetch_add(1)) < runs) {
661 printf(
"Run %d (thread %d)\n", iteration + 1, threadId);
671 printf(
"Running synchronous phase\n");
674 chainTrackingUse->
mIOPtrs = ioPtrs;
677 timerPipeline->Start();
687 timerPipeline->Stop();
694 if (tmpRetVal == 0 || tmpRetVal == 2) {
695 OutputStat(chainTrackingUse, iRun == 0 ? nTracksTotal :
nullptr, iRun == 0 ? nClustersTotal :
nullptr);
727 if (tmpRetVal == 0 || tmpRetVal == 2) {
736 if (tmpRetVal == 2) {
741 printf(
"GPU Standalone Benchmark: Non-FATAL GPU error occured, ignoring\n");
743 if (tmpRetVal != 2) {
744 printf(
"GPU Standalone Benchmark: Error occured\n");
757int32_t
main(
int argc,
char** argv)
759 std::unique_ptr<GPUReconstruction> recUnique, recUniqueAsync, recUniquePipeline;
769 deviceSet.
master =
nullptr;
771 rec = recUnique.get();
782 printf(
"Error initializing GPUReconstruction\n");
816 std::unique_ptr<std::thread> pipelineThread;
822 std::random_device
rd;
862 printf(
"No event data found in event folder\n");
872 printf(
"Preloading events%s",
configStandalone.proc.debugLevel >= 2 ?
"\n" :
"");
882 for (int32_t iRunOuter = 0; iRunOuter <
configStandalone.runs2; iRunOuter++) {
888 printf(
"\nRUN2: %d\n", iRunOuter);
892 int32_t nEventsProcessed = 0;
912 snprintf(fname, 1024,
"event.%d.dump", nEventsProcessed);
914 if (nEventsProcessed == 0) {
922 printf(
"Cannot override max time bin for non-continuous data!\n");
940 double pipelineWalltime = 1.;
942 printf(
"No processing, no events loaded\n");
946 if (
configStandalone.proc.debugLevel < 2 && (
RunBenchmark(
rec,
chainTracking, 1, iEvent, &nTracksTotal, &nClustersTotal) ||
RunBenchmark(
recPipeline,
chainTrackingPipeline, 2, iEvent, &nTracksTotal, &nClustersTotal))) {
951 if (pipeline1.get() || pipeline2.get()) {
954 pipelineWalltime = timerPipeline.GetElapsedTime() / (
configStandalone.runs - 2);
955 printf(
"Pipeline wall time: %f, %d iterations, %f per event\n", timerPipeline.GetElapsedTime(),
configStandalone.runs - 2, pipelineWalltime);
957 printf(
"Processing Event %d\n", iEvent);
967 const int32_t nOrbits = 32;
968 const double colRate = 50000;
969 const double orbitRate = 11245;
970 const double nClsPerTF = 755851. * nOrbits * colRate / orbitRate;
972 const double nGPUsReq = timePerTF * orbitRate / nOrbits;
974 snprintf(stat, 1024,
"Sync phase: %.2f sec per %d orbit TF, %.1f GPUs required", timePerTF, nOrbits, nGPUsReq);
977 snprintf(stat + strlen(stat), 1024 - strlen(stat),
" - Async phase: %f sec per TF", timePerTF);
979 printf(
"%s (Measured %s time - Extrapolated from %d clusters to %d)\n", stat,
configStandalone.proc.debugLevel ?
"kernel" :
"wall", (int32_t)
nClusters, (int32_t)nClsPerTF);
992 if (nEventsProcessed > 1) {
993 printf(
"Total: %ld clusters, %ld tracks\n", nClustersTotal, nTracksTotal);
1004 fedisableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
1010 pipelineThread->join();
1019 printf(
"Error unregistering memory\n");
1025 printf(
"Press a key to exit!\n");
Definition of container class for dE/dx corrections.
Container to store compressed TPC cluster data.
#define GPUCA_EVDUMP_FILE
double GetCurrentElapsedTime(bool reset=false)
bitfield & setBits(const bitfield v, bool w)
bool isSet(const bitfield &v) const
GPUd() value_type estimateLTFast(o2 static GPUd() float estimateLTIncrement(const o2 PropagatorImpl * Instance(bool uninitialized=false)
static void ApplySyncSettings(GPUSettingsProcessing &proc, GPUSettingsRec &rec, gpudatatypes::RecoStepField &steps, bool syncMode, int32_t dEdxMode=-2)
void SetQAFromForeignChain(GPUChainTracking *chain)
void SetCalibObjects(const GPUCalibObjectsConst &obj)
void ConvertNativeToClusterDataLegacy()
void ConvertZSFilter(bool zs12bit)
void SetO2Propagator(const o2::base::Propagator *prop)
const GPUTRDGeometry * GetTRDGeometry() const
const TPCFastTransformPOD * GetTPCTransform() const
const o2::base::MatLayerCylSet * GetMatLUT() const
void ConvertRun2RawToNative()
const GPUSettingsDisplay * mConfigDisplay
void DumpData(const char *filename, const GPUTrackingInOutPointers *ioPtrs=nullptr)
const GPUQA * GetQA() const
void ConvertZSEncoder(int32_t version)
GPUTrackingInOutPointers & mIOPtrs
struct o2::gpu::GPUChainTracking::InOutMemory mIOMem
int32_t ReadData(const char *filename)
const GPUSettingsQA * mConfigQA
const GPUSettingsProcessing & GetProcessingSettings() const
static constexpr int32_t NSECTORS
const GPUConstantMem * GetProcessors() const
static GPUDisplayFrontendInterface * getFrontend(const char *type)
int32_t ReadO2MCData(const char *filename)
void UpdateChain(GPUChainTracking *chain)
static int32_t GetMaxTimeBin(const o2::tpc::ClusterNativeAccess &native)
static constexpr int32_t TPCZ
static constexpr int32_t DRIFT_TIME
static DeviceType GetDeviceType(const char *type)
void SetInputControl(void *ptr, size_t size)
void TerminatePipelineWorker()
virtual void startGPUProfiling()
void SetResetTimers(bool reset)
static bool CheckInstanceAvailable(DeviceType type, bool verbose)
void SetSettings(float solenoidBzNominalGPU, const GPURecoStepConfiguration *workflow=nullptr)
T * AddChain(Args... args)
static GPUReconstruction * CreateInstance(const GPUSettingsDeviceBackend &cfg)
void UpdateSettings(const GPUSettingsGRP *g, const GPUSettingsProcessing *p=nullptr, const GPUSettingsRecDynamic *d=nullptr)
virtual int32_t RunChains()=0
const GPUParam & GetParam() const
void ClearAllocatedMemory(bool clearOutputs=true)
double GetStatKernelTime()
const GPUSettingsProcessing & GetProcessingSettings() const
void DumpSettings(const char *dir="")
int32_t unregisterMemoryForGPU(const void *ptr)
int32_t registerMemoryForGPU(const void *ptr, size_t size)
const GPUSettingsGRP & GetGRPSettings() const
void SetDebugLevelTmp(int32_t level)
virtual void PrintKernelOccupancies()
virtual void endGPUProfiling()
int32_t ReadSettings(const char *dir="")
void SetOutputControl(const GPUOutputControl &v)
static void RunEventGenerator(GPUChainTracking *rec, const std::string &dir)
GLenum GLenum GLsizei len
DeviceType GetDeviceType(const char *type)
GPUSettingsStandalone configStandalone
std::string to_string(gsl::span< T, Size > span)
int32_t qConfigParse(int argc, const char **argv, const char *filename)
std::unique_ptr< GPUReconstructionTimeframe > tf
std::unique_ptr< char, GPUReconstruction::alignedDefaultBufferDeleter > outputmemoryPipeline(nullptr, GPUReconstruction::alignedDefaultBufferDeleter())
std::unique_ptr< char, GPUReconstruction::alignedDefaultBufferDeleter > inputmemory(nullptr, GPUReconstruction::alignedDefaultBufferDeleter())
int32_t RunBenchmark(GPUReconstruction *recUse, GPUChainTracking *chainTrackingUse, int32_t runs, int32_t iEvent, int64_t *nTracksTotal, int64_t *nClustersTotal, int32_t threadId=0, HighResTimer *timerPipeline=nullptr)
int32_t SetupReconstruction()
std::atomic< uint32_t > nIteration
GPUReconstruction * recPipeline
int32_t nEventsInDirectory
uint32_t syncAsyncDecodedClusters
int32_t ReadConfiguration(int argc, char **argv)
std::unique_ptr< char, GPUReconstruction::alignedDefaultBufferDeleter > outputmemory(nullptr, GPUReconstruction::alignedDefaultBufferDeleter())
int32_t LoadEvent(int32_t iEvent, int32_t x)
std::vector< GPUTrackingInOutPointers > ioPtrEvents
GPUChainITS * chainITSPipeline
int32_t ReadEvent(int32_t n)
std::vector< GPUChainTracking::InOutMemory > ioMemEvents
std::unique_ptr< GPUDisplayFrontendInterface > eventDisplay
GPUChainTracking * chainTrackingAsync
GPUChainITS * chainITSAsync
GPUChainTracking * chainTrackingPipeline
GPUReconstruction * recAsync
void OutputStat(GPUChainTracking *t, int64_t *nTracksTotal=nullptr, int64_t *nClustersTotal=nullptr)
std::atomic< uint32_t > nIterationEnd
GPUChainTracking * chainTracking
void CreateTrivialCalibObjects()
S< o2::trd::GeometryFlat >::type * trdGeometry
S< o2::tpc::CalibdEdxContainer >::type * dEdxCalibContainer
S< TPCZSLinkMapping >::type * tpcZSLinkMapping
S< TPCFastTransformPOD >::type * fastTransform
S< GPUTRDRecoParam >::type * trdRecoParam
S< TPCPadGainCalib >::type * tpcPadGain
gpudatatypes::RecoStepField steps
gpudatatypes::InOutTypeField inputs
gpudatatypes::RecoStepField stepsGPUMask
gpudatatypes::InOutTypeField outputs
GPUReconstruction * master
int32_t doCompClusterDecode
int32_t grpContinuousMaxTimeBin
float solenoidBzNominalGPU
const o2::tpc::ClusterNativeAccess * clustersNative
const GPUTPCMCInfo * mcInfosTPC
uint32_t nClusterData[NSECTORS]
uint32_t nRawClusters[NSECTORS]
const o2::tpc::CompressedClustersFlat * tpcCompressedClusters
const AliHLTTPCClusterMCLabel * mcLabelsTPC
const GPUTrackingInOutZS * tpcZS
const AliHLTTPCRawCluster * rawClusters[NSECTORS]
const GPUTPCClusterData * clusterData[NSECTORS]
uint32_t nOutputTracksTPCO2
uint32_t nMergedTrackHits
const GPUTrackingInOutDigits * tpcPackedDigits
const GPUTPCMCInfoCol * mcInfosTPCCol
const GPUTPCGMMergedTrack * mergedTracks
GPUTPCGMPolynomialField polynomialField
unsigned int nClustersTotal
unsigned int nUnattachedClusters
unsigned int nAttachedClusters
typename std::vector< T, vecpod_allocator< T > > vecpod