d2/d51/GPU_2GPUbenchmark_2Shared_2Utils_8h_source.html

// Copyright 2019-2020 CERN and copyright holders of ALICE O2.

// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.

// All rights not expressly granted are reserved.

//

// This software is distributed under the terms of the GNU General Public

// License v3 (GPL Version 3), copied verbatim in the file "COPYING".

//

// In applying this license CERN does not waive the privileges and immunities

// granted to it by virtue of its status as an Intergovernmental Organization

// or submit itself to any jurisdiction.


#ifndef GPU_BENCHMARK_UTILS_H

#define GPU_BENCHMARK_UTILS_H


#if defined(__HIPCC__)

#include "hip/hip_runtime.h"

#endif


#include <iostream>

#include <sstream>

#include <iomanip>

#include <typeinfo>

#include <boost/program_options.hpp>

#include <vector>

#include <string>

#include <cmath>


#define KNRM "\x1B[0m"

#define KRED "\x1B[31m"

#define KGRN "\x1B[32m"

#define KYEL "\x1B[33m"

#define configLU "\x1B[34m"

#define KMAG "\x1B[35m"

#define KCYN "\x1B[36m"

#define KWHT "\x1B[37m"


#define GB (1024 * 1024 * 1024)


#define failed(...)                       \

  printf("%serror: ", KRED);              \

  printf(__VA_ARGS__);                    \

  printf("\n");                           \

  printf("error: TEST FAILED\n%s", KNRM); \

  exit(EXIT_FAILURE);


#endif


template <typename T>


void discardResult(const T&)

{

}


enum class Test {

  Read,

  Write,

  Copy,

  RandomRead,

  RandomWrite,

  RandomCopy

};


inline std::ostream& operator<<(std::ostream& os, Test test)

{

  switch (test) {

    case Test::Read:

      os << "read";

      break;

    case Test::Write:

      os << "write";

      break;

    case Test::Copy:

      os << "copy";

      break;

    case Test::RandomRead:

      os << "random read";

      break;

    case Test::RandomWrite:

      os << "random write";

      break;

    case Test::RandomCopy:

      os << "random copy";

      break;

  }

  return os;

}


enum class Mode {

  Sequential,

  Concurrent,

  Distributed

};


inline std::ostream& operator<<(std::ostream& os, Mode mode)

{

  switch (mode) {

    case Mode::Sequential:

      os << "sequential";

      break;

    case Mode::Concurrent:

      os << "concurrent";

      break;

    case Mode::Distributed:

      os << "distributed";

      break;

  }

  return os;

}


enum class KernelConfig {

  Single,

  Multi,

  All,

  Manual

};


inline std::ostream& operator<<(std::ostream& os, KernelConfig config)

{

  switch (config) {

    case KernelConfig::Single:

      os << "single";

      break;

    case KernelConfig::Multi:

      os << "multiple";

      break;

    case KernelConfig::All:

      os << "all";

      break;

    case KernelConfig::Manual:

      os << "manual";

      break;

  }

  return os;

}


template <class T>


inline std::string getType()

{

  if (typeid(T).name() == typeid(int8_t).name()) {

    return std::string{"int8_t"};

  }

  if (typeid(T).name() == typeid(size_t).name()) {

    return std::string{"uint64_t"};

  }

  if (typeid(T).name() == typeid(int32_t).name()) {

    return std::string{"int32_t"};

  }

  if (typeid(T).name() == typeid(int4).name()) {

    return std::string{"int4"};

  }

  return std::string{"unknown"};

}


inline std::string getTestName(Mode mode, Test test, KernelConfig blocks)

{

  std::string tname;

  tname += (mode == Mode::Sequential) ? "seq_" : "conc_";

  tname += (test == Test::Read) ? "read_" : (test == Test::Write) ? "write_"

                                                                  : "copy_";

  tname += (blocks == KernelConfig::Single) ? "SB" : "MB";

  return tname;

}


// Return pointer to custom offset (GB)

template <class chunk_t>


inline chunk_t* getCustomPtr(chunk_t* scratchPtr, float startGB)

{

  return reinterpret_cast<chunk_t*>(reinterpret_cast<char*>(scratchPtr) + (static_cast<size_t>(GB * startGB) & 0xFFFFFFFFFFFFF000));

}


inline float computeThroughput(Test test, float result, float chunkSizeGB, int32_t ntests)

{

  // https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html

  // Eff_bandwidth (GB/s) = (B_r + B_w) / (~1e9 * Time (s))


  return 1e3 * chunkSizeGB * (float)ntests / result;

}


template <class chunk_t>


inline size_t getBufferCapacity(float chunkSizeGB, int32_t prime)

{

  auto chunkCapacity = (static_cast<size_t>(GB * chunkSizeGB) & 0xFFFFFFFFFFFFF000) / sizeof(chunk_t);

  if (!prime) {

    return chunkCapacity;

  } else {

    return (chunkCapacity % prime == 0) ? (chunkCapacity - 0x1000) : chunkCapacity;

  }

}


inline bool is_prime(const int32_t n)

{

  bool isPrime = true;

  if (n == 0 || n == 1) {

    isPrime = false;

  } else {

    for (int32_t i = 2; i <= sqrt(n); ++i) {

      if (n % i == 0) {

        isPrime = false;

        break;

      }

    }

  }


  return isPrime;

}


namespace o2

{

namespace benchmark

{


struct benchmarkOpts {

  benchmarkOpts() = default;


  int32_t deviceId = 0;

  std::vector<Test> tests = {Test::Read, Test::Write, Test::Copy};

  std::vector<Mode> modes = {Mode::Sequential, Mode::Concurrent};

  std::vector<KernelConfig> pools = {KernelConfig::Single, KernelConfig::Multi};

  std::vector<std::string> dtypes = {"int8_t", "int32_t", "uint64_t"};

  std::vector<std::pair<float, float>> testChunks;

  float chunkReservedGB = 1.f;

  float threadPoolFraction = 1.f;

  float freeMemoryFractionToAllocate = 0.95f;

  int32_t numThreads = -1;

  int32_t numBlocks = -1;

  int32_t kernelLaunches = 1;

  int32_t nTests = 1;

  bool raw = false;

  int32_t streams = 8;

  int32_t prime = 0;

  std::string outFileName = "benchmark_result";

  bool dumpChunks = false;

};


template <class chunk_t>


struct gpuState {


  int32_t getMaxChunks()

  {

    return (double)scratchSize / (chunkReservedGB * GB);

  }


  int32_t getNKernelLaunches() { return iterations; }

  int32_t getStreamsPoolSize() { return streams; }


  // Configuration

  size_t nMaxThreadsPerDimension;

  int32_t iterations;

  int32_t streams;


  float chunkReservedGB; // Size of each partition (GB)


  // General containers and state

  chunk_t* scratchPtr;                             // Pointer to scratch buffer

  size_t scratchSize;                              // Size of scratch area (B)

  std::vector<chunk_t*> partAddrOnHost;            // Pointers to scratch partitions on host vector

  std::vector<std::pair<float, float>> testChunks; // Vector of definitions for arbitrary chunks


  // Static info

  size_t totalMemory;

  size_t nMultiprocessors;

  size_t nMaxThreadsPerBlock;

};


} // namespace benchmark

} // namespace o2

i
int32_t i
Definition GPUCommonAlgorithm.h:436

operator<<
std::ostream & operator<<(std::ostream &os, Test test)
Definition Utils.h:64

getCustomPtr
chunk_t * getCustomPtr(chunk_t *scratchPtr, float startGB)
Definition Utils.h:167

GB
#define GB
Definition Utils.h:40

Mode
Mode
Definition Utils.h:89

Mode::Concurrent
@ Concurrent

Mode::Distributed
@ Distributed

Mode::Sequential
@ Sequential

computeThroughput
float computeThroughput(Test test, float result, float chunkSizeGB, int32_t ntests)
Definition Utils.h:172

getType
std::string getType()
Definition Utils.h:138

getBufferCapacity
size_t getBufferCapacity(float chunkSizeGB, int32_t prime)
Definition Utils.h:181

is_prime
bool is_prime(const int32_t n)
Definition Utils.h:191

Test
Test
Definition Utils.h:55

Test::Write
@ Write

Test::RandomCopy
@ RandomCopy

Test::Copy
@ Copy

Test::Read
@ Read

Test::RandomRead
@ RandomRead

Test::RandomWrite
@ RandomWrite

getTestName
std::string getTestName(Mode mode, Test test, KernelConfig blocks)
Definition Utils.h:155

KernelConfig
KernelConfig
Definition Utils.h:111

KernelConfig::Single
@ Single

KernelConfig::All
@ All

KernelConfig::Multi
@ Multi

KernelConfig::Manual
@ Manual

discardResult
void discardResult(const T &)
Definition Utils.h:51

n
GLdouble n
Definition glcorearb.h:1982

mode
GLenum mode
Definition glcorearb.h:266

result
GLuint64EXT * result
Definition glcorearb.h:5662

name
GLuint const GLchar * name
Definition glcorearb.h:781

o2
a couple of static helper functions to create timestamp values for CCDB queries or override obsolete ...
Definition BitstreamReader.h:24

test
FIXME: do not use data model tables.
Definition benchmark_ASoAHelpers.cxx:26

int4
Definition GPUCommonDefAPI.h:70

o2::benchmark::benchmarkOpts
Definition Utils.h:212

o2::benchmark::benchmarkOpts::numBlocks
int32_t numBlocks
Definition Utils.h:225

o2::benchmark::benchmarkOpts::kernelLaunches
int32_t kernelLaunches
Definition Utils.h:226

o2::benchmark::benchmarkOpts::dtypes
std::vector< std::string > dtypes
Definition Utils.h:219

o2::benchmark::benchmarkOpts::testChunks
std::vector< std::pair< float, float > > testChunks
Definition Utils.h:220

o2::benchmark::benchmarkOpts::raw
bool raw
Definition Utils.h:228

o2::benchmark::benchmarkOpts::streams
int32_t streams
Definition Utils.h:229

o2::benchmark::benchmarkOpts::pools
std::vector< KernelConfig > pools
Definition Utils.h:218

o2::benchmark::benchmarkOpts::benchmarkOpts
benchmarkOpts()=default

o2::benchmark::benchmarkOpts::outFileName
std::string outFileName
Definition Utils.h:231

o2::benchmark::benchmarkOpts::deviceId
int32_t deviceId
Definition Utils.h:215

o2::benchmark::benchmarkOpts::numThreads
int32_t numThreads
Definition Utils.h:224

o2::benchmark::benchmarkOpts::chunkReservedGB
float chunkReservedGB
Definition Utils.h:221

o2::benchmark::benchmarkOpts::tests
std::vector< Test > tests
Definition Utils.h:216

o2::benchmark::benchmarkOpts::threadPoolFraction
float threadPoolFraction
Definition Utils.h:222

o2::benchmark::benchmarkOpts::modes
std::vector< Mode > modes
Definition Utils.h:217

o2::benchmark::benchmarkOpts::nTests
int32_t nTests
Definition Utils.h:227

o2::benchmark::benchmarkOpts::freeMemoryFractionToAllocate
float freeMemoryFractionToAllocate
Definition Utils.h:223

o2::benchmark::benchmarkOpts::dumpChunks
bool dumpChunks
Definition Utils.h:232

o2::benchmark::benchmarkOpts::prime
int32_t prime
Definition Utils.h:230

o2::benchmark::gpuState
Definition Utils.h:236

o2::benchmark::gpuState::nMultiprocessors
size_t nMultiprocessors
Definition Utils.h:260

o2::benchmark::gpuState::streams
int32_t streams
Definition Utils.h:248

o2::benchmark::gpuState::iterations
int32_t iterations
Definition Utils.h:247

o2::benchmark::gpuState::scratchSize
size_t scratchSize
Definition Utils.h:254

o2::benchmark::gpuState::totalMemory
size_t totalMemory
Definition Utils.h:259

o2::benchmark::gpuState::scratchPtr
chunk_t * scratchPtr
Definition Utils.h:253

o2::benchmark::gpuState::nMaxThreadsPerDimension
size_t nMaxThreadsPerDimension
Definition Utils.h:246

o2::benchmark::gpuState::testChunks
std::vector< std::pair< float, float > > testChunks
Definition Utils.h:256

o2::benchmark::gpuState::getNKernelLaunches
int32_t getNKernelLaunches()
Definition Utils.h:242

o2::benchmark::gpuState::getStreamsPoolSize
int32_t getStreamsPoolSize()
Definition Utils.h:243

o2::benchmark::gpuState::chunkReservedGB
float chunkReservedGB
Definition Utils.h:250

o2::benchmark::gpuState::nMaxThreadsPerBlock
size_t nMaxThreadsPerBlock
Definition Utils.h:261

o2::benchmark::gpuState::getMaxChunks
int32_t getMaxChunks()
Definition Utils.h:237

o2::benchmark::gpuState::partAddrOnHost
std::vector< chunk_t * > partAddrOnHost
Definition Utils.h:255