22#ifdef RANS_PARALLEL_STL
27#include <benchmark/benchmark.h>
37#ifdef ENABLE_VTUNE_PROFILER
53template <
typename source_T>
60 const size_t draws = std::min(1ul << 20,
static_cast<size_t>(std::numeric_limits<source_T>::max()));
61 const double probability = 0.5;
62 std::binomial_distribution<source_T> dist(draws, probability);
63 const size_t sourceSize = messageSize /
sizeof(
source_T);
64 mSourceMessage.resize(sourceSize);
65#ifdef RANS_PARALLEL_STL
66 std::generate(std::execution::par_unseq, mSourceMessage.begin(), mSourceMessage.end(), [&dist, &mt]() { return dist(mt); });
68 std::generate(mSourceMessage.begin(), mSourceMessage.end(), [&dist, &mt]() { return dist(mt); });
75 double_t expectationValue = std::accumulate(mRenormedHistogram.begin(), mRenormedHistogram.end(), 0.0, [
this](
const double_t&
a,
const count_t&
b) {
76 double_t prb = static_cast<double_t>(b) / static_cast<double_t>(mRenormedHistogram.getNumSamples());
80 mState = ((
LowerBound >> mRenormedHistogram.getRenormingBits()) <<
StreamBits) * expectationValue;
89 std::vector<source_T> mSourceMessage{};
101 if constexpr (std::is_same_v<uint8_t, T>) {
103 }
else if constexpr (std::is_same_v<uint16_t, T>) {
110template <
typename source_T>
111struct Fixture :
public benchmark::Fixture {
114 void SetUp(const ::benchmark::State& state)
final
116 const auto&
sourceMessage = getData<source_T>().getSourceMessage();
117 const auto& renormedHistogram = getData<source_T>().getRenormedHistogram();
124 void TearDown(const ::benchmark::State& state)
final
131 size_t mRenormingBits = getData<source_T>().getRenormedHistogram().getRenormingBits();
135template <
typename source_T, simd::SIMDW
idth w
idth_V>
136struct SIMDFixture :
public benchmark::Fixture {
141 SetUp(const ::benchmark::State&
state)
final
143 mState[0] = simd::setAll<width_V>(getData<source_T>().getState());
144 mState[1] = simd::setAll<width_V>(getData<source_T>().getState());
146 const auto&
sourceMessage = getData<source_T>().getSourceMessage();
147 const auto& renormedHistogram = getData<source_T>().getRenormedHistogram();
150 if constexpr (width_V == simd::SIMDWidth::SSE) {
160 if constexpr (width_V == simd::SIMDWidth::AVX) {
173 void TearDown(const ::benchmark::State&
state)
final
175 mFrequencies.clear();
178 static constexpr size_t nElems = simd::getElementCount<ransState_t>(width_V);
179 std::vector<std::array<simd::epi32_t<simd::SIMDWidth::SSE>, 2>> mFrequencies{};
180 simd::simdI_t<width_V> mState[2];
181 uint8_t mRenormingBits = getData<source_T>().getRenormedHistogram().getRenormingBits();
185template <
typename stream_IT>
186inline std::tuple<ransState_t, stream_IT>
renorm(
ransState_t state, stream_IT outputIter,
count_t frequency,
size_t symbolTablePrecision)
189 if (
state >= maxState) {
194 return std::make_tuple(
state, outputIter);
197template <
class fixture_T>
198static void ransRenormingBenchmark(benchmark::State&
st, fixture_T& fixture)
200 std::vector<stream_t> out(fixture.mFrequencies.size() * 4);
203 auto outIter = out.data();
205 for (
size_t i = 0;
i < fixture.mFrequencies.size(); ++
i) {
206 std::tie(newState, outIter) =
renorm(fixture.mState, outIter, fixture.mFrequencies[
i], fixture.mRenormingBits);
208 benchmark::ClobberMemory();
211 st.SetItemsProcessed(int64_t(
st.iterations()) * getData<typename fixture_T::source_t>().getSourceMessage().size());
212 st.SetBytesProcessed(int64_t(
st.iterations()) * getData<typename fixture_T::source_t>().getSourceMessage().size() *
sizeof(
typename fixture_T::source_t));
216template <
class fixture_T>
217static void ransRenormingBenchmarkSIMD(benchmark::State&
st, fixture_T& fixture)
219 std::vector<stream_t> out(fixture.mFrequencies.size() * 4);
221#ifdef ENABLE_VTUNE_PROFILER
225 simd::simdIsse_t frequencies[2];
226 auto outIter = out.data();
227 auto newState = fixture.mState;
228 for (
size_t i = 0;
i < fixture.mFrequencies.size(); ++
i) {
229 frequencies[0] = load(fixture.mFrequencies[
i][0]);
230 frequencies[1] = load(fixture.mFrequencies[
i][1]);
231 outIter = simd::ransRenorm<
decltype(outIter),
235 fixture.mRenormingBits,
239 benchmark::ClobberMemory();
241#ifdef ENABLE_VTUNE_PROFILER
245 st.SetItemsProcessed(int64_t(
st.iterations()) * getData<typename fixture_T::source_t>().getSourceMessage().size());
246 st.SetBytesProcessed(int64_t(
st.iterations()) * getData<typename fixture_T::source_t>().getSourceMessage().size() *
sizeof(
typename fixture_T::source_t));
250BENCHMARK_TEMPLATE_DEFINE_F(
Fixture, renorm_8, uint8_t)
251(benchmark::State&
st)
253 ransRenormingBenchmark(
st, *
this);
256BENCHMARK_TEMPLATE_DEFINE_F(
Fixture, renorm_16, uint16_t)
257(benchmark::State&
st)
259 ransRenormingBenchmark(
st, *
this);
262BENCHMARK_TEMPLATE_DEFINE_F(
Fixture, renorm_32, uint32_t)
263(benchmark::State&
st)
265 ransRenormingBenchmark(
st, *
this);
269BENCHMARK_TEMPLATE_DEFINE_F(SIMDFixture, renormSSE_8, uint8_t, simd::SIMDWidth::SSE)
270(benchmark::State&
st)
272 ransRenormingBenchmarkSIMD(
st, *
this);
275BENCHMARK_TEMPLATE_DEFINE_F(SIMDFixture, renormSSE_16, uint16_t, simd::SIMDWidth::SSE)
276(benchmark::State&
st)
278 ransRenormingBenchmarkSIMD(
st, *
this);
281BENCHMARK_TEMPLATE_DEFINE_F(SIMDFixture, renormSSE_32, uint32_t, simd::SIMDWidth::SSE)
282(benchmark::State&
st)
284 ransRenormingBenchmarkSIMD(
st, *
this);
289BENCHMARK_TEMPLATE_DEFINE_F(SIMDFixture, renormAVX_8, uint8_t, simd::SIMDWidth::AVX)
290(benchmark::State&
st)
292 ransRenormingBenchmarkSIMD(
st, *
this);
295BENCHMARK_TEMPLATE_DEFINE_F(SIMDFixture, renormAVX_16, uint16_t, simd::SIMDWidth::AVX)
296(benchmark::State&
st)
298 ransRenormingBenchmarkSIMD(
st, *
this);
301BENCHMARK_TEMPLATE_DEFINE_F(SIMDFixture, renormAVX_32, uint32_t, simd::SIMDWidth::AVX)
302(benchmark::State&
st)
304 ransRenormingBenchmarkSIMD(
st, *
this);
common helper classes and functions
SourceMessageUniform< uint32_t > sourceMessage
std::tuple< ransState_t, stream_IT > renorm(ransState_t state, stream_IT outputIter, count_t frequency, size_t symbolTablePrecision)
constexpr size_t MessageSize
BENCHMARK_REGISTER_F(Fixture, renorm_8)
constexpr size_t StreamBits
constexpr size_t LowerBound
const RenormingData< uint16_t > Data16(MessageSize)
const RenormingData< uint32_t > Data32(MessageSize)
const RenormingData< uint8_t > Data8(MessageSize)
const auto & getRenormedHistogram() const
RenormingData(size_t messageSize)
ransState_t getState() const
const auto & getSourceMessage() const
preprocessor defines to enable features based on CPU architecture
static factory classes for building histograms, encoders and decoders.
GLsizei GLenum const void GLuint GLsizei GLfloat * metrics
GLboolean GLboolean GLboolean b
GLboolean GLboolean GLboolean GLboolean a
public interface for building and renorming histograms from source data.
uint8_t itsSharedClusterMap uint8_t
wrapper around basic SIMD operations
basic SIMD datatypes and traits
std::vector< count_t > mFrequencies
void SetUp(const ::benchmark::State &state) final
void TearDown(const ::benchmark::State &state) final
static decltype(auto) fromSamples(source_IT begin, source_IT end, typename std::iterator_traits< source_IT >::value_type min, typename std::iterator_traits< source_IT >::value_type max)