Project
Loading...
Searching...
No Matches
bench_ransStreaming.cxx
Go to the documentation of this file.
1// Copyright 2019-2023 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
15
17
18#include <vector>
19#include <cstring>
20#include <random>
21#include <algorithm>
22#ifdef RANS_PARALLEL_STL
23#include <execution>
24#endif
25#include <iterator>
26
27#include <benchmark/benchmark.h>
28
29#include "rANS/factory.h"
30#include "rANS/histogram.h"
31
36
37#ifdef ENABLE_VTUNE_PROFILER
38#include <ittnotify.h>
39#endif
40
41using count_t = uint32_t;
42using ransState_t = uint64_t;
43using stream_t = uint32_t;
44
45using namespace o2::rans;
46using namespace o2::rans::internal;
47using namespace o2::rans::utils;
48
49inline constexpr size_t MessageSize = 1ull << 22;
50inline constexpr size_t LowerBound = 1ul << 20;
51inline constexpr size_t StreamBits = toBits<stream_t>();
52
53template <typename source_T>
55{
56 public:
57 explicit RenormingData(size_t messageSize)
58 {
59 std::mt19937 mt(0); // same seed we want always the same distrubution of random numbers;
60 const size_t draws = std::min(1ul << 20, static_cast<size_t>(std::numeric_limits<source_T>::max()));
61 const double probability = 0.5;
62 std::binomial_distribution<source_T> dist(draws, probability);
63 const size_t sourceSize = messageSize / sizeof(source_T);
64 mSourceMessage.resize(sourceSize);
65#ifdef RANS_PARALLEL_STL
66 std::generate(std::execution::par_unseq, mSourceMessage.begin(), mSourceMessage.end(), [&dist, &mt]() { return dist(mt); });
67#else
68 std::generate(mSourceMessage.begin(), mSourceMessage.end(), [&dist, &mt]() { return dist(mt); });
69#endif // RANS_PARALLEL_STL
70
71 const auto histogram = makeDenseHistogram::fromSamples(gsl::span<const source_T>(mSourceMessage));
72 Metrics<source_T> metrics{histogram};
73 mRenormedHistogram = renorm(histogram, metrics);
74
75 double_t expectationValue = std::accumulate(mRenormedHistogram.begin(), mRenormedHistogram.end(), 0.0, [this](const double_t& a, const count_t& b) {
76 double_t prb = static_cast<double_t>(b) / static_cast<double_t>(mRenormedHistogram.getNumSamples());
77 return a + b * prb;
78 });
79
80 mState = ((LowerBound >> mRenormedHistogram.getRenormingBits()) << StreamBits) * expectationValue;
81 };
82
83 const auto& getSourceMessage() const { return mSourceMessage; };
84 const auto& getRenormedHistogram() const { return mRenormedHistogram; };
85
86 ransState_t getState() const { return mState; };
87
88 private:
89 std::vector<source_T> mSourceMessage{};
90 RenormedDenseHistogram<source_T> mRenormedHistogram{};
91 ransState_t mState{};
92};
93
97
98template <typename T>
99const auto& getData()
100{
101 if constexpr (std::is_same_v<uint8_t, T>) {
102 return Data8;
103 } else if constexpr (std::is_same_v<uint16_t, T>) {
104 return Data16;
105 } else {
106 return Data32;
107 }
108};
109
110template <typename source_T>
111struct Fixture : public benchmark::Fixture {
113
114 void SetUp(const ::benchmark::State& state) final
115 {
116 const auto& sourceMessage = getData<source_T>().getSourceMessage();
117 const auto& renormedHistogram = getData<source_T>().getRenormedHistogram();
118
119 for (auto& symbol : sourceMessage) {
120 mFrequencies.push_back(renormedHistogram[symbol]);
121 }
122 }
123
124 void TearDown(const ::benchmark::State& state) final
125 {
126 mFrequencies.clear();
127 }
128
129 std::vector<count_t> mFrequencies{};
130 ransState_t mState = getData<source_T>().getState();
131 size_t mRenormingBits = getData<source_T>().getRenormedHistogram().getRenormingBits();
132};
133
134#ifdef RANS_SIMD
135template <typename source_T, simd::SIMDWidth width_V>
136struct SIMDFixture : public benchmark::Fixture {
137
138 using source_t = source_T;
139
140 void
141 SetUp(const ::benchmark::State& state) final
142 {
143 mState[0] = simd::setAll<width_V>(getData<source_T>().getState());
144 mState[1] = simd::setAll<width_V>(getData<source_T>().getState());
145
146 const auto& sourceMessage = getData<source_T>().getSourceMessage();
147 const auto& renormedHistogram = getData<source_T>().getRenormedHistogram();
148
149 for (size_t i = 0; i < sourceMessage.size(); i += 2 * nElems) {
150 if constexpr (width_V == simd::SIMDWidth::SSE) {
151 mFrequencies.push_back({{simd::epi32_t<simd::SIMDWidth::SSE>{renormedHistogram[sourceMessage[i + 0]],
152 renormedHistogram[sourceMessage[i + 1]],
153 0x0u,
154 0x0u},
156 renormedHistogram[sourceMessage[i + 3]],
157 0x0u,
158 0x0u}}});
159 }
160 if constexpr (width_V == simd::SIMDWidth::AVX) {
161 mFrequencies.push_back({{simd::epi32_t<simd::SIMDWidth::SSE>{renormedHistogram[sourceMessage[i + 0]],
162 renormedHistogram[sourceMessage[i + 1]],
163 renormedHistogram[sourceMessage[i + 2]],
164 renormedHistogram[sourceMessage[i + 3]]},
166 renormedHistogram[sourceMessage[i + 5]],
167 renormedHistogram[sourceMessage[i + 6]],
168 renormedHistogram[sourceMessage[i + 7]]}}});
169 }
170 }
171 }
172
173 void TearDown(const ::benchmark::State& state) final
174 {
175 mFrequencies.clear();
176 }
177
178 static constexpr size_t nElems = simd::getElementCount<ransState_t>(width_V);
179 std::vector<std::array<simd::epi32_t<simd::SIMDWidth::SSE>, 2>> mFrequencies{};
180 simd::simdI_t<width_V> mState[2];
181 uint8_t mRenormingBits = getData<source_T>().getRenormedHistogram().getRenormingBits();
182};
183#endif /* RANS_SIMD */
184
185template <typename stream_IT>
186inline std::tuple<ransState_t, stream_IT> renorm(ransState_t state, stream_IT outputIter, count_t frequency, size_t symbolTablePrecision)
187{
188 ransState_t maxState = ((LowerBound >> symbolTablePrecision) << StreamBits) * frequency; // this turns into a shift.
189 if (state >= maxState) {
190 *(++outputIter) = static_cast<stream_t>(state);
191 state >>= StreamBits;
192 }
193
194 return std::make_tuple(state, outputIter);
195};
196
197template <class fixture_T>
198static void ransRenormingBenchmark(benchmark::State& st, fixture_T& fixture)
199{
200 std::vector<stream_t> out(fixture.mFrequencies.size() * 4);
201
202 for (auto _ : st) {
203 auto outIter = out.data();
204 ransState_t newState = fixture.mState;
205 for (size_t i = 0; i < fixture.mFrequencies.size(); ++i) {
206 std::tie(newState, outIter) = renorm(fixture.mState, outIter, fixture.mFrequencies[i], fixture.mRenormingBits);
207 }
208 benchmark::ClobberMemory();
209 };
210
211 st.SetItemsProcessed(int64_t(st.iterations()) * getData<typename fixture_T::source_t>().getSourceMessage().size());
212 st.SetBytesProcessed(int64_t(st.iterations()) * getData<typename fixture_T::source_t>().getSourceMessage().size() * sizeof(typename fixture_T::source_t));
213};
214
215#ifdef RANS_SIMD
216template <class fixture_T>
217static void ransRenormingBenchmarkSIMD(benchmark::State& st, fixture_T& fixture)
218{
219 std::vector<stream_t> out(fixture.mFrequencies.size() * 4);
220
221#ifdef ENABLE_VTUNE_PROFILER
222 __itt_resume();
223#endif
224 for (auto _ : st) {
225 simd::simdIsse_t frequencies[2];
226 auto outIter = out.data();
227 auto newState = fixture.mState;
228 for (size_t i = 0; i < fixture.mFrequencies.size(); ++i) {
229 frequencies[0] = load(fixture.mFrequencies[i][0]);
230 frequencies[1] = load(fixture.mFrequencies[i][1]);
231 outIter = simd::ransRenorm<decltype(outIter),
233 StreamBits>(fixture.mState,
234 frequencies,
235 fixture.mRenormingBits,
236 outIter,
237 newState);
238 }
239 benchmark::ClobberMemory();
240 };
241#ifdef ENABLE_VTUNE_PROFILER
242 __itt_pause();
243#endif
244
245 st.SetItemsProcessed(int64_t(st.iterations()) * getData<typename fixture_T::source_t>().getSourceMessage().size());
246 st.SetBytesProcessed(int64_t(st.iterations()) * getData<typename fixture_T::source_t>().getSourceMessage().size() * sizeof(typename fixture_T::source_t));
247};
248#endif /* RANS_SIMD */
249
250BENCHMARK_TEMPLATE_DEFINE_F(Fixture, renorm_8, uint8_t)
251(benchmark::State& st)
252{
253 ransRenormingBenchmark(st, *this);
254};
255
256BENCHMARK_TEMPLATE_DEFINE_F(Fixture, renorm_16, uint16_t)
257(benchmark::State& st)
258{
259 ransRenormingBenchmark(st, *this);
260};
261
262BENCHMARK_TEMPLATE_DEFINE_F(Fixture, renorm_32, uint32_t)
263(benchmark::State& st)
264{
265 ransRenormingBenchmark(st, *this);
266};
267
268#ifdef RANS_SSE
269BENCHMARK_TEMPLATE_DEFINE_F(SIMDFixture, renormSSE_8, uint8_t, simd::SIMDWidth::SSE)
270(benchmark::State& st)
271{
272 ransRenormingBenchmarkSIMD(st, *this);
273};
274
275BENCHMARK_TEMPLATE_DEFINE_F(SIMDFixture, renormSSE_16, uint16_t, simd::SIMDWidth::SSE)
276(benchmark::State& st)
277{
278 ransRenormingBenchmarkSIMD(st, *this);
279};
280
281BENCHMARK_TEMPLATE_DEFINE_F(SIMDFixture, renormSSE_32, uint32_t, simd::SIMDWidth::SSE)
282(benchmark::State& st)
283{
284 ransRenormingBenchmarkSIMD(st, *this);
285};
286#endif /* RANS_SSE */
287
288#ifdef RANS_AVX2
289BENCHMARK_TEMPLATE_DEFINE_F(SIMDFixture, renormAVX_8, uint8_t, simd::SIMDWidth::AVX)
290(benchmark::State& st)
291{
292 ransRenormingBenchmarkSIMD(st, *this);
293};
294
295BENCHMARK_TEMPLATE_DEFINE_F(SIMDFixture, renormAVX_16, uint16_t, simd::SIMDWidth::AVX)
296(benchmark::State& st)
297{
298 ransRenormingBenchmarkSIMD(st, *this);
299};
300
301BENCHMARK_TEMPLATE_DEFINE_F(SIMDFixture, renormAVX_32, uint32_t, simd::SIMDWidth::AVX)
302(benchmark::State& st)
303{
304 ransRenormingBenchmarkSIMD(st, *this);
305};
306#endif /* RANS_AVX2 */
307
311
312#ifdef RANS_SSE
313BENCHMARK_REGISTER_F(SIMDFixture, renormSSE_8);
314BENCHMARK_REGISTER_F(SIMDFixture, renormSSE_16);
315BENCHMARK_REGISTER_F(SIMDFixture, renormSSE_32);
316#endif /* RANS_SSE */
317
318#ifdef RANS_AVX2
319BENCHMARK_REGISTER_F(SIMDFixture, renormAVX_8);
320BENCHMARK_REGISTER_F(SIMDFixture, renormAVX_16);
321BENCHMARK_REGISTER_F(SIMDFixture, renormAVX_32);
322#endif /* RANS_AVX2 */
323
benchmark::State & state
int32_t i
common helper classes and functions
SourceMessageUniform< uint32_t > sourceMessage
uint32_t stream_t
uint32_t count_t
uint64_t ransState_t
std::tuple< ransState_t, stream_IT > renorm(ransState_t state, stream_IT outputIter, count_t frequency, size_t symbolTablePrecision)
constexpr size_t MessageSize
BENCHMARK_REGISTER_F(Fixture, renorm_8)
BENCHMARK_MAIN()
constexpr size_t StreamBits
constexpr size_t LowerBound
const RenormingData< uint16_t > Data16(MessageSize)
const RenormingData< uint32_t > Data32(MessageSize)
const auto & getData()
benchmark::State & st
uint32_t count_t
const RenormingData< uint8_t > Data8(MessageSize)
uint64_t ransState_t
const auto & getRenormedHistogram() const
RenormingData(size_t messageSize)
ransState_t getState() const
const auto & getSourceMessage() const
preprocessor defines to enable features based on CPU architecture
static factory classes for building histograms, encoders and decoders.
GLsizei GLenum const void GLuint GLsizei GLfloat * metrics
Definition glcorearb.h:5500
GLboolean GLboolean GLboolean b
Definition glcorearb.h:1233
GLboolean GLboolean GLboolean GLboolean a
Definition glcorearb.h:1233
public interface for building and renorming histograms from source data.
uint8_t itsSharedClusterMap uint8_t
wrapper around basic SIMD operations
basic SIMD datatypes and traits
std::vector< count_t > mFrequencies
ransState_t mState
void SetUp(const ::benchmark::State &state) final
void TearDown(const ::benchmark::State &state) final
static decltype(auto) fromSamples(source_IT begin, source_IT end, typename std::iterator_traits< source_IT >::value_type min, typename std::iterator_traits< source_IT >::value_type max)
Definition factory.h:144