51 using namespace internal;
52 using namespace utils;
54 std::vector<uint32_t> frequencies{};
55 histogram_T histogram{frequencies.begin(), frequencies.end(), 0};
59 DictSizeEstimate estimate{histogram.getNumSamples()};
62 forEachIndexValue(histogram, trimmedBegin, trimmedEnd, [&](
const source_type&
index,
const uint32_t& frequency) {
66 estimate.updateIndexSize(delta + (delta == 0));
68 estimate.updateFreqSize(frequency);
81 using namespace internal;
82 using namespace utils;
84 std::vector<uint32_t> frequencies{9, 0, 8, 0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1};
85 histogram_T histogram{frequencies.begin(), frequencies.end(), 0};
90 DictSizeEstimate estimate{histogram.getNumSamples()};
93 forEachIndexValue(histogram, trimmedBegin, trimmedEnd, [&](
const source_type&
index,
const uint32_t& frequency) {
97 estimate.updateIndexSize(delta + (delta == 0));
99 estimate.updateFreqSize(frequency);
125 std::array<uint32_t, 32> symbolLengthDistribution;
126 std::array<uint32_t, 32> weightedSymbolLengthDistribution;
127 const size_t nSamples = 0;
128 const uint32_t renormingPrecision = 0;
132 tester.getDatasetProperties().weightedSymbolLengthDistribution = weightedSymbolLengthDistribution;
133 tester.getDatasetProperties().numSamples = nSamples;
136 BOOST_CHECK_EQUAL(tester.testComputeIncompressibleCount(symbolLengthDistribution, renormingPrecision), 1);
137 BOOST_CHECK_EQUAL(tester.testComputeIncompressibleCount(weightedSymbolLengthDistribution, renormingPrecision), 1);
142 std::array<uint32_t, 32> symbolLengthDistribution{{}};
143 std::array<uint32_t, 32> weightedSymbolLengthDistribution{{}};
144 weightedSymbolLengthDistribution[31] = 44;
145 symbolLengthDistribution[31] = 42;
146 const size_t nSamples = 44;
151 tester.getDatasetProperties().weightedSymbolLengthDistribution = weightedSymbolLengthDistribution;
152 tester.getDatasetProperties().numSamples = nSamples;
155 BOOST_CHECK_EQUAL(tester.testComputeIncompressibleCount(symbolLengthDistribution, renormingPrecision), 42);
156 BOOST_CHECK_EQUAL(tester.testComputeIncompressibleCount(weightedSymbolLengthDistribution, renormingPrecision), nSamples);
161 std::array<uint32_t, 32> symbolLengthDistribution{{}};
162 std::array<uint32_t, 32> weightedSymbolLengthDistribution{{}};
163 weightedSymbolLengthDistribution[1] = 20;
164 weightedSymbolLengthDistribution[5] = 20;
165 weightedSymbolLengthDistribution[9] = 40;
166 weightedSymbolLengthDistribution[12] = 10;
167 weightedSymbolLengthDistribution[15] = 10;
169 symbolLengthDistribution[1] = 2;
170 symbolLengthDistribution[5] = 2;
171 symbolLengthDistribution[9] = 4;
172 symbolLengthDistribution[12] = 1;
173 symbolLengthDistribution[15] = 1;
175 const size_t nSamples = 100;
176 const uint32_t renormingPrecision = 17;
180 tester.getDatasetProperties().weightedSymbolLengthDistribution = weightedSymbolLengthDistribution;
181 tester.getDatasetProperties().numSamples = nSamples;
184 BOOST_CHECK_EQUAL(tester.testComputeIncompressibleCount(symbolLengthDistribution, renormingPrecision), 0);
185 BOOST_CHECK_EQUAL(tester.testComputeIncompressibleCount(weightedSymbolLengthDistribution, renormingPrecision), 0);
192 std::vector<uint32_t> frequencies{};
193 histogram_T histogram{frequencies.begin(), frequencies.end(), 0};
194 const float eps = 1e-2;
195 const size_t nUsedAlphabetSymbols = 0;
200 const auto& coderProperties =
metrics.getCoderProperties();
207 BOOST_CHECK_SMALL(dataProperies.entropy, eps);
209 std::array<uint32_t, 32> symbolLengthDistribution{{}};
210 std::array<uint32_t, 32> weightedSymbolLengthDistribution{{}};
212 uint32_t sumUnweighted = 0;
213 uint32_t sumWeighted = 0;
214 for (
size_t i = 0;
i < 32; ++
i) {
217 BOOST_CHECK_EQUAL(weightedSymbolLengthDistribution[
i], dataProperies.weightedSymbolLengthDistribution[
i]);
219 sumUnweighted += dataProperies.symbolLengthDistribution[
i];
220 sumWeighted += dataProperies.weightedSymbolLengthDistribution[
i];
228 const auto& estimate = coderProperties.dictSizeEstimate;
238 std::vector<uint32_t> frequencies{5};
239 histogram_T histogram{frequencies.begin(), frequencies.end(), 2};
245 const auto& coderProperties =
metrics.getCoderProperties();
252 BOOST_CHECK_SMALL(dataProperies.entropy, 1e-5f);
254 std::array<uint32_t, 32> symbolLengthDistribution{{}};
255 std::array<uint32_t, 32> weightedSymbolLengthDistribution{{}};
257 symbolLengthDistribution[0] = 1;
258 weightedSymbolLengthDistribution[0] = 5;
260 uint32_t sumUnweighted = 0;
261 uint32_t sumWeighted = 0;
262 for (
size_t i = 0;
i < 32; ++
i) {
265 BOOST_CHECK_EQUAL(weightedSymbolLengthDistribution[
i], dataProperies.weightedSymbolLengthDistribution[
i]);
267 sumUnweighted += dataProperies.symbolLengthDistribution[
i];
268 sumWeighted += dataProperies.weightedSymbolLengthDistribution[
i];
279 std::vector<uint32_t> frequencies{9, 0, 8, 0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1};
280 histogram_T histogram{frequencies.begin(), frequencies.end(), 0};
282 const float eps = 1e-2;
287 const auto& coderProperties =
metrics.getCoderProperties();
294 BOOST_CHECK_CLOSE(dataProperies.entropy, 2.957295041922758, eps);
296 std::array<uint32_t, 32> symbolLengthDistribution{{}};
297 std::array<uint32_t, 32> weightedSymbolLengthDistribution{{}};
299 weightedSymbolLengthDistribution[2] = 30;
300 weightedSymbolLengthDistribution[3] = 12;
301 weightedSymbolLengthDistribution[4] = 2;
302 weightedSymbolLengthDistribution[5] = 1;
304 symbolLengthDistribution[2] = 4;
305 symbolLengthDistribution[3] = 3;
306 symbolLengthDistribution[4] = 1;
307 symbolLengthDistribution[5] = 1;
309 uint32_t sumUnweighted = 0;
310 uint32_t sumWeighted = 0;
311 for (
size_t i = 0;
i < 32; ++
i) {
312 BOOST_TEST_MESSAGE(fmt::format(
"checking length: {}",
i));
314 BOOST_CHECK_EQUAL(weightedSymbolLengthDistribution[
i], dataProperies.weightedSymbolLengthDistribution[
i]);
316 sumUnweighted += dataProperies.symbolLengthDistribution[
i];
317 sumWeighted += dataProperies.weightedSymbolLengthDistribution[
i];
325 const auto& estimate = coderProperties.dictSizeEstimate;
330 BOOST_CHECK_EQUAL(estimate.getSizeB(nUsedAlphabetSymbols, *coderProperties.renormingPrecisionBits), 21);
337 histogram_T histogram{};
350 constexpr size_t entropySizeB = 17;
352 std::vector<uint32_t> frequencies{9, 0, 8, 0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1};
353 histogram_T histogram{frequencies.begin(), frequencies.end(), 0};
357 BOOST_CHECK_EQUAL(estimate.getCompressedDatasetSize<>(1.0), addEncoderOverheadEstimateB<>(entropySizeB));
auto getMinMax(const container_T &container, typename container_T::const_iterator begin, typename container_T::const_iterator end, typename container_T::const_reference zeroElem={}) -> std::pair< typename container_T::source_type, typename container_T::source_type >