Project
Loading...
Searching...
No Matches
Metrics.h
Go to the documentation of this file.
1// Copyright 2019-2023 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
15
16#ifndef RANS_INTERNAL_METRICS_METRICS_H_
17#define RANS_INTERNAL_METRICS_METRICS_H_
18
19#include <cstdint>
20
21#include <fairlogger/Logger.h>
22#include <gsl/span>
23
31
32namespace o2::rans
33{
34
35template <typename source_T>
37{
38 inline static constexpr float_t defaultCutoffPrecision = 0.999;
39
40 public:
42
43 Metrics() = default;
44 inline Metrics(const DenseHistogram<source_type>& histogram, float_t cutoffPrecision = defaultCutoffPrecision) { init(histogram, {}, {}, cutoffPrecision); };
45 inline Metrics(const AdaptiveHistogram<source_type>& histogram, float_t cutoffPrecision = defaultCutoffPrecision) { init(histogram, {}, {}, cutoffPrecision); };
46 inline Metrics(const SparseHistogram<source_type>& histogram, float_t cutoffPrecision = defaultCutoffPrecision) { init(histogram, {}, {}, cutoffPrecision); };
47
48 inline Metrics(const DenseHistogram<source_type>& histogram, source_type min, source_type max, float_t cutoffPrecision = defaultCutoffPrecision) { init(histogram, {min}, {max}, cutoffPrecision); };
49 inline Metrics(const AdaptiveHistogram<source_type>& histogram, source_type min, source_type max, float_t cutoffPrecision = defaultCutoffPrecision) { init(histogram, {min}, {max}, cutoffPrecision); };
50 inline Metrics(const SparseHistogram<source_type>& histogram, source_type min, source_type max, float_t cutoffPrecision = defaultCutoffPrecision) { init(histogram, {min}, {max}, cutoffPrecision); };
51
52 [[nodiscard]] inline const DatasetProperties<source_type>& getDatasetProperties() const noexcept { return mDatasetProperties; };
53 [[nodiscard]] inline const CoderProperties<source_type>& getCoderProperties() const noexcept { return mCoderProperties; };
54
55 [[nodiscard]] inline DatasetProperties<source_type>& getDatasetProperties() noexcept { return mDatasetProperties; };
56 [[nodiscard]] inline CoderProperties<source_type>& getCoderProperties() noexcept { return mCoderProperties; };
57 [[nodiscard]] inline SizeEstimate getSizeEstimate() const noexcept { return SizeEstimate(*this); };
58
59 protected:
60 template <typename histogram_T>
61 void init(const histogram_T& histogram, std::optional<source_type> min, std::optional<source_type> max, float_t cutoffPrecision);
62
63 template <typename histogram_T>
64 void computeMetrics(const histogram_T& histogram, std::optional<source_type> min, std::optional<source_type> max);
65 size_t computeRenormingPrecision(float_t cutoffPrecision) noexcept;
66 size_t computeIncompressibleCount(gsl::span<uint32_t> distribution, uint32_t renormingPrecision) noexcept;
67
70};
71
72template <typename source_T>
73template <typename histogram_T>
74inline void Metrics<source_T>::init(const histogram_T& histogram, std::optional<source_type> min, std::optional<source_type> max, float_t cutoffPrecision)
75{
76 computeMetrics(histogram, min, max);
77 mCoderProperties.renormingPrecisionBits = computeRenormingPrecision(cutoffPrecision);
78 mCoderProperties.nIncompressibleSymbols = computeIncompressibleCount(mDatasetProperties.symbolLengthDistribution, *mCoderProperties.renormingPrecisionBits);
79 mCoderProperties.nIncompressibleSamples = computeIncompressibleCount(mDatasetProperties.weightedSymbolLengthDistribution, *mCoderProperties.renormingPrecisionBits);
80}
81
82template <typename source_T>
83template <typename histogram_T>
84void Metrics<source_T>::computeMetrics(const histogram_T& histogram, std::optional<source_type> min, std::optional<source_type> max)
85{
86 using namespace internal;
87 using namespace utils;
88 using source_type = typename histogram_T::source_type;
89 using value_type = typename histogram_T::value_type;
90 static_assert(std::is_same_v<source_type, source_T>);
91
92 mCoderProperties.dictSizeEstimate = DictSizeEstimate{histogram.getNumSamples()};
93 if (histogram.getNumSamples() > 0) {
94 const auto [trimmedBegin, trimmedEnd] = trim(histogram);
95 if (min.has_value()) {
96 mDatasetProperties.min = *min;
97 mDatasetProperties.max = *max;
98 } else {
99 std::tie(mDatasetProperties.min, mDatasetProperties.max) = getMinMax(histogram, trimmedBegin, trimmedEnd);
100 }
101 assert(mDatasetProperties.max >= mDatasetProperties.min);
102 mDatasetProperties.numSamples = histogram.getNumSamples();
103 mDatasetProperties.alphabetRangeBits = getRangeBits(mDatasetProperties.min, mDatasetProperties.max);
104
105 const double_t reciprocalNumSamples = 1.0 / static_cast<double_t>(histogram.getNumSamples());
106
107 source_type lastIndex = mDatasetProperties.min;
108
109 forEachIndexValue(histogram, trimmedBegin, trimmedEnd, [&, this](const source_type& index, const uint32_t& frequency) {
110 if (frequency) {
111 assert(lastIndex <= index);
112 source_type delta = index - lastIndex;
113 mCoderProperties.dictSizeEstimate.updateIndexSize(delta + (delta == 0));
114 lastIndex = index;
115 mCoderProperties.dictSizeEstimate.updateFreqSize(frequency);
116 ++mDatasetProperties.nUsedAlphabetSymbols;
117
118 const double_t probability = static_cast<double_t>(frequency) * reciprocalNumSamples;
119 const float_t fractionalBitLength = -fastlog2(probability);
120 const uint32_t bitLength = std::ceil(fractionalBitLength);
121
122 assert(bitLength > 0);
123 const uint32_t symbolDistributionBucket = bitLength - 1;
124 mDatasetProperties.entropy += probability * fractionalBitLength;
125 ++mDatasetProperties.symbolLengthDistribution[symbolDistributionBucket];
126 mDatasetProperties.weightedSymbolLengthDistribution[symbolDistributionBucket] += frequency;
127 }
128 });
129 }
130};
131
132template <typename source_T>
133inline size_t Metrics<source_T>::computeIncompressibleCount(gsl::span<uint32_t> distribution, uint32_t renormingPrecision) noexcept
134{
135 assert(utils::isValidRenormingPrecision(renormingPrecision));
136 size_t incompressibleCount = 0;
137 if (renormingPrecision > 0) {
138 incompressibleCount = std::accumulate(utils::advanceIter(distribution.data(), renormingPrecision), distribution.data() + distribution.size(), incompressibleCount);
139 } else {
140 // In case of an empty source message we allocate a precision of 0 Bits => 2**0 = 1
141 // This 1 entry is marked as the incompressible symbol, to ensure we somewhat can handle nasty surprises.
142 incompressibleCount = 1;
143 };
144 return incompressibleCount;
145};
146
147template <typename source_T>
148inline size_t Metrics<source_T>::computeRenormingPrecision(float_t cutoffPrecision) noexcept
149{
150
151 const auto& dp = this->mDatasetProperties;
152
153 constexpr size_t SafetyMargin = 1;
154 const size_t cutoffSamples = std::ceil(static_cast<double_t>(cutoffPrecision) *
155 static_cast<double_t>(dp.numSamples));
156 size_t cumulatedSamples = 0;
157
158 size_t renormingBits = std::count_if(dp.weightedSymbolLengthDistribution.begin(),
159 dp.weightedSymbolLengthDistribution.end(),
160 [&cumulatedSamples, cutoffSamples](const uint32_t& frequency) {
161 if (cumulatedSamples < cutoffSamples) {
162 cumulatedSamples += frequency;
163 return true;
164 } else {
165 return false;
166 }
167 });
168
169 if (cumulatedSamples == 0) {
170 // if the message is empty, cumulated precision will be 0. The algorithm will be unable to meet the cutoff precision.
171 // We therefore set renorming Bits to 0, which will result in 2**0 = 1 entry, which will be assigned to the incompressible symbol.
172 renormingBits = 0;
173 } else {
174 // ensure renorming is in interval [MinThreshold, MaxThreshold]
175 renormingBits = utils::sanitizeRenormingBitRange(renormingBits + SafetyMargin);
176 }
177 assert(utils::isValidRenormingPrecision(renormingBits));
178 return renormingBits;
179};
180
181} // namespace o2::rans
182
183#endif /* RANS_INTERNAL_METRICS_METRICS_H_ */
Histogram for source symbols used to estimate symbol probabilities for entropy coding.
Estimate sizes of different rANS Buffers and decide if packing should be prefered over compression.
common helper classes and functions
utility functions related to calculating different dataset metrics
helper functionalities useful for packing operations
void computeMetrics(const histogram_T &histogram, std::optional< source_type > min, std::optional< source_type > max)
Definition Metrics.h:84
CoderProperties< source_type > mCoderProperties
Definition Metrics.h:69
DatasetProperties< source_type > & getDatasetProperties() noexcept
Definition Metrics.h:55
Metrics(const SparseHistogram< source_type > &histogram, float_t cutoffPrecision=defaultCutoffPrecision)
Definition Metrics.h:46
void init(const histogram_T &histogram, std::optional< source_type > min, std::optional< source_type > max, float_t cutoffPrecision)
Definition Metrics.h:74
SizeEstimate getSizeEstimate() const noexcept
Definition Metrics.h:57
Metrics(const AdaptiveHistogram< source_type > &histogram, float_t cutoffPrecision=defaultCutoffPrecision)
Definition Metrics.h:45
Metrics(const AdaptiveHistogram< source_type > &histogram, source_type min, source_type max, float_t cutoffPrecision=defaultCutoffPrecision)
Definition Metrics.h:49
Metrics(const DenseHistogram< source_type > &histogram, source_type min, source_type max, float_t cutoffPrecision=defaultCutoffPrecision)
Definition Metrics.h:48
const DatasetProperties< source_type > & getDatasetProperties() const noexcept
Definition Metrics.h:52
CoderProperties< source_type > & getCoderProperties() noexcept
Definition Metrics.h:56
Metrics(const DenseHistogram< source_type > &histogram, float_t cutoffPrecision=defaultCutoffPrecision)
Definition Metrics.h:44
Metrics(const SparseHistogram< source_type > &histogram, source_type min, source_type max, float_t cutoffPrecision=defaultCutoffPrecision)
Definition Metrics.h:50
DatasetProperties< source_type > mDatasetProperties
Definition Metrics.h:68
const CoderProperties< source_type > & getCoderProperties() const noexcept
Definition Metrics.h:53
size_t computeIncompressibleCount(gsl::span< uint32_t > distribution, uint32_t renormingPrecision) noexcept
Definition Metrics.h:133
size_t computeRenormingPrecision(float_t cutoffPrecision) noexcept
Definition Metrics.h:148
GLuint index
Definition glcorearb.h:781
constexpr bool isValidRenormingPrecision(size_t renormPrecision)
Definition utils.h:236
size_t sanitizeRenormingBitRange(size_t renormPrecision)
Definition utils.h:212
Freq_IT advanceIter(Freq_IT iter, std::ptrdiff_t distance)
Definition utils.h:191
HistogramView< Hist_IT > trim(const HistogramView< Hist_IT > &buffer)
std::pair< source_T, source_T > getMinMax(const AdaptiveSymbolTable< source_T, symbol_T > &symbolTable)
Common utility functions.
Essential properties of the dataset used for building dictionaries and coders.
constexpr size_t min
constexpr size_t max