Project
Loading...
Searching...
No Matches
InplaceEntropyCoder.h
Go to the documentation of this file.
1// Copyright 2019-2023 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
15
16#ifndef ALICEO2_INPLACEENTROPYCODER_H_
17#define ALICEO2_INPLACEENTROPYCODER_H_
18
19#include <optional>
20#include <variant>
21#include <type_traits>
22
24
25#include "rANS/encode.h"
26#include "rANS/factory.h"
27#include "rANS/histogram.h"
28#include "rANS/metrics.h"
29#include "rANS/serialize.h"
30
31namespace o2::ctf::internal
32{
33
34template <typename source_T>
36{
40
41 using dense_encoder_type = rans::denseEncoder_type<source_T>;
42 using adaptive_encoder_type = rans::adaptiveEncoder_type<source_T>;
43 using sparse_encoder_type = rans::sparseEncoder_type<source_T>;
44
45 using dict_buffer_type = std::vector<uint8_t>;
46
47 public:
51 using histogram_type = std::variant<dense_histogram_type, adaptive_histogram_type, sparse_histogram_type>;
52 using encoder_type = std::variant<dense_encoder_type, adaptive_encoder_type, sparse_encoder_type>;
53 using incompressible_buffer_type = std::vector<source_type>;
54
56
57 template <typename source_IT>
58 InplaceEntropyCoder(source_IT srcBegin, source_IT srcEnd);
59
60 template <typename source_IT>
61 InplaceEntropyCoder(source_IT srcBegin, source_IT srcEnd, source_type min, source_type max);
62
63 void makeEncoder();
64
65 // getters
66
67 [[nodiscard]] inline const metrics_type& getMetrics() const noexcept { return mMetrics; };
68
69 [[nodiscard]] inline size_t getNIncompressibleSamples() const noexcept { return mIncompressibleBuffer.size(); };
70
71 [[nodiscard]] size_t getNStreams() const;
72
73 [[nodiscard]] size_t getSymbolTablePrecision() const;
74
75 template <typename dst_T = uint8_t>
76 [[nodiscard]] size_t getPackedIncompressibleSize() const noexcept;
77
78 // operations
79 template <typename src_IT, typename dst_IT>
80 [[nodiscard]] dst_IT encode(src_IT srcBegin, src_IT srcEnd, dst_IT dstBegin, dst_IT dstEnd);
81
82 template <typename dst_IT>
83 [[nodiscard]] dst_IT writeDictionary(dst_IT dstBegin, dst_IT dstEnd);
84
85 template <typename dst_T>
86 [[nodiscard]] dst_T* writeIncompressible(dst_T* dstBegin, dst_T* dstEnd);
87
88 private:
89 template <typename source_IT, std::enable_if_t<(sizeof(typename std::iterator_traits<source_IT>::value_type) < 4), bool> = true>
90 void init(source_IT srcBegin, source_IT srcEnd, source_type min, source_type max);
91
92 template <typename source_IT, std::enable_if_t<(sizeof(typename std::iterator_traits<source_IT>::value_type) == 4), bool> = true>
93 void init(source_IT srcBegin, source_IT srcEnd, source_type min, source_type max);
94
95 template <typename source_IT, std::enable_if_t<(sizeof(typename std::iterator_traits<source_IT>::value_type) < 4), bool> = true>
96 void init(source_IT srcBegin, source_IT srcEnd);
97
98 template <typename source_IT, std::enable_if_t<(sizeof(typename std::iterator_traits<source_IT>::value_type) == 4), bool> = true>
99 void init(source_IT srcBegin, source_IT srcEnd);
100
101 template <typename container_T>
102 void serializeDictionary(const container_T&);
103
104 std::optional<histogram_type> mHistogram{};
105 metrics_type mMetrics{};
106 std::optional<encoder_type> mEncoder{};
107 incompressible_buffer_type mIncompressibleBuffer{};
108 dict_buffer_type mDictBuffer{};
109 packer_type mIncompressiblePacker{};
110};
111
112template <typename source_T>
113template <typename src_IT>
115{
116 static_assert(std::is_same_v<source_T, typename std::iterator_traits<src_IT>::value_type>);
117
118 const size_t nSamples = std::distance(srcBegin, srcEnd);
119 if constexpr (std::is_pointer_v<src_IT>) {
120 if (sizeof(source_type) > 2 && nSamples > 0) {
121 const auto [min, max] = rans::internal::minmax(gsl::span<const source_type>(srcBegin, srcEnd));
122 init(srcBegin, srcEnd, min, max);
123 } else {
124 init(srcBegin, srcEnd);
125 }
126 } else {
127 init(srcBegin, srcEnd);
128 }
129
130 mIncompressiblePacker = Packer(mMetrics);
131};
132
133template <typename source_T>
134template <typename source_IT>
136{
137 static_assert(std::is_same_v<source_T, typename std::iterator_traits<source_IT>::value_type>);
138 init(srcBegin, srcEnd, min, max);
139 mIncompressiblePacker = Packer(mMetrics);
140};
141
142template <typename source_T>
143[[nodiscard]] inline size_t InplaceEntropyCoder<source_T>::getNStreams() const
144{
145 size_t nStreams{};
146 std::visit([&, this](auto&& encoder) { nStreams = encoder.getNStreams(); }, *mEncoder);
147 return nStreams;
148}
149
150template <typename source_T>
152{
153 size_t precision{};
154 std::visit([&, this](auto&& encoder) { precision = encoder.getSymbolTable().getPrecision(); }, *mEncoder);
155 return precision;
156}
157
158template <typename source_T>
160{
161 std::visit([this](auto&& histogram) {
162 auto renormed = rans::renorm(std::move(histogram), mMetrics);
163
164 if (std::holds_alternative<sparse_histogram_type>(*mHistogram)) {
165 serializeDictionary(renormed);
166 }
167
168 const size_t rangeBits = rans::utils::getRangeBits(*mMetrics.getCoderProperties().min, *mMetrics.getCoderProperties().max);
169 const size_t nUsedAlphabetSymbols = mMetrics.getDatasetProperties().nUsedAlphabetSymbols;
170
171 if (rangeBits <= 18) {
172 // dense symbol tables if they fit into cache, or source data covers the range of the alphabet well
173 mEncoder = encoder_type{std::in_place_type<dense_encoder_type>, renormed};
174 } else if (nUsedAlphabetSymbols < rans::utils::pow2(14)) {
175 // sparse symbol table makes sense if it fits into L3 Cache
176 mEncoder = encoder_type{std::in_place_type<sparse_encoder_type>, renormed};
177 } else {
178 // adaptive symbol table otherwise
179 mEncoder = encoder_type{std::in_place_type<adaptive_encoder_type>, renormed};
180 }
181 },
182 *mHistogram);
183};
184
185template <typename source_T>
186template <typename src_IT, typename dst_IT>
187[[nodiscard]] dst_IT InplaceEntropyCoder<source_T>::encode(src_IT srcBegin, src_IT srcEnd, dst_IT dstBegin, dst_IT dstEnd)
188{
189 static_assert(std::is_same_v<source_T, typename std::iterator_traits<src_IT>::value_type>);
190
191 dst_IT messageEnd = dstBegin;
192
193 std::visit([&, this](auto&& encoder) {
194 if (encoder.getSymbolTable().hasEscapeSymbol()) {
195 mIncompressibleBuffer.reserve(*mMetrics.getCoderProperties().nIncompressibleSamples);
196 auto [encodedMessageEnd, literalsEnd] = encoder.process(srcBegin, srcEnd, dstBegin, std::back_inserter(mIncompressibleBuffer));
197 messageEnd = encodedMessageEnd;
198 } else {
199 messageEnd = encoder.process(srcBegin, srcEnd, dstBegin);
200 }
201 rans::utils::checkBounds(messageEnd, dstEnd);
202 },
203 *mEncoder);
204
205 return messageEnd;
206};
207
208template <typename source_T>
209template <typename dst_IT>
210[[nodiscard]] inline dst_IT InplaceEntropyCoder<source_T>::writeDictionary(dst_IT dstBegin, dst_IT dstEnd)
211{
212 static_assert(std::is_pointer_v<dst_IT>);
213
214 using dst_type = std::remove_pointer_t<dst_IT>;
215
216 dst_IT ret{};
217 if (mDictBuffer.empty()) {
218 std::visit([&, this](auto&& encoder) { ret = rans::compressRenormedDictionary(encoder.getSymbolTable(), dstBegin); }, *mEncoder);
219 } else {
220 // copy
221 std::memcpy(dstBegin, mDictBuffer.data(), mDictBuffer.size());
222
223 // determine location of end
224 auto end = reinterpret_cast<uint8_t*>(dstBegin) + mDictBuffer.size();
225 // realign pointer
226 constexpr size_t alignment = std::alignment_of_v<dst_type>;
227 end += (alignment - reinterpret_cast<uintptr_t>(end) % alignment) % alignment;
228 // and convert it back to ret
229 ret = reinterpret_cast<dst_IT>(end);
230 }
231
232 rans::utils::checkBounds(ret, dstEnd);
233 return ret;
234};
235
236template <typename source_T>
237template <typename dst_T>
238inline dst_T* InplaceEntropyCoder<source_T>::writeIncompressible(dst_T* dstBegin, dst_T* dstEnd)
239{
240 return mIncompressiblePacker.pack(mIncompressibleBuffer.data(), mIncompressibleBuffer.size(), dstBegin, dstEnd);
241};
242
243template <typename source_T>
244template <typename dst_T>
245[[nodiscard]] inline size_t InplaceEntropyCoder<source_T>::getPackedIncompressibleSize() const noexcept
246{
247 return mIncompressiblePacker.template getPackingBufferSize<dst_T>(getNIncompressibleSamples());
248}
249
250template <typename source_T>
251template <typename source_IT, std::enable_if_t<(sizeof(typename std::iterator_traits<source_IT>::value_type) < 4), bool>>
252void InplaceEntropyCoder<source_T>::init(source_IT srcBegin, source_IT srcEnd, source_type min, source_type max)
253{
254 mHistogram.emplace(histogram_type{rans::makeDenseHistogram::fromSamples(srcBegin, srcEnd)});
255 mMetrics = metrics_type{std::get<dense_histogram_type>(*mHistogram), min, max};
256};
257
258template <typename source_T>
259template <typename source_IT, std::enable_if_t<(sizeof(typename std::iterator_traits<source_IT>::value_type) == 4), bool>>
260void InplaceEntropyCoder<source_T>::init(source_IT srcBegin, source_IT srcEnd, source_type min, source_type max)
261{
262 const size_t nSamples = std::distance(srcBegin, srcEnd);
263 const size_t rangeBits = rans::utils::getRangeBits(min, max);
264
265 if ((rangeBits <= 18) || ((nSamples / rans::utils::pow2(rangeBits)) >= 0.80)) {
266 // either the range of source symbols is distrubuted such that it fits into L3 Cache
267 // Or it is possible for the data to cover a very significant fraction of the total [min,max] range
268 mHistogram = histogram_type{std::in_place_type<dense_histogram_type>, rans::makeDenseHistogram::fromSamples(srcBegin, srcEnd, min, max)};
269 mMetrics = metrics_type{std::get<dense_histogram_type>(*mHistogram), min, max};
270 } else if (nSamples / rans::utils::pow2(rangeBits) <= 0.3) {
271 // or the range of source symbols is spread very thinly accross a large range
272 mHistogram = histogram_type{std::in_place_type<sparse_histogram_type>, rans::makeSparseHistogram::fromSamples(srcBegin, srcEnd)};
273 mMetrics = metrics_type{std::get<sparse_histogram_type>(*mHistogram), min, max};
274 } else {
275 // no strong evidence of either extreme case
276 mHistogram = histogram_type{std::in_place_type<adaptive_histogram_type>, rans::makeAdaptiveHistogram::fromSamples(srcBegin, srcEnd)};
277 mMetrics = metrics_type{std::get<adaptive_histogram_type>(*mHistogram), min, max};
278 }
279};
280
281template <typename source_T>
282template <typename source_IT, std::enable_if_t<(sizeof(typename std::iterator_traits<source_IT>::value_type) < 4), bool>>
283void InplaceEntropyCoder<source_T>::init(source_IT srcBegin, source_IT srcEnd)
284{
285 mHistogram = histogram_type{std::in_place_type<dense_histogram_type>, rans::makeDenseHistogram::fromSamples(srcBegin, srcEnd)};
286 mMetrics = metrics_type{std::get<dense_histogram_type>(*mHistogram)};
287};
288
289template <typename source_T>
290template <typename source_IT, std::enable_if_t<(sizeof(typename std::iterator_traits<source_IT>::value_type) == 4), bool>>
291void InplaceEntropyCoder<source_T>::init(source_IT srcBegin, source_IT srcEnd)
292{
293 mHistogram = histogram_type{std::in_place_type<sparse_histogram_type>, rans::makeSparseHistogram::fromSamples(srcBegin, srcEnd)};
294 mMetrics = metrics_type{std::get<sparse_histogram_type>(*mHistogram)};
295};
296
297template <typename source_T>
298template <typename container_T>
299void InplaceEntropyCoder<source_T>::serializeDictionary(const container_T& renormedHistogram)
300{
301
302 mDictBuffer.resize(mMetrics.getSizeEstimate().getCompressedDictionarySize(), 0);
303 auto end = rans::compressRenormedDictionary(renormedHistogram, mDictBuffer.data());
304 rans::utils::checkBounds(end, mDictBuffer.data() + mDictBuffer.size());
305 mDictBuffer.resize(std::distance(mDictBuffer.data(), end));
306
307 assert(mDictBuffer.size() > 0);
308};
309
310} // namespace o2::ctf::internal
311
312#endif /* ALICEO2_INPLACEENTROPYCODER_H_ */
Interfaces for BitPacking using librans.
uint32_t source_type
const metrics_type & getMetrics() const noexcept
dst_IT encode(src_IT srcBegin, src_IT srcEnd, dst_IT dstBegin, dst_IT dstEnd)
dst_T * writeIncompressible(dst_T *dstBegin, dst_T *dstEnd)
std::variant< dense_encoder_type, adaptive_encoder_type, sparse_encoder_type > encoder_type
std::vector< source_type > incompressible_buffer_type
std::variant< dense_histogram_type, adaptive_histogram_type, sparse_histogram_type > histogram_type
size_t getPackedIncompressibleSize() const noexcept
size_t getNIncompressibleSamples() const noexcept
dst_IT writeDictionary(dst_IT dstBegin, dst_IT dstEnd)
rans::Metrics< source_type > metrics_type
InplaceEntropyCoder(source_IT srcBegin, source_IT srcEnd)
public interface for encoding.
static factory classes for building histograms, encoders and decoders.
GLuint GLuint end
Definition glcorearb.h:469
GLenum GLint GLint * precision
Definition glcorearb.h:1899
public interface for building and renorming histograms from source data.
std::pair< source_T, source_T > minmax(gsl::span< const source_T > range)
constexpr uint32_t getRangeBits(T min, T max) noexcept
Definition utils.h:200
constexpr size_t pow2(size_t n) noexcept
Definition utils.h:165
void checkBounds(IT iteratorPosition, IT upperBound)
Definition utils.h:244
decltype(makeDenseEncoder<>::fromRenormed(RenormedDenseHistogram< source_T >{})) denseEncoder_type
Definition factory.h:229
decltype(makeSparseEncoder<>::fromRenormed(RenormedSparseHistogram< source_T >{})) sparseEncoder_type
Definition factory.h:235
dest_IT compressRenormedDictionary(const container_T &container, dest_IT dstBufferBegin)
Definition serialize.h:142
decltype(auto) renorm(histogram_T histogram, size_t newPrecision, RenormingPolicy renormingPolicy=RenormingPolicy::Auto, size_t lowProbabilityCutoffBits=0)
Definition renorm.h:203
decltype(makeAdaptiveEncoder<>::fromRenormed(RenormedAdaptiveHistogram< source_T >{})) adaptiveEncoder_type
Definition factory.h:232
Defining DataPointCompositeObject explicitly as copiable.
public interface for serializing histograms (dictionaries) to JSON or compressed binary.
static decltype(auto) fromSamples(source_IT begin, source_IT end)
Definition factory.h:65
static decltype(auto) fromSamples(source_IT begin, source_IT end, typename std::iterator_traits< source_IT >::value_type min, typename std::iterator_traits< source_IT >::value_type max)
Definition factory.h:144
constexpr size_t min
constexpr size_t max