Project
Loading...
Searching...
No Matches
HuffmanCodec.h
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
11
12/* Local Variables: */
13/* mode: c++ */
14/* End: */
15
16#ifndef HUFFMANCODEC_H
17#define HUFFMANCODEC_H
18
23
24#include <cstdint>
25#include <cerrno>
26#include <set>
27#include <map>
28#include <vector>
29#include <memory>
30#include <exception>
31#include <stdexcept>
32#include <iostream>
33#include <iomanip>
34#include <sstream> // stringstream in configuration parsing
36
37namespace o2
38{
39namespace data_compression
40{
41
50template <typename _CodeType>
52{
53 public:
56 using shared_pointer = std::shared_ptr<self_type>;
57 using CodeType = _CodeType;
58
59 HuffmanNode() : mLeft(), mRight(), mWeight(0.), mIndex(~uint16_t(0)), mCode(), mCodeLen(0) {}
60 HuffmanNode(const HuffmanNode& other) = default;
62 ~HuffmanNode() = default;
63
64 HuffmanNode(double weight, uint16_t index = ~uint16_t(0))
65 : mLeft(), mRight(), mWeight(weight), mIndex(index), mCode(), mCodeLen(0)
66 {
67 }
68
70 : mLeft(left),
71 mRight(right),
72 mWeight((mLeft ? mLeft->mWeight : 0.) + (mRight ? mRight->mWeight : 0.)),
73 mIndex(~uint16_t(0)),
74 mCode(),
75 mCodeLen(0)
76 {
77 }
78
79 bool operator<(const HuffmanNode& other) const { return mWeight < other.mWeight; }
80
81 CodeType getBinaryCode() const { return mCode; }
82 uint16_t getBinaryCodeLength() const { return mCodeLen; }
83 uint16_t getIndex() const { return mIndex; }
84
85 // TODO: can be combined to one function with templated index
86 pointer getLeftChild() const { return mLeft.get(); }
87 pointer getRightChild() const { return mRight.get(); }
88 void setBinaryCode(uint16_t codeLen, CodeType code)
89 {
90 mCode = code;
91 mCodeLen = codeLen;
92 }
93
94 // insert bit at LSB
96 {
97 mCode <<= 1;
98 if (bit) {
99 mCode.set(0);
100 } else {
101 mCode.reset(0);
102 }
103 mCodeLen += 1;
104 return *this;
105 }
106
107 // insert bit at MSB
109 {
110 if (bit) {
111 mCode.set(mCodeLen);
112 } else {
113 mCode.reset(mCodeLen);
114 }
115 mCodeLen += 1;
116 return *this;
117 }
118
119 void print(std::ostream& stream = std::cout) const
120 {
121 static int level = 1;
122 stream << "node weight: " << std::setw(9) << mWeight;
123 if (!mLeft) {
124 stream << " leave ";
125 } else {
126 stream << " tree ";
127 }
128 stream << " code length: " << mCodeLen;
129 stream << " code " << mCode;
130 stream << std::endl;
131 level++;
132 if (mLeft) {
133 stream << std::setw(level) << level << ": left: ";
134 mLeft->print(stream);
135 }
136 if (mRight) {
137 stream << std::setw(level) << level << ": right: ";
138 mRight->print(stream);
139 }
140 level--;
141 }
142
143 private:
144 shared_pointer mLeft;
145 shared_pointer mRight;
146 double mWeight;
147 uint16_t mIndex;
148 CodeType mCode;
149 uint16_t mCodeLen;
150};
151
163template <typename _CodingModel>
165{
166 public:
167 using model_type = _CodingModel;
168 using value_type = typename model_type::value_type;
169 using code_type = typename model_type::code_type;
170
171 HuffmanCodec() = delete; // forbidden
172 HuffmanCodec(const model_type& model) : mCodingModel(model) {}
173 ~HuffmanCodec() = default;
174
179 template <typename ValueType>
180 const bool Encode(ValueType v, code_type& code, uint16_t& codeLength) const
181 {
182 code = mCodingModel.Encode(v, codeLength);
183 return true;
184 }
185
190 template <typename ReturnType>
191 bool Decode(ReturnType& v, code_type code, uint16_t& codeLength) const
192 {
193 v = mCodingModel.Decode(code, codeLength);
194 return true;
195 }
196
199 {
200 return mCodingModel;
201 }
202
206 int loadConfiguration(const char* filename, std::string method = "zlib")
207 {
208 return mCodingModel.read(filename, method);
209 }
210
214 int writeConfiguration(const char* filename, std::string method = "zlib") const
215 {
216 return mCodingModel.write(filename, method);
217 }
218
219 private:
220 model_type mCodingModel;
221};
222
239template <typename _BASE, typename Rep, bool orderMSB = true>
240class HuffmanModel : public _BASE
241{
242 public:
243 HuffmanModel() : mAlphabet(), mLeaveNodes(), mTreeNodes() {}
244 ~HuffmanModel() = default;
245
247 using code_type = Rep;
249 using value_type = typename _BASE::value_type;
250 static constexpr bool OrderMSB = orderMSB;
251
252 int init(double v = 1.) { return _BASE::initWeight(mAlphabet, v); }
253
261 code_type Encode(typename _BASE::value_type symbol, uint16_t& codeLength) const
262 {
263 codeLength = 0;
264 auto nodeIndex = _BASE::alphabet_type::getIndex(symbol);
265 if (nodeIndex < mLeaveNodes.size()) {
266 // valid symbol/value
267 codeLength = mLeaveNodes[nodeIndex]->getBinaryCodeLength();
268 return mLeaveNodes[nodeIndex]->getBinaryCode();
269 } else {
270 std::string msg = "symbol ";
271 msg += symbol;
272 msg += " not found in alphapet ";
273 msg += _BASE::getName();
274 throw std::range_error(msg);
275 }
276
277 static const code_type dummy = 0;
278 return dummy;
279 }
280
290 value_type Decode(code_type code, uint16_t& codeLength) const
291 {
292 // TODO: need to check if there is a loaded tree, but don't
293 // want to check this every time when calling. Maybe its enough
294 // to let the dereferencing below throw an exception
295 codeLength = 0;
296 typename _BASE::value_type v = 0;
297 // dereference the iterator and get raw pointer from shared pointer
298 // TODO: work on shared pointers here as well
299 // the top node is the only element in the multiset after using the
300 // weighted sort algorithm to build the tree, all nodes are referenced
301 // from their parents in the tree.
302 const node_type* node = (*mTreeNodes.begin()).get();
303 uint16_t codeMSB = code.size() - 1;
304 while (node) {
305 // N.B.: nodes have either both child nodes or none of them
306 if (node->getLeftChild() == nullptr) {
307 // this is a leave node, retrieve value for corresponding index
308 // TODO: validity check for index, this can be done once after
309 // initializing the Huffman configuration, either after training
310 // or loading the configuration
311 return _BASE::alphabet_type::getSymbol(node->getIndex());
312 }
313 if (codeLength > codeMSB) {
314 // the size of the code type is shorter than the Huffman tree length
315 throw std::range_error("code type length insufficient for Huffman tree length");
316 break;
317 }
318 bool bit = false;
319 if (OrderMSB) {
320 bit = code.test(codeMSB - codeLength);
321 } else {
322 bit = code.test(codeLength);
323 }
324 ++codeLength;
325 if (bit) {
326 node = node->getLeftChild();
327 } else {
328 node = node->getRightChild();
329 }
330 }
331 return v;
332 }
333
338 template <typename T>
339 class isless
340 {
341 public:
342 bool operator()(const T a, const T b) const { return a < b; }
343 };
345 template <typename T>
346 class isless<T*>
347 {
348 public:
349 bool operator()(const T* a, const T* b) const
350 {
351 if (a == nullptr || b == nullptr) {
352 return false;
353 }
354 return *a < *b;
355 }
356 };
358 template <typename T>
359 class isless<std::shared_ptr<T>>
360 {
361 public:
362 bool operator()(const std::shared_ptr<T>& a, const std::shared_ptr<T>& b) const
363 {
364 if (!a || !b) {
365 return false;
366 }
367 return *a < *b;
368 }
369 };
370
377 {
378 mLeaveNodes.clear();
379
380 // probability model provides map of {symbol, weight}-pairs
381 _BASE& model = *this;
382 for (auto i : model) {
383 // create nodes knowing about their index and the symbol weight
384 mLeaveNodes.push_back(std::make_shared<node_type>(i.second, _BASE::alphabet_type::getIndex(i.first)));
385 }
386
387 // insert pointer to nodes into ordered structure to build tree
388 // since the type is a pointer, a specific 'less' functor needs to
389 // be provided to dereference before applying operator<
390 for (auto& i : mLeaveNodes) {
391 mTreeNodes.insert(i);
392 }
393 while (mTreeNodes.size() > 1) {
394 // create new node combining the two with lowest probability
395 std::shared_ptr<node_type> combinedNode = std::make_shared<node_type>(*mTreeNodes.begin(), *++mTreeNodes.begin());
396 // remove those two nodes from the list
397 mTreeNodes.erase(mTreeNodes.begin());
398 mTreeNodes.erase(mTreeNodes.begin());
399 // insert the new node according to the less functor
400 mTreeNodes.insert(combinedNode);
401 }
402 // assign value, method works on pointer
403 // dereference iterator and shared_ptr to get the raw pointer
404 // TODO: change method to work on shared instead of raw pointers
405 assignCode((*mTreeNodes.begin()).get());
406 return true;
407 }
408
424 {
425 if (node == nullptr) {
426 return 0;
427 }
428 int codelen = node->getBinaryCodeLength();
429 int retcodelen = codelen;
430 if (node->getLeftChild()) { // bit '1' branch
431 code_type c = node->getBinaryCode();
432 if (OrderMSB) { // note: this is a compile time switch
433 c <<= 1;
434 c.set(0);
435 } else {
436 c.set(codelen);
437 }
438 node->getLeftChild()->setBinaryCode(codelen + 1, c);
439 int branchlen = assignCode(node->getLeftChild());
440 if (retcodelen < branchlen) {
441 retcodelen = branchlen;
442 }
443 }
444 if (node->getRightChild()) { // bit '0' branch
445 code_type c = node->getBinaryCode();
446 if (OrderMSB) {
447 c <<= 1;
448 c.reset(0);
449 } else {
450 c.reset(codelen);
451 }
452 node->getRightChild()->setBinaryCode(codelen + 1, c);
453 int branchlen = assignCode(node->getRightChild());
454 if (retcodelen < branchlen) {
455 retcodelen = branchlen;
456 }
457 }
458 return retcodelen;
459 }
460
469 int write(const char* filename, std::string method = "zlib") const
470 {
471 o2::io::ocomp_stream out(filename, method);
472 return write(out);
473 }
474
478 int write(std::ostream& out) const
479 {
480 if (mTreeNodes.size() == 0) {
481 return 0;
482 }
483 return write(out, (*mTreeNodes.begin()).get());
484 }
485
492 int read(const char* filename, std::string method = "zlib")
493 {
494 o2::io::icomp_stream in(filename, method);
495 return read(in);
496 }
497
508 int read(std::istream& in)
509 {
510 mLeaveNodes.clear();
511 mTreeNodes.clear();
512 int lineNo = -1;
513 std::string node, left, right, parameters;
514 std::set<int> nodeIndices;
515 struct treeNodeConfiguration {
516 treeNodeConfiguration(int _index, int _left, int _right) : index(_index), left(_left), right(_right) {}
517 int index, left, right;
518 bool operator<(const treeNodeConfiguration& other) const { return index < other.index; }
519 };
520 std::set<treeNodeConfiguration> treeNodeConfigurations;
521 char firstChar = 0;
522 std::map<int, int> leaveNodeMap;
523 while ((in.get(firstChar)) && firstChar != '\n' && (in.putback(firstChar)) && (in >> node) && (in >> left) &&
524 (in >> right) && ++lineNo >= 0) {
525 std::getline(in, parameters);
526 if (lineNo != std::stoi(node)) {
527 std::cerr << "format error: expected node no " << lineNo << ", but got " << node << " (" << left << " " << right
528 << " " << parameters << ")" << std::endl;
529 std::cerr << "Note: Huffman table dump has to be terminated by blank line or eof" << std::endl;
530 break;
531 }
532 if (parameters.empty()) {
533 // std::cout << "tree node " << lineNo << " left=" << left << " right=" << right << std::endl;
534 int leftIndex = std::stoi(left);
535 int rightIndex = std::stoi(right);
536 auto it = nodeIndices.find(leftIndex);
537 if (it == nodeIndices.end()) {
538 std::cerr << "Format error: can not find left child node with index " << leftIndex << std::endl;
539 return -1;
540 }
541 nodeIndices.erase(it);
542 it = nodeIndices.find(rightIndex);
543 if (it == nodeIndices.end()) {
544 std::cerr << "Format error: can not find right child node with index " << rightIndex << std::endl;
545 return -1;
546 }
547 nodeIndices.erase(it);
548 treeNodeConfigurations.insert(treeNodeConfiguration(lineNo, leftIndex, rightIndex));
549 } else {
550 std::stringstream vs(left), ws(right);
551 typename _BASE::alphabet_type::value_type symbol;
552 vs >> symbol;
553 typename _BASE::weight_type weight;
554 ws >> weight;
555 int symbolIndex = _BASE::alphabet_type::getIndex(symbol);
556 // grow the vector as operator[] always expects index within range
557 if (mLeaveNodes.size() < symbolIndex + 1) {
558 mLeaveNodes.resize(symbolIndex + 1);
559 }
560 mLeaveNodes[symbolIndex] = std::make_shared<node_type>(weight, symbolIndex);
561 std::stringstream ps(parameters);
562 uint16_t codeLen = 0;
563 ps >> codeLen;
564 code_type code = 0;
565 ps >> code;
566 mLeaveNodes[symbolIndex]->setBinaryCode(codeLen, code);
567 leaveNodeMap[lineNo] = symbolIndex;
568 _BASE::addWeight(symbol, weight);
569 // std::cout << "leave node " << lineNo << " " << " value=" << value << " weight=" << weight << " " << codeLen
570 // << " " << code << std::endl;
571 }
572 nodeIndices.insert(lineNo);
573 }
574 std::map<int, std::shared_ptr<node_type>> treeNodes;
575 for (auto conf : treeNodeConfigurations) {
576 std::shared_ptr<node_type> left_local;
577 auto ln = leaveNodeMap.find(conf.left);
578 if (ln != leaveNodeMap.end()) {
579 left_local = mLeaveNodes[ln->second];
580 leaveNodeMap.erase(ln);
581 } else {
582 auto tn = treeNodes.find(conf.left);
583 if (tn == treeNodes.end()) {
584 std::cerr << "Internal error: can not find left child node with index " << conf.left << std::endl;
585 return -1;
586 }
587 left_local = tn->second;
588 treeNodes.erase(tn);
589 }
590 std::shared_ptr<node_type> right_local;
591 auto rn = leaveNodeMap.find(conf.right);
592 if (rn != leaveNodeMap.end()) {
593 right_local = mLeaveNodes[rn->second];
594 leaveNodeMap.erase(rn);
595 } else {
596 auto tn = treeNodes.find(conf.right);
597 if (tn == treeNodes.end()) {
598 std::cerr << "Internal error: can not find right child node with index " << conf.right << std::endl;
599 return -1;
600 }
601 right_local = tn->second;
602 treeNodes.erase(tn);
603 }
604 // make combined node shared ptr and add to map
605 treeNodes[conf.index] = std::make_shared<node_type>(left_local, right_local);
606 }
607 if (leaveNodeMap.size() != 0 || treeNodes.size() != 1) {
608 std::cerr << "error: " << leaveNodeMap.size() << " unhandled leave node(s)"
609 << "; " << treeNodes.size() << " tree nodes(s), expected 1" << std::endl;
610 }
611 mTreeNodes.insert(treeNodes.begin()->second);
612 return 0;
613 }
614
615 void print() const
616 {
617 if (mTreeNodes.size() > 0) {
618 node_type* topNode = (*mTreeNodes.begin()).get();
619 if (topNode) {
620 topNode->print();
621 } else {
622 // TODO: handle this error condition
623 }
624 }
625 };
626
627 private:
634 template <typename NodeType>
635 int write(std::ostream& out, NodeType* node, int nodeIndex = 0) const
636 {
637 if (!node) {
638 return nodeIndex;
639 }
640 const _BASE& model = *this;
641 NodeType* left = node->getLeftChild();
642 NodeType* right = node->getRightChild();
643 if (left == nullptr) {
644 typename _BASE::value_type value = _BASE::alphabet_type::getSymbol(node->getIndex());
645 out << nodeIndex << " " << value << " " << model[value] << " " << node->getBinaryCodeLength() << " "
646 << node->getBinaryCode() << std::endl;
647 return nodeIndex;
648 }
649 int leftIndex = write(out, left, nodeIndex);
650 int rightIndex = write(out, right, leftIndex + 1);
651 out << rightIndex + 1 << " " << leftIndex << " " << rightIndex << std::endl;
652 return rightIndex + 1;
653 }
654
655 // the alphabet, determined by template parameter
656 typename _BASE::alphabet_type mAlphabet;
657 // Huffman leave nodes containing symbol index to code mapping
658 std::vector<std::shared_ptr<node_type>> mLeaveNodes;
659 // multiset, order determined by less functor working on pointers
660 std::multiset<std::shared_ptr<node_type>, isless<std::shared_ptr<node_type>>> mTreeNodes;
661};
662
663} // namespace data_compression
664} // namespace o2
665
666#endif
Implementation of iostreams with compression filter.
int32_t i
uint32_t c
Definition RawData.h:2
Main class of the Huffman codec implementation.
typename model_type::value_type value_type
int writeConfiguration(const char *filename, std::string method="zlib") const
int loadConfiguration(const char *filename, std::string method="zlib")
model_type const & getCodingModel() const
Get the underlying coding model.
const bool Encode(ValueType v, code_type &code, uint16_t &codeLength) const
bool Decode(ReturnType &v, code_type code, uint16_t &codeLength) const
HuffmanCodec(const model_type &model)
typename model_type::code_type code_type
bool operator()(const T *a, const T *b) const
bool operator()(const std::shared_ptr< T > &a, const std::shared_ptr< T > &b) const
bool operator()(const T a, const T b) const
int read(std::istream &in)
Read configuration from stream The previous configuration is discarded.
typename _BASE::value_type value_type
int read(const char *filename, std::string method="zlib")
Read configuration from file.
value_type Decode(code_type code, uint16_t &codeLength) const
int write(std::ostream &out) const
Write Huffman table in self-consistent format to an output stream.
int write(const char *filename, std::string method="zlib") const
Write Huffman table to file.
code_type Encode(typename _BASE::value_type symbol, uint16_t &codeLength) const
Container holding information to build Huffman tree.
HuffmanNode(double weight, uint16_t index=~uint16_t(0))
HuffmanNode & operator=(const HuffmanNode &other)=default
void setBinaryCode(uint16_t codeLen, CodeType code)
HuffmanNode(const HuffmanNode &other)=default
self_type & operator<<=(bool bit)
self_type & operator>>=(bool bit)
std::shared_ptr< self_type > shared_pointer
void print(std::ostream &stream=std::cout) const
HuffmanNode(shared_pointer left, shared_pointer right)
bool operator<(const HuffmanNode &other) const
const GLdouble * v
Definition glcorearb.h:832
GLuint index
Definition glcorearb.h:781
GLdouble GLdouble right
Definition glcorearb.h:4077
GLuint GLuint GLfloat weight
Definition glcorearb.h:5477
GLboolean GLboolean GLboolean b
Definition glcorearb.h:1233
GLsizei const GLfloat * value
Definition glcorearb.h:819
GLint left
Definition glcorearb.h:1979
GLboolean GLboolean GLboolean GLboolean a
Definition glcorearb.h:1233
GLuint GLuint stream
Definition glcorearb.h:1806
a couple of static helper functions to create timestamp values for CCDB queries or override obsolete ...
Defining DataPointCompositeObject explicitly as copiable.
std::string filename()
VectorOfTObjectPtrs other
uint64_t const void const *restrict const msg
Definition x9.h:153