Project
Loading...
Searching...
No Matches
TableTreeHelpers.cxx
Go to the documentation of this file.
1// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3// All rights not expressly granted are reserved.
4//
5// This software is distributed under the terms of the GNU General Public
6// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7//
8// In applying this license CERN does not waive the privileges and immunities
9// granted to it by virtue of its status as an Intergovernmental Organization
10// or submit itself to any jurisdiction.
12#include "Framework/Logger.h"
13#include "Framework/Signpost.h"
14
15#include <arrow/dataset/file_base.h>
16#include <arrow/record_batch.h>
17#include <arrow/type.h>
18#include <arrow/util/key_value_metadata.h>
19#include <TBufferFile.h>
20
21#include <memory>
22#include <utility>
23
24O2_DECLARE_DYNAMIC_LOG(tabletree_helpers);
25
27{
28static constexpr char const* sizeBranchSuffix = "_size";
29} // namespace TableTreeHelpers
30
31namespace o2::framework
32{
33auto arrowTypeFromROOT(EDataType type, int size)
34{
35 auto typeGenerator = [](std::shared_ptr<arrow::DataType> const& type, int size) -> std::shared_ptr<arrow::DataType> {
36 switch (size) {
37 case -1:
38 return arrow::list(type);
39 case 1:
40 return std::move(type);
41 default:
42 return arrow::fixed_size_list(type, size);
43 }
44 };
45
46 switch (type) {
47 case EDataType::kBool_t:
48 return typeGenerator(arrow::boolean(), size);
49 case EDataType::kUChar_t:
50 return typeGenerator(arrow::uint8(), size);
51 case EDataType::kUShort_t:
52 return typeGenerator(arrow::uint16(), size);
53 case EDataType::kUInt_t:
54 return typeGenerator(arrow::uint32(), size);
55 case EDataType::kULong64_t:
56 return typeGenerator(arrow::uint64(), size);
57 case EDataType::kChar_t:
58 return typeGenerator(arrow::int8(), size);
59 case EDataType::kShort_t:
60 return typeGenerator(arrow::int16(), size);
61 case EDataType::kInt_t:
62 return typeGenerator(arrow::int32(), size);
63 case EDataType::kLong64_t:
64 return typeGenerator(arrow::int64(), size);
65 case EDataType::kFloat_t:
66 return typeGenerator(arrow::float32(), size);
67 case EDataType::kDouble_t:
68 return typeGenerator(arrow::float64(), size);
69 default:
70 throw runtime_error_f("Unsupported branch type: %d", static_cast<int>(type));
71 }
72}
73
74auto basicROOTTypeFromArrow(arrow::Type::type id)
75{
76 switch (id) {
77 case arrow::Type::BOOL:
78 return ROOTTypeInfo{EDataType::kBool_t, "/O", TDataType::GetDataType(EDataType::kBool_t)->Size()};
79 case arrow::Type::UINT8:
80 return ROOTTypeInfo{EDataType::kUChar_t, "/b", TDataType::GetDataType(EDataType::kUChar_t)->Size()};
81 case arrow::Type::UINT16:
82 return ROOTTypeInfo{EDataType::kUShort_t, "/s", TDataType::GetDataType(EDataType::kUShort_t)->Size()};
83 case arrow::Type::UINT32:
84 return ROOTTypeInfo{EDataType::kUInt_t, "/i", TDataType::GetDataType(EDataType::kUInt_t)->Size()};
85 case arrow::Type::UINT64:
86 return ROOTTypeInfo{EDataType::kULong64_t, "/l", TDataType::GetDataType(EDataType::kULong64_t)->Size()};
87 case arrow::Type::INT8:
88 return ROOTTypeInfo{EDataType::kChar_t, "/B", TDataType::GetDataType(EDataType::kChar_t)->Size()};
89 case arrow::Type::INT16:
90 return ROOTTypeInfo{EDataType::kShort_t, "/S", TDataType::GetDataType(EDataType::kShort_t)->Size()};
91 case arrow::Type::INT32:
92 return ROOTTypeInfo{EDataType::kInt_t, "/I", TDataType::GetDataType(EDataType::kInt_t)->Size()};
93 case arrow::Type::INT64:
94 return ROOTTypeInfo{EDataType::kLong64_t, "/L", TDataType::GetDataType(EDataType::kLong64_t)->Size()};
95 case arrow::Type::FLOAT:
96 return ROOTTypeInfo{EDataType::kFloat_t, "/F", TDataType::GetDataType(EDataType::kFloat_t)->Size()};
97 case arrow::Type::DOUBLE:
98 return ROOTTypeInfo{EDataType::kDouble_t, "/D", TDataType::GetDataType(EDataType::kDouble_t)->Size()};
99 default:
100 throw runtime_error("Unsupported arrow column type");
101 }
102}
103
104ColumnToBranch::ColumnToBranch(TTree* tree, std::shared_ptr<arrow::ChunkedArray> const& column, std::shared_ptr<arrow::Field> const& field)
105 : mBranchName{field->name()},
106 mColumn{column.get()},
107 mFieldSize{field->type()->byte_width()}
108{
109 std::string leafList;
110 std::string sizeLeafList;
111 auto arrowType = field->type();
112 mFieldType = arrowType->id();
113 switch (mFieldType) {
114 case arrow::Type::FIXED_SIZE_LIST:
115 mListSize = std::static_pointer_cast<arrow::FixedSizeListType>(arrowType)->list_size();
116 arrowType = arrowType->field(0)->type();
117 mElementType = basicROOTTypeFromArrow(arrowType->id());
118 leafList = mBranchName + "[" + std::to_string(mListSize) + "]" + mElementType.suffix;
119 mFieldSize = arrowType->byte_width() * mListSize;
120 break;
121 case arrow::Type::LIST:
122 arrowType = arrowType->field(0)->type();
123 mElementType = basicROOTTypeFromArrow(arrowType->id());
124 leafList = mBranchName + "[" + mBranchName + TableTreeHelpers::sizeBranchSuffix + "]" + mElementType.suffix;
125 sizeLeafList = mBranchName + TableTreeHelpers::sizeBranchSuffix + "/I";
126 // Notice that this could be replaced by a better guess of the
127 // average size of the list elements, but this is not trivial.
128 mFieldSize = arrowType->byte_width();
129 break;
130 default:
131 mElementType = basicROOTTypeFromArrow(arrowType->id());
132 leafList = mBranchName + mElementType.suffix;
133 break;
134 }
135 if (!sizeLeafList.empty()) {
136 mSizeBranch = tree->GetBranch((mBranchName + TableTreeHelpers::sizeBranchSuffix).c_str());
137 if (mSizeBranch == nullptr) {
138 mSizeBranch = tree->Branch((mBranchName + TableTreeHelpers::sizeBranchSuffix).c_str(), (char*)nullptr, sizeLeafList.c_str());
139 }
140 }
141 mBranch = tree->GetBranch(mBranchName.c_str());
142 if (mBranch == nullptr) {
143 mBranch = tree->Branch(mBranchName.c_str(), (char*)nullptr, leafList.c_str());
144 }
145 if (mElementType.type == EDataType::kBool_t) {
146 cache.resize(mListSize);
147 }
148 accessChunk();
149}
150
152{
153 if (O2_BUILTIN_UNLIKELY(*pos - mFirstIndex >= mChunkLength)) {
154 nextChunk();
155 }
156 if (mElementType.type == EDataType::kBool_t) {
157 auto boolArray = std::static_pointer_cast<arrow::BooleanArray>(mCurrentArray);
158 for (auto i = 0; i < mListSize; ++i) {
159 cache[i] = boolArray->Value((*pos - mFirstIndex) * mListSize + i);
160 }
161 mBranch->SetAddress((void*)(cache.data()));
162 return;
163 }
164 uint8_t const* buffer;
165 switch (mFieldType) {
166 case arrow::Type::LIST: {
167 auto list = std::static_pointer_cast<arrow::ListArray>(mCurrentArray);
168 mListSize = list->value_length((*pos - mFirstIndex));
169 buffer = std::static_pointer_cast<arrow::PrimitiveArray>(list->values())->values()->data() + mCurrentArray->offset() + list->value_offset((*pos - mFirstIndex)) * mElementType.size;
170 mBranch->SetAddress((void*)buffer);
171 mSizeBranch->SetAddress(&mListSize);
172 };
173 break;
174 case arrow::Type::FIXED_SIZE_LIST:
175 default: {
176 buffer = std::static_pointer_cast<arrow::PrimitiveArray>(mCurrentArray)->values()->data() + mCurrentArray->offset() + (*pos - mFirstIndex) * mListSize * mElementType.size;
177 mBranch->SetAddress((void*)buffer);
178 };
179 }
180}
181
182void ColumnToBranch::accessChunk()
183{
184 auto array = mColumn->chunk(mCurrentChunk);
185 switch (mFieldType) {
186 case arrow::Type::FIXED_SIZE_LIST: {
187 auto list = std::static_pointer_cast<arrow::FixedSizeListArray>(array);
188 mChunkLength = list->length();
189 mCurrentArray = list->values();
190 };
191 break;
192 case arrow::Type::LIST: {
193 auto list = std::static_pointer_cast<arrow::ListArray>(array);
194 mChunkLength = list->length();
195 mCurrentArray = list;
196 };
197 break;
198 default:
199 mCurrentArray = array;
200 mChunkLength = mCurrentArray->length();
201 }
202}
203
204void ColumnToBranch::nextChunk()
205{
206 mFirstIndex += mChunkLength;
207 ++mCurrentChunk;
208 accessChunk();
209}
210
211TableToTree::TableToTree(std::shared_ptr<arrow::Table> const& table, TFile* file, const char* treename)
212{
213 mTable = table.get();
214 mTree.reset(static_cast<TTree*>(file->Get(treename)));
215 if (mTree) {
216 return;
217 }
218 std::string treeName(treename);
219 auto pos = treeName.find_first_of('/');
220 if (pos != std::string::npos) {
221 file->cd(treeName.substr(0, pos).c_str());
222 treeName = treeName.substr(pos + 1, std::string::npos);
223 }
224 mTree = std::make_shared<TTree>(treeName.c_str(), treeName.c_str());
225}
226
228{
229 mRows = mTable->num_rows();
230 auto columns = mTable->columns();
231 auto fields = mTable->schema()->fields();
232 assert(columns.size() == fields.size());
233 for (auto i = 0u; i < columns.size(); ++i) {
234 addBranch(columns[i], fields[i]);
235 }
236}
237
238void TableToTree::addBranch(std::shared_ptr<arrow::ChunkedArray> const& column, std::shared_ptr<arrow::Field> const& field)
239{
240 if (mRows == 0) {
241 mRows = column->length();
242 } else if (mRows != column->length()) {
243 throw runtime_error_f("Adding incompatible column with size %d (num rows = %d)", column->length(), mRows);
244 }
245 mColumnReaders.emplace_back(new ColumnToBranch{mTree.get(), column, field});
246}
247
248std::shared_ptr<TTree> TableToTree::process()
249{
250 int64_t row = 0;
251 if (mTree->GetNbranches() == 0 || mRows == 0) {
252 mTree->Write("", TObject::kOverwrite);
253 mTree->SetDirectory(nullptr);
254 return mTree;
255 }
256
257 for (auto& reader : mColumnReaders) {
258 int idealBasketSize = 1024 + reader->fieldSize() * reader->columnEntries(); // minimal additional size needed, otherwise we get 2 baskets
259 int basketSize = std::max(32000, idealBasketSize); // keep a minimum value
260 // std::cout << "Setting baskets size for " << reader->branchName() << " to " << basketSize << " = 1024 + "
261 // << reader->fieldSize() << " * " << reader->columnEntries() << ". mRows was " << mRows << std::endl;
262 mTree->SetBasketSize(reader->branchName(), basketSize);
263 // If it starts with fIndexArray, also set the size branch basket size
264 if (strncmp(reader->branchName(), "fIndexArray", strlen("fIndexArray")) == 0) {
265 std::string sizeBranch = reader->branchName();
266 sizeBranch += "_size";
267 // std::cout << "Setting baskets size for " << sizeBranch << " to " << basketSize << " = 1024 + "
268 // << reader->fieldSize() << " * " << reader->columnEntries() << ". mRows was " << mRows << std::endl;
269 // One int per array to keep track of the size
270 int idealBasketSize = 4 * mRows + 1024 + reader->fieldSize() * reader->columnEntries(); // minimal additional size needed, otherwise we get 2 baskets
271 int basketSize = std::max(32000, idealBasketSize); // keep a minimum value
272 mTree->SetBasketSize(sizeBranch.c_str(), basketSize);
273 mTree->SetBasketSize(reader->branchName(), basketSize);
274 }
275 }
276
277 while (row < mRows) {
278 for (auto& reader : mColumnReaders) {
279 reader->at(&row);
280 }
281 mTree->Fill();
282 ++row;
283 }
284 mTree->Write("", TObject::kOverwrite);
285 mTree->SetDirectory(nullptr);
286 return mTree;
287}
288
289namespace
290{
291struct BranchInfo {
292 std::string name;
293 TBranch* ptr;
294 bool mVLA;
295};
296} // namespace
297
298} // namespace o2::framework
#define O2_BUILTIN_UNLIKELY(x)
std::vector< std::shared_ptr< arrow::Field > > fields
int32_t i
uint16_t pos
Definition RawData.h:3
#define O2_DECLARE_DYNAMIC_LOG(name)
Definition Signpost.h:489
TBranch * ptr
bool mVLA
ColumnToBranch(TTree *tree, std::shared_ptr< arrow::ChunkedArray > const &column, std::shared_ptr< arrow::Field > const &field)
void at(const int64_t *pos)
void addBranch(std::shared_ptr< arrow::ChunkedArray > const &column, std::shared_ptr< arrow::Field > const &field)
TableToTree(std::shared_ptr< arrow::Table > const &table, TFile *file, const char *treename)
std::shared_ptr< TTree > process()
GLuint buffer
Definition glcorearb.h:655
GLsizeiptr size
Definition glcorearb.h:659
GLenum array
Definition glcorearb.h:4274
GLuint const GLchar * name
Definition glcorearb.h:781
GLint GLint GLsizei GLint GLenum GLenum type
Definition glcorearb.h:275
Defining ITS Vertex explicitly as messageable.
Definition Cartesian.h:288
RuntimeErrorRef runtime_error(const char *)
auto basicROOTTypeFromArrow(arrow::Type::type id)
auto arrowTypeFromROOT(EDataType type, int size)
RuntimeErrorRef runtime_error_f(const char *,...)
std::string to_string(gsl::span< T, Size > span)
Definition common.h:52
std::unique_ptr< TTree > tree((TTree *) flIn.Get(std::string(o2::base::NameConf::CTFTREENAME).c_str()))
std::vector< int > row