15#include <arrow/dataset/file_base.h>
16#include <arrow/record_batch.h>
17#include <arrow/type.h>
18#include <arrow/util/key_value_metadata.h>
19#include <TBufferFile.h>
28static constexpr char const* sizeBranchSuffix =
"_size";
35 auto typeGenerator = [](std::shared_ptr<arrow::DataType>
const&
type,
int size) -> std::shared_ptr<arrow::DataType> {
38 return arrow::list(
type);
40 return std::move(
type);
42 return arrow::fixed_size_list(
type,
size);
47 case EDataType::kBool_t:
48 return typeGenerator(arrow::boolean(),
size);
49 case EDataType::kUChar_t:
50 return typeGenerator(arrow::uint8(),
size);
51 case EDataType::kUShort_t:
52 return typeGenerator(arrow::uint16(),
size);
53 case EDataType::kUInt_t:
54 return typeGenerator(arrow::uint32(),
size);
55 case EDataType::kULong64_t:
56 return typeGenerator(arrow::uint64(),
size);
57 case EDataType::kChar_t:
58 return typeGenerator(arrow::int8(),
size);
59 case EDataType::kShort_t:
60 return typeGenerator(arrow::int16(),
size);
61 case EDataType::kInt_t:
62 return typeGenerator(arrow::int32(),
size);
63 case EDataType::kLong64_t:
64 return typeGenerator(arrow::int64(),
size);
65 case EDataType::kFloat_t:
66 return typeGenerator(arrow::float32(),
size);
67 case EDataType::kDouble_t:
68 return typeGenerator(arrow::float64(),
size);
77 case arrow::Type::BOOL:
78 return ROOTTypeInfo{EDataType::kBool_t,
"/O", TDataType::GetDataType(EDataType::kBool_t)->Size()};
79 case arrow::Type::UINT8:
80 return ROOTTypeInfo{EDataType::kUChar_t,
"/b", TDataType::GetDataType(EDataType::kUChar_t)->Size()};
81 case arrow::Type::UINT16:
82 return ROOTTypeInfo{EDataType::kUShort_t,
"/s", TDataType::GetDataType(EDataType::kUShort_t)->Size()};
83 case arrow::Type::UINT32:
84 return ROOTTypeInfo{EDataType::kUInt_t,
"/i", TDataType::GetDataType(EDataType::kUInt_t)->Size()};
85 case arrow::Type::UINT64:
86 return ROOTTypeInfo{EDataType::kULong64_t,
"/l", TDataType::GetDataType(EDataType::kULong64_t)->Size()};
87 case arrow::Type::INT8:
88 return ROOTTypeInfo{EDataType::kChar_t,
"/B", TDataType::GetDataType(EDataType::kChar_t)->Size()};
89 case arrow::Type::INT16:
90 return ROOTTypeInfo{EDataType::kShort_t,
"/S", TDataType::GetDataType(EDataType::kShort_t)->Size()};
91 case arrow::Type::INT32:
92 return ROOTTypeInfo{EDataType::kInt_t,
"/I", TDataType::GetDataType(EDataType::kInt_t)->Size()};
93 case arrow::Type::INT64:
94 return ROOTTypeInfo{EDataType::kLong64_t,
"/L", TDataType::GetDataType(EDataType::kLong64_t)->Size()};
95 case arrow::Type::FLOAT:
96 return ROOTTypeInfo{EDataType::kFloat_t,
"/F", TDataType::GetDataType(EDataType::kFloat_t)->Size()};
97 case arrow::Type::DOUBLE:
98 return ROOTTypeInfo{EDataType::kDouble_t,
"/D", TDataType::GetDataType(EDataType::kDouble_t)->Size()};
105 : mBranchName{field->
name()},
106 mColumn{column.get()},
107 mFieldSize{field->
type()->byte_width()}
109 std::string leafList;
110 std::string sizeLeafList;
111 auto arrowType = field->type();
112 mFieldType = arrowType->id();
113 switch (mFieldType) {
114 case arrow::Type::FIXED_SIZE_LIST:
115 mListSize = std::static_pointer_cast<arrow::FixedSizeListType>(arrowType)->list_size();
116 arrowType = arrowType->field(0)->type();
119 mFieldSize = arrowType->byte_width() * mListSize;
121 case arrow::Type::LIST:
122 arrowType = arrowType->field(0)->type();
124 leafList = mBranchName +
"[" + mBranchName + TableTreeHelpers::sizeBranchSuffix +
"]" + mElementType.
suffix;
125 sizeLeafList = mBranchName + TableTreeHelpers::sizeBranchSuffix +
"/I";
128 mFieldSize = arrowType->byte_width();
132 leafList = mBranchName + mElementType.
suffix;
135 if (!sizeLeafList.empty()) {
136 mSizeBranch =
tree->GetBranch((mBranchName + TableTreeHelpers::sizeBranchSuffix).c_str());
137 if (mSizeBranch ==
nullptr) {
138 mSizeBranch =
tree->Branch((mBranchName + TableTreeHelpers::sizeBranchSuffix).c_str(), (
char*)
nullptr, sizeLeafList.c_str());
141 mBranch =
tree->GetBranch(mBranchName.c_str());
142 if (mBranch ==
nullptr) {
143 mBranch =
tree->Branch(mBranchName.c_str(), (
char*)
nullptr, leafList.c_str());
145 if (mElementType.
type == EDataType::kBool_t) {
146 cache.resize(mListSize);
156 if (mElementType.
type == EDataType::kBool_t) {
157 auto boolArray = std::static_pointer_cast<arrow::BooleanArray>(mCurrentArray);
158 for (
auto i = 0;
i < mListSize; ++
i) {
159 cache[
i] = boolArray->Value((*
pos - mFirstIndex) * mListSize +
i);
161 mBranch->SetAddress((
void*)(cache.data()));
165 switch (mFieldType) {
166 case arrow::Type::LIST: {
167 auto list = std::static_pointer_cast<arrow::ListArray>(mCurrentArray);
168 mListSize = list->value_length((*
pos - mFirstIndex));
169 buffer = std::static_pointer_cast<arrow::PrimitiveArray>(list->values())->values()->data() + mCurrentArray->offset() + list->value_offset((*
pos - mFirstIndex)) * mElementType.
size;
170 mBranch->SetAddress((
void*)
buffer);
171 mSizeBranch->SetAddress(&mListSize);
174 case arrow::Type::FIXED_SIZE_LIST:
176 buffer = std::static_pointer_cast<arrow::PrimitiveArray>(mCurrentArray)->values()->data() + mCurrentArray->offset() + (*
pos - mFirstIndex) * mListSize * mElementType.
size;
177 mBranch->SetAddress((
void*)
buffer);
182void ColumnToBranch::accessChunk()
184 auto array = mColumn->chunk(mCurrentChunk);
185 switch (mFieldType) {
186 case arrow::Type::FIXED_SIZE_LIST: {
187 auto list = std::static_pointer_cast<arrow::FixedSizeListArray>(
array);
188 mChunkLength = list->length();
189 mCurrentArray = list->values();
192 case arrow::Type::LIST: {
193 auto list = std::static_pointer_cast<arrow::ListArray>(
array);
194 mChunkLength = list->length();
195 mCurrentArray = list;
199 mCurrentArray =
array;
200 mChunkLength = mCurrentArray->length();
204void ColumnToBranch::nextChunk()
206 mFirstIndex += mChunkLength;
213 mTable = table.get();
214 mTree.reset(
static_cast<TTree*
>(file->Get(treename)));
218 std::string treeName(treename);
219 auto pos = treeName.find_first_of(
'/');
220 if (
pos != std::string::npos) {
221 file->cd(treeName.substr(0,
pos).c_str());
222 treeName = treeName.substr(
pos + 1, std::string::npos);
224 mTree = std::make_shared<TTree>(treeName.c_str(), treeName.c_str());
229 mRows = mTable->num_rows();
230 auto columns = mTable->columns();
231 auto fields = mTable->schema()->fields();
232 assert(columns.size() ==
fields.size());
233 for (
auto i = 0u;
i < columns.size(); ++
i) {
238void TableToTree::addBranch(std::shared_ptr<arrow::ChunkedArray>
const& column, std::shared_ptr<arrow::Field>
const& field)
241 mRows = column->length();
242 }
else if (mRows != column->length()) {
243 throw runtime_error_f(
"Adding incompatible column with size %d (num rows = %d)", column->length(), mRows);
245 mColumnReaders.emplace_back(
new ColumnToBranch{mTree.get(), column, field});
251 if (mTree->GetNbranches() == 0 || mRows == 0) {
252 mTree->Write(
"", TObject::kOverwrite);
253 mTree->SetDirectory(
nullptr);
257 for (
auto& reader : mColumnReaders) {
258 int idealBasketSize = 1024 + reader->fieldSize() * reader->columnEntries();
259 int basketSize = std::max(32000, idealBasketSize);
262 mTree->SetBasketSize(reader->branchName(), basketSize);
264 if (strncmp(reader->branchName(),
"fIndexArray", strlen(
"fIndexArray")) == 0) {
265 std::string sizeBranch = reader->branchName();
266 sizeBranch +=
"_size";
270 int idealBasketSize = 4 * mRows + 1024 + reader->fieldSize() * reader->columnEntries();
271 int basketSize = std::max(32000, idealBasketSize);
272 mTree->SetBasketSize(sizeBranch.c_str(), basketSize);
273 mTree->SetBasketSize(reader->branchName(), basketSize);
277 while (
row < mRows) {
278 for (
auto& reader : mColumnReaders) {
284 mTree->Write(
"", TObject::kOverwrite);
285 mTree->SetDirectory(
nullptr);
#define O2_BUILTIN_UNLIKELY(x)
std::vector< std::shared_ptr< arrow::Field > > fields
#define O2_DECLARE_DYNAMIC_LOG(name)
ColumnToBranch(TTree *tree, std::shared_ptr< arrow::ChunkedArray > const &column, std::shared_ptr< arrow::Field > const &field)
void at(const int64_t *pos)
void addBranch(std::shared_ptr< arrow::ChunkedArray > const &column, std::shared_ptr< arrow::Field > const &field)
TableToTree(std::shared_ptr< arrow::Table > const &table, TFile *file, const char *treename)
std::shared_ptr< TTree > process()
GLuint const GLchar * name
GLint GLint GLsizei GLint GLenum GLenum type
Defining ITS Vertex explicitly as messageable.
RuntimeErrorRef runtime_error(const char *)
auto basicROOTTypeFromArrow(arrow::Type::type id)
auto arrowTypeFromROOT(EDataType type, int size)
RuntimeErrorRef runtime_error_f(const char *,...)
std::string to_string(gsl::span< T, Size > span)
std::unique_ptr< TTree > tree((TTree *) flIn.Get(std::string(o2::base::NameConf::CTFTREENAME).c_str()))