34int main(
int argc,
char* argv[])
36 std::string inputCollection(
"input.txt");
37 std::string outputFileName(
"AO2D.root");
38 long maxDirSize = 100000000;
39 bool skipNonExistingFiles =
false;
40 bool skipParentFilesList =
false;
43 int compression = 505;
46 static struct option long_options[] = {
47 {
"input", required_argument,
nullptr, 0},
48 {
"output", required_argument,
nullptr, 1},
49 {
"max-size", required_argument,
nullptr, 2},
50 {
"skip-non-existing-files", no_argument,
nullptr, 3},
51 {
"skip-parent-files-list", no_argument,
nullptr, 4},
52 {
"compression", required_argument,
nullptr, 5},
53 {
"verbosity", required_argument,
nullptr,
'v'},
54 {
"help", no_argument,
nullptr,
'h'},
55 {
nullptr, 0,
nullptr, 0}};
58 int c = getopt_long(argc, argv,
"", long_options, &option_index);
62 inputCollection = optarg;
64 outputFileName = optarg;
66 maxDirSize = atol(optarg);
68 skipNonExistingFiles =
true;
70 skipParentFilesList =
true;
72 compression = atoi(optarg);
73 }
else if (
c ==
'v') {
75 }
else if (
c ==
'h') {
76 printf(
"AO2D merging tool. Options: \n");
77 printf(
" --input <inputfile.txt> Contains path to files to be merged. Default: %s\n", inputCollection.c_str());
78 printf(
" --output <outputfile.root> Target output ROOT file. Default: %s\n", outputFileName.c_str());
79 printf(
" --max-size <size in Bytes> Target directory size. Default: %ld. Set to 0 if file is not self-contained.\n", maxDirSize);
80 printf(
" --skip-non-existing-files Flag to allow skipping of non-existing files in the input list.\n");
81 printf(
" --skip-parent-files-list Flag to allow skipping the merging of the parent files list.\n");
82 printf(
" --compression <root compression id> Compression algorithm / level to use (default: %d)\n", compression);
83 printf(
" --verbosity <flag> Verbosity of output (default: %d).\n",
verbosity);
90 printf(
"AOD merger started with:\n");
91 printf(
" Input file: %s\n", inputCollection.c_str());
92 printf(
" Output file name: %s\n", outputFileName.c_str());
93 printf(
" Maximal folder size (uncompressed): %ld\n", maxDirSize);
94 if (skipNonExistingFiles) {
95 printf(
" WARNING: Skipping non-existing files.\n");
98 std::map<std::string, TTree*> trees;
99 std::map<std::string, uint64_t> sizeCompressed;
100 std::map<std::string, uint64_t> sizeUncompressed;
101 std::map<std::string, int>
offsets;
102 std::map<std::string, int> unassignedIndexOffset;
104 auto outputFile = TFile::Open(outputFileName.c_str(),
"RECREATE",
"", compression);
105 TDirectory* outputDir =
nullptr;
106 long currentDirSize = 0;
109 in.open(inputCollection);
111 TMap* metaData =
nullptr;
112 TMap* parentFiles =
nullptr;
113 int totalMergedDFs = 0;
115 while (in.good() && exitCode == 0) {
118 if (line.Length() == 0) {
122 if (line.BeginsWith(
"alien:") && !gGrid) {
123 printf(
"Connecting to AliEn...");
124 TGrid::Connect(
"alien:");
127 printf(
"Processing input file: %s\n", line.Data());
129 auto inputFile = TFile::Open(line);
130 if (!inputFile || inputFile->IsZombie()) {
131 printf(
"Error: %s input file %s.\n", !inputFile ?
"Could not open" :
"Zombie", line.Data());
132 if (skipNonExistingFiles) {
135 printf(
"Aborting merge!\n");
141 TList* keyList = inputFile->GetListOfKeys();
144 for (
auto key1 : *keyList) {
145 if (((TObjString*)key1)->GetString().EqualTo(
"metaData")) {
146 auto metaDataCurrentFile = (TMap*)inputFile->Get(
"metaData");
147 if (metaData ==
nullptr) {
148 metaData = metaDataCurrentFile;
150 metaData->Write(
"metaData", TObject::kSingleKey);
152 for (
auto metaDataPair : *metaData) {
153 auto metaDataKey = ((TPair*)metaDataPair)->Key();
154 if (metaDataCurrentFile->Contains(((TObjString*)metaDataKey)->GetString())) {
155 auto value = (TObjString*)metaData->GetValue(((TObjString*)metaDataKey)->GetString());
156 auto valueCurrentFile = (TObjString*)metaDataCurrentFile->GetValue(((TObjString*)metaDataKey)->GetString());
157 if (!
value->GetString().EqualTo(valueCurrentFile->GetString())) {
158 printf(
"WARNING: Metadata differs between input files. Key %s : %s vs. %s\n", ((TObjString*)metaDataKey)->GetString().Data(),
159 value->GetString().Data(), valueCurrentFile->GetString().Data());
162 printf(
"WARNING: Metadata differs between input files. Key %s is not present in current file\n", ((TObjString*)metaDataKey)->GetString().Data());
168 if (((TObjString*)key1)->GetString().EqualTo(
"parentFiles") && !skipParentFilesList) {
169 auto parentFilesCurrentFile = (TMap*)inputFile->Get(
"parentFiles");
170 if (parentFiles ==
nullptr) {
171 parentFiles =
new TMap;
173 for (
auto pair : *parentFilesCurrentFile) {
174 parentFiles->Add(((TPair*)pair)->Key(), ((TPair*)pair)->Value());
176 delete parentFilesCurrentFile;
179 if (!((TObjString*)key1)->GetString().BeginsWith(
"DF_")) {
183 auto dfName = ((TObjString*)key1)->GetString().Data();
186 printf(
" Processing folder %s\n", dfName);
190 auto folder = (TDirectoryFile*)inputFile->Get(dfName);
191 auto treeList = folder->GetListOfKeys();
196 for (
auto i = 0;
i < treeList->GetEntries(); ++
i) {
197 TKey* ki = (TKey*)treeList->At(
i);
198 for (
int j =
i + 1;
j < treeList->GetEntries(); ++
j) {
199 TKey* kj = (TKey*)treeList->At(
j);
200 if (std::strcmp(ki->GetName(), kj->GetName()) == 0 && std::strcmp(ki->GetTitle(), kj->GetTitle()) == 0) {
201 if (ki->GetCycle() < kj->GetCycle()) {
202 printf(
" *** FATAL *** we had ordered the keys, first cycle should be higher, please check");
206 treeList->Remove(kj);
216 std::list<std::string> foundTrees;
218 for (
auto key2 : *treeList) {
219 auto treeName = ((TObjString*)key2)->GetString().Data();
220 bool found = (std::find(foundTrees.begin(), foundTrees.end(), treeName) != foundTrees.end());
222 printf(
" ***WARNING*** Tree %s was already merged (even if we purged duplicated trees before, so this should not happen), skipping\n", treeName);
225 foundTrees.push_back(treeName);
227 auto inputTree = (TTree*)inputFile->Get(Form(
"%s/%s", dfName, treeName));
228 bool fastCopy = (inputTree->GetTotBytes() > 10000000);
230 printf(
" Processing tree %s with %lld entries with total size %lld (fast copy: %d)\n", treeName, inputTree->GetEntries(), inputTree->GetTotBytes(), fastCopy);
233 bool alreadyCopied =
false;
234 if (trees.count(treeName) == 0) {
236 printf(
" *** FATAL ***: The tree %s was not in the previous dataframe(s)\n", treeName);
243 outputDir = outputFile->mkdir(dfName);
246 printf(
"Writing to output folder %s\n", dfName);
250 auto outputTree = inputTree->CloneTree(-1, (fastCopy) ?
"fast" :
"");
251 currentDirSize += inputTree->GetTotBytes();
252 alreadyCopied =
true;
253 outputTree->SetAutoFlush(0);
254 trees[treeName] = outputTree;
257 trees[treeName]->CopyAddresses(inputTree);
260 auto outputTree = trees[treeName];
262 std::vector<std::pair<int*, int>> indexList;
263 std::vector<char*> vlaPointers;
264 std::vector<int*> indexPointers;
265 TObjArray* branches = inputTree->GetListOfBranches();
266 for (
int i = 0;
i < branches->GetEntriesFast(); ++
i) {
267 TBranch* br = (TBranch*)branches->UncheckedAt(
i);
268 TString branchName(br->GetName());
271 if (((TLeaf*)br->GetListOfLeaves()->First())->GetLeafCount() !=
nullptr) {
272 int maximum = ((TLeaf*)br->GetListOfLeaves()->First())->GetLeafCount()->GetMaximum();
277 br->GetExpectedType(cls,
type);
278 auto typeSize = TDataType::GetDataType(
type)->Size();
280 char*
buffer =
new char[maximum * typeSize];
281 memset(
buffer, 0, maximum * typeSize);
282 vlaPointers.push_back(
buffer);
284 printf(
" Allocated VLA buffer of length %d with %d bytes each for branch name %s\n", maximum, typeSize, br->GetName());
286 inputTree->SetBranchAddress(br->GetName(),
buffer);
287 outputTree->SetBranchAddress(br->GetName(),
buffer);
289 if (branchName.BeginsWith(
"fIndexArray")) {
290 for (
int i = 0;
i < maximum;
i++) {
294 }
else if (branchName.BeginsWith(
"fIndexSlice")) {
297 vlaPointers.push_back(
reinterpret_cast<char*
>(
buffer));
299 inputTree->SetBranchAddress(br->GetName(),
buffer);
300 outputTree->SetBranchAddress(br->GetName(),
buffer);
304 }
else if (branchName.BeginsWith(
"fIndex") && !branchName.EndsWith(
"_size")) {
307 indexPointers.push_back(
buffer);
309 inputTree->SetBranchAddress(br->GetName(),
buffer);
310 outputTree->SetBranchAddress(br->GetName(),
buffer);
316 if (indexList.size() > 0) {
317 auto entries = inputTree->GetEntries();
318 int minIndexOffset = unassignedIndexOffset[treeName];
319 auto newMinIndexOffset = minIndexOffset;
320 for (
int i = 0;
i < entries;
i++) {
321 for (
auto&
index : indexList) {
324 inputTree->GetEntry(
i);
326 for (
const auto& idx : indexList) {
328 if (*(idx.first) < 0) {
329 *(idx.first) += minIndexOffset;
330 newMinIndexOffset = std::min(newMinIndexOffset, *(idx.first));
332 *(idx.first) += idx.second;
335 if (!alreadyCopied) {
336 int nbytes = outputTree->Fill();
338 currentDirSize += nbytes;
342 unassignedIndexOffset[treeName] = newMinIndexOffset;
343 }
else if (!alreadyCopied) {
344 auto nbytes = outputTree->CopyEntries(inputTree, -1, (fastCopy) ?
"fast" :
"");
346 currentDirSize += nbytes;
352 for (
auto&
buffer : indexPointers) {
355 for (
auto&
buffer : vlaPointers) {
365 for (
auto const&
tree : trees) {
366 bool found = (std::find(foundTrees.begin(), foundTrees.end(),
tree.first) != foundTrees.end());
367 if (found ==
false) {
368 printf(
" *** FATAL ***: The tree %s was not in the current dataframe\n",
tree.first.c_str());
380 for (
auto const&
tree : trees) {
387 if (maxDirSize > 0) {
389 printf(
"ERROR: Index on %s but no tree found\n",
offset.first.c_str());
395 if (maxDirSize == 0 || currentDirSize > maxDirSize) {
397 printf(
"Maximum size reached: %ld. Closing folder %s.\n", currentDirSize, dfName);
399 for (
auto const&
tree : trees) {
402 tree.second->Write();
405 sizeCompressed[
tree.first] +=
tree.second->GetZipBytes();
406 sizeUncompressed[
tree.first] +=
tree.second->GetTotBytes();
421 parentFiles->Write(
"parentFiles", TObject::kSingleKey);
424 for (
auto const&
tree : trees) {
426 tree.second->Write();
429 sizeCompressed[
tree.first] +=
tree.second->GetZipBytes();
430 sizeUncompressed[
tree.first] +=
tree.second->GetTotBytes();
438 if (totalMergedDFs == 0) {
439 printf(
"ERROR: Did not merge a single DF. This does not seem right.\n");
445 printf(
"Removing incomplete output file %s.\n", outputFile->GetName());
446 gSystem->Unlink(outputFile->GetName());
448 printf(
"AOD merger finished. Size overview follows:\n");
450 uint64_t totalCompressed = 0;
451 uint64_t totalUncompressed = 0;
452 for (
auto const&
tree : sizeCompressed) {
453 totalCompressed +=
tree.second;
454 totalUncompressed += sizeUncompressed[
tree.first];
456 if (totalCompressed > 0 && totalUncompressed > 0) {
457 for (
auto const&
tree : sizeCompressed) {
458 printf(
" Tree %20s | Compressed: %12" PRIu64
" (%2.0f%%) | Uncompressed: %12" PRIu64
" (%2.0f%%)\n",
tree.first.c_str(),
tree.second, 100.0 *
tree.second / totalCompressed, sizeUncompressed[
tree.first], 100.0 * sizeUncompressed[
tree.first] / totalUncompressed);