33int main(
int argc,
char* argv[])
35 std::string inputCollection(
"input.txt");
36 std::string outputFileName(
"AO2D.root");
37 long maxDirSize = 100000000;
38 bool skipNonExistingFiles =
false;
39 bool skipParentFilesList =
false;
42 int compression = 505;
45 static struct option long_options[] = {
46 {
"input", required_argument,
nullptr, 0},
47 {
"output", required_argument,
nullptr, 1},
48 {
"max-size", required_argument,
nullptr, 2},
49 {
"skip-non-existing-files", no_argument,
nullptr, 3},
50 {
"skip-parent-files-list", no_argument,
nullptr, 4},
51 {
"compression", required_argument,
nullptr, 5},
52 {
"verbosity", required_argument,
nullptr,
'v'},
53 {
"help", no_argument,
nullptr,
'h'},
54 {
nullptr, 0,
nullptr, 0}};
57 int c = getopt_long(argc, argv,
"", long_options, &option_index);
61 inputCollection = optarg;
63 outputFileName = optarg;
65 maxDirSize = atol(optarg);
67 skipNonExistingFiles =
true;
69 skipParentFilesList =
true;
71 compression = atoi(optarg);
72 }
else if (
c ==
'v') {
74 }
else if (
c ==
'h') {
75 printf(
"AO2D merging tool. Options: \n");
76 printf(
" --input <inputfile.txt> Contains path to files to be merged. Default: %s\n", inputCollection.c_str());
77 printf(
" --output <outputfile.root> Target output ROOT file. Default: %s\n", outputFileName.c_str());
78 printf(
" --max-size <size in Bytes> Target directory size. Default: %ld. Set to 0 if file is not self-contained.\n", maxDirSize);
79 printf(
" --skip-non-existing-files Flag to allow skipping of non-existing files in the input list.\n");
80 printf(
" --skip-parent-files-list Flag to allow skipping the merging of the parent files list.\n");
81 printf(
" --compression <root compression id> Compression algorithm / level to use (default: %d)\n", compression);
82 printf(
" --verbosity <flag> Verbosity of output (default: %d).\n",
verbosity);
89 printf(
"AOD merger started with:\n");
90 printf(
" Input file: %s\n", inputCollection.c_str());
91 printf(
" Output file name: %s\n", outputFileName.c_str());
92 printf(
" Maximal folder size (uncompressed): %ld\n", maxDirSize);
93 if (skipNonExistingFiles) {
94 printf(
" WARNING: Skipping non-existing files.\n");
97 std::map<std::string, TTree*> trees;
98 std::map<std::string, uint64_t> sizeCompressed;
99 std::map<std::string, uint64_t> sizeUncompressed;
100 std::map<std::string, int>
offsets;
101 std::map<std::string, int> unassignedIndexOffset;
103 auto outputFile = TFile::Open(outputFileName.c_str(),
"RECREATE",
"", compression);
104 TDirectory* outputDir =
nullptr;
105 long currentDirSize = 0;
108 in.open(inputCollection);
110 bool connectedToAliEn =
false;
111 TMap* metaData =
nullptr;
112 TMap* parentFiles =
nullptr;
113 int totalMergedDFs = 0;
115 while (in.good() && exitCode == 0) {
118 if (line.Length() == 0) {
122 if (line.BeginsWith(
"alien:") && !connectedToAliEn) {
123 printf(
"Connecting to AliEn...");
124 TGrid::Connect(
"alien:");
125 connectedToAliEn =
true;
128 printf(
"Processing input file: %s\n", line.Data());
130 auto inputFile = TFile::Open(line);
131 if (!inputFile || inputFile->IsZombie()) {
132 printf(
"Error: %s input file %s.\n", !inputFile ?
"Could not open" :
"Zombie", line.Data());
133 if (skipNonExistingFiles) {
136 printf(
"Aborting merge!\n");
142 TList* keyList = inputFile->GetListOfKeys();
145 for (
auto key1 : *keyList) {
146 if (((TObjString*)key1)->GetString().EqualTo(
"metaData")) {
147 auto metaDataCurrentFile = (TMap*)inputFile->Get(
"metaData");
148 if (metaData ==
nullptr) {
149 metaData = metaDataCurrentFile;
151 metaData->Write(
"metaData", TObject::kSingleKey);
153 for (
auto metaDataPair : *metaData) {
154 auto metaDataKey = ((TPair*)metaDataPair)->Key();
155 if (metaDataCurrentFile->Contains(((TObjString*)metaDataKey)->GetString())) {
156 auto value = (TObjString*)metaData->GetValue(((TObjString*)metaDataKey)->GetString());
157 auto valueCurrentFile = (TObjString*)metaDataCurrentFile->GetValue(((TObjString*)metaDataKey)->GetString());
158 if (!
value->GetString().EqualTo(valueCurrentFile->GetString())) {
159 printf(
"WARNING: Metadata differs between input files. Key %s : %s vs. %s\n", ((TObjString*)metaDataKey)->GetString().Data(),
160 value->GetString().Data(), valueCurrentFile->GetString().Data());
163 printf(
"WARNING: Metadata differs between input files. Key %s is not present in current file\n", ((TObjString*)metaDataKey)->GetString().Data());
169 if (((TObjString*)key1)->GetString().EqualTo(
"parentFiles") && !skipParentFilesList) {
170 auto parentFilesCurrentFile = (TMap*)inputFile->Get(
"parentFiles");
171 if (parentFiles ==
nullptr) {
172 parentFiles =
new TMap;
174 for (
auto pair : *parentFilesCurrentFile) {
175 parentFiles->Add(((TPair*)pair)->Key(), ((TPair*)pair)->Value());
177 delete parentFilesCurrentFile;
180 if (!((TObjString*)key1)->GetString().BeginsWith(
"DF_")) {
184 auto dfName = ((TObjString*)key1)->GetString().Data();
187 printf(
" Processing folder %s\n", dfName);
191 auto folder = (TDirectoryFile*)inputFile->Get(dfName);
192 auto treeList = folder->GetListOfKeys();
197 for (
auto i = 0;
i < treeList->GetEntries(); ++
i) {
198 TKey* ki = (TKey*)treeList->At(
i);
199 for (
int j =
i + 1;
j < treeList->GetEntries(); ++
j) {
200 TKey* kj = (TKey*)treeList->At(
j);
201 if (std::strcmp(ki->GetName(), kj->GetName()) == 0 && std::strcmp(ki->GetTitle(), kj->GetTitle()) == 0) {
202 if (ki->GetCycle() < kj->GetCycle()) {
203 printf(
" *** FATAL *** we had ordered the keys, first cycle should be higher, please check");
207 treeList->Remove(kj);
217 std::list<std::string> foundTrees;
219 for (
auto key2 : *treeList) {
220 auto treeName = ((TObjString*)key2)->GetString().Data();
221 bool found = (std::find(foundTrees.begin(), foundTrees.end(), treeName) != foundTrees.end());
223 printf(
" ***WARNING*** Tree %s was already merged (even if we purged duplicated trees before, so this should not happen), skipping\n", treeName);
226 foundTrees.push_back(treeName);
228 auto inputTree = (TTree*)inputFile->Get(Form(
"%s/%s", dfName, treeName));
229 bool fastCopy = (inputTree->GetTotBytes() > 10000000);
231 printf(
" Processing tree %s with %lld entries with total size %lld (fast copy: %d)\n", treeName, inputTree->GetEntries(), inputTree->GetTotBytes(), fastCopy);
234 bool alreadyCopied =
false;
235 if (trees.count(treeName) == 0) {
237 printf(
" *** FATAL ***: The tree %s was not in the previous dataframe(s)\n", treeName);
244 outputDir = outputFile->mkdir(dfName);
247 printf(
"Writing to output folder %s\n", dfName);
251 auto outputTree = inputTree->CloneTree(-1, (fastCopy) ?
"fast" :
"");
252 currentDirSize += inputTree->GetTotBytes();
253 alreadyCopied =
true;
254 outputTree->SetAutoFlush(0);
255 trees[treeName] = outputTree;
258 trees[treeName]->CopyAddresses(inputTree);
261 auto outputTree = trees[treeName];
263 std::vector<std::pair<int*, int>> indexList;
264 std::vector<char*> vlaPointers;
265 std::vector<int*> indexPointers;
266 TObjArray* branches = inputTree->GetListOfBranches();
267 for (
int i = 0;
i < branches->GetEntriesFast(); ++
i) {
268 TBranch* br = (TBranch*)branches->UncheckedAt(
i);
269 TString branchName(br->GetName());
272 if (((TLeaf*)br->GetListOfLeaves()->First())->GetLeafCount() !=
nullptr) {
273 int maximum = ((TLeaf*)br->GetListOfLeaves()->First())->GetLeafCount()->GetMaximum();
278 br->GetExpectedType(cls,
type);
279 auto typeSize = TDataType::GetDataType(
type)->Size();
281 char*
buffer =
new char[maximum * typeSize];
282 memset(
buffer, 0, maximum * typeSize);
283 vlaPointers.push_back(
buffer);
285 printf(
" Allocated VLA buffer of length %d with %d bytes each for branch name %s\n", maximum, typeSize, br->GetName());
287 inputTree->SetBranchAddress(br->GetName(),
buffer);
288 outputTree->SetBranchAddress(br->GetName(),
buffer);
290 if (branchName.BeginsWith(
"fIndexArray")) {
291 for (
int i = 0;
i < maximum;
i++) {
295 }
else if (branchName.BeginsWith(
"fIndexSlice")) {
298 vlaPointers.push_back(
reinterpret_cast<char*
>(
buffer));
300 inputTree->SetBranchAddress(br->GetName(),
buffer);
301 outputTree->SetBranchAddress(br->GetName(),
buffer);
305 }
else if (branchName.BeginsWith(
"fIndex") && !branchName.EndsWith(
"_size")) {
308 indexPointers.push_back(
buffer);
310 inputTree->SetBranchAddress(br->GetName(),
buffer);
311 outputTree->SetBranchAddress(br->GetName(),
buffer);
317 if (indexList.size() > 0) {
318 auto entries = inputTree->GetEntries();
319 int minIndexOffset = unassignedIndexOffset[treeName];
320 auto newMinIndexOffset = minIndexOffset;
321 for (
int i = 0;
i < entries;
i++) {
322 for (
auto&
index : indexList) {
325 inputTree->GetEntry(
i);
327 for (
const auto& idx : indexList) {
329 if (*(idx.first) < 0) {
330 *(idx.first) += minIndexOffset;
331 newMinIndexOffset = std::min(newMinIndexOffset, *(idx.first));
333 *(idx.first) += idx.second;
336 if (!alreadyCopied) {
337 int nbytes = outputTree->Fill();
339 currentDirSize += nbytes;
343 unassignedIndexOffset[treeName] = newMinIndexOffset;
344 }
else if (!alreadyCopied) {
345 auto nbytes = outputTree->CopyEntries(inputTree, -1, (fastCopy) ?
"fast" :
"");
347 currentDirSize += nbytes;
353 for (
auto&
buffer : indexPointers) {
356 for (
auto&
buffer : vlaPointers) {
366 for (
auto const&
tree : trees) {
367 bool found = (std::find(foundTrees.begin(), foundTrees.end(),
tree.first) != foundTrees.end());
368 if (found ==
false) {
369 printf(
" *** FATAL ***: The tree %s was not in the current dataframe\n",
tree.first.c_str());
381 for (
auto const&
tree : trees) {
388 if (maxDirSize > 0) {
390 printf(
"ERROR: Index on %s but no tree found\n",
offset.first.c_str());
396 if (maxDirSize == 0 || currentDirSize > maxDirSize) {
398 printf(
"Maximum size reached: %ld. Closing folder %s.\n", currentDirSize, dfName);
400 for (
auto const&
tree : trees) {
403 tree.second->Write();
406 sizeCompressed[
tree.first] +=
tree.second->GetZipBytes();
407 sizeUncompressed[
tree.first] +=
tree.second->GetTotBytes();
422 parentFiles->Write(
"parentFiles", TObject::kSingleKey);
425 for (
auto const&
tree : trees) {
427 tree.second->Write();
430 sizeCompressed[
tree.first] +=
tree.second->GetZipBytes();
431 sizeUncompressed[
tree.first] +=
tree.second->GetTotBytes();
439 if (totalMergedDFs == 0) {
440 printf(
"ERROR: Did not merge a single DF. This does not seem right.\n");
446 printf(
"Removing incomplete output file %s.\n", outputFile->GetName());
447 gSystem->Unlink(outputFile->GetName());
449 printf(
"AOD merger finished. Size overview follows:\n");
451 uint64_t totalCompressed = 0;
452 uint64_t totalUncompressed = 0;
453 for (
auto const&
tree : sizeCompressed) {
454 totalCompressed +=
tree.second;
455 totalUncompressed += sizeUncompressed[
tree.first];
457 if (totalCompressed > 0 && totalUncompressed > 0) {
458 for (
auto const&
tree : sizeCompressed) {
459 printf(
" Tree %20s | Compressed: %12" PRIu64
" (%2.0f%%) | Uncompressed: %12" PRIu64
" (%2.0f%%)\n",
tree.first.c_str(),
tree.second, 100.0 *
tree.second / totalCompressed, sizeUncompressed[
tree.first], 100.0 * sizeUncompressed[
tree.first] / totalUncompressed);