33int main(
int argc,
char* argv[])
35 std::string inputCollection(
"input.txt");
36 std::string outputFileName(
"AO2D.root");
37 long maxDirSize = 100000000;
38 bool skipNonExistingFiles =
false;
39 bool skipParentFilesList =
false;
42 int compression = 505;
45 static struct option long_options[] = {
46 {
"input", required_argument,
nullptr, 0},
47 {
"output", required_argument,
nullptr, 1},
48 {
"max-size", required_argument,
nullptr, 2},
49 {
"skip-non-existing-files", no_argument,
nullptr, 3},
50 {
"skip-parent-files-list", no_argument,
nullptr, 4},
51 {
"compression", required_argument,
nullptr, 5},
52 {
"verbosity", required_argument,
nullptr,
'v'},
53 {
"help", no_argument,
nullptr,
'h'},
54 {
nullptr, 0,
nullptr, 0}};
57 int c = getopt_long(argc, argv,
"", long_options, &option_index);
61 inputCollection = optarg;
63 outputFileName = optarg;
65 maxDirSize = atol(optarg);
67 skipNonExistingFiles =
true;
69 skipParentFilesList =
true;
71 compression = atoi(optarg);
72 }
else if (
c ==
'v') {
74 }
else if (
c ==
'h') {
75 printf(
"AO2D merging tool. Options: \n");
76 printf(
" --input <inputfile.txt> Contains path to files to be merged. Default: %s\n", inputCollection.c_str());
77 printf(
" --output <outputfile.root> Target output ROOT file. Default: %s\n", outputFileName.c_str());
78 printf(
" --max-size <size in Bytes> Target directory size. Default: %ld. Set to 0 if file is not self-contained.\n", maxDirSize);
79 printf(
" --skip-non-existing-files Flag to allow skipping of non-existing files in the input list.\n");
80 printf(
" --skip-parent-files-list Flag to allow skipping the merging of the parent files list.\n");
81 printf(
" --compression <root compression id> Compression algorithm / level to use (default: %d)\n", compression);
82 printf(
" --verbosity <flag> Verbosity of output (default: %d).\n",
verbosity);
89 printf(
"AOD merger started with:\n");
90 printf(
" Input file: %s\n", inputCollection.c_str());
91 printf(
" Output file name: %s\n", outputFileName.c_str());
92 printf(
" Maximal folder size (uncompressed): %ld\n", maxDirSize);
93 if (skipNonExistingFiles) {
94 printf(
" WARNING: Skipping non-existing files.\n");
97 std::map<std::string, TTree*> trees;
98 std::map<std::string, uint64_t> sizeCompressed;
99 std::map<std::string, uint64_t> sizeUncompressed;
100 std::map<std::string, int>
offsets;
101 std::map<std::string, int> unassignedIndexOffset;
103 auto outputFile = TFile::Open(outputFileName.c_str(),
"RECREATE",
"", compression);
104 TDirectory* outputDir =
nullptr;
105 long currentDirSize = 0;
108 in.open(inputCollection);
110 TMap* metaData =
nullptr;
111 TMap* parentFiles =
nullptr;
112 int totalMergedDFs = 0;
114 while (in.good() && exitCode == 0) {
117 if (line.Length() == 0) {
121 if (line.BeginsWith(
"alien:") && !gGrid) {
122 printf(
"Connecting to AliEn...");
123 TGrid::Connect(
"alien:");
126 printf(
"Processing input file: %s\n", line.Data());
128 auto inputFile = TFile::Open(line);
129 if (!inputFile || inputFile->IsZombie()) {
130 printf(
"Error: %s input file %s.\n", !inputFile ?
"Could not open" :
"Zombie", line.Data());
131 if (skipNonExistingFiles) {
134 printf(
"Aborting merge!\n");
140 TList* keyList = inputFile->GetListOfKeys();
143 for (
auto key1 : *keyList) {
144 if (((TObjString*)key1)->GetString().EqualTo(
"metaData")) {
145 auto metaDataCurrentFile = (TMap*)inputFile->Get(
"metaData");
146 if (metaData ==
nullptr) {
147 metaData = metaDataCurrentFile;
149 metaData->Write(
"metaData", TObject::kSingleKey);
151 for (
auto metaDataPair : *metaData) {
152 auto metaDataKey = ((TPair*)metaDataPair)->Key();
153 if (metaDataCurrentFile->Contains(((TObjString*)metaDataKey)->GetString())) {
154 auto value = (TObjString*)metaData->GetValue(((TObjString*)metaDataKey)->GetString());
155 auto valueCurrentFile = (TObjString*)metaDataCurrentFile->GetValue(((TObjString*)metaDataKey)->GetString());
156 if (!
value->GetString().EqualTo(valueCurrentFile->GetString())) {
157 printf(
"WARNING: Metadata differs between input files. Key %s : %s vs. %s\n", ((TObjString*)metaDataKey)->GetString().Data(),
158 value->GetString().Data(), valueCurrentFile->GetString().Data());
161 printf(
"WARNING: Metadata differs between input files. Key %s is not present in current file\n", ((TObjString*)metaDataKey)->GetString().Data());
167 if (((TObjString*)key1)->GetString().EqualTo(
"parentFiles") && !skipParentFilesList) {
168 auto parentFilesCurrentFile = (TMap*)inputFile->Get(
"parentFiles");
169 if (parentFiles ==
nullptr) {
170 parentFiles =
new TMap;
172 for (
auto pair : *parentFilesCurrentFile) {
173 parentFiles->Add(((TPair*)pair)->Key(), ((TPair*)pair)->Value());
175 delete parentFilesCurrentFile;
178 if (!((TObjString*)key1)->GetString().BeginsWith(
"DF_")) {
182 auto dfName = ((TObjString*)key1)->GetString().Data();
185 printf(
" Processing folder %s\n", dfName);
189 auto folder = (TDirectoryFile*)inputFile->Get(dfName);
190 auto treeList = folder->GetListOfKeys();
195 for (
auto i = 0;
i < treeList->GetEntries(); ++
i) {
196 TKey* ki = (TKey*)treeList->At(
i);
197 for (
int j =
i + 1;
j < treeList->GetEntries(); ++
j) {
198 TKey* kj = (TKey*)treeList->At(
j);
199 if (std::strcmp(ki->GetName(), kj->GetName()) == 0 && std::strcmp(ki->GetTitle(), kj->GetTitle()) == 0) {
200 if (ki->GetCycle() < kj->GetCycle()) {
201 printf(
" *** FATAL *** we had ordered the keys, first cycle should be higher, please check");
205 treeList->Remove(kj);
215 std::list<std::string> foundTrees;
217 for (
auto key2 : *treeList) {
218 auto treeName = ((TObjString*)key2)->GetString().Data();
219 bool found = (std::find(foundTrees.begin(), foundTrees.end(), treeName) != foundTrees.end());
221 printf(
" ***WARNING*** Tree %s was already merged (even if we purged duplicated trees before, so this should not happen), skipping\n", treeName);
224 foundTrees.push_back(treeName);
226 auto inputTree = (TTree*)inputFile->Get(Form(
"%s/%s", dfName, treeName));
227 bool fastCopy = (inputTree->GetTotBytes() > 10000000);
229 printf(
" Processing tree %s with %lld entries with total size %lld (fast copy: %d)\n", treeName, inputTree->GetEntries(), inputTree->GetTotBytes(), fastCopy);
232 bool alreadyCopied =
false;
233 if (trees.count(treeName) == 0) {
235 printf(
" *** FATAL ***: The tree %s was not in the previous dataframe(s)\n", treeName);
242 outputDir = outputFile->mkdir(dfName);
245 printf(
"Writing to output folder %s\n", dfName);
249 auto outputTree = inputTree->CloneTree(-1, (fastCopy) ?
"fast" :
"");
250 currentDirSize += inputTree->GetTotBytes();
251 alreadyCopied =
true;
252 outputTree->SetAutoFlush(0);
253 trees[treeName] = outputTree;
256 trees[treeName]->CopyAddresses(inputTree);
259 auto outputTree = trees[treeName];
261 std::vector<std::pair<int*, int>> indexList;
262 std::vector<char*> vlaPointers;
263 std::vector<int*> indexPointers;
264 TObjArray* branches = inputTree->GetListOfBranches();
265 for (
int i = 0;
i < branches->GetEntriesFast(); ++
i) {
266 TBranch* br = (TBranch*)branches->UncheckedAt(
i);
267 TString branchName(br->GetName());
270 if (((TLeaf*)br->GetListOfLeaves()->First())->GetLeafCount() !=
nullptr) {
271 int maximum = ((TLeaf*)br->GetListOfLeaves()->First())->GetLeafCount()->GetMaximum();
276 br->GetExpectedType(cls,
type);
277 auto typeSize = TDataType::GetDataType(
type)->Size();
279 char*
buffer =
new char[maximum * typeSize];
280 memset(
buffer, 0, maximum * typeSize);
281 vlaPointers.push_back(
buffer);
283 printf(
" Allocated VLA buffer of length %d with %d bytes each for branch name %s\n", maximum, typeSize, br->GetName());
285 inputTree->SetBranchAddress(br->GetName(),
buffer);
286 outputTree->SetBranchAddress(br->GetName(),
buffer);
288 if (branchName.BeginsWith(
"fIndexArray")) {
289 for (
int i = 0;
i < maximum;
i++) {
293 }
else if (branchName.BeginsWith(
"fIndexSlice")) {
296 vlaPointers.push_back(
reinterpret_cast<char*
>(
buffer));
298 inputTree->SetBranchAddress(br->GetName(),
buffer);
299 outputTree->SetBranchAddress(br->GetName(),
buffer);
303 }
else if (branchName.BeginsWith(
"fIndex") && !branchName.EndsWith(
"_size")) {
306 indexPointers.push_back(
buffer);
308 inputTree->SetBranchAddress(br->GetName(),
buffer);
309 outputTree->SetBranchAddress(br->GetName(),
buffer);
315 if (indexList.size() > 0) {
316 auto entries = inputTree->GetEntries();
317 int minIndexOffset = unassignedIndexOffset[treeName];
318 auto newMinIndexOffset = minIndexOffset;
319 for (
int i = 0;
i < entries;
i++) {
320 for (
auto&
index : indexList) {
323 inputTree->GetEntry(
i);
325 for (
const auto& idx : indexList) {
327 if (*(idx.first) < 0) {
328 *(idx.first) += minIndexOffset;
329 newMinIndexOffset = std::min(newMinIndexOffset, *(idx.first));
331 *(idx.first) += idx.second;
334 if (!alreadyCopied) {
335 int nbytes = outputTree->Fill();
337 currentDirSize += nbytes;
341 unassignedIndexOffset[treeName] = newMinIndexOffset;
342 }
else if (!alreadyCopied) {
343 auto nbytes = outputTree->CopyEntries(inputTree, -1, (fastCopy) ?
"fast" :
"");
345 currentDirSize += nbytes;
351 for (
auto&
buffer : indexPointers) {
354 for (
auto&
buffer : vlaPointers) {
364 for (
auto const&
tree : trees) {
365 bool found = (std::find(foundTrees.begin(), foundTrees.end(),
tree.first) != foundTrees.end());
366 if (found ==
false) {
367 printf(
" *** FATAL ***: The tree %s was not in the current dataframe\n",
tree.first.c_str());
379 for (
auto const&
tree : trees) {
386 if (maxDirSize > 0) {
388 printf(
"ERROR: Index on %s but no tree found\n",
offset.first.c_str());
394 if (maxDirSize == 0 || currentDirSize > maxDirSize) {
396 printf(
"Maximum size reached: %ld. Closing folder %s.\n", currentDirSize, dfName);
398 for (
auto const&
tree : trees) {
401 tree.second->Write();
404 sizeCompressed[
tree.first] +=
tree.second->GetZipBytes();
405 sizeUncompressed[
tree.first] +=
tree.second->GetTotBytes();
420 parentFiles->Write(
"parentFiles", TObject::kSingleKey);
423 for (
auto const&
tree : trees) {
425 tree.second->Write();
428 sizeCompressed[
tree.first] +=
tree.second->GetZipBytes();
429 sizeUncompressed[
tree.first] +=
tree.second->GetTotBytes();
437 if (totalMergedDFs == 0) {
438 printf(
"ERROR: Did not merge a single DF. This does not seem right.\n");
444 printf(
"Removing incomplete output file %s.\n", outputFile->GetName());
445 gSystem->Unlink(outputFile->GetName());
447 printf(
"AOD merger finished. Size overview follows:\n");
449 uint64_t totalCompressed = 0;
450 uint64_t totalUncompressed = 0;
451 for (
auto const&
tree : sizeCompressed) {
452 totalCompressed +=
tree.second;
453 totalUncompressed += sizeUncompressed[
tree.first];
455 if (totalCompressed > 0 && totalUncompressed > 0) {
456 for (
auto const&
tree : sizeCompressed) {
457 printf(
" Tree %20s | Compressed: %12" PRIu64
" (%2.0f%%) | Uncompressed: %12" PRIu64
" (%2.0f%%)\n",
tree.first.c_str(),
tree.second, 100.0 *
tree.second / totalCompressed, sizeUncompressed[
tree.first], 100.0 * sizeUncompressed[
tree.first] / totalUncompressed);