Skip to content

Commit

Permalink
Add a tool to merge several podio files into a single one (#681)
Browse files Browse the repository at this point in the history
* Add a tool to merge several podio files into a single one

* Generate a metadata frame if it doesn't exist

* Add configuration for the metadata parameter name

* Hardcode the metadata parameters

---------

Co-authored-by: jmcarcell <[email protected]>
  • Loading branch information
jmcarcell and jmcarcell authored Nov 20, 2024
1 parent b57b8de commit 960795b
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 0 deletions.
1 change: 1 addition & 0 deletions tools/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ install(PROGRAMS ${CMAKE_CURRENT_LIST_DIR}/podio-vis DESTINATION ${CMAKE_INSTALL
if(ENABLE_RNTUPLE)
install(PROGRAMS ${CMAKE_CURRENT_LIST_DIR}/podio-ttree-to-rntuple DESTINATION ${CMAKE_INSTALL_BINDIR})
endif()
install(PROGRAMS ${CMAKE_CURRENT_LIST_DIR}/podio-merge-files DESTINATION ${CMAKE_INSTALL_BINDIR})

# Add a very basic test of podio-vis
if(BUILD_TESTING)
Expand Down
67 changes: 67 additions & 0 deletions tools/podio-merge-files
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/usr/bin/env python3
"""podio-merge-files tool to merge any number of podio files into one"""

import argparse
import sys
import podio
import podio.root_io
from podio import reading

parser = argparse.ArgumentParser(
description="Merge any number of podio files into one, can merge TTree and RNTuple files"
)

parser.add_argument("--output-file", help="name of the output file", required=True)
parser.add_argument("files", nargs="+", help="which files to merge")
parser.add_argument(
"--metadata",
choices=["none", "all", "first"],
default="first",
help="metadata to include in the output file, default: "
"only the one from the first file, other options: all files, none",
)
args = parser.parse_args()

all_files = set()
for f in args.files:
if f in all_files:
raise ValueError(f"File {f} is present more than once in the input list")
all_files.add(f)

ROOT_FORMAT = reading._determine_root_format(args.files[0]) # pylint: disable=protected-access
if ROOT_FORMAT == reading.RootFileFormat.TTREE:
reader = podio.root_io.Reader(args.files)
writer = podio.root_io.Writer(args.output_file)
elif ROOT_FORMAT == reading.RootFileFormat.RNTUPLE:
reader = podio.root_io.RNTupleReader(args.files)
writer = podio.root_io.RNTupleWriter(args.output_file)
else:
raise ValueError(f"Input file {args.files[0]} is not a TTree or RNTuple file")

categories = list(reader.categories)
is_metadata_available = True # pylint: disable=invalid-name
try:
# All frames will be copied as they are except the metadata ones
categories.remove("metadata")
except ValueError:
is_metadata_available = False # pylint: disable=invalid-name

for category in categories:
all_frames = reader.get(category)
for frame in all_frames:
writer.write_frame(frame, category)

if args.metadata == "none":
sys.exit(0)

if not is_metadata_available:
print("Warning: metadata category 'metadata' not found in the input files, it will be created")
all_frames = [podio.Frame()]
else:
if args.metadata == "first":
all_frames = [reader.get("metadata")[0]]
else:
all_frames = reader.get("metadata")
for frame in all_frames:
frame.put_parameter("MergedInputFiles", args.files)
writer.write_frame(frame, "metadata")

0 comments on commit 960795b

Please sign in to comment.