Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extract search for input files into separate function #418

Merged
merged 3 commits into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions python/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ def get_entries(inpath: str) -> int | None:
try:
nevents = infile.Get("events").GetEntries()
except AttributeError:
LOGGER.error('Input file is missing "events" TTree!\nAborting...')
LOGGER.error('Input file is missing "events" TTree!\n - %s'
'\nAborting...', inpath)
sys.exit(3)

return nevents
Expand Down Expand Up @@ -63,15 +64,15 @@ def get_entries_sow(infilepath: str, nevents_max: Optional[int] = None, get_loca
sumOfWeightsTTree = 0.

# check for empty chunk (can this be improved? exception from RDF cannot be caught it seems?)
tree =infile.Get("events")
tree = infile.Get("events")
if not tree:
print("Tree not found in file", infilepath, " possibly empty chunk - continuing with next one.")
infile.Close()
return processEvents, eventsTTree, processSumOfWeights, sumOfWeightsTTree

try:

#use a RDF here too so the nevents restriction option can be imposed easily for the local events
# use a RDF here too so the nevents restriction option can be imposed easily for the local events
rdf_tmp = ROOT.ROOT.RDataFrame("events", infilepath)

if nevents_max:
Expand All @@ -85,7 +86,7 @@ def get_entries_sow(infilepath: str, nevents_max: Optional[int] = None, get_loca
# infile.Get("events").Draw('EventHeader.weight[0]>>histo')
# histo=ROOT.gDirectory.Get('histo')
histo = rdf_tmp.Histo1D(weight_name)
sumOfWeightsTTree=float(eventsTTree)*histo.GetMean()
sumOfWeightsTTree = float(eventsTTree) * histo.GetMean()
except cppyy.gbl.std.runtime_error:
LOGGER.error('Error: Event weights requested with do_weighted,'
'but input file does not contain weight column. Aborting.')
Expand Down
3 changes: 2 additions & 1 deletion python/run_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -886,7 +886,8 @@ def run_histmaker(args, rdf_module, anapath):

LOGGER.info('Writing out process %s, nEvents processed %s',
process, f'{evtcount.GetValue():,}')
with ROOT.TFile(f'{output_dir}/{process}.root', 'RECREATE'):
with ROOT.TFile(os.path.join(output_dir, f'{process}.root'),
'RECREATE'):
for hist in hists_to_write.values():
if do_scale:
hist.Scale(scale * int_lumi)
Expand Down
102 changes: 63 additions & 39 deletions python/run_final_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ def get_entries(infilepath: str) -> tuple[int, int]:
try:
events_in_ttree = infile.Get("events").GetEntries()
except AttributeError:
LOGGER.error('Input file is missing "events" TTree!\nAborting...')
LOGGER.error('Input file is missing "events" TTree!\n - %s'
'\nAborting...')
sys.exit(3)

return events_processed, events_in_ttree
Expand All @@ -67,6 +68,42 @@ def get_processes(rdf_module: object) -> list[str]:
return process_list


# _____________________________________________________________________________
def find_sample_files(input_dir: str,
sample_name: str) -> list[str]:
'''
Find input files for the specified sample name.
'''
result: list[str] = []

full_input_path = os.path.abspath(os.path.join(input_dir, sample_name))

# Find all input files ending with .root
if os.path.isdir(full_input_path):
all_files = os.listdir(full_input_path)
# Remove files not ending with `.root`
all_files = [f for f in all_files if f.endswith('.root')]
# Remove directories
all_files = [f for f in all_files
if os.path.isfile(os.path.join(full_input_path, f))]
result = [os.path.join(full_input_path, f) for f in all_files]

# Handle case when there is just one input file
if len(result) < 1:
if os.path.isfile(full_input_path + '.root'):
result.append(full_input_path + '.root')
else:
LOGGER.debug('Input file "%s" does not exist!',
full_input_path + '.root')

if len(result) < 1:
LOGGER.error('Can not find input files for "%s" sample!\nAborting...',
sample_name)
sys.exit(3)

return result


# _____________________________________________________________________________
def save_results(results: dict[str, dict[str, any]],
rdf_module: object) -> None:
Expand Down Expand Up @@ -279,47 +316,34 @@ def run(rdf_module, args) -> None:

file_list[process_name] = ROOT.vector('string')()

infilepath = input_dir + process_name + '.root' # input file

if not os.path.isfile(infilepath):
LOGGER.debug('File %s does not exist!\nTrying if it is a '
'directory as it might have been processed in batch.',
infilepath)
else:
LOGGER.info('Open file:\n %s', infilepath)
flist = find_sample_files(input_dir, process_name)
for filepath in flist:
# TODO: check in `get_entries()` if file is valid and remove it
# from the input list if it is not
if do_weighted:
process_events[process_name], events_ttree[process_name], \
sow_process[process_name], sow_ttree[process_name] = \
get_entries_sow(infilepath, weight_name="weight")
else:
process_events[process_name], events_ttree[process_name] = \
get_entries(infilepath)
file_list[process_name].push_back(infilepath)

indirpath = input_dir + process_name
if os.path.isdir(indirpath):
#reset the nevts/sow counters to avoid wrong counting in case a single file of same name (e.g. local test output) also exists in the same directory
process_events[process_name] = 0
events_ttree[process_name] = 0
sow_process[process_name] = 0.
sow_ttree[process_name] = 0.

info_msg = f'Open directory {indirpath}'
flist = glob.glob(indirpath + '/chunk*.root')
for filepath in flist:
info_msg += '\n\t' + filepath
if do_weighted:
chunk_process_events, chunk_events_ttree, \
chunk_process_events, chunk_events_ttree, \
chunk_sow_process, chunk_sow_ttree = \
get_entries_sow(filepath, weight_name="weight")
sow_process[process_name] += chunk_sow_process
sow_ttree[process_name] += chunk_sow_ttree
else:
chunk_process_events, chunk_events_ttree = \
get_entries(filepath)
process_events[process_name] += chunk_process_events
events_ttree[process_name] += chunk_events_ttree
file_list[process_name].push_back(filepath)
sow_process[process_name] += chunk_sow_process
sow_ttree[process_name] += chunk_sow_ttree
else:
chunk_process_events, chunk_events_ttree = \
get_entries(filepath)
process_events[process_name] += chunk_process_events
events_ttree[process_name] += chunk_events_ttree
file_list[process_name].push_back(filepath)
if len(file_list[process_name]) < 1:
LOGGER.error('No valid input files for sample "%s" '
'found!\nAborting..', process_name)
sys.exit(3)
if len(file_list[process_name]) == 1:
LOGGER.info('Loading events for sample "%s" from file:\n - %s',
process_name, file_list[process_name][0])
else:
info_msg = f'Loading events for sample "{process_name}"'
info_msg += ' from files:'
for filepath in file_list[process_name]:
info_msg += f'\n - {filepath}'
LOGGER.info(info_msg)

info_msg = 'Processed events:'
Expand Down