From e4150fcedd87b3e8ea8f44653819caf9c23cfc83 Mon Sep 17 00:00:00 2001 From: Birgit Stapf Date: Fri, 27 Sep 2024 19:36:57 +0200 Subject: [PATCH] add first version of using event weights for scaling etc --- analyzers/dataframe/src/Analysis_FCChh.cc | 20 +- examples/FCChh/ggHH_bbyy/analysis_final.py | 3 + examples/FCChh/ggHH_bbyy/analysis_stage1.py | 8 +- .../ggHH_bbyy/analysis_stage1_fullfile.py | 173 ------------------ python/process.py | 63 +++++++ python/run_fccanalysis.py | 90 ++++++--- python/run_final_analysis.py | 106 +++++++++-- 7 files changed, 241 insertions(+), 222 deletions(-) delete mode 100644 examples/FCChh/ggHH_bbyy/analysis_stage1_fullfile.py diff --git a/analyzers/dataframe/src/Analysis_FCChh.cc b/analyzers/dataframe/src/Analysis_FCChh.cc index 430ca03bcb..0bf52537db 100644 --- a/analyzers/dataframe/src/Analysis_FCChh.cc +++ b/analyzers/dataframe/src/Analysis_FCChh.cc @@ -924,6 +924,7 @@ AnalysisFCChh::get_tagged_jets( for (size_t pid_index = 0; pid_index < pids.size(); ++pid_index) { const auto tag = + // static_cast(tag_values[pid_index]); static_cast(tag_values[pids[pid_index].parameters_begin]); // std::cout << "Tag = " << tag << std::endl; @@ -1145,7 +1146,8 @@ ROOT::VecOps::RVec AnalysisFCChh::getOSPairs( // FOR DEBUG? // if (OS_pairs.size() > 1){ // std::cout << "Number of possible OS pairs: " << OS_pairs.size() << - // std::endl; std::cout << "Build from: " << leptons_pos.size() << " pos, " + // std::endl; std::cout << "Build from: " << leptons_pos.size() << " + // pos, " // << leptons_neg.size() << " neg." << std::endl; // } @@ -1570,8 +1572,8 @@ ROOT::VecOps::RVec AnalysisFCChh::get_mT_pseudo( // //if one of the input particles is empty, just fill a default value of // -999 as mT if (particle_1.size() < 1 || particle_2.size() < 1|| -// MET_obj.size() < 1 ){ m_strans_vector.push_back(-999.); return -// m_strans_vector; +// MET_obj.size() < 1 ){ m_strans_vector.push_back(-999.); +// return m_strans_vector; // } // TLorentzVector tlv_vis1 = getTLV_reco(particle_1.at(0)); @@ -1618,8 +1620,8 @@ ROOT::VecOps::RVec AnalysisFCChh::get_mT_pseudo( // //if one of the input particles is empty, just fill a default value of // -999 as mT if (particle_1.size() < 1 || particle_2.size() < 1|| -// MET_obj.size() < 1 ){ m_strans_vector.push_back(-999.); return -// m_strans_vector; +// MET_obj.size() < 1 ){ m_strans_vector.push_back(-999.); +// return m_strans_vector; // } // TLorentzVector tlv_vis1 = getTLV_reco(particle_1.at(0)); @@ -2496,7 +2498,7 @@ ROOT::VecOps::RVec AnalysisFCChh::getTruthTau( !isFromHiggsDirect( truth_part, parent_ids, truth_particles)) { //&& isFromHadron(truth_part, parent_ids, - //truth_particles) ) { + // truth_particles) ) { continue; } // select both from higgs or from hadrons @@ -2541,7 +2543,7 @@ ROOT::VecOps::RVec AnalysisFCChh::getTruthTauLeps( !isFromHiggsDirect( truth_part, parent_ids, truth_particles)) { //&& isFromHadron(truth_part, parent_ids, - //truth_particles) ) { + // truth_particles) ) { continue; } // select both from higgs or from hadrons @@ -2618,7 +2620,7 @@ ROOT::VecOps::RVec AnalysisFCChh::getTruthTauHads( !isFromHiggsDirect( truth_part, parent_ids, truth_particles)) { //&& isFromHadron(truth_part, parent_ids, - //truth_particles) ) { + // truth_particles) ) { continue; } // select both from higgs or from hadrons @@ -3336,7 +3338,7 @@ AnalysisFCChh::find_reco_matches_exclusive( } // if (reco_match_vector.size() > 1){ // std::cout << "Warning in AnalysisFCChh::find_reco_matches() - Truth - //particle matched to more than one reco particle." << std::endl; + // particle matched to more than one reco particle." << std::endl; // } // check that the reco particle is not already in the out_vector diff --git a/examples/FCChh/ggHH_bbyy/analysis_final.py b/examples/FCChh/ggHH_bbyy/analysis_final.py index fd62d5345c..44a9e8c360 100644 --- a/examples/FCChh/ggHH_bbyy/analysis_final.py +++ b/examples/FCChh/ggHH_bbyy/analysis_final.py @@ -22,6 +22,9 @@ saveTabular = True +# Optional: Use weighted events +do_weighted = True + ###Dictionnay of the list of cuts. The key is the name of the selection that will be added to the output file cutList = { "sel0_myy":"m_yy[0] > 100. && m_yy[0] < 180.", diff --git a/examples/FCChh/ggHH_bbyy/analysis_stage1.py b/examples/FCChh/ggHH_bbyy/analysis_stage1.py index 23eabc1cfe..e203a67af9 100644 --- a/examples/FCChh/ggHH_bbyy/analysis_stage1.py +++ b/examples/FCChh/ggHH_bbyy/analysis_stage1.py @@ -48,6 +48,9 @@ def __init__(self, cmdline_args): # Optional: running on HTCondor, default is False # self.run_batch = False + # Optional: Use weighted events + self.do_weighted = True + # Optional: test file that is used if you run with the --test argument (fccanalysis run ./examples/FCChh/ggHH_bbyy/analysis_stage1.py --test) self.test_file = 'root://eospublic.cern.ch//eos/experiment/fcc/hh/' \ 'tutorials/edm4hep_tutorial_data/' \ @@ -130,8 +133,8 @@ def analyzers(self, dframe): ########################################### APPLY PRE-SELECTION ########################################### #require at least two b-jets and two photons, both with invariant masses compatible with the Higgs mass - .Filter("sel_bjets.size()>1") - .Filter("sel_gamma.size()>1") + # .Filter("sel_bjets.size()>1") + # .Filter("sel_gamma.size()>1") # .Filter("m_bb[0] < 200.") # .Filter("m_bb[0] > 80.") # .Filter("m_yy[0] < 180.") @@ -147,6 +150,7 @@ def output(self): Output variables which will be saved to output root file. ''' branch_list = [ + 'weight', # Photons and H(yy) system: 'ngamma', 'g1_pt', 'g2_pt', 'g1_eta', 'g2_eta', 'm_yy', # b-jets and H(bb) system: diff --git a/examples/FCChh/ggHH_bbyy/analysis_stage1_fullfile.py b/examples/FCChh/ggHH_bbyy/analysis_stage1_fullfile.py deleted file mode 100644 index 16d0810216..0000000000 --- a/examples/FCChh/ggHH_bbyy/analysis_stage1_fullfile.py +++ /dev/null @@ -1,173 +0,0 @@ -''' -Analysis example for FCC-hh, using gg->HH->bbyy di-Higgs production events -''' -from argparse import ArgumentParser - - -# Mandatory: Analysis class where the user defines the operations on the -# dataframe. -class Analysis(): - ''' - Di-Higgs analysis in bbyy. - ''' - def __init__(self, cmdline_args): - parser = ArgumentParser( - description='Additional analysis arguments', - usage='Provide additional arguments after analysis script path') - # parser.add_argument('--bjet-pt', default='10.', type=float, - # help='Minimal pT of the selected b-jets.') - # Parse additional arguments not known to the FCCAnalyses parsers - # All command line arguments know to fccanalysis are provided in the - # `cmdline_arg` dictionary. - self.ana_args, _ = parser.parse_known_args(cmdline_args['unknown']) - - # Mandatory: List of processes to run over - self.process_list = { - # # Add your processes like this: - ## '':{'fraction':, 'chunks':, 'output': }, - # # - needs to correspond either the name of the input .root file, or the name of a directory containing root files - # # If you want to process only part of the events, split the output into chunks or give a different name to the output use the optional arguments - # # or leave blank to use defaults = run the full statistics in one output file named the same as the process: - 'pwp8_pp_hh_lambda100_5f_hhbbaa': {}, - } - - # Mandatory: Input directory where to find the samples, or a production tag when running over the centrally produced - # samples (this points to the yaml files for getting sample statistics) - self.input_dir = '/eos/experiment/fcc/hh/generation/DelphesEvents/fcc_v06_scenarioI/' - # self.prod_tag = 'FCCee/spring2021/IDEA/' - - # Optional: output directory, default is local running directory - self.output_dir = 'fcc_v06_bbyy/' \ - # f'stage1_{self.ana_args.muon_pt}' - - # Optional: analysisName, default is '' - # self.analysis_name = 'My Analysis' - - # Optional: number of threads to run on, default is 'all available' - # self.n_threads = 4 - - # Optional: running on HTCondor, default is False - # self.run_batch = False - - # Optional: test file that is used if you run with the --test argument (fccanalysis run ./examples/FCChh/ggHH_bbyy/analysis_stage1.py --test) - self.test_file = 'root://eospublic.cern.ch//eos/experiment/fcc/hh/' \ - 'tutorials/edm4hep_tutorial_data/' \ - 'p8_ee_ZH_ecm240.root' - - - # Mandatory: analyzers function to define the analysis graph, please make - # sure you return the dataframe, in this example it is dframe2 - def analyzers(self, dframe): - ''' - Analysis graph. - ''' - - dframe2 = ( - dframe - - .Define("weight", "EventHeader.weight") - - ########################################### PHOTONS ########################################### - - .Define("gamma", "FCCAnalyses::ReconstructedParticle::get(Photon_objIdx.index, ReconstructedParticles)") - .Define("selpt_gamma", "FCCAnalyses::ReconstructedParticle::sel_pt(30.)(gamma)") - .Define("sel_gamma_unsort", "FCCAnalyses::ReconstructedParticle::sel_eta(4)(selpt_gamma)") - .Define("sel_gamma", "AnalysisFCChh::SortParticleCollection(sel_gamma_unsort)") - - .Define("ngamma", "FCCAnalyses::ReconstructedParticle::get_n(sel_gamma)") - .Define("g1_e", "FCCAnalyses::ReconstructedParticle::get_e(sel_gamma)[0]") - .Define("g1_pt", "FCCAnalyses::ReconstructedParticle::get_pt(sel_gamma)[0]") - .Define("g1_eta", "FCCAnalyses::ReconstructedParticle::get_eta(sel_gamma)[0]") - .Define("g1_phi", "FCCAnalyses::ReconstructedParticle::get_phi(sel_gamma)[0]") - .Define("g2_e", "FCCAnalyses::ReconstructedParticle::get_e(sel_gamma)[1]") - .Define("g2_pt", "FCCAnalyses::ReconstructedParticle::get_pt(sel_gamma)[1]") - .Define("g2_eta", "FCCAnalyses::ReconstructedParticle::get_eta(sel_gamma)[1]") - .Define("g2_phi", "FCCAnalyses::ReconstructedParticle::get_phi(sel_gamma)[1]") - - #H(yy) if it exists, if there are no 2 selected photons, doesnt get filled - .Define("yy_pairs_unmerged", "AnalysisFCChh::getPairs(sel_gamma)") - .Define("yy_pairs", "AnalysisFCChh::merge_pairs(yy_pairs_unmerged)") - .Define("m_yy", "FCCAnalyses::ReconstructedParticle::get_mass(yy_pairs)") - - - ########################################### JETS ########################################### - - # jets after overlap removal is performed between jets and isolated electrons, muons and photons - - # selected jets above a pT threshold of 30 GeV, eta < 4, tight ID - .Define("selpt_jets", "FCCAnalyses::ReconstructedParticle::sel_pt(30.)(Jet)") - .Define("sel_jets_unsort", "FCCAnalyses::ReconstructedParticle::sel_eta(4)(selpt_jets)") - .Define("sel_jets", "AnalysisFCChh::SortParticleCollection(sel_jets_unsort)") - .Define("njets", "FCCAnalyses::ReconstructedParticle::get_n(sel_jets)") - .Define("j1_e", "FCCAnalyses::ReconstructedParticle::get_e(sel_jets)[0]") - .Define("j1_pt", "FCCAnalyses::ReconstructedParticle::get_pt(sel_jets)[0]") - .Define("j1_eta", "FCCAnalyses::ReconstructedParticle::get_eta(sel_jets)[0]") - .Define("j1_phi", "FCCAnalyses::ReconstructedParticle::get_phi(sel_jets)[0]") - - .Define("j2_e", "FCCAnalyses::ReconstructedParticle::get_e(sel_jets)[1]") - .Define("j2_pt", "FCCAnalyses::ReconstructedParticle::get_pt(sel_jets)[1]") - .Define("j2_eta", "FCCAnalyses::ReconstructedParticle::get_eta(sel_jets)[1]") - .Define("j2_phi", "FCCAnalyses::ReconstructedParticle::get_phi(sel_jets)[1]") - - #b-tagged jets: - # .Alias("Jet3","Jet#3.index") - .Alias("Jet3","_Jet_particles.index") - #LOOSE WP : bit 1 = medium WP, bit 2 = tight WP - #b tagged jets - .Define("bjets", "AnalysisFCChh::get_tagged_jets(ReconstructedParticles, ParticleIDs, _ParticleIDs_particle, _ParticleIDs_parameters, 1)") #bit 0 = loose WP, see: https://github.com/delphes/delphes/blob/master/cards/FCC/scenarios/FCChh_I.tcl - # .Define("bjets", "AnalysisFCChh::get_tagged_jets(Jet, Jet3, ParticleIDs, _ParticleIDs_parameters, 1)") #bit 0 = loose WP, see: https://github.com/delphes/delphes/blob/master/cards/FCC/scenarios/FCChh_I.tcl - # .Define("selpt_bjets", "FCCAnalyses::ReconstructedParticle::sel_pt(30.)(bjets)") - # .Define("sel_bjets_unsort", "FCCAnalyses::ReconstructedParticle::sel_eta(4)(selpt_bjets)") - # .Define("sel_bjets", "AnalysisFCChh::SortParticleCollection(sel_bjets_unsort)") - .Alias("sel_bjets", "bjets") - .Define("nbjets", "FCCAnalyses::ReconstructedParticle::get_n(sel_bjets)") - .Define("b1_e", "FCCAnalyses::ReconstructedParticle::get_e(sel_bjets)[0]") - .Define("b1_pt", "FCCAnalyses::ReconstructedParticle::get_pt(sel_bjets)[0]") - .Define("b1_eta", "FCCAnalyses::ReconstructedParticle::get_eta(sel_bjets)[0]") - .Define("b1_phi", "FCCAnalyses::ReconstructedParticle::get_phi(sel_bjets)[0]") - .Define("b2_e", "FCCAnalyses::ReconstructedParticle::get_e(sel_bjets)[1]") - .Define("b2_pt", "FCCAnalyses::ReconstructedParticle::get_pt(sel_bjets)[1]") - .Define("b2_eta", "FCCAnalyses::ReconstructedParticle::get_eta(sel_bjets)[1]") - .Define("b2_phi", "FCCAnalyses::ReconstructedParticle::get_phi(sel_bjets)[1]") - - - # #H(bb) system - using the loose WP b-jets -> if the events has < 2 bjets these variables do not get filled! - .Define("bb_pairs_unmerged", "AnalysisFCChh::getPairs(sel_bjets)") #currently gets only leading pT pair, as a RecoParticlePair - #then merge the bb pair into one object and get its kinematic properties - .Define("bb_pairs", "AnalysisFCChh::merge_pairs(bb_pairs_unmerged)") #merge into one object to access inv masses etc - .Define("m_bb", "FCCAnalyses::ReconstructedParticle::get_mass(bb_pairs)") - - # Filter at least one candidate - .Filter("sel_bjets.size()>1") - # .Filter("sel_gamma.size()>1") - # .Filter("m_bb[0] < 200.") - # .Filter("m_bb[0] > 80.") - # .Filter("m_yy[0] < 180.") - # .Filter("m_yy[0] > 100.") - - ) - return dframe2 - - # Mandatory: output function, please make sure you return the branch list - # as a python list - def output(self): - ''' - Output variables which will be saved to output root file. - ''' - branch_list = [ - 'nbjets', - # 'm_bb', - 'ngamma', - 'm_yy', - 'm_bb', - 'b1_pt', 'b2_pt', 'b1_eta', 'b2_eta', - # 'selected_muons_pt', - # 'selected_muons_y', - # 'selected_muons_p', - # 'selected_muons_e', - # 'zed_leptonic_pt', - # 'zed_leptonic_m', - # 'zed_leptonic_charge', - # 'zed_leptonic_recoil_m' - ] - return branch_list diff --git a/python/process.py b/python/process.py index 2b74c5c771..f6902ae258 100644 --- a/python/process.py +++ b/python/process.py @@ -7,6 +7,7 @@ import json import glob import logging +from typing import Optional import urllib.request import yaml # type: ignore import ROOT # type: ignore @@ -30,6 +31,68 @@ def get_entries(inpath: str) -> int: return nevents +def get_entries_sow(infilepath: str, nevents_max: Optional[int] = None, get_local: bool = True, weight_name: str = "EventHeader.weight") -> tuple[int, int, float, float]: + ''' + Get number of original entries and number of actual entries in the file, as well as the sum of weights + ''' + + infile = ROOT.TFile.Open(infilepath) + infile.cd() + + processEvents = 0 + processSumOfWeights = 0. + try: + processEvents = infile.Get('eventsProcessed').GetVal() + except AttributeError: + print('----> Warning: Input file is missing information about ' # should these warnings be kept? + 'original number of events!') + try: + processSumOfWeights = infile.Get('SumOfWeights').GetVal() + except AttributeError: + print('----> Warning: Input file is missing information about ' + 'original sum of weights!') + + if not get_local: + return processEvents, None, processSumOfWeights, None + + eventsTTree = 0 + sumOfWeightsTTree = 0. + + # check for empty chunk (can this be improved? exception from RDF cannot be caught it seems?) + tree =infile.Get("events") + if not tree: + print("Tree not found in file", infilepath, " possibly empty chunk - continuing with next one.") + infile.Close() + return processEvents, eventsTTree, processSumOfWeights, sumOfWeightsTTree + + try: + + #use a RDF here too so the nevents restriction option can be imposed easily for the local events + rdf_tmp = ROOT.ROOT.RDataFrame("events", infilepath) + + if nevents_max: + rdf_tmp = rdf_tmp.Range(0, nevents_max) + + eventsTTree = rdf_tmp.Count().GetValue() + + # eventsTTree = infile.Get("events").GetEntries() + ROOT.gROOT.SetBatch(True) + try: + # infile.Get("events").Draw('EventHeader.weight[0]>>histo') + # histo=ROOT.gDirectory.Get('histo') + histo = rdf_tmp.Histo1D(weight_name) + sumOfWeightsTTree=float(eventsTTree)*histo.GetMean() + except AttributeError: + print('----> Warning: Input file has no event weights.') + except AttributeError: + print('----> Error: Input file is missing events TTree! Probably empty chunk.') + infile.Close() + + infile.Close() + + return processEvents, eventsTTree, processSumOfWeights, sumOfWeightsTTree + + def get_process_info(process: str, prod_tag: str, input_dir: str) -> tuple[list[str], list[int]]: diff --git a/python/run_fccanalysis.py b/python/run_fccanalysis.py index e6f4f86c0c..e6e13b3f3b 100644 --- a/python/run_fccanalysis.py +++ b/python/run_fccanalysis.py @@ -14,7 +14,7 @@ import ROOT # type: ignore from anascript import get_element, get_element_dict, get_attribute -from process import get_process_info +from process import get_process_info, get_entries_sow from frame import generate_graph LOGGER = logging.getLogger('FCCAnalyses.run') @@ -455,7 +455,7 @@ def apply_filepath_rewrites(filepath: str) -> str: # _____________________________________________________________________________ -def run_local(args, analysis, infile_list): +def run_local(args, analysis, infile_list, do_weighted = False): ''' Run analysis locally. ''' @@ -467,38 +467,65 @@ def run_local(args, analysis, infile_list): nevents_orig = 0 # The amount of events in the input file(s) nevents_local = 0 + + # Same for the sum of weights + if do_weighted: + sow_orig = 0. + sow_local = 0. + for filepath in infile_list: filepath = apply_filepath_rewrites(filepath) file_list.push_back(filepath) info_msg += f'- {filepath}\t\n' - infile = ROOT.TFile.Open(filepath, 'READ') - try: - nevents_orig += infile.Get('eventsProcessed').GetVal() - except AttributeError: - pass - try: - nevents_local += infile.Get("events").GetEntries() - except AttributeError: - LOGGER.error('Input file:\n%s\nis missing events TTree!\n' - 'Aborting...', filepath) + if do_weighted: + + if args.nevents > 0: + nevts_param, nevts_tree, sow_param, sow_tree = get_entries_sow(filepath, args.nevents) + else: + nevts_param, nevts_tree, sow_param, sow_tree = get_entries_sow(filepath) + + nevents_orig += nevts_param + nevents_local += nevts_tree + sow_orig += sow_param + sow_local += sow_tree + + else: + infile = ROOT.TFile.Open(filepath, 'READ') + try: + nevents_orig += infile.Get('eventsProcessed').GetVal() + except AttributeError: + pass + + try: + nevents_local += infile.Get("events").GetEntries() + except AttributeError: + LOGGER.error('Input file:\n%s\nis missing events TTree!\n' + 'Aborting...', filepath) + infile.Close() + sys.exit(3) infile.Close() - sys.exit(3) - infile.Close() - LOGGER.info(info_msg) + if args.nevents > 0 and args.nevents < nevents_local: + nevents_local = args.nevents - # Adjust number of events in case --nevents was specified - if args.nevents > 0 and args.nevents < nevents_local: - nevents_local = args.nevents + LOGGER.info(info_msg) + + if nevents_orig > 0: LOGGER.info('Number of events:\n\t- original: %s\n\t- local: %s', f'{nevents_orig:,}', f'{nevents_local:,}') + if do_weighted: + LOGGER.info('Sum of weights:\n\t- original: %s\n\t- local: %s', + f'{sow_orig:,}', f'{sow_local:,}') else: LOGGER.info('Number of local events: %s', f'{nevents_local:,}') + if do_weighted: + LOGGER.info('Local sum of weights: %s', f'{sow_local:,}') + output_dir = get_attribute(analysis, 'output_dir', '') if not args.batch: @@ -529,7 +556,7 @@ def run_local(args, analysis, infile_list): if nevents_orig > 0: info_msg += f'\nReduction factor total: {outn/nevents_orig}' info_msg += '\n' - info_msg += 80 * '=' + info_msg += 80 * '=' info_msg += '\n' LOGGER.info(info_msg) @@ -540,8 +567,15 @@ def run_local(args, analysis, infile_list): 'eventsProcessed', nevents_orig if nevents_orig != 0 else inn) param.Write() - param = ROOT.TParameter(int)('eventsSelected', outn) + param = ROOT.TParameter(int)('eventsSelected', outn) # Should no of weighted, selected events be added as well? param.Write() + + if do_weighted: + param_sow = ROOT.TParameter(float)( + 'SumOfWeights', + sow_orig if sow_orig != 0 else sow_local ) + param_sow.Write() + outfile.Write() if args.bench: @@ -590,6 +624,12 @@ def run_fccanalysis(args, analysis_module): if output_dir_eos is not None and not os.path.exists(output_dir_eos): os.system(f'mkdir -p {output_dir_eos}') + # Check if using weighted events is requested + do_weighted = get_attribute(analysis, 'do_weighted', False) + + if do_weighted: + LOGGER.info('Using generator weights') + # Check if test mode is specified, and if so run the analysis on it (this # will exit after) if args.test: @@ -598,7 +638,7 @@ def run_fccanalysis(args, analysis_module): directory, _ = os.path.split(args.output) if directory: os.system(f'mkdir -p {directory}') - run_local(args, analysis, [testfile_path]) + run_local(args, analysis, [testfile_path], do_weighted=do_weighted) sys.exit(0) # Check if files are specified, and if so run the analysis on it/them (this @@ -608,7 +648,7 @@ def run_fccanalysis(args, analysis_module): directory, _ = os.path.split(args.output) if directory: os.system(f'mkdir -p {directory}') - run_local(args, analysis, args.files_list) + run_local(args, analysis, args.files_list, do_weighted=do_weighted) sys.exit(0) # Check if batch mode is available @@ -629,6 +669,8 @@ def run_fccanalysis(args, analysis_module): 'analysis script!\nAborting...') sys.exit(3) + + for process_name in process_list: LOGGER.info('Started processing sample "%s" ...', process_name) file_list, event_list = get_process_info(process_name, @@ -693,11 +735,11 @@ def run_fccanalysis(args, analysis_module): LOGGER.info('Running locally...') if len(chunk_list) == 1: args.output = f'{output_stem}.root' - run_local(args, analysis, chunk_list[0]) + run_local(args, analysis, chunk_list[0], do_weighted=do_weighted) else: for index, chunk in enumerate(chunk_list): args.output = f'{output_stem}/chunk{index}.root' - run_local(args, analysis, chunk) + run_local(args, analysis, chunk, do_weighted=do_weighted) if len(process_list) == 0: LOGGER.warning('No files processed (process_list not found)!\n' diff --git a/python/run_final_analysis.py b/python/run_final_analysis.py index 7eaa7740b0..f9943a15d5 100644 --- a/python/run_final_analysis.py +++ b/python/run_final_analysis.py @@ -15,7 +15,7 @@ import ROOT # type: ignore import cppyy from anascript import get_element, get_attribute -from process import get_process_dict +from process import get_process_dict, get_entries_sow from frame import generate_graph LOGGER = logging.getLogger('FCCAnalyses.run_final') @@ -230,6 +230,14 @@ def run(rdf_module, args) -> None: file_list = {} results = {} + # Check if using weighted events is requested + do_weighted = get_attribute(rdf_module, 'do_weighted', False) + + if do_weighted: + LOGGER.info('Using generator weights') + sow_process={} + sow_ttree={} + # Checking input directory input_dir = get_attribute(rdf_module, 'inputDir', '') if not input_dir: @@ -265,27 +273,50 @@ def run(rdf_module, args) -> None: for process_name in process_list: process_events[process_name] = 0 events_ttree[process_name] = 0 + if do_weighted: + sow_process[process_name] = 0. + sow_ttree[process_name] = 0. + file_list[process_name] = ROOT.vector('string')() infilepath = input_dir + process_name + '.root' # input file + if not os.path.isfile(infilepath): LOGGER.debug('File %s does not exist!\nTrying if it is a ' 'directory as it might have been processed in batch.', infilepath) else: LOGGER.info('Open file:\n %s', infilepath) - process_events[process_name], events_ttree[process_name] = \ - get_entries(infilepath) + if do_weighted: + process_events[process_name], events_ttree[process_name], \ + sow_process[process_name], sow_ttree[process_name] = \ + get_entries_sow(infilepath, weight_name="weight") + else: + process_events[process_name], events_ttree[process_name] = \ + get_entries(infilepath) file_list[process_name].push_back(infilepath) indirpath = input_dir + process_name if os.path.isdir(indirpath): + #reset the nevts/sow counters to avoid wrong counting in case a single file of same name (e.g. local test output) also exists in the same directory + process_events[process_name] = 0 + events_ttree[process_name] = 0 + sow_process[process_name] = 0. + sow_ttree[process_name] = 0. + info_msg = f'Open directory {indirpath}' flist = glob.glob(indirpath + '/chunk*.root') for filepath in flist: info_msg += '\n\t' + filepath - chunk_process_events, chunk_events_ttree = \ - get_entries(filepath) + if do_weighted: + chunk_process_events, chunk_events_ttree, \ + chunk_sow_process, chunk_sow_ttree = \ + get_entries_sow(filepath, weight_name="weight") + sow_process[process_name] += chunk_sow_process + sow_ttree[process_name] += chunk_sow_ttree + else: + chunk_process_events, chunk_events_ttree = \ + get_entries(filepath) process_events[process_name] += chunk_process_events events_ttree[process_name] += chunk_events_ttree file_list[process_name].push_back(filepath) @@ -300,6 +331,17 @@ def run(rdf_module, args) -> None: info_msg += f'\n\t- {process_name}: {n_events:,}' LOGGER.info(info_msg) + if do_weighted: + info_msg = 'Processed sum of weights:' + for process_name, sow in sow_process.items(): + info_msg += f'\n\t- {process_name}: {sow:,}' + LOGGER.info(info_msg) + info_msg = 'Sum of weights in the TTree:' + for process_name, sow in sow_ttree.items(): + info_msg += f'\n\t- {process_name}: {sow:,}' + LOGGER.info(info_msg) + + # Check if there are any histograms defined histo_list: dict[str, dict[str, any]] = get_attribute(rdf_module, "histoList", {}) @@ -446,6 +488,7 @@ def run(rdf_module, args) -> None: # Now perform the loop and evaluate everything at once. LOGGER.info('Evaluating...') all_events_raw = dframe.Count().GetValue() + LOGGER.info('Done') nevents_real += all_events_raw @@ -453,10 +496,19 @@ def run(rdf_module, args) -> None: if do_scale: LOGGER.info('Scaling cut yields...') - all_events = all_events_raw * 1. * gen_sf * \ - int_lumi / process_events[process_name] - uncertainty = ROOT.Math.sqrt(all_events_raw) * gen_sf * \ - int_lumi / process_events[process_name] + if do_weighted: + #should add check if the weight column exists! + all_events_weighted = dframe.Sum("weight").GetValue() #this will only work if "weight" column is defined in the analysis script, should weight name be an argument? + print( all_events_weighted) + all_events = all_events_weighted * 1. * gen_sf * \ + int_lumi / sow_process[process_name] + uncertainty = ROOT.Math.sqrt(all_events_weighted) * gen_sf * \ + int_lumi / sow_process[process_name] + else: + all_events = all_events_raw * 1. * gen_sf * \ + int_lumi / process_events[process_name] + uncertainty = ROOT.Math.sqrt(all_events_raw) * gen_sf * \ + int_lumi / process_events[process_name] else: all_events = all_events_raw uncertainty = ROOT.Math.sqrt(all_events_raw) @@ -518,8 +570,12 @@ def run(rdf_module, args) -> None: hist_name = hist.GetName() + '_raw' outfile.WriteObject(hist.GetValue(), hist_name) if do_scale: - hist.Scale(gen_sf * int_lumi / - process_events[process_name]) + if do_weighted: + hist.Scale(gen_sf * int_lumi / + sow_process[process_name]) + else: + hist.Scale(gen_sf * int_lumi / + process_events[process_name]) outfile.WriteObject(hist.GetValue()) # write all metadata info to the output file @@ -527,9 +583,15 @@ def run(rdf_module, args) -> None: process_events[process_name]) outfile.WriteObject(param) - param = ROOT.TParameter(float)("sumOfWeights", - process_events[process_name]) - outfile.WriteObject(param) + if do_weighted: + param = ROOT.TParameter(float)("sumOfWeights", + sow_process[process_name]) + outfile.WriteObject(param) + + else: + param = ROOT.TParameter(float)("sumOfWeights", + process_events[process_name]) + outfile.WriteObject(param) param = ROOT.TParameter(bool)("scaled", do_scale) @@ -567,6 +629,22 @@ def run(rdf_module, args) -> None: 'events!', cut, process_name) sys.exit(3) + #store also the TParameters for total number of events and sum of weights to the trees + print("Updating file", fout_list[i]) + outfile = ROOT.TFile(fout_list[i], 'update') + param = ROOT.TParameter(int)('eventsProcessed', process_events[process_name]) + print("Number of events processed:", process_events[process_name]) + param.Write() + if do_weighted: + param2 = ROOT.TParameter(float)('SumOfWeights', sow_process[process_name]) + print("Sum of weights:", sow_process[process_name]) + param2.Write() + outfile.Write() + outfile.Close() + + + + # Save results either to JSON or LaTeX tables save_results(results, rdf_module)