diff --git a/addons/CMakeLists.txt b/addons/CMakeLists.txt index dee331a9c4..4684dae6fe 100644 --- a/addons/CMakeLists.txt +++ b/addons/CMakeLists.txt @@ -1,5 +1,6 @@ add_subdirectory(FastJet) add_subdirectory(ONNXRuntime) +add_subdirectory(TMVAHelper) set(ADDONS_LIBRARIES ${ADDONS_LIBRARIES} PARENT_SCOPE) message(STATUS "add-ons--------------------------- ${ADDONS_LIBRARIES}") diff --git a/addons/TMVAHelper/CMakeLists.txt b/addons/TMVAHelper/CMakeLists.txt new file mode 100644 index 0000000000..1aa791382c --- /dev/null +++ b/addons/TMVAHelper/CMakeLists.txt @@ -0,0 +1,37 @@ + + +find_package(TBB REQUIRED tbb) +find_package(ROOT) +find_package(ROOT COMPONENTS ROOTVecOps) +find_package(ROOT COMPONENTS TMVA) + + +message(STATUS "includes-------------------------- TEST: ${TBB_INCLUDE_DIRS}") + +file(GLOB sources src/*.cc) +file(GLOB headers *.h) + + +fccanalyses_addon_build(TMVAHelper + SOURCES ${headers} ${sources} +# EXT_LIBS ROOT::ROOTVecOps ROOT::TMVA TBB::tbb + INSTALL_COMPONENT tmvahelper) + +add_custom_command(TARGET TMVAHelper POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_CURRENT_SOURCE_DIR}/python/* + ${CMAKE_CURRENT_BINARY_DIR} +) + +target_link_libraries(TMVAHelper PRIVATE TBB::tbb) +target_link_libraries(TMVAHelper PRIVATE ROOT::ROOTVecOps) +target_link_libraries(TMVAHelper PRIVATE ROOT::TMVA) +target_compile_features(TMVAHelper PRIVATE cxx_std_11) + +install(FILES + ${CMAKE_CURRENT_LIST_DIR}/TMVAHelper.h + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/TMVAHelper + ) + +file(GLOB _addon_python_files python/*.py) +install(FILES ${_addon_python_files} DESTINATION ${CMAKE_INSTALL_PREFIX}/python/addons/TMVAHelper) diff --git a/addons/TMVAHelper/TMVAHelper.h b/addons/TMVAHelper/TMVAHelper.h new file mode 100644 index 0000000000..346e48b70d --- /dev/null +++ b/addons/TMVAHelper/TMVAHelper.h @@ -0,0 +1,22 @@ +#ifndef TMVAHelper_TMVAHelper_h +#define TMVAHelper_TMVAHelper_h + +#include +#include "ROOT/RVec.hxx" +#include "TMVA/RBDT.hxx" + + +class tmva_helper_xgb { + public: + explicit tmva_helper_xgb(const std::string &filename, const std::string &name, const unsigned &nvars, const unsigned int nslots = 1); + virtual ~tmva_helper_xgb(); + ROOT::VecOps::RVec operator()(const ROOT::VecOps::RVec vars); + + private: + unsigned int nvars_; + TMVA::Experimental::RBDT<> model_; + std::vector> interpreters_; +}; + + +#endif \ No newline at end of file diff --git a/addons/TMVAHelper/python/TMVAHelper.py b/addons/TMVAHelper/python/TMVAHelper.py new file mode 100644 index 0000000000..26af1741bf --- /dev/null +++ b/addons/TMVAHelper/python/TMVAHelper.py @@ -0,0 +1,39 @@ + +import ROOT +import pathlib + +ROOT.gInterpreter.ProcessLine('#include "TMVAHelper/TMVAHelper.h"') +ROOT.gSystem.Load("libTMVAHelper.so") + +class TMVAHelperXGB(): + + def __init__(self, model_input, model_name, variables=[]): + + if len(variables) == 0: # try to get the variables from the model file (saved as a TList) + fIn = ROOT.TFile(model_input) + variables_ = fIn.Get("variables") + self.variables = [str(var.GetString()) for var in variables_] + fIn.Close() + else: + self.variables = variables + self.nvars = len(self.variables) + self.model_input = model_input + self.model_name = model_name + self.nthreads = ROOT.GetThreadPoolSize() + + self.tmva_helper = ROOT.tmva_helper_xgb(self.model_input, self.model_name, self.nvars, self.nthreads) + self.var_col = f"tmva_vars_{self.model_name}" + + def run_inference(self, df, col_name = "mva_score"): + + # check if columns exist in the dataframe + cols = df.GetColumnNames() + for var in self.variables: + if not var in cols: + raise Exception(f"Variable {var} not defined in dataframe.") + + vars_str = ', (float)'.join(self.variables) + df = df.Define(self.var_col, f"ROOT::VecOps::RVec{{{vars_str}}}") + df = df.Define(col_name, self.tmva_helper, [self.var_col]) + return df + diff --git a/addons/TMVAHelper/src/TMVAHelper.cc b/addons/TMVAHelper/src/TMVAHelper.cc new file mode 100644 index 0000000000..c0ddd64363 --- /dev/null +++ b/addons/TMVAHelper/src/TMVAHelper.cc @@ -0,0 +1,23 @@ +#include "TMVAHelper/TMVAHelper.h" + + +tmva_helper_xgb::tmva_helper_xgb(const std::string &filename, const std::string &name, const unsigned &nvars, const unsigned int nslots) : + model_(name, filename), nvars_(nvars) { + + const unsigned int nslots_actual = std::max(nslots, 1U); + interpreters_.reserve(nslots_actual); + for (unsigned int islot = 0; islot < nslots_actual; ++islot) { + interpreters_.emplace_back(model_); + } +} + +ROOT::VecOps::RVec tmva_helper_xgb::operator()(const ROOT::VecOps::RVec vars) { + auto const tbb_slot = std::max(tbb::this_task_arena::current_thread_index(), 0); + if (tbb_slot >= interpreters_.size()) { + throw std::runtime_error("Not enough interpreters allocated for number of tbb threads"); + } + auto &interpreter_data = interpreters_[tbb_slot]; + return interpreter_data.Compute(vars); +} + +tmva_helper_xgb::~tmva_helper_xgb() {} diff --git a/bin/fccanalysis b/bin/fccanalysis index b2628fd8f0..bcdc6484c5 100755 --- a/bin/fccanalysis +++ b/bin/fccanalysis @@ -14,6 +14,7 @@ from pin_analysis import PinAnalysis from run_analysis import run from run_final_analysis import run_final from do_plots import do_plots +from do_combine import do_combine class MultiLineFormatter(logging.Formatter): @@ -94,6 +95,8 @@ def main(): run_final(parser) elif args.command == 'plots': do_plots(parser) + elif args.command == 'combine': + do_combine(parser) else: run(parser) diff --git a/examples/FCCee/higgs/jetclustering/functions.h b/examples/FCCee/higgs/jetclustering/functions.h new file mode 100644 index 0000000000..74079a0328 --- /dev/null +++ b/examples/FCCee/higgs/jetclustering/functions.h @@ -0,0 +1,81 @@ +#ifndef ZHfunctions_H +#define ZHfunctions_H + +#include +#include +#include + +#include "TLorentzVector.h" +#include "ROOT/RVec.hxx" +#include "edm4hep/ReconstructedParticleData.h" +#include "edm4hep/MCParticleData.h" +#include "edm4hep/ParticleIDData.h" +#include "ReconstructedParticle2MC.h" + + +namespace FCCAnalyses { + +// make Lorentzvectors from pseudojets +Vec_tlv makeLorentzVectors(Vec_f jets_px, Vec_f jets_py, Vec_f jets_pz, Vec_f jets_e) { + Vec_tlv result; + for(int i=0; i> constituents, Vec_rp reco, Vec_mc mc, Vec_i mcind, bool findGluons = false) { + // jet truth=finder: match the gen-level partons (eventually with gluons) with the jet constituents + // matching by mimimizing the sum of dr of the parton and all the jet constituents + + Vec_tlv genQuarks; // Lorentz-vector of potential partons (gen-level) + Vec_i genQuarks_pdgId; // corresponding PDG ID + for(size_t i = 0; i < mc.size(); ++i) { + int pdgid = abs(mc.at(i).PDG); + if(pdgid > 6 and not findGluons) continue; // only quarks + if(pdgid > 6 and pdgid != 21 and findGluons) continue; // only quarks and gluons + TLorentzVector tlv; + tlv.SetXYZM(mc.at(i).momentum.x,mc.at(i).momentum.y,mc.at(i).momentum.z,mc.at(i).mass); + genQuarks.push_back(tlv); + genQuarks_pdgId.push_back(mc.at(i).PDG); + } + + Vec_tlv recoParticles; // Lorentz-vector of all reconstructed particles + for(size_t i = 0; i < reco.size(); ++i) { + auto & p = reco[i]; + TLorentzVector tlv; + tlv.SetXYZM(p.momentum.x, p.momentum.y, p.momentum.z, p.mass); + recoParticles.push_back(tlv); + } + + Vec_i usedIdx; + Vec_i result; + for(size_t iJet = 0; iJet < constituents.size(); ++iJet) { + Vec_d dr; + for(size_t iGen = 0; iGen < genQuarks.size(); ++iGen) { + if(std::find(usedIdx.begin(), usedIdx.end(), iGen) != usedIdx.end()) { + dr.push_back(1e99); // set infinite dr, skip + continue; + } + dr.push_back(0); + for(size_t i = 0; i < constituents[iJet].size(); ++i) { + dr[iGen] += recoParticles[constituents[iJet][i]].DeltaR(genQuarks[iGen]); + } + } + int maxDrIdx = std::min_element(dr.begin(),dr.end()) - dr.begin(); + usedIdx.push_back(maxDrIdx); + result.push_back(genQuarks_pdgId[maxDrIdx]); + + } + return result; +} + + + + + +} + +#endif \ No newline at end of file diff --git a/examples/FCCee/higgs/jetclustering/histmaker.py b/examples/FCCee/higgs/jetclustering/histmaker.py new file mode 100644 index 0000000000..488b6ebd80 --- /dev/null +++ b/examples/FCCee/higgs/jetclustering/histmaker.py @@ -0,0 +1,103 @@ + +processList = { + 'wzp6_ee_mumuH_Hbb_ecm240': {'fraction': 1}, +} + +# Production tag when running over EDM4Hep centrally produced events, this points to the yaml files for getting sample statistics (mandatory) +prodTag = "FCCee/winter2023/IDEA/" + +# Link to the dictonary that contains all the cross section informations etc... (mandatory) +procDict = "FCCee_procDict_winter2023_IDEA.json" + +# additional/custom C++ functions, defined in header files (optional) +includePaths = ["functions.h"] + +#Optional: output directory, default is local running directory +outputDir = f"outputs/FCCee/higgs/jetclustering/histmaker/" + +# optional: ncpus, default is 4, -1 uses all cores available +nCPUS = -1 + +# scale the histograms with the cross-section and integrated luminosity +doScale = True +intLumi = 7200000.0 # 7.2 /ab + + +# define some binning for various histograms +bins_p_mu = (250, 0, 250) +bins_m_ll = (250, 0, 250) +bins_p_ll = (250, 0, 250) +bins_recoil = (200, 120, 140) +bins_pdgid = (51, -25.5, 25.5) +bins_dijet_m = (80, 70, 150) + + +# build_graph function that contains the analysis logic, cuts and histograms (mandatory) +def build_graph(df, dataset): + + results = [] + df = df.Define("weight", "1.0") + weightsum = df.Sum("weight") + + df = df.Alias("Particle0", "Particle#0.index") + df = df.Alias("Particle1", "Particle#1.index") + df = df.Alias("MCRecoAssociations0", "MCRecoAssociations#0.index") + df = df.Alias("MCRecoAssociations1", "MCRecoAssociations#1.index") + + # select muons from Z decay and form Z/recoil mass + df = df.Alias("Muon0", "Muon#0.index") + df = df.Define("muons_all", "FCCAnalyses::ReconstructedParticle::get(Muon0, ReconstructedParticles)") + df = df.Define("muons", "FCCAnalyses::ReconstructedParticle::sel_p(25)(muons_all)") + df = df.Define("muons_p", "FCCAnalyses::ReconstructedParticle::get_p(muons)") + df = df.Define("muons_no", "FCCAnalyses::ReconstructedParticle::get_n(muons)") + df = df.Filter("muons_no >= 2") + + df = df.Define("zmumu", "ReconstructedParticle::resonanceBuilder(91)(muons)") + df = df.Define("zmumu_m", "ReconstructedParticle::get_mass(zmumu)[0]") + df = df.Define("zmumu_p", "ReconstructedParticle::get_p(zmumu)[0]") + df = df.Define("zmumu_recoil", "ReconstructedParticle::recoilBuilder(240)(zmumu)") + df = df.Define("zmumu_recoil_m", "ReconstructedParticle::get_mass(zmumu_recoil)[0]") + + # basic selection + df = df.Filter("zmumu_m > 70 && zmumu_m < 100") + df = df.Filter("zmumu_p > 20 && zmumu_p < 70") + df = df.Filter("zmumu_recoil_m < 140 && zmumu_recoil_m > 120") + + + # do jet clustering on all particles, except the muons + df = df.Define("rps_no_muons", "FCCAnalyses::ReconstructedParticle::remove(ReconstructedParticles, muons)") + df = df.Define("RP_px", "FCCAnalyses::ReconstructedParticle::get_px(rps_no_muons)") + df = df.Define("RP_py", "FCCAnalyses::ReconstructedParticle::get_py(rps_no_muons)") + df = df.Define("RP_pz","FCCAnalyses::ReconstructedParticle::get_pz(rps_no_muons)") + df = df.Define("RP_e", "FCCAnalyses::ReconstructedParticle::get_e(rps_no_muons)") + df = df.Define("pseudo_jets", "FCCAnalyses::JetClusteringUtils::set_pseudoJets(RP_px, RP_py, RP_pz, RP_e)") + + + # Implemented algorithms and arguments: https://github.com/HEP-FCC/FCCAnalyses/blob/master/addons/FastJet/JetClustering.h + # More info: https://indico.cern.ch/event/1173562/contributions/4929025/attachments/2470068/4237859/2022-06-FCC-jets.pdf + df = df.Define("clustered_jets", "JetClustering::clustering_ee_kt(2, 2, 0, 10)(pseudo_jets)") # 2-jet exclusive clustering + + df = df.Define("jets", "FCCAnalyses::JetClusteringUtils::get_pseudoJets(clustered_jets)") + df = df.Define("jetconstituents", "FCCAnalyses::JetClusteringUtils::get_constituents(clustered_jets)") # one-to-one mapping to the input collection (rps_no_muons) + df = df.Define("jets_e", "FCCAnalyses::JetClusteringUtils::get_e(jets)") + df = df.Define("jets_px", "FCCAnalyses::JetClusteringUtils::get_px(jets)") + df = df.Define("jets_py", "FCCAnalyses::JetClusteringUtils::get_py(jets)") + df = df.Define("jets_pz", "FCCAnalyses::JetClusteringUtils::get_pz(jets)") + df = df.Define("jets_phi", "FCCAnalyses::JetClusteringUtils::get_phi(jets)") + df = df.Define("jets_m", "FCCAnalyses::JetClusteringUtils::get_m(jets)") + + # convert jets to LorentzVectors + df = df.Define("jets_tlv", "FCCAnalyses::makeLorentzVectors(jets_px, jets_py, jets_pz, jets_e)") + df = df.Define("jets_truth", "FCCAnalyses::jetTruthFinder(jetconstituents, rps_no_muons, Particle, MCRecoAssociations1)") # returns best-matched PDG ID of the jets + df = df.Define("dijet_higgs_m", "(jets_tlv[0]+jets_tlv[1]).M()") + + # define histograms + results.append(df.Histo1D(("zmumu_m", "", *bins_m_ll), "zmumu_m")) + results.append(df.Histo1D(("zmumu_p", "", *bins_p_ll), "zmumu_p")) + results.append(df.Histo1D(("zmumu_recoil_m", "", *bins_recoil), "zmumu_recoil_m")) + + results.append(df.Histo1D(("jets_truth", "", *bins_pdgid), "jets_truth")) + results.append(df.Histo1D(("dijet_higgs_m", "", *bins_dijet_m), "dijet_higgs_m")) + + return results, weightsum + diff --git a/examples/FCCee/higgs/jetclustering/plots.py b/examples/FCCee/higgs/jetclustering/plots.py new file mode 100644 index 0000000000..4bf8eae2a4 --- /dev/null +++ b/examples/FCCee/higgs/jetclustering/plots.py @@ -0,0 +1,90 @@ +import ROOT + +# global parameters +intLumi = 1. # histograms already scaled +intLumiLabel = "L = 7.2 ab^{-1}" +ana_tex = 'e^{+}e^{-} #rightarrow Z(#mu^{+}#mu^{-})H(b#bar{b})' +delphesVersion = '3.4.2' +energy = 240.0 +collider = 'FCC-ee' +inputDir = f"outputs/FCCee/higgs/jetclustering/histmaker/" +formats = ['png','pdf'] +outdir = f"outputs/FCCee/higgs/jetclustering/plots/" +plotStatUnc = True + +colors = {} +colors['ZH'] = ROOT.kRed + +procs = {} +procs['signal'] = {'ZH':['wzp6_ee_mumuH_Hbb_ecm240']} +procs['backgrounds'] = {} + +legend = {} +legend['ZH'] = 'ZH' + +hists = {} + +hists["dijet_higgs_m"] = { + "output": "dijet_higgs_m", + "logy": False, + "stack": True, + "rebin": 1, + "xmin": 70, + "xmax": 150, + "ymin": 0, + "ymax": 3000, + "xtitle": "Dijet mass (GeV)", + "ytitle": "Events", +} + +hists["zmumu_recoil_m"] = { + "output": "zmumu_recoil_m", + "logy": False, + "stack": True, + "rebin": 1, + "xmin": 120, + "xmax": 140, + "ymin": 0, + "ymax": 2000, + "xtitle": "Recoil (GeV)", + "ytitle": "Events", +} + +hists["zmumu_p"] = { + "output": "zmumu_p", + "logy": False, + "stack": True, + "rebin": 1, + "xmin": 20, + "xmax": 70, + "ymin": 0, + "ymax": 5000, + "xtitle": "p(#mu^{#plus}#mu^{#minus}) (GeV)", + "ytitle": "Events", +} + +hists["zmumu_m"] = { + "output": "zmumu_m", + "logy": False, + "stack": True, + "rebin": 1, + "xmin": 70, + "xmax": 110, + "ymin": 0, + "ymax": 7000, + "xtitle": "m(#mu^{#plus}#mu^{#minus}) (GeV)", + "ytitle": "Events", +} + +hists["jets_truth"] = { + "output": "jets_truth", + "logy": True, + "stack": True, + "rebin": 1, + "xmin": -8, + "xmax": 8, + "ymin": 1, + "ymax": 1e6, + "xtitle": "Jet truth label PDGID", + "ytitle": "Events", +} diff --git a/examples/FCCee/higgs/mass_xsec/combine_recoil.py b/examples/FCCee/higgs/mass_xsec/combine_recoil.py new file mode 100644 index 0000000000..4391ee738b --- /dev/null +++ b/examples/FCCee/higgs/mass_xsec/combine_recoil.py @@ -0,0 +1,38 @@ +import ROOT + +flavor = "mumu" # mumu, ee + +intLumi = 1.0 # assume histograms are scaled in previous step +outputDir = f"outputs/FCCee/higgs/mass-xsec/combine/{flavor}/" +mc_stats = True +rebin = 10 + +# get histograms from histmaker step +#inputDir = f"outputs/FCCee/higgs/mass-xsec/histmaker/{flavor}/" + +# get histograms from final step, selection to be defined +inputDir = f"outputs/FCCee/higgs/mass-xsec/final_selection/{flavor}/" +selection = "sel3" + + +sig_procs = {'sig':['wzp6_ee_mumuH_ecm240']} +bkg_procs = {'bkg':['p8_ee_WW_ecm240', 'p8_ee_ZZ_ecm240']} + + +categories = ["recoil"] +hist_names = ["zll_recoil_m_final"] + + +systs = {} + +systs['bkg_norm'] = { + 'type': 'lnN', + 'value': 1.10, + 'procs': ['bkg'], +} + +systs['lumi'] = { + 'type': 'lnN', + 'value': 1.01, + 'procs': '.*', +} diff --git a/examples/FCCee/higgs/mass_xsec/final_selection_recoil.py b/examples/FCCee/higgs/mass_xsec/final_selection_recoil.py new file mode 100644 index 0000000000..540fbcd112 --- /dev/null +++ b/examples/FCCee/higgs/mass_xsec/final_selection_recoil.py @@ -0,0 +1,49 @@ + +flavor = "mumu" # mumu, ee + +#Input directory where the files produced at the pre-selection level are +inputDir = f"outputs/FCCee/higgs/mass-xsec/preselection/{flavor}/" + +#Input directory where the files produced at the pre-selection level are +#Optional: output directory, default is local running directory +outputDir = f"outputs/FCCee/higgs/mass-xsec/final_selection/{flavor}/" + +# if no processList or empty dictionary provided, run over all ROOT files in the input directory +processList = {} + +#Link to the dictonary that contains all the cross section informations etc... +procDict = "FCCee_procDict_winter2023_IDEA.json" + +#Number of CPUs to use +nCPUS = -1 + +#produces ROOT TTrees, default is False +doTree = False + + +# scale the histograms with the cross-section and integrated luminosity +doScale = True +intLumi = 7200000.0 # 7.2 /ab + +saveTabular = True + +###Dictionnay of the list of cuts. The key is the name of the selection that will be added to the output file +sel0 = "(zll_m > 86 && zll_m < 96)" +sel1 = "(zll_p > 20 && zll_p < 70)" +sel2 = "(cosTheta_miss < 0.98)" +sel3 = "(zll_recoil_m < 140 && zll_recoil_m > 120)" +cutList = { + "sel0": f"{sel0}", + "sel1": f"{sel0} && {sel1}", + "sel2": f"{sel0} && {sel1} && {sel2}", + "sel3": f"{sel0} && {sel1} && {sel2} && {sel3}" +} + + +#Dictionary for the ouput variable/hitograms. The key is the name of the variable in the output files. "name" is the name of the variable in the input file, "title" is the x-axis label of the histogram, "bin" the number of bins of the histogram, "xmin" the minimum x-axis value and "xmax" the maximum x-axis value. +histoList = { + "zll_m":{"cols": ["zll_m"], "title": "m_{Z} (GeV)", "bins": [(250,0,250)]}, + "zll_p":{"cols": ["zll_p"], "title": "p_{Z} (GeV)", "bins": [(250,0,250)]}, + "zll_recoil_m":{"cols": ["zll_recoil_m"], "title": "Recoil (GeV)", "bins": [(250,0,250)]}, + "zll_recoil_m_final":{"cols": ["zll_recoil_m"], "title": "Recoil (GeV)", "bins": [(200,120,140)]}, +} diff --git a/examples/FCCee/higgs/mass_xsec/functions.h b/examples/FCCee/higgs/mass_xsec/functions.h new file mode 100644 index 0000000000..c06b52f45c --- /dev/null +++ b/examples/FCCee/higgs/mass_xsec/functions.h @@ -0,0 +1,256 @@ +#ifndef ZHfunctions_H +#define ZHfunctions_H + +#include +#include +#include + +#include "TLorentzVector.h" +#include "ROOT/RVec.hxx" +#include "edm4hep/ReconstructedParticleData.h" +#include "edm4hep/MCParticleData.h" +#include "edm4hep/ParticleIDData.h" +#include "ReconstructedParticle2MC.h" + + +namespace FCCAnalyses { namespace ZHfunctions { + + +// build the Z resonance based on the available leptons. Returns the best lepton pair compatible with the Z mass and recoil at 125 GeV +// technically, it returns a ReconstructedParticleData object with index 0 the di-lepton system, index and 2 the leptons of the pair +struct resonanceBuilder_mass_recoil { + float m_resonance_mass; + float m_recoil_mass; + float chi2_recoil_frac; + float ecm; + bool m_use_MC_Kinematics; + resonanceBuilder_mass_recoil(float arg_resonance_mass, float arg_recoil_mass, float arg_chi2_recoil_frac, float arg_ecm, bool arg_use_MC_Kinematics); + Vec_rp operator()(Vec_rp legs, Vec_i recind, Vec_i mcind, Vec_rp reco, Vec_mc mc, Vec_i parents, Vec_i daugthers) ; +}; + +resonanceBuilder_mass_recoil::resonanceBuilder_mass_recoil(float arg_resonance_mass, float arg_recoil_mass, float arg_chi2_recoil_frac, float arg_ecm, bool arg_use_MC_Kinematics) {m_resonance_mass = arg_resonance_mass, m_recoil_mass = arg_recoil_mass, chi2_recoil_frac = arg_chi2_recoil_frac, ecm = arg_ecm, m_use_MC_Kinematics = arg_use_MC_Kinematics;} + +Vec_rp resonanceBuilder_mass_recoil::resonanceBuilder_mass_recoil::operator()(Vec_rp legs, Vec_i recind, Vec_i mcind, Vec_rp reco, Vec_mc mc, Vec_i parents, Vec_i daugthers) { + + Vec_rp result; + result.reserve(3); + std::vector> pairs; // for each permutation, add the indices of the muons + int n = legs.size(); + + if(n > 1) { + ROOT::VecOps::RVec v(n); + std::fill(v.end() - 2, v.end(), true); // helper variable for permutations + do { + std::vector pair; + rp reso; + reso.charge = 0; + TLorentzVector reso_lv; + for(int i = 0; i < n; ++i) { + if(v[i]) { + pair.push_back(i); + reso.charge += legs[i].charge; + TLorentzVector leg_lv; + + if(m_use_MC_Kinematics) { // MC kinematics + int track_index = legs[i].tracks_begin; // index in the Track array + int mc_index = ReconstructedParticle2MC::getTrack2MC_index(track_index, recind, mcind, reco); + if (mc_index >= 0 && mc_index < mc.size()) { + leg_lv.SetXYZM(mc.at(mc_index).momentum.x, mc.at(mc_index).momentum.y, mc.at(mc_index).momentum.z, mc.at(mc_index).mass); + } + } + else { // reco kinematics + leg_lv.SetXYZM(legs[i].momentum.x, legs[i].momentum.y, legs[i].momentum.z, legs[i].mass); + } + + reso_lv += leg_lv; + } + } + + if(reso.charge != 0) continue; // neglect non-zero charge pairs + reso.momentum.x = reso_lv.Px(); + reso.momentum.y = reso_lv.Py(); + reso.momentum.z = reso_lv.Pz(); + reso.mass = reso_lv.M(); + result.emplace_back(reso); + pairs.push_back(pair); + + } while(std::next_permutation(v.begin(), v.end())); + } + else { + std::cout << "ERROR: resonanceBuilder_mass_recoil, at least two leptons required." << std::endl; + exit(1); + } + + if(result.size() > 1) { + Vec_rp bestReso; + + int idx_min = -1; + float d_min = 9e9; + for (int i = 0; i < result.size(); ++i) { + + // calculate recoil + auto recoil_p4 = TLorentzVector(0, 0, 0, ecm); + TLorentzVector tv1; + tv1.SetXYZM(result.at(i).momentum.x, result.at(i).momentum.y, result.at(i).momentum.z, result.at(i).mass); + recoil_p4 -= tv1; + + auto recoil_fcc = edm4hep::ReconstructedParticleData(); + recoil_fcc.momentum.x = recoil_p4.Px(); + recoil_fcc.momentum.y = recoil_p4.Py(); + recoil_fcc.momentum.z = recoil_p4.Pz(); + recoil_fcc.mass = recoil_p4.M(); + + TLorentzVector tg; + tg.SetXYZM(result.at(i).momentum.x, result.at(i).momentum.y, result.at(i).momentum.z, result.at(i).mass); + + float boost = tg.P(); + float mass = std::pow(result.at(i).mass - m_resonance_mass, 2); // mass + float rec = std::pow(recoil_fcc.mass - m_recoil_mass, 2); // recoil + float d = (1.0-chi2_recoil_frac)*mass + chi2_recoil_frac*rec; + + if(d < d_min) { + d_min = d; + idx_min = i; + } + } + if(idx_min > -1) { + bestReso.push_back(result.at(idx_min)); + auto & l1 = legs[pairs[idx_min][0]]; + auto & l2 = legs[pairs[idx_min][1]]; + bestReso.emplace_back(l1); + bestReso.emplace_back(l2); + } + else { + std::cout << "ERROR: resonanceBuilder_mass_recoil, no mininum found." << std::endl; + exit(1); + } + return bestReso; + } + else { + auto & l1 = legs[0]; + auto & l2 = legs[1]; + result.emplace_back(l1); + result.emplace_back(l2); + return result; + } +} + + + + +struct sel_iso { + sel_iso(float arg_max_iso); + float m_max_iso = .25; + Vec_rp operator() (Vec_rp in, Vec_f iso); + }; + +sel_iso::sel_iso(float arg_max_iso) : m_max_iso(arg_max_iso) {}; +ROOT::VecOps::RVec sel_iso::operator() (Vec_rp in, Vec_f iso) { + Vec_rp result; + result.reserve(in.size()); + for (size_t i = 0; i < in.size(); ++i) { + auto & p = in[i]; + if (iso[i] < m_max_iso) { + result.emplace_back(p); + } + } + return result; +} + + +// compute the cone isolation for reco particles +struct coneIsolation { + + coneIsolation(float arg_dr_min, float arg_dr_max); + double deltaR(double eta1, double phi1, double eta2, double phi2) { return TMath::Sqrt(TMath::Power(eta1-eta2, 2) + (TMath::Power(phi1-phi2, 2))); }; + + float dr_min = 0; + float dr_max = 0.4; + Vec_f operator() (Vec_rp in, Vec_rp rps) ; +}; + +coneIsolation::coneIsolation(float arg_dr_min, float arg_dr_max) : dr_min(arg_dr_min), dr_max( arg_dr_max ) { }; +Vec_f coneIsolation::coneIsolation::operator() (Vec_rp in, Vec_rp rps) { + + Vec_f result; + result.reserve(in.size()); + + std::vector lv_reco; + std::vector lv_charged; + std::vector lv_neutral; + + for(size_t i = 0; i < rps.size(); ++i) { + ROOT::Math::PxPyPzEVector tlv; + tlv.SetPxPyPzE(rps.at(i).momentum.x, rps.at(i).momentum.y, rps.at(i).momentum.z, rps.at(i).energy); + + if(rps.at(i).charge == 0) lv_neutral.push_back(tlv); + else lv_charged.push_back(tlv); + } + + for(size_t i = 0; i < in.size(); ++i) { + ROOT::Math::PxPyPzEVector tlv; + tlv.SetPxPyPzE(in.at(i).momentum.x, in.at(i).momentum.y, in.at(i).momentum.z, in.at(i).energy); + lv_reco.push_back(tlv); + } + + // compute the isolation (see https://github.com/delphes/delphes/blob/master/modules/Isolation.cc#L154) + for (auto & lv_reco_ : lv_reco) { + double sumNeutral = 0.0; + double sumCharged = 0.0; + // charged + for (auto & lv_charged_ : lv_charged) { + double dr = coneIsolation::deltaR(lv_reco_.Eta(), lv_reco_.Phi(), lv_charged_.Eta(), lv_charged_.Phi()); + if(dr > dr_min && dr < dr_max) sumCharged += lv_charged_.P(); + } + + // neutral + for (auto & lv_neutral_ : lv_neutral) { + double dr = coneIsolation::deltaR(lv_reco_.Eta(), lv_reco_.Phi(), lv_neutral_.Eta(), lv_neutral_.Phi()); + if(dr > dr_min && dr < dr_max) sumNeutral += lv_neutral_.P(); + } + double sum = sumCharged + sumNeutral; + double ratio= sum / lv_reco_.P(); + result.emplace_back(ratio); + } + return result; +} + + + +// returns missing energy vector, based on reco particles +Vec_rp missingEnergy(float ecm, Vec_rp in, float p_cutoff = 0.0) { + float px = 0, py = 0, pz = 0, e = 0; + for(auto &p : in) { + if (std::sqrt(p.momentum.x * p.momentum.x + p.momentum.y*p.momentum.y) < p_cutoff) continue; + px += -p.momentum.x; + py += -p.momentum.y; + pz += -p.momentum.z; + e += p.energy; + } + + Vec_rp ret; + rp res; + res.momentum.x = px; + res.momentum.y = py; + res.momentum.z = pz; + res.energy = ecm-e; + ret.emplace_back(res); + return ret; +} + +// calculate the cosine(theta) of the missing energy vector +float get_cosTheta_miss(Vec_rp met){ + float costheta = 0.; + if(met.size() > 0) { + TLorentzVector lv_met; + lv_met.SetPxPyPzE(met[0].momentum.x, met[0].momentum.y, met[0].momentum.z, met[0].energy); + costheta = fabs(std::cos(lv_met.Theta())); + } + return costheta; +} + + + +}} + +#endif \ No newline at end of file diff --git a/examples/FCCee/higgs/mass_xsec/histmaker_recoil.py b/examples/FCCee/higgs/mass_xsec/histmaker_recoil.py new file mode 100644 index 0000000000..6bcfd5002e --- /dev/null +++ b/examples/FCCee/higgs/mass_xsec/histmaker_recoil.py @@ -0,0 +1,189 @@ + +flavor = "mumu" # mumu, ee + +# list of processes (mandatory) +processList_mumu = { + 'p8_ee_ZZ_ecm240':{'fraction': 1}, + 'p8_ee_WW_ecm240':{'fraction': 1}, + 'wzp6_ee_mumuH_ecm240':{'fraction': 1}, +} + +processList_ee = { + 'p8_ee_ZZ_ecm240':{'fraction': 1}, + 'p8_ee_WW_ecm240':{'fraction': 1}, + 'wzp6_ee_eeH_ecm240':{'fraction': 1}, +} + +if flavor == "mumu": + processList = processList_mumu +else: + processList = processList_ee + +# Production tag when running over EDM4Hep centrally produced events, this points to the yaml files for getting sample statistics (mandatory) +prodTag = "FCCee/winter2023/IDEA/" + +# Link to the dictonary that contains all the cross section informations etc... (mandatory) +procDict = "FCCee_procDict_winter2023_IDEA.json" + +# additional/custom C++ functions, defined in header files (optional) +includePaths = ["functions.h"] + +# Define the input dir (optional) +#inputDir = "outputs/FCCee/higgs/mH-recoil/mumu/stage1" +#inputDir = "localSamples/" + +#Optional: output directory, default is local running directory +outputDir = f"outputs/FCCee/higgs/mass-xsec/histmaker/{flavor}/" + + +# optional: ncpus, default is 4, -1 uses all cores available +nCPUS = -1 + +# scale the histograms with the cross-section and integrated luminosity +doScale = True +intLumi = 7200000.0 # 7.2 /ab + + +# define some binning for various histograms +bins_p_mu = (250, 0, 250) # 100 MeV bins +bins_m_ll = (250, 0, 250) # 100 MeV bins +bins_p_ll = (250, 0, 250) # 100 MeV bins +bins_recoil = (250, 0, 250) # 1 GeV bins +bins_cosThetaMiss = (10000, 0, 1) + +bins_theta = (500, -5, 5) +bins_eta = (600, -3, 3) +bins_phi = (500, -5, 5) + +bins_count = (50, 0, 50) +bins_charge = (10, -5, 5) +bins_iso = (500, 0, 5) + +bins_recoil_final = (200, 120, 140) # 100 MeV bins + + +# build_graph function that contains the analysis logic, cuts and histograms (mandatory) +def build_graph(df, dataset): + + results = [] + df = df.Define("weight", "1.0") + weightsum = df.Sum("weight") + + # define some aliases to be used later on + df = df.Alias("Particle0", "Particle#0.index") + df = df.Alias("Particle1", "Particle#1.index") + df = df.Alias("MCRecoAssociations0", "MCRecoAssociations#0.index") + df = df.Alias("MCRecoAssociations1", "MCRecoAssociations#1.index") + df = df.Alias("Muon0", "Muon#0.index") + + # get all the leptons from the collection + df = df.Define("muons_all", "FCCAnalyses::ReconstructedParticle::get(Muon0, ReconstructedParticles)") + + # select leptons with momentum > 20 GeV + df = df.Define("muons", "FCCAnalyses::ReconstructedParticle::sel_p(20)(muons_all)") + df = df.Define("muons_p", "FCCAnalyses::ReconstructedParticle::get_p(muons)") + df = df.Define("muons_theta", "FCCAnalyses::ReconstructedParticle::get_theta(muons)") + df = df.Define("muons_phi", "FCCAnalyses::ReconstructedParticle::get_phi(muons)") + df = df.Define("muons_q", "FCCAnalyses::ReconstructedParticle::get_charge(muons)") + df = df.Define("muons_no", "FCCAnalyses::ReconstructedParticle::get_n(muons)") + + + # compute the muon isolation and store muons with an isolation cut of 0.25 in a separate column muons_sel_iso + df = df.Define("muons_iso", "FCCAnalyses::ZHfunctions::coneIsolation(0.01, 0.5)(muons, ReconstructedParticles)") + df = df.Define("muons_sel_iso", "FCCAnalyses::ZHfunctions::sel_iso(0.25)(muons, muons_iso)") + + + # baseline histograms, before any selection cuts (store with _cut0) + results.append(df.Histo1D(("muons_p_cut0", "", *bins_p_mu), "muons_p")) + results.append(df.Histo1D(("muons_theta_cut0", "", *bins_theta), "muons_theta")) + results.append(df.Histo1D(("muons_phi_cut0", "", *bins_phi), "muons_phi")) + results.append(df.Histo1D(("muons_q_cut0", "", *bins_charge), "muons_q")) + results.append(df.Histo1D(("muons_no_cut0", "", *bins_count), "muons_no")) + results.append(df.Histo1D(("muons_iso_cut0", "", *bins_iso), "muons_iso")) + + + ######### + ### CUT 0: all events + ######### + df = df.Define("cut0", "0") + results.append(df.Histo1D(("cutFlow", "", *bins_count), "cut0")) + + ######### + ### CUT 1: at least 1 muon with at least one isolated one + ######### + df = df.Filter("muons_no >= 1 && muons_sel_iso.size() > 0") + df = df.Define("cut1", "1") + results.append(df.Histo1D(("cutFlow", "", *bins_count), "cut1")) + + + ######### + ### CUT 2 :at least 2 opposite-sign (OS) leptons + ######### + df = df.Filter("muons_no >= 2 && abs(Sum(muons_q)) < muons_q.size()") + df = df.Define("cut2", "2") + results.append(df.Histo1D(("cutFlow", "", *bins_count), "cut2")) + + # now we build the Z resonance based on the available leptons. + # the function resonanceBuilder_mass_recoil returns the best lepton pair compatible with the Z mass (91.2 GeV) and recoil at 125 GeV + # the argument 0.4 gives a weight to the Z mass and the recoil mass in the chi2 minimization + # technically, it returns a ReconstructedParticleData object with index 0 the di-lepton system, index and 2 the leptons of the pair + df = df.Define("zbuilder_result", "FCCAnalyses::ZHfunctions::resonanceBuilder_mass_recoil(91.2, 125, 0.4, 240, false)(muons, MCRecoAssociations0, MCRecoAssociations1, ReconstructedParticles, Particle, Particle0, Particle1)") + df = df.Define("zll", "Vec_rp{zbuilder_result[0]}") # the Z + df = df.Define("zll_muons", "Vec_rp{zbuilder_result[1],zbuilder_result[2]}") # the leptons + df = df.Define("zll_m", "FCCAnalyses::ReconstructedParticle::get_mass(zll)[0]") # Z mass + df = df.Define("zll_p", "FCCAnalyses::ReconstructedParticle::get_p(zll)[0]") # momentum of the Z + df = df.Define("zll_recoil", "FCCAnalyses::ReconstructedParticle::recoilBuilder(240)(zll)") # compute the recoil based on the reconstructed Z + df = df.Define("zll_recoil_m", "FCCAnalyses::ReconstructedParticle::get_mass(zll_recoil)[0]") # recoil mass + df = df.Define("zll_muons_p", "FCCAnalyses::ReconstructedParticle::get_p(zll_muons)") # get the momentum of the 2 muons from the Z resonance + + + ######### + ### CUT 3: Z mass window + ######### + results.append(df.Histo1D(("zll_m_cut2", "", *bins_m_ll), "zll_m")) + df = df.Filter("zll_m > 86 && zll_m < 96") + df = df.Define("cut3", "3") + results.append(df.Histo1D(("cutFlow", "", *bins_count), "cut3")) + + + ######### + ### CUT 4: Z momentum + ######### + results.append(df.Histo1D(("zll_p_cut3", "", *bins_p_ll), "zll_p")) + df = df.Filter("zll_p > 20 && zll_p < 70") + df = df.Define("cut4", "4") + results.append(df.Histo1D(("cutFlow", "", *bins_count), "cut4")) + + + ######### + ### CUT 5: cosThetaMiss + ######### + df = df.Define("missingEnergy", "FCCAnalyses::ZHfunctions::missingEnergy(240., ReconstructedParticles)") + df = df.Define("cosTheta_miss", "FCCAnalyses::ZHfunctions::get_cosTheta_miss(missingEnergy)") + results.append(df.Histo1D(("cosThetaMiss_cut4", "", *bins_cosThetaMiss), "cosTheta_miss")) # plot it before the cut + + df = df.Filter("cosTheta_miss < 0.98") + df = df.Define("cut5", "5") + results.append(df.Histo1D(("cutFlow", "", *bins_count), "cut5")) + + + ######### + ### CUT 6: recoil mass window + ######### + results.append(df.Histo1D(("zll_recoil_m", "", *bins_recoil), "zll_recoil_m")) # plot it before the cut + df = df.Filter("zll_recoil_m < 140 && zll_recoil_m > 120") + df = df.Define("cut6", "6") + results.append(df.Histo1D(("cutFlow", "", *bins_count), "cut6")) + + + ######################## + # Final histograms + ######################## + results.append(df.Histo1D(("zll_m_final", "", *bins_m_ll), "zll_m")) + results.append(df.Histo1D(("zll_recoil_m_final", "", *bins_recoil_final), "zll_recoil_m")) + results.append(df.Histo1D(("zll_p_final", "", *bins_p_ll), "zll_p")) + results.append(df.Histo1D(("zll_muons_p_final", "", *bins_p_mu), "zll_muons_p")) + + + return results, weightsum + diff --git a/examples/FCCee/higgs/mass_xsec/plots_recoil.py b/examples/FCCee/higgs/mass_xsec/plots_recoil.py new file mode 100644 index 0000000000..47a8205357 --- /dev/null +++ b/examples/FCCee/higgs/mass_xsec/plots_recoil.py @@ -0,0 +1,102 @@ +import ROOT + +flavor = "mumu" # mumu, ee + + +# global parameters +intLumi = 1. +intLumiLabel = "L = 7.2 ab^{-1}" +ana_tex = 'e^{+}e^{-} #rightarrow ZH #rightarrow #mu^{+}#mu^{-} + X' +delphesVersion = '3.4.2' +energy = 240.0 +collider = 'FCC-ee' +inputDir = f"outputs/FCCee/higgs/mass-xsec/histmaker/{flavor}/" +formats = ['png','pdf'] +outdir = f"outputs/FCCee/higgs/mass-xsec/plots/{flavor}/" +plotStatUnc = True + +colors = {} +colors['ZH'] = ROOT.kRed +colors['WW'] = ROOT.kBlue+1 +colors['ZZ'] = ROOT.kGreen+2 + +procs = {} +procs['signal'] = {'ZH':['wzp6_ee_mumuH_ecm240']} +procs['backgrounds'] = {'WW':['p8_ee_WW_ecm240'], 'ZZ':['p8_ee_ZZ_ecm240']} + + +legend = {} +legend['ZH'] = 'ZH' +legend['WW'] = 'WW' +legend['ZZ'] = 'ZZ' + + + +hists = {} + +hists["zll_recoil_m"] = { + "output": "zll_recoil_m", + "logy": False, + "stack": True, + "rebin": 100, + "xmin": 120, + "xmax": 140, + "ymin": 0, + "ymax": 2500, + "xtitle": "Recoil (GeV)", + "ytitle": "Events / 100 MeV", +} + +hists["zll_p"] = { + "output": "zll_p", + "logy": False, + "stack": True, + "rebin": 2, + "xmin": 0, + "xmax": 80, + "ymin": 0, + "ymax": 2000, + "xtitle": "p(#mu^{#plus}#mu^{#minus}) (GeV)", + "ytitle": "Events ", +} + +hists["zll_m"] = { + "output": "zll_m", + "logy": False, + "stack": True, + "rebin": 2, + "xmin": 86, + "xmax": 96, + "ymin": 0, + "ymax": 3000, + "xtitle": "m(#mu^{#plus}#mu^{#minus}) (GeV)", + "ytitle": "Events ", +} + +hists["cosThetaMiss_cut4"] = { + "output": "cosThetaMiss_cut4", + "logy": True, + "stack": True, + "rebin": 10, + "xmin": 0, + "xmax": 1, + "ymin": 10, + "ymax": 100000, + "xtitle": "cos(#theta_{miss})", + "ytitle": "Events ", + "extralab": "Before cos(#theta_{miss}) cut", +} + + +hists["cutFlow"] = { + "output": "cutFlow", + "logy": True, + "stack": False, + "xmin": 0, + "xmax": 6, + "ymin": 1e4, + "ymax": 1e11, + "xtitle": ["All events", "#geq 1 #mu^{#pm} + ISO", "#geq 2 #mu^{#pm} + OS", "86 < m_{#mu^{+}#mu^{#minus}} < 96", "20 < p_{#mu^{+}#mu^{#minus}} < 70", "|cos#theta_{miss}| < 0.98", "120 < m_{rec} < 140"], + "ytitle": "Events ", + "scaleSig": 10 +} diff --git a/examples/FCCee/higgs/mass_xsec/preselection_recoil.py b/examples/FCCee/higgs/mass_xsec/preselection_recoil.py new file mode 100644 index 0000000000..7f6fa210bc --- /dev/null +++ b/examples/FCCee/higgs/mass_xsec/preselection_recoil.py @@ -0,0 +1,96 @@ + +flavor = "mumu" # mumu, ee + +# list of processes (mandatory) +processList_mumu = { + 'p8_ee_ZZ_ecm240':{'fraction': 1}, + 'p8_ee_WW_ecm240':{'fraction': 1}, + 'wzp6_ee_mumuH_ecm240':{'fraction': 1}, +} + +processList_ee = { + 'p8_ee_ZZ_ecm240':{'fraction': 1}, + 'p8_ee_WW_ecm240':{'fraction': 1}, + 'wzp6_ee_eeH_ecm240':{'fraction': 1}, +} + +if flavor == "mumu": + processList = processList_mumu +else: + processList = processList_ee + +# Production tag when running over EDM4Hep centrally produced events, this points to the yaml files for getting sample statistics (mandatory) +prodTag = "FCCee/winter2023/IDEA/" + +# Link to the dictonary that contains all the cross section informations etc... (mandatory) +procDict = "FCCee_procDict_winter2023_IDEA.json" + +# Additional/custom C++ functions, defined in header files +includePaths = ["functions.h"] + +# Output directory +outputDir = f"outputs/FCCee/higgs/mass-xsec/preselection/{flavor}/" + +# Multithreading: -1 means using all cores +nCPUS = -1 + +# Batch settings +#runBatch = False +#batchQueue = "longlunch" +#compGroup = "group_u_FCC.local_gen" + +class RDFanalysis(): + + # encapsulate analysis logic, definitions and filters in the dataframe + def analysers(df): + + # define some aliases to be used later on + df = df.Alias("Particle0", "Particle#0.index") + df = df.Alias("Particle1", "Particle#1.index") + df = df.Alias("MCRecoAssociations0", "MCRecoAssociations#0.index") + df = df.Alias("MCRecoAssociations1", "MCRecoAssociations#1.index") + df = df.Alias("Muon0", "Muon#0.index") + + # get all the leptons from the collection + df = df.Define("muons_all", "FCCAnalyses::ReconstructedParticle::get(Muon0, ReconstructedParticles)") + + # select leptons with momentum > 20 GeV + df = df.Define("muons", "FCCAnalyses::ReconstructedParticle::sel_p(20)(muons_all)") + df = df.Define("muons_p", "FCCAnalyses::ReconstructedParticle::get_p(muons)") + df = df.Define("muons_theta", "FCCAnalyses::ReconstructedParticle::get_theta(muons)") + df = df.Define("muons_phi", "FCCAnalyses::ReconstructedParticle::get_phi(muons)") + df = df.Define("muons_q", "FCCAnalyses::ReconstructedParticle::get_charge(muons)") + df = df.Define("muons_no", "FCCAnalyses::ReconstructedParticle::get_n(muons)") + + # compute the muon isolation and store muons with an isolation cut of 0.25 in a separate column muons_sel_iso + df = df.Define("muons_iso", "FCCAnalyses::ZHfunctions::coneIsolation(0.01, 0.5)(muons, ReconstructedParticles)") + df = df.Define("muons_sel_iso", "FCCAnalyses::ZHfunctions::sel_iso(0.25)(muons, muons_iso)") + + + # Basic selection: at least 2 OS muons, one isolated + df = df.Filter("muons_no >= 1 && muons_sel_iso.size() > 0") + df = df.Filter("muons_no >= 2 && abs(Sum(muons_q)) < muons_q.size()") + + + # now we build the Z resonance based on the available leptons. + # the function resonanceBuilder_mass_recoil returns the best lepton pair compatible with the Z mass (91.2 GeV) and recoil at 125 GeV + # the argument 0.4 gives a weight to the Z mass and the recoil mass in the chi2 minimization + # technically, it returns a ReconstructedParticleData object with index 0 the di-lepton system, index and 2 the leptons of the pair + df = df.Define("zbuilder_result", "FCCAnalyses::ZHfunctions::resonanceBuilder_mass_recoil(91.2, 125, 0.4, 240, false)(muons, MCRecoAssociations0, MCRecoAssociations1, ReconstructedParticles, Particle, Particle0, Particle1)") + df = df.Define("zll", "Vec_rp{zbuilder_result[0]}") # the Z + df = df.Define("zll_muons", "Vec_rp{zbuilder_result[1],zbuilder_result[2]}") # the leptons + df = df.Define("zll_m", "FCCAnalyses::ReconstructedParticle::get_mass(zll)[0]") # Z mass + df = df.Define("zll_p", "FCCAnalyses::ReconstructedParticle::get_p(zll)[0]") # momentum of the Z + df = df.Define("zll_recoil", "FCCAnalyses::ReconstructedParticle::recoilBuilder(240)(zll)") # compute the recoil based on the reconstructed Z + df = df.Define("zll_recoil_m", "FCCAnalyses::ReconstructedParticle::get_mass(zll_recoil)[0]") # recoil mass + df = df.Define("zll_muons_p", "FCCAnalyses::ReconstructedParticle::get_p(zll_muons)") # get the momentum of the 2 muons from the Z resonance + + df = df.Define("missingEnergy", "FCCAnalyses::ZHfunctions::missingEnergy(240., ReconstructedParticles)") + df = df.Define("cosTheta_miss", "FCCAnalyses::ZHfunctions::get_cosTheta_miss(missingEnergy)") + + return df + + # define output branches to be saved + def output(): + branchList = ["zll_m", "zll_p", "cosTheta_miss", "zll_recoil_m"] + return branchList diff --git a/examples/FCCee/higgs/mva/evaluate_bdt.py b/examples/FCCee/higgs/mva/evaluate_bdt.py new file mode 100644 index 0000000000..7eb7e63dcc --- /dev/null +++ b/examples/FCCee/higgs/mva/evaluate_bdt.py @@ -0,0 +1,104 @@ +import sys, os +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd +import sklearn +import pickle +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("-i", "--input", type=str, default="outputs/FCCee/higgs/mva/bdt_model_example.pkl", help="Input pkl file") +parser.add_argument("-o", "--outDir", type=str, default="outputs/FCCee/higgs/mva/plots_training", help="Output directory") +args = parser.parse_args() + + +def plot_roc(): + + train_probs = bdt.predict_proba(train_data) + train_preds = train_probs[:,1] + train_fpr, train_tpr, threshold = sklearn.metrics.roc_curve(train_labels, train_preds) + train_roc_auc = sklearn.metrics.auc(train_fpr, train_tpr) + + test_probs = bdt.predict_proba(test_data) + test_preds = test_probs[:,1] + test_fpr, test_tpr, threshold = sklearn.metrics.roc_curve(test_labels, test_preds) + test_roc_auc = sklearn.metrics.auc(test_fpr, test_tpr) + + # Plot the ROC curve + plt.figure(figsize=(8, 6)) + plt.plot(train_fpr, train_tpr, color='blue', label=f"Training ROC (AUC = {train_roc_auc:.2f})") + plt.plot(test_fpr, test_tpr, color='red', label=f"Testing ROC (AUC = {test_roc_auc:.2f})") + plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random Guess') + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.title('Receiver Operating Characteristic (ROC) Curve') + plt.legend() + plt.grid() + plt.savefig(f"{outDir}/roc.png") + plt.savefig(f"{outDir}/roc.pdf") + plt.close() + + +def plot_score(): + + train_predictions = bdt.predict_proba(train_data)[:,1] + test_predictions = bdt.predict_proba(test_data)[:,1] + + # Separate the data into signal and background samples + train_signal_scores = train_predictions[train_labels == 1] + train_background_scores = train_predictions[train_labels == 0] + test_signal_scores = test_predictions[test_labels == 1] + test_background_scores = test_predictions[test_labels == 0] + + # Plot the BDT scores for signal and background events + plt.figure(figsize=(8, 6)) + plt.hist(train_signal_scores, bins=50, range=(0, 1), histtype='step', label='Training Signal', color='blue', density=True) + plt.hist(train_background_scores, bins=50, range=(0, 1), histtype='step', label='Training Background', color='red', density=True) + plt.hist(test_signal_scores, bins=50, range=(0, 1), histtype='step', label='Testing Signal', color='blue', linestyle='dashed', density=True) + plt.hist(test_background_scores, bins=50, range=(0, 1), histtype='step', label='Testing Background', color='red', linestyle='dashed', density=True) + plt.xlabel('BDT Score') + plt.ylabel('Number of Events (normalized)') + plt.title('BDT Score Distribution') + plt.legend() + plt.grid() + plt.savefig(f"{outDir}/score.png") + plt.savefig(f"{outDir}/score.pdf") + plt.close() + +def plot_importance(): + + fig, ax = plt.subplots(figsize=(12, 6)) + + importance = bdt.get_booster().get_score(importance_type='weight') + sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=False) + sorted_indices = [int(x[0][1:]) for x in sorted_importance] # sorted indices + + # Get the sorted variable names and their corresponding importances + sorted_vars = [variables[i] for i in sorted_indices] + sorted_values = [x[1] for x in sorted_importance] + + # Create a DataFrame and plot the feature importances + importance_df = pd.DataFrame({'Variable': sorted_vars, 'Importance': sorted_values}) + importance_df.plot(kind='barh', x='Variable', y='Importance', legend=None, ax=ax) + ax.set_xlabel('BDT score') + ax.set_title("BDT variable scores", fontsize=16) + plt.savefig(f"{outDir}/importance.png") + plt.savefig(f"{outDir}/importance.pdf") + plt.close() + + + +if __name__ == "__main__": + outDir = args.outDir + + res = pickle.load(open(args.input, "rb")) + bdt = res['model'] + train_data = res['train_data'] + test_data = res['test_data'] + train_labels = res['train_labels'] + test_labels = res['test_labels'] + variables = res['variables'] + + plot_score() + plot_roc() + plot_importance() \ No newline at end of file diff --git a/examples/FCCee/higgs/mva/final_selection.py b/examples/FCCee/higgs/mva/final_selection.py new file mode 100644 index 0000000000..ebd5d20054 --- /dev/null +++ b/examples/FCCee/higgs/mva/final_selection.py @@ -0,0 +1,43 @@ + + +#Input directory where the files produced at the pre-selection level are +inputDir = f"outputs/FCCee/higgs/mva/preselection/" + +#Input directory where the files produced at the pre-selection level are +#Optional: output directory, default is local running directory +outputDir = f"outputs/FCCee/higgs/mva/final_selection/" + +# if no processList or empty dictionary provided, run over all ROOT files in the input directory +processList = {} + +#Link to the dictonary that contains all the cross section informations etc... +procDict = "FCCee_procDict_winter2023_IDEA.json" + +#Number of CPUs to use +nCPUS = -1 + +#produces ROOT TTrees, default is False +doTree = False + + +# scale the histograms with the cross-section and integrated luminosity +doScale = True +intLumi = 7200000.0 # 7.2 /ab + +saveTabular = True + +###Dictionnay of the list of cuts. The key is the name of the selection that will be added to the output file +cutList = { + "sel0": "1==1", + "sel1": "mva_score[0] > 0.5", +} + + +#Dictionary for the ouput variable/hitograms. The key is the name of the variable in the output files. "name" is the name of the variable in the input file, "title" is the x-axis label of the histogram, "bin" the number of bins of the histogram, "xmin" the minimum x-axis value and "xmax" the maximum x-axis value. +histoList = { + "mva_score":{"cols": ["mva_score"], "title": "MVA score", "bins": [(100,0,1)]}, + "zmumu_m":{"cols": ["zmumu_m"], "title": "m_{Z} (GeV)", "bins": [(250,0,250)]}, + "zmumu_p":{"cols": ["zmumu_p"], "title": "p_{Z} (GeV)", "bins": [(250,0,250)]}, + "zmumu_recoil_m":{"cols": ["zmumu_recoil_m"], "title": "Recoil (GeV)", "bins": [(250,0,250)]}, + "zmumu_recoil_m_final":{"cols": ["zmumu_recoil_m"], "title": "Recoil (GeV)", "bins": [(200,120,140)]}, +} diff --git a/examples/FCCee/higgs/mva/functions.h b/examples/FCCee/higgs/mva/functions.h new file mode 100644 index 0000000000..5e9faaee56 --- /dev/null +++ b/examples/FCCee/higgs/mva/functions.h @@ -0,0 +1,85 @@ +#ifndef ZHfunctions_H +#define ZHfunctions_H + +#include +#include +#include + +#include "TLorentzVector.h" +#include "ROOT/RVec.hxx" +#include "edm4hep/ReconstructedParticleData.h" +#include "edm4hep/MCParticleData.h" +#include "edm4hep/ParticleIDData.h" +#include "ReconstructedParticle2MC.h" + + +namespace FCCAnalyses { + +// acolinearity between two reco particles +float acolinearity(Vec_rp in) { + if(in.size() < 2) return -999; + + TLorentzVector p1; + p1.SetXYZM(in[0].momentum.x, in[0].momentum.y, in[0].momentum.z, in[0].mass); + + TLorentzVector p2; + p2.SetXYZM(in[1].momentum.x, in[1].momentum.y, in[1].momentum.z, in[1].mass); + + TVector3 v1 = p1.Vect(); + TVector3 v2 = p2.Vect(); + return std::acos(v1.Dot(v2)/(v1.Mag()*v2.Mag())*(-1.)); +} + +// acoplanarity between two reco particles +float acoplanarity(Vec_rp in) { + if(in.size() < 2) return -999; + + TLorentzVector p1; + p1.SetXYZM(in[0].momentum.x, in[0].momentum.y, in[0].momentum.z, in[0].mass); + + TLorentzVector p2; + p2.SetXYZM(in[1].momentum.x, in[1].momentum.y, in[1].momentum.z, in[1].mass); + + float acop = abs(p1.Phi() - p2.Phi()); + if(acop > M_PI) acop = 2 * M_PI - acop; + acop = M_PI - acop; + + return acop; +} + +// returns missing energy vector, based on reco particles +Vec_rp missingEnergy(float ecm, Vec_rp in, float p_cutoff = 0.0) { + float px = 0, py = 0, pz = 0, e = 0; + for(auto &p : in) { + if (std::sqrt(p.momentum.x * p.momentum.x + p.momentum.y*p.momentum.y) < p_cutoff) continue; + px += -p.momentum.x; + py += -p.momentum.y; + pz += -p.momentum.z; + e += p.energy; + } + + Vec_rp ret; + rp res; + res.momentum.x = px; + res.momentum.y = py; + res.momentum.z = pz; + res.energy = ecm-e; + ret.emplace_back(res); + return ret; +} + +// calculate the cosine(theta) of the missing energy vector +float get_cosTheta_miss(Vec_rp met){ + float costheta = 0.; + if(met.size() > 0) { + TLorentzVector lv_met; + lv_met.SetPxPyPzE(met[0].momentum.x, met[0].momentum.y, met[0].momentum.z, met[0].energy); + costheta = fabs(std::cos(lv_met.Theta())); + } + return costheta; +} + + +} + +#endif \ No newline at end of file diff --git a/examples/FCCee/higgs/mva/plots.py b/examples/FCCee/higgs/mva/plots.py new file mode 100644 index 0000000000..30bfd5bcb5 --- /dev/null +++ b/examples/FCCee/higgs/mva/plots.py @@ -0,0 +1,44 @@ +import ROOT + + +# global parameters +intLumi = 1. +intLumiLabel = "L = 7.2 ab^{-1}" +ana_tex = 'e^{+}e^{-} #rightarrow ZH #rightarrow #mu^{+}#mu^{-} + X' +delphesVersion = '3.4.2' +energy = 240.0 +collider = 'FCC-ee' +inputDir = f"outputs/FCCee/higgs/mva/final_selection/" +formats = ['png','pdf'] +outdir = f"outputs/FCCee/higgs/mva/plots/" +yaxis = ['lin','log'] +stacksig = ['nostack'] +plotStatUnc = True + + + + +variables = ['zmumu_recoil_m_final', 'mva_score'] +rebin = [1, 1] # uniform rebin per variable (optional) + +###Dictonnary with the analysis name as a key, and the list of selections to be plotted for this analysis. The name of the selections should be the same than in the final selection +selections = {} +selections['ZH'] = ["sel0", "sel1"] + +extralabel = {} +extralabel['sel0'] = "Basic selection" +extralabel['sel1'] = "MVA > 0.5" + +colors = {} +colors['ZH'] = ROOT.kRed +colors['WW'] = ROOT.kBlue+1 + + +plots = {} +plots['ZH'] = {'signal':{'ZH':['wzp6_ee_mumuH_ecm240']}, + 'backgrounds':{'WW':['p8_ee_WW_ecm240']} + } + +legend = {} +legend['ZH'] = 'ZH' +legend['WW'] = 'WW' diff --git a/examples/FCCee/higgs/mva/preselection.py b/examples/FCCee/higgs/mva/preselection.py new file mode 100644 index 0000000000..0fd081c912 --- /dev/null +++ b/examples/FCCee/higgs/mva/preselection.py @@ -0,0 +1,107 @@ + +from addons.TMVAHelper.TMVAHelper import TMVAHelperXGB + +# list of processes (mandatory) +processList = { + 'p8_ee_WW_ecm240': {'fraction': 0.1}, + 'wzp6_ee_mumuH_ecm240': {'fraction': 1}, +} + +# Production tag when running over EDM4Hep centrally produced events, this points to the yaml files for getting sample statistics (mandatory) +prodTag = "FCCee/winter2023/IDEA/" + +# Link to the dictonary that contains all the cross section informations etc... (mandatory) +procDict = "FCCee_procDict_winter2023_IDEA.json" + +# Additional/custom C++ functions, defined in header files +includePaths = ["functions.h"] + +# Output directory +outputDir = f"outputs/FCCee/higgs/mva/preselection/" + +# Multithreading: -1 means using all cores +nCPUS = -1 + +# Batch settings +#runBatch = False +#batchQueue = "longlunch" +#compGroup = "group_u_FCC.local_gen" + +doInference = False + +class RDFanalysis(): + + # encapsulate analysis logic, definitions and filters in the dataframe + def analysers(df): + + df = df.Alias("Particle0", "Particle#0.index") + df = df.Alias("Particle1", "Particle#1.index") + df = df.Alias("MCRecoAssociations0", "MCRecoAssociations#0.index") + df = df.Alias("MCRecoAssociations1", "MCRecoAssociations#1.index") + df = df.Alias("Muon", "Muon#0.index") + df = df.Alias("Electron", "Electron#0.index") + + # all leptons (bare) + df = df.Define("muons_all", "FCCAnalyses::ReconstructedParticle::get(Muon, ReconstructedParticles)") + df = df.Define("electrons_all", "FCCAnalyses::ReconstructedParticle::get(Electron, ReconstructedParticles)") + + # define good muons and electrons + df = df.Define("muons", "FCCAnalyses::ReconstructedParticle::sel_p(20)(muons_all)") + df = df.Define("electrons", "FCCAnalyses::ReconstructedParticle::sel_p(20)(electrons_all)") + + # electron veto + df = df.Define("electrons_no", "FCCAnalyses::ReconstructedParticle::get_n(electrons)") + df = df.Filter("electrons_no == 0") + + # photon veto + df = df.Alias("Photon0", "Photon#0.index") + df = df.Define("photons_all", "FCCAnalyses::ReconstructedParticle::get(Photon0, ReconstructedParticles)") + df = df.Define("photons", "FCCAnalyses::ReconstructedParticle::sel_p(40)(photons_all)") + df = df.Define("photons_no", "FCCAnalyses::ReconstructedParticle::get_n(photons)") + df = df.Filter("photons_no == 0") + + # basic cuts: two OS leptons + df = df.Define("muons_p", "FCCAnalyses::ReconstructedParticle::get_p(muons)") + df = df.Define("muons_theta", "FCCAnalyses::ReconstructedParticle::get_theta(muons)") + df = df.Define("muons_phi", "FCCAnalyses::ReconstructedParticle::get_phi(muons)") + df = df.Define("muons_q", "FCCAnalyses::ReconstructedParticle::get_charge(muons)") + df = df.Define("muons_no", "FCCAnalyses::ReconstructedParticle::get_n(muons)") + df = df.Filter("muons_no == 2 && Sum(muons_q) == 0") + + # build Z resonance + df = df.Define("zmumu", "ReconstructedParticle::resonanceBuilder(91)(muons)") + df = df.Define("zmumu_m", "ReconstructedParticle::get_mass(zmumu)[0]") + df = df.Define("zmumu_p", "ReconstructedParticle::get_p(zmumu)[0]") + df = df.Define("zmumu_recoil", "ReconstructedParticle::recoilBuilder(240)(zmumu)") + df = df.Define("zmumu_recoil_m", "ReconstructedParticle::get_mass(zmumu_recoil)[0]") + + # kinematic cuts + df = df.Filter("zmumu_m > 86 && zmumu_m < 96") + df = df.Filter("zmumu_p > 20 && zmumu_p < 70") + df = df.Filter("zmumu_recoil_m < 140 && zmumu_recoil_m > 120") + + df = df.Define("muon1_p", "muons_p[0]") + df = df.Define("muon2_p", "muons_p[1]") + df = df.Define("muon1_theta", "muons_theta[0]") + df = df.Define("muon2_theta", "muons_theta[1]") + + df = df.Define("missingEnergy", "FCCAnalyses::missingEnergy(240., ReconstructedParticles)") + df = df.Define("cosTheta_miss", "FCCAnalyses::get_cosTheta_miss(missingEnergy)") + df = df.Filter("cosTheta_miss < 0.98") + + df = df.Define("acoplanarity", "FCCAnalyses::acoplanarity(muons)") + df = df.Define("acolinearity", "FCCAnalyses::acolinearity(muons)") + + + if doInference: + tmva_helper = TMVAHelperXGB("outputs/FCCee/higgs/mva/bdt_model_example.root", "bdt_model") # read the XGBoost training + df = tmva_helper.run_inference(df, col_name="mva_score") # by default, makes a new column mva_score + + return df + + # define output branches to be saved + def output(): + branchList = ["muon1_p", "muon2_p", "muon1_theta", "muon2_theta", "zmumu_p", "zmumu_m", "zmumu_recoil_m", "acoplanarity", "acolinearity", "cosTheta_miss"] + if doInference: + branchList.append("mva_score") + return branchList diff --git a/examples/FCCee/higgs/mva/train_bdt.py b/examples/FCCee/higgs/mva/train_bdt.py new file mode 100644 index 0000000000..30527e98e7 --- /dev/null +++ b/examples/FCCee/higgs/mva/train_bdt.py @@ -0,0 +1,107 @@ + +import uproot +import pandas as pd +import xgboost as xgb +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score, roc_auc_score +import ROOT +import pickle + + +ROOT.gROOT.SetBatch(True) +# e.g. https://root.cern/doc/master/tmva101__Training_8py.html + +def load_process(fIn, variables, target=0, weight_sf=1.): + + f = uproot.open(fIn) + tree = f["events"] + #meta = f["meta"] + #weight = meta.values()[2]/meta.values()[1]*weight_sf + weight = 1.0/tree.num_entries*weight_sf + print("Load {} with {} events and weight {}".format(fIn.replace(".root", ""), tree.num_entries, weight)) + + df = tree.arrays(variables, library="pd") # convert the signal and background data to pandas DataFrames + df['target'] = target # add a target column to indicate signal (1) and background (0) + df['weight'] = weight + return df + + + +print("Parse inputs") + +# configuration of signal, background, variables, files, ... +variables = ['muon1_p', 'muon2_p', 'muon1_theta', 'muon2_theta', 'zmumu_p', 'zmumu_m', 'acoplanarity', 'acolinearity', 'cosTheta_miss'] +weight_sf = 1e9 +sig_df = load_process("outputs/FCCee/higgs/mva/preselection/wzp6_ee_mumuH_ecm240.root", variables, weight_sf=weight_sf, target=1) +bkg_df = load_process("outputs/FCCee/higgs/mva/preselection/p8_ee_WW_ecm240.root", variables, weight_sf=weight_sf) + + + + + +# Concatenate the dataframes into a single dataframe +data = pd.concat([sig_df, bkg_df], ignore_index=True) + + +# split data in train/test events +train_data, test_data, train_labels, test_labels, train_weights, test_weights = train_test_split( + data[variables], data['target'], data['weight'], test_size=0.2, random_state=42 +) + + + +# conversion to numpy needed to have default feature_names (fN), needed for conversion to TMVA +train_data = train_data.to_numpy() +test_data = test_data.to_numpy() +train_labels = train_labels.to_numpy() +test_labels = test_labels.to_numpy() +train_weights = train_weights.to_numpy() +test_weights = test_weights.to_numpy() + +# set hyperparameters for the XGBoost model +params = { + 'objective': 'binary:logistic', + 'eval_metric': 'logloss', + 'eta': 0.1, + 'max_depth': 5, + 'subsample': 0.5, + 'colsample_bytree': 0.5, + 'seed': 42, + 'n_estimators': 350, # low number for testing purposes (default 350) + 'early_stopping_rounds': 25, + 'num_rounds': 20, + 'learning_rate': 0.20, + 'gamma': 3, + 'min_child_weight': 10, + 'max_delta_step': 0, +} + + +# train the XGBoost model +print("Start training") +eval_set = [(train_data, train_labels), (test_data, test_labels)] +bdt = xgb.XGBClassifier(**params) +bdt.fit(train_data, train_labels, verbose=True, eval_set=eval_set, sample_weight=train_weights) + + +# export model (to ROOT and pkl) +print("Export model") +fOutName = "outputs/FCCee/higgs/mva/bdt_model_example.root" +ROOT.TMVA.Experimental.SaveXGBoost(bdt, "bdt_model", fOutName, num_inputs=len(variables)) + +# append the variables +variables_ = ROOT.TList() +for var in variables: + variables_.Add(ROOT.TObjString(var)) +fOut = ROOT.TFile(fOutName, "UPDATE") +fOut.WriteObject(variables_, "variables") + + +save = {} +save['model'] = bdt +save['train_data'] = train_data +save['test_data'] = test_data +save['train_labels'] = train_labels +save['test_labels'] = test_labels +save['variables'] = variables +pickle.dump(save, open("outputs/FCCee/higgs/mva/bdt_model_example.pkl", "wb")) diff --git a/python/anascript.py b/python/anascript.py index f06ed07b9d..17b01eb7bb 100644 --- a/python/anascript.py +++ b/python/anascript.py @@ -178,7 +178,7 @@ def get_element(rdf_module, element: str, is_final: bool = False): LOGGER.debug('The variable <%s> is optional in the analysis ' 'final step/histmaker.\nBy default no scaling is ' 'applied.', element) - return False + return True LOGGER.debug('The option <%s> is not available in the presel. ' 'stages of the analysis', element) diff --git a/python/do_combine.py b/python/do_combine.py new file mode 100644 index 0000000000..0256e727ca --- /dev/null +++ b/python/do_combine.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python + +import sys +import os +import os.path +import ntpath +import importlib +import copy +import re +import logging +import ROOT +import array + +ROOT.gROOT.SetBatch(True) +ROOT.gStyle.SetOptStat(0) +ROOT.gStyle.SetOptTitle(0) + + +LOGGER = logging.getLogger('FCCAnalyses.combine') + + +def get_param(obj, name, default=None): + if hasattr(obj, name): + return getattr(obj, name) + elif default != None: + LOGGER.info(f"Use default value of {default} for {name}") + return default + else: + LOGGER.error(f"Parameter {name} not defined but required. Aborting") + sys.exit(3) + +def rebin(h, newbins): + if isinstance(newbins, int): + return h.Rebin(newbins, h.GetName()) + else: + mybins = array.array('d', newbins) + return h.Rebin(len(mybins)-1, h.GetName(), mybins) + +def run(script_path): + + ROOT.gROOT.SetBatch(True) + ROOT.gErrorIgnoreLevel = ROOT.kWarning + + module_path = os.path.abspath(script_path) + module_dir = os.path.dirname(module_path) + base_name = os.path.splitext(ntpath.basename(script_path))[0] + + sys.path.insert(0, module_dir) + param = importlib.import_module(base_name) + + inputDir = get_param(param, "inputDir") + outputDir = get_param(param, "outputDir") + + + lspace = 12 + sig_procs = get_param(param, "sig_procs") + bkg_procs = get_param(param, "bkg_procs") + procs_sig_names = sorted(list(sig_procs.keys())) + procs_bkg_names = sorted(list(bkg_procs.keys())) + procs = procs_sig_names + procs_bkg_names + proc_dict = sig_procs | bkg_procs + nprocs = len(procs) + procs_idx = list(range(-len(sig_procs)+1, len(bkg_procs)+1, 1)) # negative or 0 for signal, positive for bkg + + categories = get_param(param, "categories") + hist_names = get_param(param, "hist_names") + ncats = len(categories) + + cats_str = "".join([f"{cat:{' '}{'<'}{lspace}}" for cat in categories]) + procs_str = "".join([f"{proc:{' '}{'<'}{lspace}}" for proc in procs] * ncats) + cats_procs_str = "".join([f"{cat:{' '}{'<'}{lspace}}" for cat in categories for _ in range(nprocs)]) + cats_procs_idx_str = "".join([f"{str(proc_idx):{' '}{'<'}{lspace}}" for proc_idx in procs_idx] * ncats) + rates_cats = "".join([f"{'-1':{' '}{'<'}{lspace}}"]*(ncats)) + rates_procs = "".join([f"{'-1':{' '}{'<'}{lspace}}"]*(ncats*nprocs)) + + + ## datacard header + dc = "" + dc += f"imax *\n" + dc += f"jmax *\n" + dc += "kmax *\n" + dc += f"########################################\n" + dc += f"shapes * * datacard.root $CHANNEL_$PROCESS $CHANNEL_$PROCESS_$SYSTEMATIC\n" + dc += f"shapes data_obs * datacard.root $CHANNEL_asimov\n" + dc += f"########################################\n" + dc += f"bin {cats_str}\n" + dc += f"observation {rates_cats}\n" + dc += f"########################################\n" + dc += f"bin {cats_procs_str}\n" + dc += f"process {procs_str}\n" + dc += f"process {cats_procs_idx_str}\n" + dc += f"rate {rates_procs}\n" + dc += f"########################################\n" + + ## systematic uncertainties + systs = get_param(param, "systs") + for systName, syst in systs.items(): + syst_type = syst['type'] + syst_val = str(syst['value']) + procs_to_apply = syst['procs'] + dc_tmp = f"{systName:{' '}{'<'}{15}} {syst_type:{' '}{'<'}{10}} " + for cat in categories: + for proc in procs: + apply_proc = (isinstance(procs_to_apply, list) and proc in procs_to_apply) or (isinstance(procs_to_apply, str) and re.search(procs_to_apply, proc)) + if apply_proc: + if syst_type == "shape": + LOGGER.warning('Shape uncertainties not yet supported! Skipping') + val = "-" + else: + val = str(syst_val) + else: + val = "-" + dc_tmp += f"{val:{' '}{'<'}{lspace}}" + dc += f"{dc_tmp}\n" + + ## auto MC stats + if get_param(param, "mc_stats"): + dc += "* autoMCStats 1 1" + + ## get histograms + new_bins = get_param(param, "rebin", 1) + sel = get_param(param, "selection", -1) + intLumi = get_param(param, "intLumi") + hists = [] + hists_asimov = {} + for procName, procList in proc_dict.items(): + for i,cat in enumerate(categories): + hist = None + for proc in procList: + if sel == -1: + fInName = f"{inputDir}/{proc}.root" + else: + fInName = f"{inputDir}/{proc}_{sel}_histo.root" + if not os.path.isfile(fInName): + LOGGER.error(f'File {fInName} not found! Aborting...') + sys.exit(3) + fIn = ROOT.TFile(fInName, 'READ') + h = copy.deepcopy(fIn.Get(hist_names[i])) + if hist == None: + hist = h + else: + hist.Add(h) + hist.SetName(f"{cat}_{procName}") + hist.Scale(intLumi) + hist = rebin(hist, new_bins) + hists.append(copy.deepcopy(hist)) + if not cat in hists_asimov: + hist_asimov = copy.deepcopy(hist) + hist_asimov.SetName(f"{cat}_asimov") + hists_asimov[cat] = hist_asimov + else: + hists_asimov[cat].Add(hist) + + # write cards + if not os.path.exists(outputDir): + os.system(f"mkdir -p {outputDir}") + + f = open(f"{outputDir}/datacard.txt", 'w') + f.write(dc) + f.close() + fOut = ROOT.TFile(f"{outputDir}/datacard.root", "RECREATE") + for hist in hists: + hist.Write() + for hist in hists_asimov.values(): + hist.Write() + fOut.Close() + + print(dc) + + +def do_combine(parser): + args, _ = parser.parse_known_args() + + if args.command != 'combine': + LOGGER.error('Wrong sub-command!\nAborting...') + + if not os.path.isfile(args.script_path): + LOGGER.error('Plots script "%s" not found!\nAborting...', + args.script_path) + sys.exit(3) + + run(args.script_path) \ No newline at end of file diff --git a/python/parsers.py b/python/parsers.py index 03df576aeb..3a971c5e18 100644 --- a/python/parsers.py +++ b/python/parsers.py @@ -144,6 +144,13 @@ def setup_run_parser_plots(parser): ''' parser.add_argument('script_path', help="path to the plots script") +def setup_run_parser_combine(parser): + ''' + Define command line arguments for the combine sub-command. + ''' + parser.add_argument('script_path', help="path to the combine script") + + # _____________________________________________________________________________ def setup_subparsers(subparsers): @@ -173,6 +180,9 @@ def setup_subparsers(subparsers): parser_run_plots = subparsers.add_parser( 'plots', help="run a RDataFrame based FCC analysis plot configuration") + parser_run_combine = subparsers.add_parser( + 'combine', + help="prepare combine cards to run basic template fits") # Register sub-parsers setup_init_parser(parser_init) @@ -182,3 +192,4 @@ def setup_subparsers(subparsers): setup_run_parser(parser_run) setup_run_parser_final(parser_run_final) setup_run_parser_plots(parser_run_plots) + setup_run_parser_combine(parser_run_combine) \ No newline at end of file diff --git a/python/run_analysis.py b/python/run_analysis.py index f42bdc9d9e..23bd0f50d4 100644 --- a/python/run_analysis.py +++ b/python/run_analysis.py @@ -323,6 +323,7 @@ def run_rdf(rdf_module, dframe2 = dframe try: + evtcount_init = dframe2.Count() dframe3 = get_element(rdf_module.RDFanalysis, "analysers")(dframe2) branch_list = ROOT.vector('string')() @@ -330,7 +331,7 @@ def run_rdf(rdf_module, for bname in blist: branch_list.push_back(bname) - evtcount = dframe3.Count() + evtcount_final = dframe3.Count() # Generate computational graph of the analysis if args.graph: @@ -342,7 +343,7 @@ def run_rdf(rdf_module, 'occurred:\n%s', excp) sys.exit(3) - return evtcount.GetValue() + return evtcount_init.GetValue(), evtcount_final.GetValue() # _____________________________________________________________________________ @@ -459,7 +460,7 @@ def run_local(rdf_module, infile_list, args): Run analysis locally. ''' # Create list of files to be processed - info_msg = 'Creating dataframe object from files:\n\t' + info_msg = 'Creating dataframe object from files:\n' file_list = ROOT.vector('string')() # Amount of events processed in previous stage (= 0 if it is the first # stage) @@ -511,18 +512,20 @@ def run_local(rdf_module, infile_list, args): # Run RDF start_time = time.time() - outn = run_rdf(rdf_module, file_list, outfile_path, args) + inn, outn = run_rdf(rdf_module, file_list, outfile_path, args) elapsed_time = time.time() - start_time + + # replace nevents_local by inn = the amount of processed events info_msg = f"{' SUMMARY ':=^80}\n" info_msg += 'Elapsed time (H:M:S): ' info_msg += time.strftime('%H:%M:%S', time.gmtime(elapsed_time)) info_msg += '\nEvents processed/second: ' - info_msg += f'{int(nevents_local/elapsed_time):,}' - info_msg += f'\nTotal events processed: {int(nevents_local):,}' + info_msg += f'{int(inn/elapsed_time):,}' + info_msg += f'\nTotal events processed: {int(inn):,}' info_msg += f'\nNo. result events: {int(outn):,}' - if nevents_local > 0: - info_msg += f'\nReduction factor local: {outn/nevents_local}' + if inn > 0: + info_msg += f'\nReduction factor local: {outn/inn}' if nevents_orig > 0: info_msg += f'\nReduction factor total: {outn/nevents_orig}' info_msg += '\n' @@ -531,10 +534,13 @@ def run_local(rdf_module, infile_list, args): LOGGER.info(info_msg) # Update resulting root file with number of processed events + # and number of selected events with ROOT.TFile(outfile_path, 'update') as outfile: param = ROOT.TParameter(int)( 'eventsProcessed', - nevents_orig if nevents_orig != 0 else nevents_local) + nevents_orig if nevents_orig != 0 else inn) + param.Write() + param = ROOT.TParameter(int)('eventsSelected', outn) param.Write() outfile.Write() diff --git a/python/run_final_analysis.py b/python/run_final_analysis.py index 582803e3c1..4f0cb48ebb 100644 --- a/python/run_final_analysis.py +++ b/python/run_final_analysis.py @@ -90,7 +90,14 @@ def run(rdf_module, args): else: LOGGER.debug('Process already in the dictionary. Skipping it...') - ROOT.ROOT.EnableImplicitMT(get_element(rdf_module, "nCPUS", True)) + + # set multithreading + ncpus = get_element(rdf_module, "nCPUS", 1) + if ncpus < 0: # use all available threads + ROOT.EnableImplicitMT() + ncpus = ROOT.GetThreadPoolSize() + ROOT.ROOT.EnableImplicitMT(ncpus) + ROOT.EnableThreadSafety() nevents_real = 0 start_time = time.time() @@ -136,7 +143,14 @@ def run(rdf_module, args): save_tab.append(cut_names) efficiency_list.append(cut_names) - process_list = get_element(rdf_module, "processList", True) + process_list = get_element(rdf_module, "processList", {}) + if len(process_list) == 0: + files = glob.glob(f"{input_dir}/*") + process_list = [os.path.basename(file.replace(".root", "")) for file in files] + info_msg = f"Found {len(process_list)} processes in the input directory:" + for process_name in process_list: + info_msg += f'\n\t- {process_name}' + LOGGER.info(info_msg) for process_name in process_list: process_events[process_name] = 0 events_ttree[process_name] = 0 @@ -204,6 +218,19 @@ def run(rdf_module, args): cuts_list.append(process_name) eff_list = [] eff_list.append(process_name) + + # get process information from prodDict + try: + xsec = process_dict[process_name]["crossSection"] + kfactor = process_dict[process_name]["kfactor"] + matchingEfficiency = process_dict[process_name]["matchingEfficiency"] + except KeyError: + xsec = 1.0 + kfactor = 1.0 + matchingEfficiency = 1.0 + LOGGER.error( + f'No value defined for process {process_name} in dictionary!') + gen_sf = xsec*kfactor*matchingEfficiency # Define all histos, snapshots, etc... LOGGER.info('Defining snapshots and histograms') @@ -284,15 +311,9 @@ def run(rdf_module, args): uncertainty = ROOT.Math.sqrt(all_events) if do_scale: - all_events = all_events * 1. * \ - process_dict[process_name]["crossSection"] * \ - process_dict[process_name]["kfactor"] * \ - process_dict[process_name]["matchingEfficiency"] * \ - int_lumi / process_events[process_name] - uncertainty = ROOT.Math.sqrt(all_events) * \ - process_dict[process_name]["crossSection"] * \ - process_dict[process_name]["kfactor"] * \ - process_dict[process_name]["matchingEfficiency"] * \ + all_events = all_events * 1. * gen_sf * \ + int_lumi / process_events[process_name] + uncertainty = ROOT.Math.sqrt(all_events) * gen_sf * \ int_lumi / process_events[process_name] LOGGER.info('Printing scaled number of events!!!') @@ -313,16 +334,10 @@ def run(rdf_module, args): uncertainty = ROOT.Math.sqrt(nevents_this_cut_raw) if do_scale: nevents_this_cut = \ - nevents_this_cut * 1. * \ - process_dict[process_name]["crossSection"] * \ - process_dict[process_name]["kfactor"] * \ - process_dict[process_name]["matchingEfficiency"] * \ + nevents_this_cut * 1. * gen_sf * \ int_lumi / process_events[process_name] uncertainty = \ - ROOT.Math.sqrt(nevents_this_cut_raw) * \ - process_dict[process_name]["crossSection"] * \ - process_dict[process_name]["kfactor"] * \ - process_dict[process_name]["matchingEfficiency"] * \ + ROOT.Math.sqrt(nevents_this_cut_raw) * gen_sf * \ int_lumi / process_events[process_name] info_msg += f'\n\t{"After selection " + cut:{cfn_width}} : ' info_msg += f'{nevents_this_cut:,}' @@ -357,20 +372,26 @@ def run(rdf_module, args): fhisto = output_dir + process_name + '_' + cut + '_histo.root' with ROOT.TFile(fhisto, 'RECREATE'): for h in histos_list[i]: - try: - h.Scale( - 1. * process_dict[process_name]["crossSection"] * - process_dict[process_name]["kfactor"] * - process_dict[process_name]["matchingEfficiency"] / - process_events[process_name]) - except KeyError: - LOGGER.warning( - 'No value defined for process %s in dictionary!', - process_name) - if h.Integral(0, -1) > 0: - h.Scale(1./h.Integral(0, -1)) + if do_scale: + h.Scale(gen_sf * int_lumi / process_events[process_name]) h.Write() + # write all meta info to the output file + p = ROOT.TParameter(int)("eventsProcessed", process_events[process_name]) + p.Write() + # take sum of weights=eventsProcessed for now (assume weights==1) + p = ROOT.TParameter(float)("sumOfWeights", process_events[process_name]) + p.Write() + p = ROOT.TParameter(float)("intLumi", int_lumi) + p.Write() + p = ROOT.TParameter(float)("crossSection", xsec) + p.Write() + p = ROOT.TParameter(float)("kfactor", kfactor) + p.Write() + p = ROOT.TParameter(float)("matchingEfficiency", + matchingEfficiency) + p.Write() + if do_tree: # test that the snapshot worked well validfile = testfile(fout_list[i])