From d9246538bdc27c3a90524df5d6ae1116bfc9685e Mon Sep 17 00:00:00 2001 From: Juraj Smiesko <34742917+kjvbrt@users.noreply.github.com> Date: Thu, 25 Jan 2024 10:52:46 +0100 Subject: [PATCH] Splitting FCCAnalysisRun (#335) * Splitting FCCAnalysisRun * Check for HTCondor commands * Improved batch submition * Using with context for ROOT files * Adding type hints * Updated man pages --- .github/scripts/man2html.sh | 4 +- bin/fccanalysis | 74 +- .../higgs/mH-recoil/mumu/analysis_final.py | 2 + man/man1/fccanalysis-build.1 | 60 + man/man1/fccanalysis-run.1 | 150 +-- man/man1/fccanalysis-test.1 | 69 +- man/man1/fccanalysis.1 | 124 +- man/man7/fccanalysis-file.7 | 122 -- man/man7/fccanalysis-script.7 | 202 +++ python/FCCAnalysisRun.py | 1133 ----------------- python/FCCAnalysisSetup.py | 12 - python/Parsers.py | 114 -- python/analysis_builder.py | 65 - python/{anafile.py => anascript.py} | 50 +- python/anascript.pyi | 8 + python/build_analysis.py | 14 +- python/doPlots.py | 712 ----------- python/do_plots.py | 807 ++++++++++++ python/init_analysis.py | 121 ++ python/parsers.py | 174 +++ python/process.py | 100 +- python/process.pyi | 12 + python/run_analysis.py | 924 ++++++++++++++ python/run_final_analysis.py | 496 ++++++++ python/{test.py => test_fccanalyses.py} | 0 25 files changed, 3072 insertions(+), 2477 deletions(-) create mode 100644 man/man1/fccanalysis-build.1 delete mode 100644 man/man7/fccanalysis-file.7 create mode 100644 man/man7/fccanalysis-script.7 delete mode 100644 python/FCCAnalysisRun.py delete mode 100644 python/FCCAnalysisSetup.py delete mode 100644 python/Parsers.py delete mode 100644 python/analysis_builder.py rename python/{anafile.py => anascript.py} (92%) create mode 100644 python/anascript.pyi delete mode 100644 python/doPlots.py create mode 100644 python/do_plots.py create mode 100644 python/init_analysis.py create mode 100644 python/parsers.py create mode 100644 python/process.pyi create mode 100644 python/run_analysis.py create mode 100644 python/run_final_analysis.py rename python/{test.py => test_fccanalyses.py} (100%) diff --git a/.github/scripts/man2html.sh b/.github/scripts/man2html.sh index 683e988df8..db72d29b87 100755 --- a/.github/scripts/man2html.sh +++ b/.github/scripts/man2html.sh @@ -3,9 +3,9 @@ mkdir -p ./man/html cd ./man/man1 for MANFILE in *; do - mandoc -Thtml -Ostyle=/css/mandoc.css "${MANFILE}" > "../html/${MANFILE::-1}html" + mandoc -Thtml -Ostyle=../../css/mandoc.css "${MANFILE}" > "../html/${MANFILE::-1}html" done cd ../man7 for MANFILE in *; do - mandoc -Thtml -Ostyle=/css/mandoc.css "${MANFILE}" > "../html/${MANFILE::-1}html" + mandoc -Thtml -Ostyle=../../css/mandoc.css "${MANFILE}" > "../html/${MANFILE::-1}html" done diff --git a/bin/fccanalysis b/bin/fccanalysis index cf466e0c2a..64d51bd743 100755 --- a/bin/fccanalysis +++ b/bin/fccanalysis @@ -1,8 +1,20 @@ #!/usr/bin/env python3 +''' +Starting point (executable) for fccanalysis command +''' import argparse import logging +from parsers import setup_subparsers +from init_analysis import init_analysis +from build_analysis import build_analysis +from test_fccanalyses import test_fccanalyses +from pin_analysis import PinAnalysis +from run_analysis import run +from run_final_analysis import run_final +from do_plots import do_plots + class MultiLineFormatter(logging.Formatter): ''' @@ -30,9 +42,12 @@ class MultiLineFormatter(logging.Formatter): def main(): + ''' + Starting point for fccanalysis command + ''' parser = argparse.ArgumentParser(description='FCCAnalyses v0.8.0') - # Verbosity settings + # Verbosity arguments verbosity_argument_group = parser.add_mutually_exclusive_group() verbosity_argument_group.add_argument('-v', '--verbose', action='store_true', @@ -42,42 +57,16 @@ def main(): help='make output more verbose') verbosity_argument_group.add_argument('-vvv', '--most-verbose', action='store_true', - help='make output most verbose') - - # Create sub-parsers - subparsers = parser.add_subparsers(help='types of running modes', - dest='command') - parser_init = subparsers.add_parser( - 'init', - help="generate a RDataFrame based FCC analysis") - parser_build = subparsers.add_parser( - 'build', - help='build and install local analysis') - parser_test = subparsers.add_parser( - 'test', - help='test whole or a part of the analysis framework') - parser_pin = subparsers.add_parser( - 'pin', - help='pin fccanalyses to the current version of Key4hep stack') - parser_run = subparsers.add_parser( - 'run', - help="run a RDataFrame based FCC analysis") - parser_run_final = subparsers.add_parser( - 'final', - help="run a RDataFrame based FCC analysis final configuration") - parser_run_plots = subparsers.add_parser( - 'plots', - help="run a RDataFrame based FCC analysis plot configuration") - - # Setup sub-parsers - import Parsers as fccpars - fccpars.setup_init_parser(parser_init) - fccpars.setup_build_parser(parser_build) - fccpars.setup_test_parser(parser_test) - fccpars.setup_pin_parser(parser_pin) - fccpars.setup_run_parser(parser_run) - fccpars.setup_run_parser_final(parser_run_final) - fccpars.setup_run_parser_plots(parser_run_plots) + help='make output even more verbose') + + # Sub-parsers + subparsers = parser.add_subparsers( + title='sub-commands', + metavar='sub-command', + dest='command', + help='one of the available sub-commands') + + setup_subparsers(subparsers) args = parser.parse_args() @@ -94,19 +83,18 @@ def main(): logger.addHandler(stream_handler) if args.command == 'init': - from FCCAnalysisSetup import setup - setup(parser) + init_analysis(parser) elif args.command == 'build': - from build_analysis import build_analysis build_analysis(parser) elif args.command == 'test': - from test import test_fccanalyses test_fccanalyses(parser) elif args.command == 'pin': - from pin_analysis import PinAnalysis PinAnalysis(parser) + elif args.command == 'final': + run_final(parser) + elif args.command == 'plots': + do_plots(parser) else: - from FCCAnalysisRun import run run(parser) diff --git a/examples/FCCee/higgs/mH-recoil/mumu/analysis_final.py b/examples/FCCee/higgs/mH-recoil/mumu/analysis_final.py index e9c3b1e3f4..e1075a43c3 100644 --- a/examples/FCCee/higgs/mH-recoil/mumu/analysis_final.py +++ b/examples/FCCee/higgs/mH-recoil/mumu/analysis_final.py @@ -22,6 +22,8 @@ #produces ROOT TTrees, default is False doTree = False +saveTabular = True + ###Dictionnay of the list of cuts. The key is the name of the selection that will be added to the output file cutList = {"sel0":"Zcand_q == 0", "sel1":"Zcand_q == -1 || Zcand_q == 1", diff --git a/man/man1/fccanalysis-build.1 b/man/man1/fccanalysis-build.1 new file mode 100644 index 0000000000..62e418ffcc --- /dev/null +++ b/man/man1/fccanalysis-build.1 @@ -0,0 +1,60 @@ +.\" Manpage for fccanalysis-build +.\" Contact FCC-PED-SoftwareAndComputing-Analysis@cern.ch to correct errors or typos. +.TH FCCANALYSIS\-BUILD 1 "17 Jan 2024" "0.8.0" "fccanalysis-build man page" +.SH NAME +\fBfccanalysis\-build\fR \(en helper to build FCCAnalyses framework locally +.SH SYNOPSIS +.B fccanalysis build +[\fB\-h\fR | \fB\-\-help\fR] +[\fB\-c\fR] +[\fB\-j\fR \fIBUILD_THREADS\fR] +.SH DESCRIPTION +.B fccanalysis\-build +will build the whole FCCAnalyses framework\&. + +This sub-command is not intended as a replacement for the CMake build procedure, +it just makes this task a little easier\&. The helper will build the framework in +the directory: build and install it into the directory: install\&. +.SH OPTIONS +.TP +.BR \-h ", " \-\-help +Prints short help message and exits\&. +.TP +.BR \-c ", " \-\-clean\-build +Completely clean build directory before compilation\&. +.TP +\fB\-j\fR \fIBUILD_THREADS\fR, \fB\-\-build\-threads\fR \fIBUILD_THREADS\fR +Specify number of threads used when compiling (equivalent to `make -j`) +.SH SEE ALSO +fccanalysis(1), fccanalysis\-test(7), cmake(1) +.SH BUGS +Many +.SH AUTHORS +There are many contributors to the FCCAnalyses framework, but the principal +authors are: +.in +4 +Clement Helsens +.br +Valentin Volk +.br +Gerardo Ganis +.SH FCCANALYSES +Part of the FCCAnalyses framework\&. +.SH LINKS +.PP +.UR https://hep-fcc\&.github\&.io/FCCAnalyses/ +FCCAnalyses webpage +.UE +.PP +.UR https://github\&.com/HEP\-FCC/FCCAnalyses/ +FCCAnalysises GitHub repository +.UE +.PP +.UR https://fccsw\-forum\&.web\&.cern\&.ch/ +FCCSW Forum +.UE +.SH CONTACT +.pp +.MT FCC-PED-SoftwareAndComputing-Analysis@cern.ch +FCC-PED-SoftwareAndComputing-Analysis +.ME diff --git a/man/man1/fccanalysis-run.1 b/man/man1/fccanalysis-run.1 index 1b7dfad0ca..f6b416be1f 100644 --- a/man/man1/fccanalysis-run.1 +++ b/man/man1/fccanalysis-run.1 @@ -1,125 +1,89 @@ -.\" Manpage for fccanalysis -.\" Contact fcc-experiments-sw-dev@cern.ch to correct errors or typos. -.TH FCCANALYSIS\-RUN 1 "24 May 2023" "0.8.0" "fccanalysis-run man page" +.\" Manpage for fccanalysis-run +.\" Contact FCC-PED-SoftwareAndComputing-Analysis@cern.ch to correct errors or typos. +.TH FCCANALYSIS\-RUN 1 "17 Jan 2024" "0.8.0" "fccanalysis-run man page" .SH NAME -fccanalysis\-run \- run FCC analysis +\fBfccanalysis\-run\fR \- run FCC analysis .SH SYNOPSIS -.sp -.nf -\fIfccanalysis run\fR [\-h | \-\-help] [\-\-files\-list FILES_LIST [FILES_LIST ...]] - [\-\-output OUTPUT] [\-\-nevents NEVENTS] [\-\-test] [\-\-bench] - [\-\-ncpus NCPUS] [\-\-preprocess] [\-\-validate] [\-\-rerunfailed] - [\-\-jobdir JOBDIR] -.fi -.sp +.B fccanalysis run +[\fB\-h\fR | \fB\-\-help\fR] +[\fB\-\-files\-list\fR \fIFILES_LIST\fR [\fIFILES_LIST\fR ...]] +[\fB\-\-output\fR \fIOUTPUT\fR] +[\fB\-\-nevents\fR \fINEVENTS\fR] +[\fB\-\-test\fR] +[\fB\-\-bench\fR] +[\fB\-\-ncpus\fR \fINCPUS\fR] +.I analysis-script .SH DESCRIPTION -\fIfccanalysis-run\fR will run analysis provided in the analysis file\&. The +.B fccanalysis-run +will run analysis provided in the analysis file\&. The analysis itself can be divided into several stages if desired\&. For all those stages \fIfccanalysis-run\fR is used\&. -.sp -When using \fIfccanalysis-run\fR the analysis is running in the managed mode, -where the RDataFrame provided is steered by the framework and users can control -some aspects of the running with additional global attributes, see -\fBfccanalysis-file\fR(8). + +When using \fBfccanalysis-run\fR the analysis is running in the managed mode, +where the RDataFrame is steered by the framework and users can control some +aspects of the running with additional global attributes, see +\fIfccanalysis-script\fR(8). .SH OPTIONS -.PP - -.RS 4 -Path to analysis file\&. -.RE -.PP -\-\-help -.RS 4 +.TP +.I analysis-script +Path to analysis script\&. +.TP +.BR \-h ", " \-\-help Prints short help message and exits\&. -.RE -.PP -\-\-files\-list FILES_LIST [FILES_LIST ...] -.RS 4 +.TP +\fB\-\-files\-list\fR \fIFILES_LIST\fR [\fIFILES_LIST\fR ...] Specify input file to bypass the processList\&. -.RE -.PP -\-\-output OUTPUT -.RS 4 +.TP +\fB\-\-output\fR \fIOUTPUT\fR Specify output file name to bypass the processList and or outputList, default \fIoutput.root\fR\&. -.RE -.PP -\-\-nevents NEVENTS -.RS 4 +.TP +\fB\-\-nevents\fR \fINEVENTS\fR Specify max number of events to process\&. -.RE -.PP -\-\-test -.RS 4 +.TP +.B \-\-test Run over the test file\&. -.RE -.PP -\-\-bench -.RS 4 +.TP +.B \-\-bench Output benchmark results to a JSON file\&. -.RE -.PP -\-\-ncpus NCPUS -.RS 4 +.TP +\-\-ncpus \fINCPUS\fR Set number of threads\&. -.RE -.PP -\-\-preprocess -.RS 4 -Run preprocessing\&. -.RE -.PP -\-\-validate -.RS 4 -Validate a given production\&. -.RE -.PP -\-\-rerunfailed -.RS 4 -Rerun failed jobs\&. -.RE -.PP -\-\-jobdir JOBDIR -.RS 4 -Specify the batch job directory\&. -.RE .SH ENVIRONMENT VARIABLES -.PP -\fBFCCDICTSDIR\fR -.RS 4 +.TP +.B FCCDICTSDIR Controls search path for the process dictionaries. The default value is \fI/cvmfs/fcc.cern.ch/FCCDicts/\fR\&. -.RE .SH SEE ALSO -fccanalysis(1), fccanalysis-file(7) +fccanalysis(1), fccanalysis-script(7) .SH BUGS Many .SH AUTHORS There are many contributors to the FCCAnalyses framework, but the principal authors are: -.br -.RS 4 +.in +4 Clement Helsens .br Valentin Volk .br Gerardo Ganis -.RE .SH FCCANALYSES Part of the FCCAnalyses framework\&. .SH LINKS .PP -1\&. FCCAnalyses webpage -.RS 4 -https://hep-fcc\&.github\&.io/FCCAnalyses/ -.RE -.PP -2\&. FCCAnalysises GitHub repository -.RS -https://github\&.com/HEP\-FCC/FCCAnalyses/ -.RE -.PP -3\&. FCCSW Forum -.RS -https://fccsw\-forum\&.web\&.cern\&.ch/ -.RE +.UR https://hep-fcc\&.github\&.io/FCCAnalyses/ +FCCAnalyses webpage +.UE +.PP +.UR https://github\&.com/HEP\-FCC/FCCAnalyses/ +FCCAnalysises GitHub repository +.UE +.PP +.UR https://fccsw\-forum\&.web\&.cern\&.ch/ +FCCSW Forum +.UE +.SH CONTACT +.pp +.MT FCC-PED-SoftwareAndComputing-Analysis@cern.ch +FCC-PED-SoftwareAndComputing-Analysis +.ME diff --git a/man/man1/fccanalysis-test.1 b/man/man1/fccanalysis-test.1 index 049fa80011..e7921c348f 100644 --- a/man/man1/fccanalysis-test.1 +++ b/man/man1/fccanalysis-test.1 @@ -1,30 +1,33 @@ .\" Manpage for fccanalysis-test -.\" Contact fcc-experiments-sw-dev@cern.ch to correct errors or typos. -.TH FCCANALYSIS\-TEST 1 "17 June 2023" "0.8.0" "fccanalysis-test man page" +.\" Contact FCC-PED-SoftwareAndComputing-Analysis@cern.ch to correct errors or typos. +.TH FCCANALYSIS\-TEST 1 "17 Jan 2024" "0.8.0" "fccanalysis-test man page" .SH NAME -fccanalysis\-test \- test FCCAnalyses framework +\fBfccanalysis\-test\fR \(en test FCCAnalyses framework .SH SYNOPSIS -.sp -.nf -\fIfccanalysis test\fR [\-h | \-\-help] [-R TESTS_REGEX] [-E EXCLUDE_REGEX] -.fi -.sp +.B fccanalysis test +[\fB\-h\fR | \fB\-\-help\fR] +[\fB\-R\fR \fITESTS_REGEX\fR] +[\fB\-E\fR \fIEXCLUDE_REGEX\fR] +[\fB\-j\fR \fIPARALLEL\fR] .SH DESCRIPTION -\fIfccanalysis\-test\fR will test the whole or a part of the FCC analysis -framework\&. +.B fccanalysis\-test +will test the whole or a part of the FCC analysis framework\&. + It is not intended as a replacement for running \fIctest\fR command directly in the build directory, it is just convenience shortcut. .SH OPTIONS -.PP -\-h, \-\-help -.RS 4 +.TP +.BR \-h ", " \-\-help Prints short help message and exits\&. -.RE -.PP -[test_regex] -.RS 4 +.TP +\fB\-R\fR \fITESTS_REGEX\fR Specify subset of tests to be run\&. -.RE +.TP +\fB\-E\fR \fIEXCLUDE_REGEX\fR +Specify subset of tests to ignore\&. +.TP +\fB\-j\fR \fIPARALLEL\fR, \fB\-\-parallel\fR \fIPARALLEL\fI +Number of tests running in parallel (equivalent to `ctest -j`)\&. .SH SEE ALSO fccanalysis(1), fccanalysis\-build(7), ctest(1) .SH BUGS @@ -32,29 +35,29 @@ Many .SH AUTHORS There are many contributors to the FCCAnalyses framework, but the principal authors are: -.br -.RS 4 +.in +4 Clement Helsens .br Valentin Volk .br Gerardo Ganis -.RE .SH FCCANALYSES Part of the FCCAnalyses framework\&. .SH LINKS .PP -1\&. FCCAnalyses webpage -.RS 4 -https://hep-fcc\&.github\&.io/FCCAnalyses/ -.RE +.UR https://hep-fcc\&.github\&.io/FCCAnalyses/ +FCCAnalyses webpage +.UE .PP -2\&. FCCAnalysises GitHub repository -.RS -https://github\&.com/HEP\-FCC/FCCAnalyses/ -.RE +.UR https://github\&.com/HEP\-FCC/FCCAnalyses/ +FCCAnalysises GitHub repository +.UE .PP -3\&. FCCSW Forum -.RS -https://fccsw\-forum\&.web\&.cern\&.ch/ -.RE +.UR https://fccsw\-forum\&.web\&.cern\&.ch/ +FCCSW Forum +.UE +.SH CONTACT +.pp +.MT FCC-PED-SoftwareAndComputing-Analysis@cern.ch +FCC-PED-SoftwareAndComputing-Analysis +.ME diff --git a/man/man1/fccanalysis.1 b/man/man1/fccanalysis.1 index 07261a3c39..ece72b5cfb 100644 --- a/man/man1/fccanalysis.1 +++ b/man/man1/fccanalysis.1 @@ -1,102 +1,86 @@ .\" Manpage for fccanalysis -.\" Contact fcc-experiments-sw-dev@cern.ch to correct errors or typos. -.TH FCCANALYSIS 1 "19 Oct 2023" "0.8.0" "fccanalysis man page" +.\" Contact FCC-PED-SoftwareAndComputing-Analysis@cern.ch to correct errors or typos. +.TH FCCANALYSIS 1 "17 Jan 2024" "0.8.0" "fccanalysis man page" .SH NAME -fccanalysis \- build and run your FCC analysis +\fBfccanalysis\fR \(en write, build and run your FCC analysis .SH SYNOPSIS -.sp -.nf -\fIfccanalysis\fR [\-h] [\-v | \-vv | \-vvv] [] -.fi -.sp +.B fccanalysis +[\fB\-h\fR] +[\fB\-v\fR | \fB\-vv\fR | \fB\-vvv\fR] +<\fIsub-command\fR> +[<\fIargs\fR>] .SH DESCRIPTION -\fIfccanalysis\fR is the manager program for the FCC analysis, which can build, +.B fccanalysis +is the manager program for the FCC analysis, which can build, run or submit analysis to the batch\&. .SH OPTIONS -.PP -\-h, \-\-help -.RS 4 +.TP +.BR \-h ", " \-\-help Prints short help message and exits\&. -.RE -.PP -\-v, \-\-verbose -.RS 4 +.TP +.BR \-v ", " \-\-verbose Makes output verbose. RDataFrame verbosity level is set to ROOT.Experimental.ELogLevel.kInfo\&. -.RE -.PP -\-vv, \-\-more\-verbose -.RS 4 +.TP +.BR \-vv ", " \-\-more\-verbose Makes output more verbose. RDataFrame verbosity level is set to ROOT.Experimental.ELogLevel.kDebug\&. -.RE -.PP -\-vvv, \-\-most\-verbose -.RS 4 +.TP +.BR \-vvv ", " \-\-most\-verbose Makes output even more verbose. RDataFrame verbosity level is set to ROOT.Experimental.ELogLevel.kDebug+10 \-\-\- outputs also generated code\&. -.RE -.SH COMMANDS -.PP -\fBfccanalysis-init\fR -.RS 4 -[Experimental] Generate a RDataFrame based FCC analysis\&. -.RE -.PP -\fBfccanalysis-build\fR -.RS 4 -Builds and installs the FCCAnalyses framework\&. -.RE -.PP -\fBfccanalysis-pin\fR -.RS 4 +.SH SUB-COMMANDS +.TP +.B fccanalysis-init +[Experimental] Generate a RDataFrame based FCC analysis package\&. +.TP +.B fccanalysis-build +Helper to build and install FCCAnalyses framework\&. +.TP +.B fccanalysis-pin Pin analysis to the currently sourced version of the Key4hep stack\&. -.RE -.PP -\fBfccanalysis-run\fR(1) -.RS 4 -Runs the stage analysis file provided\&. -.RE -.PP -\fBfccanalysis-final\fR -.RS 4 -Runs the finalization analysis file provided\&. -.RE -.PP -\fBfccanalysis-plots\fR -.RS 4 +.TP +.B fccanalysis-run +Runs the analysis script provided\&. +.TP +.B fccanalysis-final +Runs the finalization analysis script provided\&. +.TP +.B fccanalysis-plots Generate plots based on the plots analysis file provided\&. -.RE +.TP +.B fccanalysis-test +Helper to run tests of the full FCCAnalyses framework\&. .SH SEE ALSO -fccanalysis-run(1), fccanalysis-file(7) +fccanalysis\-run(1), fccanalysis\-script(7) .SH BUGS Many .SH AUTHORS There are many contributors to the FCCAnalyses framework, but the principal authors are: -.br -.RS 4 +.in +4 Clement Helsens .br Valentin Volk .br Gerardo Ganis -.RE .SH FCCANALYSES Part of the FCCAnalyses framework\&. .SH LINKS .PP -1\&. FCCAnalyses webpage -.RS 4 -https://hep\-fcc\&.github\&.io/FCCAnalyses/ -.RE +.UR https://hep-fcc\&.github\&.io/FCCAnalyses/ +FCCAnalyses webpage +.UE .PP -2\&. FCCAnalysises GitHub repository -.RS -https://github\&.com/HEP-FCC/FCCAnalyses/ -.RE +.UR https://github\&.com/HEP\-FCC/FCCAnalyses/ +FCCAnalysises GitHub repository +.UE .PP -3\&. FCCSW Forum -.RS -https://fccsw\-forum\&.web\&.cern\&.ch/ -.RE +.UR https://fccsw\-forum\&.web\&.cern\&.ch/ +FCCSW Forum +.UE +.SH CONTACT +.pp +.MT FCC-PED-SoftwareAndComputing-Analysis@cern.ch +FCC-PED-SoftwareAndComputing-Analysis +.ME diff --git a/man/man7/fccanalysis-file.7 b/man/man7/fccanalysis-file.7 deleted file mode 100644 index 1c535081e9..0000000000 --- a/man/man7/fccanalysis-file.7 +++ /dev/null @@ -1,122 +0,0 @@ -.\" Manpage for fccanalysis-file -.\" Contact fcc-experiments-sw-dev@cern.ch to correct errors or typos. -.TH FCCANALYSIS\-FILE 7 "24 May 2023" "0.8.0" "fccanalysis-file man page" -.SH NAME -fccanalysis\-file \- analysis file specification -.SH SYNOPSIS -.sp -* -.sp -.SH DESCRIPTION -The analysis file is expected to be a valid Python script containing either -part of or the full analysis. There are two basic modes how to run a FCC -analysis, one is to run in the managed mode like so: -.PP -.RS 4 -fcc run -.RE -.PP -or -.PP -.RS 4 -fcc final -.RE -.PP -where user needs to provide minimal number of variables and settings. In this -mode the RDataFrame is managed for the user and it can be controlled by defining -several global attributes in the analysis file. The other mode is to run the -analysis file as regular python script: -.PP -.RS 4 -python -.RE -.PP -here user has full control over the RDataFrame, but has to create all necessary -scaffolding\&. -.SH ATTRIBUTES -In case of running the FCCAnalysis in the managed mode user can use the -following global attributes to control the behavior of the analysis. -.PP -\fBprocDict\fR -.RS 4 -This variable controls which process dictionary will be used. It can be either -simple file name, absolute path or url. In the case of simple filename, the file -is being searched for first in the working directory and then at the locations -indicated in the $FCCDICTSDIR environment variable. -.RE -.PP -\fBprodTag\fR -.RS 4 -Provides information where to find input files. There are several way how to -find the information, one of them uses YAML file which is being searched for in -the subfolders of $FCCDICTSDIR. -.RE -.PP -\fBprocessList\fR -.RS 4 -Dictionary of process samples to be run over. Each process can have several -parameters: -.br -\fIfraction\fR -.RS 4 -The analysis will run over reduced number of input files roughly corresponding -to the fraction of total events specified\&. -.br -Default value: 1 (full process sample) -.RE -\fIoutput\fR -.RS 4 -Specifies the stem for the output file(s)\&. The stem will be used to create -output directory if there is more than one chunk or as a filename if there is -only one\&. -.br -Default value: output\&.root -.RE -\fIchunks\fR -.RS 4 -The analysis RDataFrame can be split into several chunks\&. -.br -Default value: 1 -.RE -.RE -.PP -\fBoutputDir\fR -.RS 4 -User can specify the directory for the output files. The output directory can be -overwriten by specifiing absolute path with `\-\-output` commandline argument\&. -.RE -.PP -This section is under construction. You are invited to help :) -.SH SEE ALSO -fccanalysis(1), fccanalysis-run(1) -.SH BUGS -Many -.SH AUTHORS -There are many contributors to the FCCAnalyses framework, but the principal -authors are: -.br -.RS 4 -Clement Helsens -.br -Valentin Volk -.br -Gerardo Ganis -.RE -.SH FCCANALYSES -Part of the FCCAnalyses framework\&. -.SH LINKS -.PP -1\&. FCCAnalyses webpage -.RS 4 -https://hep-fcc\&.github\&.io/FCCAnalyses/ -.RE -.PP -2\&. FCCAnalysises GitHub repository -.RS -https://github\&.com/HEP\-FCC/FCCAnalyses/ -.RE -.PP -3\&. FCCSW Forum -.RS -https://fccsw\-forum\&.web\&.cern\&.ch/ -.RE diff --git a/man/man7/fccanalysis-script.7 b/man/man7/fccanalysis-script.7 new file mode 100644 index 0000000000..eb68ef1707 --- /dev/null +++ b/man/man7/fccanalysis-script.7 @@ -0,0 +1,202 @@ +.\" Manpage for fccanalysis-script +.\" Contact FCC-PED-SoftwareAndComputing-Analysis@cern.ch to correct errors or typos. +.TH FCCANALYSIS\-SCRIPT 7 "17 Jan 2024" "0.8.0" "fccanalysis-script man page" +.SH NAME +\fBfccanalysis\-script\fR \(en analysis steering script specification +.SH SYNOPSIS +.IP +* +.SH DESCRIPTION +.PP +The analysis script is expected to be a valid Python script containing either +part of or the full analysis. There are two basic modes how to run an +analysis, one is to run in the managed mode like so: +.IP +fccanalysis run \fIanalysis_script.py\fR + +.RE +or +.IP +fccanalysis final \fIanalysis_script.py\fR + +.RE +.PP +where user needs to provide minimal number of variables and settings. In this +mode the RDataFrame is managed for the user and it can be controlled by defining +several global attributes in the analysis script. The other mode is to run the +analysis script as a regular python script: +.IP +python \fIanalysis_script.py\fR +.RE +.PP +here user has full control over the RDataFrame, but has to create all necessary +scaffolding\&. +.PP +It is expected that the whole analysis will be split into several stages and +it can be done in one of the two styles: +.IP +anascript_stage1.py \-> anascript_stage2.py \-> ... \-> anascript_stage_final.py \-> plots.py + +.RE +or +.IP +analysis_histmaker.py \-> plots.py + +.RE +In the case of the first style there are at least three stages required +(anascript_stage1.py, anascript_stage_final.py, plots.py) and there is no upper +limit on the number of stages. In the case of the second style only two stages +are required. The first style is named "staged analysis" and the second +"histmaker analysis". +.TP +\fBstaged analysis\fR +The analysis script needs to contain \fIRDFanalysis\fR class of the following +structure: +.IP +class RDFanalysis(): + def analysers(df): + df2 = ( + df + # define the muon collection + .Define("muons", "ReconstructedParticle::get(Muon0, ReconstructedParticles)") + ... + ) + return df2 + def output(): + return ["muons", "muon_mass"] +.TP +\fBhistmaker analysis\fR +The analysis script needs to contain \fIbuild_graph\fR function of the following +structure: +.IP +def build_graph(df, dataset): + results = [] + df = df.Define("weight", "1.0") + weightsum = df.Sum("weight") + df = df.Define("muons", "FCCAnalyses::ReconstructedParticle::sel_p(20)(muons_all)") + ... + results.append(df.Histo1D(("muons_p_cut0", "", *bins_p_mu), "muons_p")) + return results, weightsum +.TP +\fBplots script\fR +This stage does not require neither \fIRDFanalysis\fR class neither +\fIbuild_graph\fR function, it has it's own set of attributes, please see the +examples in the \fIexamples\fR directory. +.SH ATTRIBUTES +In case of running the FCCAnalysis in the managed mode user can use the +following global attributes to control the behavior of the analysis. +.TP +\fBprocessList\fR (mandatory) +Dictionary of process samples to be run over. Each process can have several +parameters: +\fIfraction\fR +.in +4 +The analysis will run over reduced number of input files roughly corresponding +to the fraction of total events specified\&. +.br +Default value: 1 (full process sample) +.in -4 +\fIoutput\fR +.in +4 +Specifies the stem for the output file(s)\&. The stem will be used to create +output directory if there is more than one chunk or as a filename if there is +only one\&. +.br +Default value: output\&.root +.in -4 +\fIchunks\fR +.in +4 +The analysis RDataFrame can be split into several chunks\&. +.br +Default value: 1 +.TP +\fBprodTag\fR (mandatory) +Provides information where to find input files. There are several way how to +find the information, one of them uses YAML file which is being searched for in +the subfolders of $FCCDICTSDIR\&. +.TP +\fBoutputDir\fR (mandatory) +User can specify the directory for the output files. The output directory can be +overwriten by specifiing absolute path with `\-\-output` commandline argument\&. +.TP +\fBanalysisName\fR (optional) +Optional name for the analysis +.br +Default value: empty string +.TP +\fBnCPUS\fR (optional) +Number of threads the RDataFrame will use\&. +.br +Default value: 4 +.TP +\fBrunBatch\fR (optional) +Run the analysis on the HTCondor batch system. +.br +Default value: False +.TP +\fBbatchQueue\fR (optional) +Batch queue name when running on HTCondor. +.br +Default value: "longlunch" +.TP +\fBcompGroup\fR (optional) +Computing account when running on HTCondor. +.br +Default value: "group_u_FCC.local_gen" +.TP +\fBoutputDirEos\fR (optional) +Output directory on EOS, if specified files will be copied there once the batch +job is done. +.br +Default value: empty string +.TP +\fBeosType\fR (mandatory if \fIoutputDirEos\fR is used) +Type of the EOS proxy to be used. +.br +Default value: empty string +.TP +\fBtestFile\fR (optional) +Location of the test file. +.br +Default value: empty string +.TP +.B procDict +This variable controls which process dictionary will be used. It can be either +simple file name, absolute path or url. In the case of simple filename, the file +is being searched for first in the working directory and then at the locations +indicated in the $FCCDICTSDIR environment variable. +.PP +This section is under construction. You are invited to help :) +.SH SEE ALSO +fccanalysis(1), fccanalysis-run(1) +.SH BUGS +Many +.SH AUTHORS +There are many contributors to the FCCAnalyses framework, but the principal +authors are: +.in +4 +Clement Helsens +.br +Valentin Volk +.br +Gerardo Ganis +.SH FCCANALYSES +Part of the FCCAnalyses framework\&. +.SH LINKS +.PP +.UR https://hep-fcc\&.github\&.io/FCCAnalyses/ +FCCAnalyses webpage +.UE +.PP +.UR https://github\&.com/HEP\-FCC/FCCAnalyses/ +FCCAnalysises GitHub repository +.UE +.PP +.UR https://fccsw\-forum\&.web\&.cern\&.ch/ +FCCSW Forum +.UE +.SH CONTACT +.pp +.MT FCC-PED-SoftwareAndComputing-Analysis@cern.ch +FCC-PED-SoftwareAndComputing-Analysis +.ME diff --git a/python/FCCAnalysisRun.py b/python/FCCAnalysisRun.py deleted file mode 100644 index b1f7072497..0000000000 --- a/python/FCCAnalysisRun.py +++ /dev/null @@ -1,1133 +0,0 @@ -import os -import sys -import time -import yaml -import glob -import json -import logging -import subprocess -import importlib.util -import datetime -import numpy as np - -import ROOT -from anafile import getElement, getElementDict -from process import getProcessInfo, get_process_dict - -LOGGER = logging.getLogger('FCCAnalyses.run') - -ROOT.gROOT.SetBatch(True) - -# __________________________________________________________ -def get_entries(infilepath): - ''' - Get number of original entries and number of actual entries in the file - ''' - infile = ROOT.TFile.Open(infilepath) - infile.cd() - - processEvents = 0 - try: - processEvents = infile.Get('eventsProcessed').GetVal() - except AttributeError: - LOGGER.warning('Input file is missing information about ' - 'original number of events!') - - eventsTTree = 0 - try: - eventsTTree = infile.Get("events").GetEntries() - except AttributeError: - LOGGER.error('Input file is missing "events" TTree!\nAborting...') - infile.Close() - sys.exit(3) - - infile.Close() - - return processEvents, eventsTTree - - -# __________________________________________________________ -def getsubfileList(in_file_list, event_list, fraction): - nevts_total = sum(event_list) - nevts_target = int(nevts_total * fraction) - - if nevts_target <= 0: - LOGGER.error('The reduction fraction %f too stringent, no events ' - 'left!\nAborting...', fraction) - sys.exit(3) - - nevts_real = 0 - out_file_list = [] - for i in range(len(event_list)): - if nevts_real >= nevts_target: - break - nevts_real += event_list[i] - out_file_list.append(in_file_list[i]) - - info_msg = f'Reducing the input file list by fraction "{fraction}" of ' - info_msg += 'total events:\n\t' - info_msg += f'- total number of events: {nevts_total:,}\n\t' - info_msg += f'- targeted number of events: {nevts_target:,}\n\t' - info_msg += '- number of events in the resulting file list: ' - info_msg += f'{nevts_real:,}\n\t' - info_msg += '- number of files after reduction: ' - info_msg += str((len(out_file_list))) - LOGGER.info(info_msg) - - return out_file_list - - -# __________________________________________________________ -def getchunkList(fileList, chunks): - chunk_list = list(np.array_split(fileList, chunks)) - chunk_list = [chunk for chunk in chunk_list if chunk.size > 0] - - return chunk_list - - -# __________________________________________________________ -def saveBenchmark(outfile, benchmark): - benchmarks = [] - try: - with open(outfile, 'r') as benchin: - benchmarks = json.load(benchin) - except OSError: - pass - - benchmarks = [b for b in benchmarks if b['name'] != benchmark['name']] - benchmarks.append(benchmark) - - with open(outfile, 'w') as benchout: - json.dump(benchmarks, benchout, indent=2) - - -#__________________________________________________________ -def getCommandOutput(command): - p = subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.PIPE,universal_newlines=True) - (stdout,stderr) = p.communicate() - return {"stdout":stdout, "stderr":stderr, "returncode":p.returncode} - - -# __________________________________________________________ -def SubmitToCondor(cmd,nbtrials): - submissionStatus=0 - cmd=cmd.replace('//','/') # -> dav : is it needed? - for i in range(nbtrials): - outputCMD = getCommandOutput(cmd) - stderr=outputCMD["stderr"].split('\n') - stdout=outputCMD["stdout"].split('\n') # -> dav : is it needed? - - if len(stderr)==1 and stderr[0]=='' : - LOGGER.info('GOOD SUBMISSION') - submissionStatus=1 - else: - LOGGER.warning('Error while submitting, retrying...\n\t' - 'Trial: %i / %i\n\tError: %s', i, nbtrials, stderr) - time.sleep(10) - - if submissionStatus == 1: - return 1 - - if i == nbtrials-1: - LOGGER.error('Failed submitting after: %i trials, stop trying to ' - 'submit', nbtrials) - return 0 - -# __________________________________________________________ -def initialize(args, rdfModule, analysisFile): - - # for convenience and compatibility with user code - ROOT.gInterpreter.Declare("using namespace FCCAnalyses;") - geometryFile = getElement(rdfModule, "geometryFile") - readoutName = getElement(rdfModule, "readoutName") - if geometryFile!="" and readoutName!="": - ROOT.CaloNtupleizer.loadGeometry(geometryFile, readoutName) - - # set multithreading (no MT if number of events is specified) - ncpus = 1 - if args.nevents < 0: - if isinstance(args.ncpus, int) and args.ncpus >= 1: - ncpus = args.ncpus - else: - ncpus = getElement(rdfModule, "nCPUS") - if ncpus < 0: # use all available threads - ROOT.EnableImplicitMT() - ncpus = ROOT.GetThreadPoolSize() - ROOT.ROOT.EnableImplicitMT(ncpus) - ROOT.EnableThreadSafety() - - if ROOT.IsImplicitMTEnabled(): - LOGGER.info('Multithreading enabled. Running over %i threads', - ROOT.GetThreadPoolSize()) - else: - LOGGER.info('No multithreading enabled. Running in single thread...') - - # custom header files - includePaths = getElement(rdfModule, "includePaths") - if includePaths: - ROOT.gInterpreter.ProcessLine(".O3") - basepath = os.path.dirname(os.path.abspath(analysisFile))+"/" - for path in includePaths: - LOGGER.info('Loading %s...', path) - ROOT.gInterpreter.Declare(f'#include "{basepath}/{path}"') - - # check if analyses plugins need to be loaded before anything - # still in use? - analysesList = getElement(rdfModule, "analysesList") - if analysesList and len(analysesList) > 0: - _ana = [] - for analysis in analysesList: - LOGGER.info('Load cxx analyzers from %s...', analysis) - if analysis.startswith('libFCCAnalysis_'): - ROOT.gSystem.Load(analysis) - else: - ROOT.gSystem.Load(f'libFCCAnalysis_{analysis}') - if not hasattr(ROOT, analysis): - ROOT.error('Analysis %s not properly loaded!\nAborting...', - analysis) - sys.exit(4) - _ana.append(getattr(ROOT, analysis).dictionary) - -# __________________________________________________________ -def runRDF(rdfModule, inputlist, outFile, nevt, args): - df = ROOT.RDataFrame("events", inputlist) - - # limit number of events processed - if args.nevents > 0: - df = df.Range(0, args.nevents) - - try: - df2 = getElement(rdfModule.RDFanalysis, "analysers")(df) - - branch_list = ROOT.vector('string')() - blist = getElement(rdfModule.RDFanalysis, "output")() - for bname in blist: - branch_list.push_back(bname) - - df2.Snapshot("events", outFile, branch_list) - except Exception as excp: - LOGGER.error('During the execution of the analysis file exception ' - 'occurred:\n%s', excp) - sys.exit(3) - - return df2.Count() - - -# __________________________________________________________ -def sendToBatch(rdfModule, chunkList, process, analysisFile): - localDir = os.environ["LOCAL_DIR"] - current_date = datetime.datetime.fromtimestamp( - datetime.datetime.now().timestamp()).strftime('%Y-%m-%d_%H-%M-%S') - logDir = localDir + "/BatchOutputs/{}/{}".format(current_date, process) - if not os.path.exists(logDir): - os.system("mkdir -p {}".format(logDir)) - - # Making sure the FCCAnalyses libraries are compiled and installed - try: - subprocess.check_output(['make', 'install'], - cwd=localDir+'/build', - stderr=subprocess.DEVNULL - ) - except subprocess.CalledProcessError as e: - LOGGER.error('The FCCanalyses libraries are not properly build and ' - 'installed!\nAborting job submission...') - sys.exit(3) - - outputDir = getElement(rdfModule, "outputDir") - outputDirEos = getElement(rdfModule, "outputDirEos") - eosType = getElement(rdfModule, "eosType") - userBatchConfig = getElement(rdfModule, "userBatchConfig") - - if outputDir!="" and outputDir[-1]!="/": outputDir+="/" - - condor_file_str = '' - for ch in range(len(chunkList)): - frunname = '{}/job{}_chunk{}.sh'.format(logDir, process, ch) - LOGGER.info('Script to run %s: ', frunname) - condor_file_str += frunname + " " - - frun = None - try: - frun = open(frunname, 'w') - except IOError as e: - LOGGER.warning('I/O error(%i): %s', e.errno, e.strerror) - time.sleep(10) - frun = open(frunname, 'w') - - subprocess.getstatusoutput('chmod 777 %s'%(frunname)) - frun.write('#!/bin/bash\n') - frun.write('source ' + localDir + '/setup.sh\n') - - # add userBatchConfig if any - if userBatchConfig != "": - if not os.path.isfile(userBatchConfig): - LOGGER.warning('userBatchConfig file does not exist, will not ' - 'add it to default config, please check') - else: - configFile=open(userBatchConfig) - for line in configFile: - frun.write(line+'\n') - - frun.write('mkdir job{}_chunk{}\n'.format(process,ch)) - frun.write('cd job{}_chunk{}\n'.format(process,ch)) - - if not os.path.isabs(outputDir): - frun.write(localDir + '/bin/fccanalysis run {} --batch --output {}chunk{}.root --files-list '.format(analysisFile, outputDir, ch)) - else: - frun.write(localDir + '/bin/fccanalysis run {} --batch --output {}{}/chunk{}.root --files-list '.format(analysisFile, outputDir, process,ch)) - - for ff in range(len(chunkList[ch])): - frun.write(' %s'%(chunkList[ch][ff])) - frun.write('\n') - if not os.path.isabs(outputDir): - if outputDirEos=="": - frun.write('cp {}chunk{}.root {}/{}/{}/chunk{}.root\n'.format(outputDir,ch,localDir,outputDir,process,ch)) - else: - frun.write('xrdcp {}chunk{}.root root://{}.cern.ch/{}/{}/chunk{}.root\n'.format(outputDir,ch,eosType,outputDirEos,process,ch)) - else: - if outputDirEos!="": - frun.write('xrdcp {}chunk{}.root root://{}.cern.ch/{}/{}/chunk{}.root\n'.format(outputDir,ch,eosType,outputDirEos,process,ch)) - - frun.close() - - - condor_file_str=condor_file_str.replace("//","/") - frunname_condor = 'job_desc_{}.cfg'.format(process) - frunfull_condor = '%s/%s'%(logDir,frunname_condor) - frun_condor = None - try: - frun_condor = open(frunfull_condor, 'w') - except IOError as e: - LOGGER.warning('I/O error(%i): %s', e.errno, e.strerror) - time.sleep(10) - frun_condor = open(frunfull_condor, 'w') - sysVer_str = '' - try: - f_make = open(localDir+'/build/CMakeFiles/CMakeConfigureLog.yaml', 'r') - except IOError as e: - LOGGER.warning('I/O error(%i): %s', e.errno, e.strerror) - LOGGER.warning('File not found: ' + localDir+'/build/CMakeFiles/CMakeConfigureLog.yaml') - else: - with open(localDir+'/build/CMakeFiles/CMakeConfigureLog.yaml', 'r') as makeConfig: - make_content = makeConfig.read() - if 'centos7' in make_content: - sysVer_str = '(OpSysAndVer =?= "CentOS7")' + ' &&' - if 'almalinux9' in make_content: - sysVer_str = '(OpSysAndVer =?= "AlmaLinux9")' + ' &&' - if sysVer_str == '': - LOGGER.warning('FCCAnalysis was compiled in an environment not available in lxplus HTcondor. Please check.' - 'Submitting jobs to default operating system. There may be compatibility issues.') - subprocess.getstatusoutput('chmod 777 {}'.format(frunfull_condor)) - frun_condor.write('executable = $(filename)\n') - frun_condor.write('Log = {}/condor_job.{}.$(ClusterId).$(ProcId).log\n'.format(logDir,process)) - frun_condor.write('Output = {}/condor_job.{}.$(ClusterId).$(ProcId).out\n'.format(logDir,process)) - frun_condor.write('Error = {}/condor_job.{}.$(ClusterId).$(ProcId).error\n'.format(logDir,process)) - frun_condor.write('getenv = False\n') - frun_condor.write('environment = "LS_SUBCWD={}"\n'.format(logDir)) # not sure - frun_condor.write('requirements = ({} (Machine =!= LastRemoteHost) && (TARGET.has_avx2 =?= True) )\n'.format(sysVer_str)) - frun_condor.write('on_exit_remove = (ExitBySignal == False) && (ExitCode == 0)\n') - frun_condor.write('max_retries = 3\n') - frun_condor.write('+JobFlavour = "{}"\n'.format(getElement(rdfModule, "batchQueue"))) - frun_condor.write('+AccountingGroup = "{}"\n'.format(getElement(rdfModule, "compGroup"))) - frun_condor.write('RequestCpus = {}\n'.format(getElement(rdfModule, "nCPUS"))) - frun_condor.write('queue filename matching files {}\n'.format(condor_file_str)) - frun_condor.close() - - cmdBatch="condor_submit {}".format(frunfull_condor) - LOGGER.info('Batch command: %s', cmdBatch) - job=SubmitToCondor(cmdBatch, 10) - - -#__________________________________________________________ -def apply_filepath_rewrites(filepath): - ''' - Apply path rewrites if applicable. - ''' - # Stripping leading and trailing white spaces - filepath_stripped = filepath.strip() - # Stripping leading and trailing slashes - filepath_stripped = filepath_stripped.strip('/') - - # Splitting the path along slashes - filepath_splitted = filepath_stripped.split('/') - - if len(filepath_splitted) > 1 and filepath_splitted[0] == 'eos': - if filepath_splitted[1] == 'experiment': - filepath = 'root://eospublic.cern.ch//' + filepath_stripped - elif filepath_splitted[1] == 'user': - filepath = 'root://eosuser.cern.ch//' + filepath_stripped - elif 'home-' in filepath_splitted[1]: - filepath = 'root://eosuser.cern.ch//eos/user/' + \ - filepath_stripped.replace('eos/home-', '') - else: - LOGGER.warning('Unknown EOS path type!\nPlease check with the ' - 'developers as this might impact performance of ' - 'the analysis.') - return filepath - - -# __________________________________________________________ -def runLocal(rdfModule, infile_list, args): - # Create list of files to be processed - info_msg = 'Creating dataframe object from files:\n\t' - file_list = ROOT.vector('string')() - nevents_orig = 0 # Amount of events processed in previous stage (= 0 if it is the first stage) - nevents_local = 0 # The amount of events in the input file(s) - for filepath in infile_list: - - filepath = apply_filepath_rewrites(filepath) - - file_list.push_back(filepath) - info_msg += f'- {filepath}\t\n' - infile = ROOT.TFile.Open(filepath, 'READ') - try: - nevents_orig += infile.Get('eventsProcessed').GetVal() - except AttributeError: - pass - - try: - nevents_local += infile.Get("events").GetEntries() - except AttributeError: - LOGGER.error('Input file:\n%s\nis missing events TTree!\n' - 'Aborting...', filepath) - infile.Close() - sys.exit(3) - - LOGGER.info(info_msg) - - # Adjust number of events in case --nevents was specified - if args.nevents > 0 and args.nevents < nevents_local: - nevents_local = args.nevents - - if nevents_orig > 0: - LOGGER.info('Number of events:\n\t- original: %s\n\t- local: %s', - f'{nevents_orig:,}', f'{nevents_local:,}') - else: - LOGGER.info('Number of local events: %s', f'{nevents_local:,}') - - output_dir = getElement(rdfModule, "outputDir") - if not args.batch: - if os.path.isabs(args.output): - LOGGER.warning('Provided output path is absolute, "outputDir" ' - 'from analysis script will be ignored!') - outfile_path = os.path.join(output_dir, args.output) - else: - outfile_path = args.output - LOGGER.info('Output file path:\n%s', outfile_path) - - # Run RDF - start_time = time.time() - outn = runRDF(rdfModule, file_list, outfile_path, nevents_local, args) - outn = outn.GetValue() - - outfile = ROOT.TFile(outfile_path, 'update') - param = ROOT.TParameter(int)( - 'eventsProcessed', - nevents_orig if nevents_orig != 0 else nevents_local) - param.Write() - outfile.Write() - outfile.Close() - - elapsed_time = time.time() - start_time - info_msg = f"{' SUMMARY ':=^80}\n" - info_msg += 'Elapsed time (H:M:S): ' - info_msg += time.strftime('%H:%M:%S', time.gmtime(elapsed_time)) - info_msg += '\nEvents processed/second: ' - info_msg += f'{int(nevents_local/elapsed_time):,}' - info_msg += f'\nTotal events processed: {int(nevents_local):,}' - info_msg += f'\nNo. result events: {int(outn):,}' - if nevents_local > 0: - info_msg += f'\nReduction factor local: {outn/nevents_local}' - if nevents_orig > 0: - info_msg += f'\nReduction factor total: {outn/nevents_orig}' - info_msg += '\n' - info_msg += 80 * '=' - info_msg += '\n' - LOGGER.info(info_msg) - - if args.bench: - analysis_name = getElement(rdfModule, 'analysisName') - if not analysis_name: - analysis_name = args.anafile_path - - bench_time = {} - bench_time['name'] = 'Time spent running the analysis: ' - bench_time['name'] += analysis_name - bench_time['unit'] = 'Seconds' - bench_time['value'] = elapsed_time - bench_time['range'] = 10 - bench_time['extra'] = 'Analysis path: ' + args.anafile_path - saveBenchmark('benchmarks_smaller_better.json', bench_time) - - bench_evt_per_sec = {} - bench_evt_per_sec['name'] = 'Events processed per second: ' - bench_evt_per_sec['name'] += analysis_name - bench_evt_per_sec['unit'] = 'Evt/s' - bench_evt_per_sec['value'] = nevents_local / elapsed_time - bench_time['range'] = 1000 - bench_time['extra'] = 'Analysis path: ' + args.anafile_path - saveBenchmark('benchmarks_bigger_better.json', bench_evt_per_sec) - - -#__________________________________________________________ -def runStages(args, rdfModule, preprocess, analysisFile): - ''' - Run regular stage. - ''' - - # Set ncpus, load header files, custom dicts, ... - initialize(args, rdfModule, analysisFile) - - # Check if outputDir exist and if not create it - outputDir = getElement(rdfModule, "outputDir") - if not os.path.exists(outputDir) and outputDir: - os.system("mkdir -p {}".format(outputDir)) - - # Check if outputDir exist and if not create it - outputDirEos = getElement(rdfModule,"outputDirEos") - if not os.path.exists(outputDirEos) and outputDirEos: - os.system("mkdir -p {}".format(outputDirEos)) - - # Check if test mode is specified, and if so run the analysis on it (this - # will exit after) - if args.test: - LOGGER.info('Running over test file...') - testfile_path = getElement(rdfModule, "testFile") - directory, _ = os.path.split(args.output) - if directory: - os.system("mkdir -p {}".format(directory)) - runLocal(rdfModule, [testfile_path], args) - sys.exit(0) - - # Check if files are specified, and if so run the analysis on it/them (this - # will exit after) - if len(args.files_list) > 0: - LOGGER.info('Running over files provided in command line argument...') - directory, _ = os.path.split(args.output) - if directory: - os.system(f'mkdir -p {directory}') - runLocal(rdfModule, args.files_list, args) - sys.exit(0) - - # Check if batch mode and set start and end file from original list - run_batch = getElement(rdfModule, "runBatch") - - # Check if the process list is specified - process_list = getElement(rdfModule, "processList") - - for process_name in process_list: - file_list, event_list = getProcessInfo( - process_name, - getElement(rdfModule, "prodTag"), - getElement(rdfModule, "inputDir")) - - if len(file_list) <= 0: - LOGGER.error('No files to process!\nAborting...') - sys.exit(3) - - # Determine the fraction of the input to be processed - fraction = 1 - if getElementDict(process_list[process_name], 'fraction'): - fraction = getElementDict(process_list[process_name], 'fraction') - # Put together output path - output_stem = process_name - if getElementDict(process_list[process_name], 'output'): - output_stem = getElementDict(process_list[process_name], 'output') - # Determine the number of chunks the output will be split into - chunks = 1 - if getElementDict(process_list[process_name], 'chunks'): - chunks = getElementDict(process_list[process_name], 'chunks') - - info_msg = f'Adding process "{process_name}" with:' - if fraction < 1: - info_msg += f'\n\t- fraction: {fraction}' - info_msg += f'\n\t- number of files: {len(file_list):,}' - info_msg += f'\n\t- output stem: {output_stem}' - if chunks > 1: - info_msg += f'\n\t- number of chunks: {chunks}' - - if fraction < 1: - file_list = getsubfileList(file_list, event_list, fraction) - - chunk_list = [file_list] - if chunks > 1: - chunk_list = getchunkList(file_list, chunks) - LOGGER.info('Number of the output files: %s', f'{len(chunk_list):,}') - - # Create directory if more than 1 chunk - if chunks > 1: - output_directory = os.path.join(outputDir, output_stem) - - if not os.path.exists(output_directory): - os.system("mkdir -p {}".format(output_directory)) - - if run_batch: - # Sending to the batch system - LOGGER.info('Running on the batch...') - if len(chunk_list) == 1: - LOGGER.warning('\033[4m\033[1m\033[91mRunning on batch with ' - 'only one chunk might not be optimal\033[0m') - - sendToBatch(rdfModule, chunk_list, process_name, analysisFile) - - else: - # Running locally - LOGGER.info('Running locally...') - if len(chunk_list) == 1: - args.output = '{}.root'.format(output_stem) - runLocal(rdfModule, chunk_list[0], args) - else: - for index, chunk in enumerate(chunk_list): - args.output = '{}/chunk{}.root'.format(output_stem, index) - runLocal(rdfModule, chunk, args) - - -# __________________________________________________________ -def testfile(f): - tf=ROOT.TFile.Open(f) - tt=None - try: - tt=tf.Get("events") - if tt == None: - LOGGER.warning('File does not contains events, selection was too ' - 'tight, skipping it: %s', f) - return False - except IOError as e: - LOGGER.warning('I/O error(%i): %s', e.errno, e.strerror) - return False - except ValueError: - LOGGER.warning('Could read the file') - return False - except: - LOGGER.warning('Unexpected error: %s\nfile ===%s=== must be deleted', - sys.exc_info()[0], f) - return False - return True - - -# __________________________________________________________ -def runFinal(rdfModule): - proc_dict_location = getElement(rdfModule, "procDict", True) - if not proc_dict_location: - LOGGER.error('Location of the procDict not provided!\nAborting...') - sys.exit(3) - - procDict = get_process_dict(proc_dict_location) - - procDictAdd = getElement(rdfModule, "procDictAdd", True) - for procAdd in procDictAdd: - if getElementDict(procDict, procAdd) == None: - procDict[procAdd] = procDictAdd[procAdd] - - ROOT.ROOT.EnableImplicitMT(getElement(rdfModule, "nCPUS", True)) - - nevents_real = 0 - start_time = time.time() - - processEvents={} - eventsTTree={} - processList={} - saveTab=[] - efficiencyList=[] - - inputDir = getElement(rdfModule,"inputDir", True) - if not inputDir: - LOGGER.error('The inputDir variable is mandatory for the final stage ' - 'of the analysis!\nAborting...') - sys.exit(3) - - if inputDir[-1]!="/":inputDir+="/" - - outputDir = getElement(rdfModule,"outputDir", True) - if outputDir!="": - if outputDir[-1]!="/":outputDir+="/" - - if not os.path.exists(outputDir) and outputDir!='': - os.system("mkdir -p {}".format(outputDir)) - - cutList = getElement(rdfModule,"cutList", True) - length_cuts_names = max([len(cut) for cut in cutList]) - cutLabels = getElement(rdfModule,"cutLabels", True) - - # save a table in a separate tex file - saveTabular = getElement(rdfModule,"saveTabular", True) - if saveTabular: - # option to rewrite the cuts in a better way for the table. otherwise, take them from the cutList - if cutLabels: - cutNames = list(cutLabels.values()) - else: - cutNames = [cut for cut in cutList] - - cutNames.insert(0, ' ') - saveTab.append(cutNames) - efficiencyList.append(cutNames) - - for process_id in getElement(rdfModule, "processList", True): - processEvents[process_id] = 0 - eventsTTree[process_id] = 0 - - fileListRoot = ROOT.vector('string')() - infilepath = inputDir + process_id + '.root' # input file - if not os.path.isfile(infilepath): - LOGGER.warning('File %s does not exist!\nTrying if it is a ' - 'directory as it was processed with batch.', - infilepath) - else: - LOGGER.info('Open file:\n\t%s', infilepath) - processEvents[process_id], eventsTTree[process_id] = get_entries(infilepath) - fileListRoot.push_back(infilepath) - - indirpath = inputDir + process_id - if os.path.isdir(indirpath): - info_msg = f'Open directory {indirpath}' - flist = glob.glob(indirpath + '/chunk*.root') - for filepath in flist: - info_msg += '\n\t' + filepath - chunkProcessEvents, chunkEventsTTree = get_entries(filepath) - processEvents[process_id] += chunkProcessEvents - eventsTTree[process_id] += chunkEventsTTree - fileListRoot.push_back(filepath) - LOGGER.info(info_msg) - processList[process_id] = fileListRoot - - info_msg = 'Processed events:' - for process_id, n_events in processEvents.items(): - info_msg += f'\n\t- {process_id}: {n_events:,}' - LOGGER.info(info_msg) - info_msg = 'Events in the TTree:' - for process_id, n_events in eventsTTree.items(): - info_msg += f'\n\t- {process_id}: {n_events:,}' - LOGGER.info(info_msg) - - histoList = getElement(rdfModule, "histoList", True) - doScale = getElement(rdfModule, "doScale", True) - intLumi = getElement(rdfModule, "intLumi", True) - - doTree = getElement(rdfModule, "doTree", True) - for pr in getElement(rdfModule, "processList", True): - LOGGER.info('Running over process: %s', pr) - - if processEvents[pr] == 0: - LOGGER.error('Can\'t scale histograms, the number of processed ' - 'events for the process "%s" seems to be zero!', pr) - sys.exit(3) - - RDF = ROOT.ROOT.RDataFrame - df = RDF("events", processList[pr]) - defineList = getElement(rdfModule,"defineList", True) - if len(defineList) > 0: - LOGGER.info('Registering extra DataFrame defines...') - for define in defineList: - df = df.Define(define, defineList[define]) - - fout_list = [] - histos_list = [] - tdf_list = [] - count_list = [] - cuts_list = [] - cuts_list.append(pr) - eff_list = [] - eff_list.append(pr) - - # Define all histos, snapshots, etc... - LOGGER.info('Defining snapshots and histograms') - for cut in cutList: - fout = outputDir+pr+'_'+cut+'.root' #output file for tree - fout_list.append(fout) - - df_cut = df.Filter(cutList[cut]) - count_list.append(df_cut.Count()) - - histos = [] - - for v in histoList: - if "name" in histoList[v]: # default 1D histogram - model = ROOT.RDF.TH1DModel(v, ";{};".format(histoList[v]["title"]), histoList[v]["bin"], histoList[v]["xmin"], histoList[v]["xmax"]) - histos.append(df_cut.Histo1D(model,histoList[v]["name"])) - elif "cols" in histoList[v]: # multi dim histogram (1, 2 or 3D) - cols = histoList[v]['cols'] - bins = histoList[v]['bins'] - bins_unpacked = tuple([i for sub in bins for i in sub]) - if len(bins) != len(cols): - LOGGER.error('Amount of columns should be equal to ' - 'the amount of bin configs!\nAborting...') - sys.exit(3) - if len(cols) == 1: - histos.append(df_cut.Histo1D((v, "", *bins_unpacked), cols[0])) - elif len(cols) == 2: - histos.append(df_cut.Histo2D((v, "", *bins_unpacked), cols[0], cols[1])) - elif len(cols) == 3: - histos.append(df_cut.Histo3D((v, "", *bins_unpacked), cols[0], cols[1], cols[2])) - else: - LOGGER.error('Only 1, 2 or 3D histograms supported.') - sys.exit(3) - else: - LOGGER.error('Error parsing the histogram config. Provide ' - 'either name or cols.') - sys.exit(3) - histos_list.append(histos) - - if doTree: - opts = ROOT.RDF.RSnapshotOptions() - opts.fLazy = True - try: - snapshot_tdf = df_cut.Snapshot("events", fout, "", opts) - except Exception as excp: - LOGGER.error('During the execution of the final stage ' - 'exception occurred:\n%s', excp) - sys.exit(3) - - # Needed to avoid python garbage collector messing around with - # the snapshot - tdf_list.append(snapshot_tdf) - - # Now perform the loop and evaluate everything at once. - LOGGER.info('Evaluating...') - all_events = df.Count().GetValue() - LOGGER.info('Done') - - nevents_real += all_events - uncertainty = ROOT.Math.sqrt(all_events) - - if doScale: - all_events = all_events*1.*procDict[pr]["crossSection"]*procDict[pr]["kfactor"]*procDict[pr]["matchingEfficiency"]*intLumi/processEvents[pr] - uncertainty = ROOT.Math.sqrt(all_events)*procDict[pr]["crossSection"]*procDict[pr]["kfactor"]*procDict[pr]["matchingEfficiency"]*intLumi/processEvents[pr] - LOGGER.info('Printing scaled number of events!!!') - - cfn_width = 16 + length_cuts_names # Cutflow name width - info_msg = 'Cutflow:' - info_msg += f'\n\t{"All events":{cfn_width}} : {all_events:,}' - - if saveTabular: - cuts_list.append('{nevents:.2e} $\\pm$ {uncertainty:.2e}'.format(nevents=all_events,uncertainty=uncertainty)) # scientific notation - recomended for backgrounds - # cuts_list.append('{nevents:.3f} $\\pm$ {uncertainty:.3f}'.format(nevents=all_events,uncertainty=uncertainty)) # float notation - recomended for signals with few events - eff_list.append(1.) #start with 100% efficiency - - for i, cut in enumerate(cutList): - neventsThisCut = count_list[i].GetValue() - neventsThisCut_raw = neventsThisCut - uncertainty = ROOT.Math.sqrt(neventsThisCut_raw) - if doScale: - neventsThisCut = neventsThisCut*1.*procDict[pr]["crossSection"]*procDict[pr]["kfactor"]*procDict[pr]["matchingEfficiency"]*intLumi/processEvents[pr] - uncertainty = ROOT.Math.sqrt(neventsThisCut_raw)*procDict[pr]["crossSection"]*procDict[pr]["kfactor"]*procDict[pr]["matchingEfficiency"]*intLumi/processEvents[pr] - info_msg += f'\n\t{"After selection " + cut:{cfn_width}} : ' - info_msg += f'{neventsThisCut:,}' - - # Saving the number of events, uncertainty and efficiency for the output-file - if saveTabular and cut != 'selNone': - if neventsThisCut != 0: - cuts_list.append('{nevents:.2e} $\\pm$ {uncertainty:.2e}'.format(nevents=neventsThisCut,uncertainty=uncertainty)) # scientific notation - recomended for backgrounds - # cuts_list.append('{nevents:.3f} $\\pm$ {uncertainty:.3f}'.format(nevents=neventsThisCut,uncertainty=uncertainty)) # # float notation - recomended for signals with few events - prevNevents = cuts_list[-2].split() - eff_list.append('{eff:.3f}'.format(eff=1.*neventsThisCut/all_events)) - # if number of events is zero, the previous uncertainty is saved instead: - elif '$\\pm$' in cuts_list[-1]: - cut = (cuts_list[-1]).split() - cuts_list.append('$\\leq$ {uncertainty}'.format(uncertainty=cut[2])) - eff_list.append('0.') - else: - cuts_list.append(cuts_list[-1]) - eff_list.append('0.') - - LOGGER.info(info_msg) - - # And save everything - LOGGER.info('Saving the outputs...') - for i, cut in enumerate(cutList): - fhisto = outputDir+pr+'_'+cut+'_histo.root' #output file for histograms - tf = ROOT.TFile.Open(fhisto,'RECREATE') - for h in histos_list[i]: - try : - h.Scale(1.*procDict[pr]["crossSection"]*procDict[pr]["kfactor"]*procDict[pr]["matchingEfficiency"]/processEvents[pr]) - except KeyError: - LOGGER.warning( - 'No value defined for process %s in dictionary!', pr) - if h.Integral(0,-1)>0:h.Scale(1./h.Integral(0,-1)) - h.Write() - tf.Close() - - if doTree: - # test that the snapshot worked well - validfile = testfile(fout_list[i]) - if not validfile: continue - - if saveTabular and cut != 'selNone': - saveTab.append(cuts_list) - efficiencyList.append(eff_list) - - if saveTabular: - f = open(outputDir+"outputTabular.txt","w") - # Printing the number of events in format of a LaTeX table - print('\\begin{table}[H] \n \\centering \n \\resizebox{\\textwidth}{!}{ \n \\begin{tabular}{|l||',end='',file=f) - print('c|' * (len(cuts_list)-1),end='',file=f) - print('} \hline',file=f) - for i, row in enumerate(saveTab): - print(' ', end='', file=f) - print(*row, sep = ' & ', end='', file=f) - print(' \\\\ ', file=f) - if (i == 0): - print(' \\hline',file=f) - print(' \\hline \n \\end{tabular}} \n \\caption{Caption} \n \\label{tab:my_label} \n\\end{table}', file=f) - - # Efficiency: - print('\n\nEfficiency: ', file=f) - print('\\begin{table}[H] \n \\centering \n \\resizebox{\\textwidth}{!}{ \n \\begin{tabular}{|l||',end='',file=f) - print('c|' * (len(cuts_list)-1),end='',file=f) - print('} \hline',file=f) - for i in range(len(eff_list)): - print(' ', end='', file=f) - v = [row[i] for row in efficiencyList] - print(*v, sep = ' & ', end='', file=f) - print(' \\\\ ', file=f) - if (i == 0): - print(' \\hline',file=f) - print(' \\hline \n \\end{tabular}} \n \\caption{Caption} \n \\label{tab:my_label} \n\\end{table}', file=f) - f.close() - - elapsed_time = time.time() - start_time - - info_msg = f"{' SUMMARY ':=^80}\n" - info_msg += 'Elapsed time (H:M:S): ' - info_msg += time.strftime('%H:%M:%S', time.gmtime(elapsed_time)) - info_msg += '\nEvents processed/second: ' - info_msg += f'{int(nevents_real/elapsed_time):,}' - info_msg += f'\nTotal events processed: {nevents_real:,}' - info_msg += '\n' - info_msg += 80 * '=' - info_msg += '\n' - LOGGER.info(info_msg) - - -def runHistmaker(args, rdfModule, analysisFile): - - # set ncpus, load header files, custom dicts, ... - initialize(args, rdfModule, analysisFile) - - # load process dictionary - proc_dict_location = getElement(rdfModule, "procDict", True) - if not proc_dict_location: - LOGGER.error('Location of the procDict not provided.\nAborting...') - sys.exit(3) - - procDict = get_process_dict(proc_dict_location) - - # check if outputDir exist and if not create it - outputDir = getElement(rdfModule,"outputDir") - if not os.path.exists(outputDir) and outputDir!='': - os.system("mkdir -p {}".format(outputDir)) - - doScale = getElement(rdfModule,"doScale", True) - intLumi = getElement(rdfModule,"intLumi", True) - - # check if the process list is specified, and create graphs for them - processList = getElement(rdfModule,"processList") - graph_function = getattr(rdfModule, "build_graph") - results = [] # all the histograms - hweights = [] # all the weights - evtcounts = [] # event count of the input file - eventsProcessedDict = {} # number of events processed per process, in a potential previous step - for process in processList: - fileList, eventList = getProcessInfo(process, getElement(rdfModule,"prodTag"), getElement(rdfModule, "inputDir")) - if len(fileList) == 0: - LOGGER.error('No files to process!\nAborting...') - sys.exit(3) - processDict={} - fraction = 1 - output = process - chunks = 1 - try: - processDict = processList[process] - if getElementDict(processList[process], 'fraction') != None: fraction = getElementDict(processList[process], 'fraction') - if getElementDict(processList[process], 'output') != None: output = getElementDict(processList[process], 'output') - if getElementDict(processList[process], 'chunks') != None: chunks = getElementDict(processList[process], 'chunks') - except TypeError: - LOGGER.warning('No values set for process %s will use default ' - 'values!', process) - if fraction < 1:fileList = getsubfileList(fileList, eventList, fraction) - - # get the number of events processed, in a potential previous step - fileListRoot = ROOT.vector('string')() - nevents_meta = 0 # amount of events processed in previous stage (= 0 if it is the first stage) - for fileName in fileList: - fileName = apply_filepath_rewrites(fileName) - fileListRoot.push_back(fileName) - if getElement(rdfModule,"prodTag") == None: # skip check for processed events in case of first stage - tf=ROOT.TFile.Open(str(fileName),"READ") - tf.cd() - for key in tf.GetListOfKeys(): - if 'eventsProcessed' == key.GetName(): - nevents_meta += tf.eventsProcessed.GetVal() - break - if args.test: - break - eventsProcessedDict[process] = nevents_meta - info_msg = f'Add process "{process}" with:' - info_msg += f'\n\tfraction = {fraction}' - info_msg += f'\n\tnFiles = {len(fileListRoot):,}' - info_msg += f'\n\toutput = {output}\n\tchunks = {chunks}' - LOGGER.info(info_msg) - - df = ROOT.ROOT.RDataFrame("events", fileListRoot) - evtcount = df.Count() - res, hweight = graph_function(df, process) - results.append(res) - hweights.append(hweight) - evtcounts.append(evtcount) - - LOGGER.info('Starting the event loop...') - start_time = time.time() - ROOT.ROOT.RDF.RunGraphs(evtcounts) - LOGGER.info('Event loop done!') - elapsed_time = time.time() - start_time - - LOGGER.info('Writing out output files...') - nevents_tot = 0 - for process, res, hweight, evtcount in zip(processList, results, hweights, evtcounts): - LOGGER.info('Writing out process %s, nEvents processed %s', - process, f'{evtcount.GetValue():,}') - fOut = ROOT.TFile(f"{outputDir}/{process}.root", "RECREATE") - - # get the cross-sections etc. First try locally, then the procDict - if 'crossSection' in processList[process]: - crossSection = processList[process]['crossSection'] - elif process in procDict and 'crossSection' in procDict[process]: - crossSection = procDict[process]['crossSection'] - else: - LOGGER.warning('Can\'t find cross-section for process %s in ' - 'processList or procDict!\nUsing default value of 1' - , process) - crossSection = 1 - - if 'kfactor' in processList[process]: - kfactor = processList[process]['kfactor'] - elif process in procDict and 'kfactor' in procDict[process]: - kfactor = procDict[process]['kfactor'] - else: - kfactor = 1 - - if 'matchingEfficiency' in processList[process]: - matchingEfficiency = processList[process]['matchingEfficiency'] - elif process in procDict and 'matchingEfficiency' in procDict[process]: - matchingEfficiency = procDict[process]['matchingEfficiency'] - else: - matchingEfficiency = 1 - - eventsProcessed = eventsProcessedDict[process] if eventsProcessedDict[process] != 0 else evtcount.GetValue() - scale = crossSection*kfactor*matchingEfficiency/eventsProcessed - - histsToWrite = {} - for r in res: - hist = r.GetValue() - hName = hist.GetName() - if hist.GetName() in histsToWrite: # merge histograms in case histogram exists - histsToWrite[hName].Add(hist) - else: - histsToWrite[hName] = hist - - for hist in histsToWrite.values(): - if doScale: - hist.Scale(scale*intLumi) - hist.Write() - - # write all meta info to the output file - p = ROOT.TParameter(int)("eventsProcessed", eventsProcessed) - p.Write() - p = ROOT.TParameter(float)("sumOfWeights", hweight.GetValue()) - p.Write() - p = ROOT.TParameter(float)("intLumi", intLumi) - p.Write() - p = ROOT.TParameter(float)("crossSection", crossSection) - p.Write() - p = ROOT.TParameter(float)("kfactor", kfactor) - p.Write() - p = ROOT.TParameter(float)("matchingEfficiency", matchingEfficiency) - p.Write() - nevents_tot += evtcount.GetValue() - - info_msg = f"{' SUMMARY ':=^80}\n" - info_msg += 'Elapsed time (H:M:S): ' - info_msg += time.strftime('%H:%M:%S', time.gmtime(elapsed_time)) - info_msg += '\nEvents processed/second: ' - info_msg += f'{int(nevents_tot/elapsed_time):,}' - info_msg += f'\nTotal events processed: {nevents_tot:,}' - info_msg += '\n' - info_msg += 80 * '=' - info_msg += '\n' - LOGGER.info(info_msg) - - -# __________________________________________________________ -def runPlots(analysisFile): - import doPlots as dp - dp.run(analysisFile) - - -def run(parser): - ''' - Set things in motion. - ''' - - args, _ = parser.parse_known_args() - - # Check that the analysis file exists - analysisFile = args.anafile_path - if not os.path.isfile(analysisFile): - LOGGER.error('Analysis script %s not found!\nAborting...', - analysisFile) - sys.exit(3) - - # Load pre compiled analyzers - LOGGER.info('Loading analyzers from libFCCAnalyses...') - ROOT.gSystem.Load("libFCCAnalyses") - # Is this still needed?? 01/04/2022 still to be the case - _fcc = ROOT.dummyLoader - - # Set verbosity level - if args.verbose: - # ROOT.Experimental.ELogLevel.kInfo verbosity level is more - # equivalent to DEBUG in other log systems - LOGGER.debug('Setting verbosity level "kInfo" for RDataFrame...') - verbosity = ROOT.Experimental.RLogScopedVerbosity( - ROOT.Detail.RDF.RDFLogChannel(), - ROOT.Experimental.ELogLevel.kInfo) - if args.more_verbose: - LOGGER.debug('Setting verbosity level "kDebug" for RDataFrame...') - verbosity = ROOT.Experimental.RLogScopedVerbosity( - ROOT.Detail.RDF.RDFLogChannel(), - ROOT.Experimental.ELogLevel.kDebug) - if args.most_verbose: - LOGGER.debug('Setting verbosity level "kDebug+10" for ' - 'RDataFrame...') - verbosity = ROOT.Experimental.RLogScopedVerbosity( - ROOT.Detail.RDF.RDFLogChannel(), - ROOT.Experimental.ELogLevel.kDebug+10) - - # Load the analysis - analysisFile = os.path.abspath(analysisFile) - LOGGER.info('Loading analysis file:\n%s', analysisFile) - rdfSpec = importlib.util.spec_from_file_location("rdfanalysis", analysisFile) - rdfModule = importlib.util.module_from_spec(rdfSpec) - rdfSpec.loader.exec_module(rdfModule) - - if not hasattr(args, 'command'): - LOGGER.error('Unknow sub-command "%s"!\nAborting...') - sys.exit(3) - - if args.command == "run": - if hasattr(rdfModule, "build_graph") and hasattr(rdfModule, "RDFanalysis"): - LOGGER.error('Analysis file ambiguous!\nBoth "RDFanalysis" ' - 'class and "build_graph" function are defined.') - sys.exit(3) - elif hasattr(rdfModule, "build_graph") and not hasattr(rdfModule, "RDFanalysis"): - runHistmaker(args, rdfModule, analysisFile) - elif not hasattr(rdfModule, "build_graph") and hasattr(rdfModule, "RDFanalysis"): - runStages(args, rdfModule, args.preprocess, analysisFile) - else: - LOGGER.error('Analysis file does not contain required ' - 'objects!\nProvide either "RDFanalysis" class or ' - '"build_graph" function.') - sys.exit(3) - elif args.command == "final": - runFinal(rdfModule) - elif args.command == "plots": - runPlots(analysisFile) diff --git a/python/FCCAnalysisSetup.py b/python/FCCAnalysisSetup.py deleted file mode 100644 index cfcebc5318..0000000000 --- a/python/FCCAnalysisSetup.py +++ /dev/null @@ -1,12 +0,0 @@ -#__________________________________________________________ -def setup(mainparser): - from analysis_builder import setup_analysis - - args, _ = mainparser.parse_known_args() - if args.command == 'init': - setup_analysis(package=args.package, - name=args.name, - author=args.author, - description=args.description, - standalone=args.standalone, - output_dir=args.output_dir) diff --git a/python/Parsers.py b/python/Parsers.py deleted file mode 100644 index ae7257a408..0000000000 --- a/python/Parsers.py +++ /dev/null @@ -1,114 +0,0 @@ -import argparse - -def setup_init_parser(parser): - publicOptions = parser.add_argument_group('User options') - publicOptions.add_argument('package', help='name of the analysis package to be built') - publicOptions.add_argument('--name', help='name of the main analysis utility', default='DummyAnalysis') - publicOptions.add_argument('--author', help="author's \"name \" (will use git-config if not specified)") - publicOptions.add_argument('--description', help='analysis package description') - publicOptions.add_argument('--standalone', action='store_true', help="also add CMake directive to build standalone package", default=False) - publicOptions.add_argument('--output-dir', help='output directory where the analysis package will be written') - -def setup_build_parser(parser): - publicOptions = parser.add_argument_group('User build options') - publicOptions.add_argument('-c', '--clean-build', - action='store_true', - default=False, - help='do a clean build') - publicOptions.add_argument( - '-j', '--build-threads', - type=int, - default=1, - help='number of threads when building (equivalent to `make -j`)' - ) - - -def setup_test_parser(parser): - ''' - Adds test options - ''' - test_args = parser.add_argument_group('Test options') - test_args.add_argument( - '-R', '--tests-regex', - type=str, - help='Run tests matching regular expression (e.g. run only unit tests ' - 'with "^UT")' - ) - test_args.add_argument( - '-E', '--exclude-regex', - type=str, - help='Exclude tests matching regular expression' - ) - test_args.add_argument( - '-j', '--parallel', - type=int, - default=-1, - help='number of tests running in parallel (equivalent to `ctest -j`)' - ) - - -def setup_pin_parser(parser): - publicOptions = parser.add_argument_group('User pin options') - publicOptions.add_argument('-c', '--clear', - action='store_true', - default=False, - help='clear analysis pin') - publicOptions.add_argument('-f', '--force', - action='store_true', - default=False, - help='force recreate analysis pin') - publicOptions.add_argument('-s', '--show', - action='store_true', - default=False, - help='show pinned stack') - - -def setup_run_parser(parser): - ''' - Define command line arguments for the run subcommand. - ''' - parser.add_argument('anafile_path', - help='path to analysis file') - parser.add_argument('--files-list', default=[], nargs='+', - help='specify input file to bypass the processList') - parser.add_argument('--output', type=str, default='output.root', - help='specify output file name to bypass the processList and or outputList') - parser.add_argument('--nevents', type=int, default=-1, - help='specify max number of events to process') - parser.add_argument('--test', action='store_true', default=False, - help='run over the test file') - parser.add_argument('--bench', action='store_true', default=False, - help='output benchmark results to a JSON file') - parser.add_argument('--ncpus', type=int, default=-1, - help='set number of threads') - parser.add_argument('--final', action='store_true', default=False, - help='run final analysis (produces final histograms and trees)') - parser.add_argument('--plots', action='store_true', default=False, - help='run analysis plots') - parser.add_argument('--preprocess', action='store_true', default=False, - help='run preprocessing') - parser.add_argument('--validate', action='store_true', default=False, - help='validate a given production') - parser.add_argument('--rerunfailed', action='store_true', default=False, - help='rerun failed jobs') - parser.add_argument('--jobdir', type=str, default='output.root', - help='specify the batch job directory') - - # Internal argument, not to be used by the users - parser.add_argument('--batch', action='store_true', default=False, - help=argparse.SUPPRESS) - - -def setup_run_parser_final(parser): - ''' - Define command line arguments for the final subcommand. - ''' - parser.add_argument('anafile_path', - help='path to analysis_final script') - - -def setup_run_parser_plots(parser): - ''' - Define command line arguments for the plots subcommand. - ''' - parser.add_argument('anafile_path', help="path to analysis_plots script") diff --git a/python/analysis_builder.py b/python/analysis_builder.py deleted file mode 100644 index 689c84c403..0000000000 --- a/python/analysis_builder.py +++ /dev/null @@ -1,65 +0,0 @@ -import logging - -LOGGER = logging.getLogger('FCCAnalyses.analysis_builder') - -def find_author(): - from subprocess import getoutput - return getoutput('git config --global --get user.name') + ' <' + getoutput('git config --global --get user.email') + '>' - -def replace_all(input: str, repl) -> str: - output = input - for a, b in repl.items(): - output = output.replace(a, b) - return output - -def setup_analysis(package: str, - author: str='', - description: str='', - name: str='', - standalone: bool=False, - output_dir: str=''): - if not author: - author = find_author() - if not description: - description = '[...]' - elif '\n' in description: - raise RuntimeError('Multiline description is not supported. Please edit the output analysis header file directly.') - from subprocess import getoutput - fccanalyses_path = getoutput('git rev-parse --show-toplevel') - replacement_dict = { - '__pkgname__': package, - '__pkgdesc__': description, - '__name__': name, - '__author__': author, - '__fccpath__': fccanalyses_path - } - - if not output_dir: - path = f'{fccanalyses_path}/case-studies/{package}' - else: - path = output_dir - - import os - for p in [path, f'{path}/src', f'{path}/include', f'{path}/scripts']: - try: - os.mkdir(p) - except FileExistsError: - LOGGER.warning('FCCAnalysis package "%s" already exists.', package) - try: - tmpl_dir = os.path.join(fccanalyses_path, 'templates') - with open(f'{path}/src/classes.h', 'w') as f: - f.write(replace_all(open(f'{tmpl_dir}/classes.h', 'r').read(), replacement_dict)) - with open(f'{path}/src/classes_def.xml', 'w') as f: - f.write(replace_all(open(f'{tmpl_dir}/classes_def.xml', 'r').read(), replacement_dict)) - with open(f'{path}/src/{name}.cc', 'w') as f: - f.write(replace_all(open(f'{tmpl_dir}/Package.cc', 'r').read(), replacement_dict)) - with open(f'{path}/include/{name}.h', 'w') as f: - f.write(replace_all(open(f'{tmpl_dir}/Package.h', 'r').read(), replacement_dict)) - with open(f'{path}/scripts/analysis_cfg.py', 'w') as f: - f.write(replace_all(open(f'{tmpl_dir}/analysis_cfg.py', 'r').read(), replacement_dict)) - if standalone: - with open(f'{path}/CMakeLists.txt', 'w') as f: - f.write(replace_all(open(f'{tmpl_dir}/CMakeLists.txt', 'r').read(), replacement_dict)) - except OSError as error: - LOGGER.error('FCCAnalysis package "%s" creation error:\n%s', - package, error) diff --git a/python/anafile.py b/python/anascript.py similarity index 92% rename from python/anafile.py rename to python/anascript.py index 34b319069b..dc7ed2cbd3 100644 --- a/python/anafile.py +++ b/python/anascript.py @@ -7,15 +7,15 @@ import logging -LOGGER = logging.getLogger('FCCAnalyses.run') +LOGGER: logging.Logger = logging.getLogger('FCCAnalyses.run') -def getElement(rdfModule, element, isFinal=False): +def get_element(rdf_module, element: str, is_final: bool = False): ''' Pick up the attribute from the analysis file. ''' try: - return getattr(rdfModule, element) + return getattr(rdf_module, element) except AttributeError: # return default values or crash if mandatory @@ -27,7 +27,7 @@ def getElement(rdfModule, element, isFinal=False): elif element == 'analysers': LOGGER.error('The function <%s> is mandatory in your analysis ' 'script!.\nAborting...', element) - if isFinal: + if is_final: LOGGER.error('The function <%s> is not part of the final ' 'stage of the analysis!', element) sys.exit(3) @@ -35,7 +35,7 @@ def getElement(rdfModule, element, isFinal=False): elif element == 'output': LOGGER.error('The function <%s> is mandatory in your analysis ' 'script.\nAborting...', element) - if isFinal: + if is_final: LOGGER.error('The function <%s> is not part of the final ' 'stage of the analysis!', element) sys.exit(3) @@ -53,7 +53,7 @@ def getElement(rdfModule, element, isFinal=False): elif element == 'runBatch': LOGGER.debug('The variable <%s> is optional in your analysis ' 'script.\nReturning default value: False') - if isFinal: + if is_final: LOGGER.debug('The option <%s> is not available in the final ' 'stage of the analysis.', element) return False @@ -68,7 +68,7 @@ def getElement(rdfModule, element, isFinal=False): LOGGER.debug('The variable <%s> is optional in your analysis ' 'script.\nReturning default value: "workday"', element) - if isFinal: + if is_final: LOGGER.debug('The option <%s> is not available in the final ' 'stage of the analysis.', element) return 'workday' @@ -77,7 +77,7 @@ def getElement(rdfModule, element, isFinal=False): LOGGER.debug('The variable <%s> is optional in your analysis ' 'script.\nReturning default value: ' '"group_u_FCC.local_gen"', element) - if isFinal: + if is_final: LOGGER.debug('The option <%s> is not available in the final ' 'stage of the analysis.', element) return 'group_u_FCC.local_gen' @@ -85,7 +85,7 @@ def getElement(rdfModule, element, isFinal=False): elif element == 'outputDirEos': LOGGER.debug('The variable <%s> is optional in your analysis ' 'script.\nReturning empty string.', element) - if isFinal: + if is_final: LOGGER.debug('The option <%s> is not available in the final ' 'stage of the analysis.', element) return '' @@ -94,7 +94,7 @@ def getElement(rdfModule, element, isFinal=False): LOGGER.debug('The variable <%s> is optional in your analysis ' 'script.\nReturning default value: "eospublic"', element) - if isFinal: + if is_final: LOGGER.debug('The option <%s> is not available in the final ' 'stage of the analysis.', element) return 'eospublic' @@ -102,7 +102,7 @@ def getElement(rdfModule, element, isFinal=False): elif element == 'userBatchConfig': LOGGER.debug('The variable <%s> is optional in your your analysis ' 'script.\nReturning empty string.', element) - if isFinal: + if is_final: LOGGER.debug('The option <%s> is not available in the final ' 'stage of the analysis.', element) return '' @@ -115,13 +115,13 @@ def getElement(rdfModule, element, isFinal=False): LOGGER.debug('The variable <%s> is optional in your analysis ' 'script.\nReturning default test file:\n\t%s', element, test_file_path) - if isFinal: + if is_final: LOGGER.debug('The option <%s> is not available in the final ' 'stage of the analysis.', element) return test_file_path elif element == 'procDict': - if isFinal: + if is_final: LOGGER.error('The variable <%s> is mandatory in the final ' 'stage of the analysis.\nAborting...', element) sys.exit(3) @@ -129,7 +129,7 @@ def getElement(rdfModule, element, isFinal=False): 'stages of the analysis', element) elif element == 'cutList': - if isFinal: + if is_final: LOGGER.debug('The variable <%s> is optional in your final ' 'analysis script.\nReturning empty dictionary.', element) @@ -138,7 +138,7 @@ def getElement(rdfModule, element, isFinal=False): 'stages of the analysis', element) elif element == 'defineList': - if isFinal: + if is_final: LOGGER.debug('The variable <%s> is optional in your final ' 'analysis script.\nReturning empty dictionary.', element) @@ -147,7 +147,7 @@ def getElement(rdfModule, element, isFinal=False): 'stages of the analysis', element) elif element == 'histoList': - if isFinal: + if is_final: LOGGER.error('The variable <%s> is mandatory in the final ' 'stage of the analysis.\nAborting...', element) sys.exit(3) @@ -155,7 +155,7 @@ def getElement(rdfModule, element, isFinal=False): 'stages of the analysis', element) elif element == 'doTree': - if isFinal: + if is_final: LOGGER.debug('The variable <%s> is optional in your final ' 'analysis script.\nReturning default value: ' 'False', @@ -165,7 +165,7 @@ def getElement(rdfModule, element, isFinal=False): 'stages of the analysis', element) elif element == 'procDictAdd': - if isFinal: + if is_final: LOGGER.debug('The variable <%s> is optional in your final ' 'analysis script.\nReturning empty dictionary.', element) @@ -174,7 +174,7 @@ def getElement(rdfModule, element, isFinal=False): 'stages of the analysis', element) elif element == 'doScale': - if isFinal: + if is_final: LOGGER.debug('The variable <%s> is optional in the analysis ' 'final step/histmaker.\nBy default no scaling is ' 'applied.', element) @@ -183,7 +183,7 @@ def getElement(rdfModule, element, isFinal=False): 'stages of the analysis', element) elif element == 'intLumi': - if isFinal: + if is_final: LOGGER.debug('The variable <%s> is optional in the analysis ' 'final step/histmaker.\nUsing the default value: ' '1', element) @@ -192,7 +192,7 @@ def getElement(rdfModule, element, isFinal=False): 'stages of the analysis', element) elif element == 'saveTabular': - if isFinal: + if is_final: LOGGER.debug('The variable <%s> is optional in your final ' 'analysis script.\nReturning empty dictionary.', element) @@ -201,7 +201,7 @@ def getElement(rdfModule, element, isFinal=False): 'stages of the analysis', element) elif element == 'cutLabels': - if isFinal: + if is_final: LOGGER.debug('The variable <%s> is optional in your final ' 'analysis script.\nReturning empty dictionary.', element) @@ -212,7 +212,7 @@ def getElement(rdfModule, element, isFinal=False): elif element == 'geometryFile': LOGGER.debug('The variable <%s> is optional in your analysis ' 'script.\nReturning empty string.', element) - if isFinal: + if is_final: LOGGER.debug('The option <%s> is not available in the final ' 'stage of the analysis.', element) return '' @@ -220,7 +220,7 @@ def getElement(rdfModule, element, isFinal=False): elif element == 'readoutName': LOGGER.debug('The variable <%s> is optional in your analysis ' 'script.\nReturning empty string.', element) - if isFinal: + if is_final: LOGGER.debug('The option <%s> is not available in the final ' 'stage of the analysis.', element) return '' @@ -228,7 +228,7 @@ def getElement(rdfModule, element, isFinal=False): return None -def getElementDict(_dict, element): +def get_element_dict(_dict, element: str): ''' Returns None if the key is not found in the dictionary. ''' diff --git a/python/anascript.pyi b/python/anascript.pyi new file mode 100644 index 0000000000..befa2f0a28 --- /dev/null +++ b/python/anascript.pyi @@ -0,0 +1,8 @@ +# generated with `stubgen anascript.py` + +import logging + +LOGGER: logging.Logger + +def get_element(rdf_module, element: str, is_final: bool = False): ... +def get_element_dict(_dict, element: str): ... diff --git a/python/build_analysis.py b/python/build_analysis.py index 98498463b5..9eefaf0bce 100644 --- a/python/build_analysis.py +++ b/python/build_analysis.py @@ -13,19 +13,19 @@ LOGGER = logging.getLogger('FCCAnalyses.build') -def run_subprocess(command, run_dir): +def run_subprocess(command: str, run_dir: str): ''' Run subprocess in specified directory. Check only the return value, otherwise keep the subprocess connected to stdin/stout/stderr. ''' try: - proc = subprocess.Popen(command, cwd=run_dir) - status = proc.wait() + with subprocess.Popen(command, cwd=run_dir) as proc: + status: int = proc.wait() - if status != 0: - LOGGER.error('Error encountered!\nAborting...') - sys.exit(3) + if status != 0: + LOGGER.error('Error encountered!\nAborting...') + sys.exit(3) except KeyboardInterrupt: LOGGER.error('Aborting...') @@ -67,5 +67,5 @@ def build_analysis(mainparser): LOGGER.info('Creating install directory...') os.makedirs(install_path) - run_subprocess(['make', '-j{}'.format(args.build_threads), 'install'], + run_subprocess(['make', f'-j{args.build_threads}', 'install'], local_dir + '/build') diff --git a/python/doPlots.py b/python/doPlots.py deleted file mode 100644 index ee4052431f..0000000000 --- a/python/doPlots.py +++ /dev/null @@ -1,712 +0,0 @@ -#!/usr/bin/env python -''' -Create plots out of the histograms produced in previous stages -''' -import sys -import os -import os.path -import ntpath -import importlib -import copy -import re -import logging -import ROOT - -ROOT.gROOT.SetBatch(True) -ROOT.gStyle.SetOptStat(0) -ROOT.gStyle.SetOptTitle(0) - - -LOGGER = logging.getLogger('FCCAnalyses.plot') - -# __________________________________________________________ -def removekey(d, key): - r = dict(d) - del r[key] - return r - -def sortedDictValues(dic): - keys = sorted(dic) - return [dic[key] for key in keys] - -def formatStatUncHist(hists, name, hstyle=3254): - hTot = hists[0].Clone(name + "_unc") - for h in hists[1:]: - hTot.Add(h) - hTot.SetFillColor(ROOT.kBlack) - hTot.SetMarkerSize(0) - hTot.SetLineWidth(0) - hTot.SetFillStyle(hstyle) - return hTot - -#__________________________________________________________ -def mapHistos(var, label, sel, param, rebin): - LOGGER.info('Run plots for var:%s label:%s selection:%s', - var, label, sel) - signal=param.plots[label]['signal'] - backgrounds=param.plots[label]['backgrounds'] - - hsignal = {} - for s in signal: - hsignal[s]=[] - for f in signal[s]: - fin=param.inputDir+f+'_'+sel+'_histo.root' - if not os.path.isfile(fin): - LOGGER.info('File %s does not exist!\nSkipping it...', fin) - else: - tf=ROOT.TFile(fin) - h=tf.Get(var) - hh = copy.deepcopy(h) - scaleSig=1. - try: - scaleSig=param.scaleSig - except AttributeError: - LOGGER.info('No scale signal, using 1.') - param.scaleSig=scaleSig - LOGGER.info('ScaleSig: %g', scaleSig) - hh.Scale(param.intLumi*scaleSig) - hh.Rebin(rebin) - - if len(hsignal[s])==0: - hsignal[s].append(hh) - else: - hh.Add(hsignal[s][0]) - hsignal[s][0]=hh - - - hbackgrounds = {} - for b in backgrounds: - hbackgrounds[b]=[] - for f in backgrounds[b]: - fin=param.inputDir+f+'_'+sel+'_histo.root' - if not os.path.isfile(fin): - LOGGER.info('File %s does not exist!\nSkipping it...', fin) - else: - tf=ROOT.TFile(fin) - h=tf.Get(var) - hh = copy.deepcopy(h) - hh.Scale(param.intLumi) - hh.Rebin(rebin) - if len(hbackgrounds[b])==0: - hbackgrounds[b].append(hh) - else: - hh.Add(hbackgrounds[b][0]) - hbackgrounds[b][0]=hh - - for s in hsignal: - if len(hsignal[s])==0: - hsignal=removekey(hsignal,s) - - for b in hbackgrounds: - if len(hbackgrounds[b])==0: - hbackgrounds=removekey(hbackgrounds,b) - - return hsignal,hbackgrounds - - -#__________________________________________________________ -def mapHistosFromHistmaker(hName, param, plotCfg): - rebin = plotCfg['rebin'] if 'rebin' in plotCfg else 1 - LOGGER('Get histograms for %s', hName) - signal=param.procs['signal'] - backgrounds=param.procs['backgrounds'] - scaleSig = plotCfg['scaleSig'] if 'scaleSig' in plotCfg else 1 - - hsignal = {} - for s in signal: - hsignal[s]=[] - for f in signal[s]: - fin=f"{param.inputDir}/{f}.root" - if not os.path.isfile(fin): - LOGGER.info('File %s does not exist!\nSkipping it...', fin) - else: - tf=ROOT.TFile(fin) - h=tf.Get(hName) - hh = copy.deepcopy(h) - LOGGER.info('ScaleSig: ', scaleSig) - hh.Scale(param.intLumi*scaleSig) - hh.Rebin(rebin) - if len(hsignal[s])==0: - hsignal[s].append(hh) - else: - hh.Add(hsignal[s][0]) - hsignal[s][0]=hh - - - hbackgrounds = {} - for b in backgrounds: - hbackgrounds[b]=[] - for f in backgrounds[b]: - fin=f"{param.inputDir}/{f}.root" - if not os.path.isfile(fin): - LOGGER.info('File %s does not exist!\nSkipping it...', fin) - else: - tf=ROOT.TFile(fin) - h=tf.Get(hName) - hh = copy.deepcopy(h) - hh.Scale(param.intLumi) - hh.Rebin(rebin) - if len(hbackgrounds[b])==0: - hbackgrounds[b].append(hh) - else: - hh.Add(hbackgrounds[b][0]) - hbackgrounds[b][0]=hh - - for s in hsignal: - if len(hsignal[s])==0: - hsignal=removekey(hsignal,s) - - for b in hbackgrounds: - if len(hbackgrounds[b])==0: - hbackgrounds=removekey(hbackgrounds,b) - - return hsignal,hbackgrounds - -#__________________________________________________________ -def runPlots(var,sel,param,hsignal,hbackgrounds,extralab,splitLeg,plotStatUnc): - - ###Below are settings for separate signal and background legends - if(splitLeg): - legsize = 0.04*(len(hsignal)) - legsize2 = 0.04*(len(hbackgrounds)) - legCoord = [0.15,0.60 - legsize,0.50,0.62] - leg2 = ROOT.TLegend(0.60,0.60 - legsize2,0.88,0.62) - leg2.SetFillColor(0) - leg2.SetFillStyle(0) - leg2.SetLineColor(0) - leg2.SetShadowColor(10) - leg2.SetTextSize(0.035) - leg2.SetTextFont(42) - else: - legsize = 0.04*(len(hbackgrounds)+len(hsignal)) - legCoord=[0.68, 0.86-legsize, 0.96, 0.88] - try: - legCoord=param.legendCoord - except AttributeError: - LOGGER.info('No legCoord, using default one...') - legCoord=[0.68, 0.86-legsize, 0.96, 0.88] - leg2 = None - - leg = ROOT.TLegend(legCoord[0],legCoord[1],legCoord[2],legCoord[3]) - leg.SetFillColor(0) - leg.SetFillStyle(0) - leg.SetLineColor(0) - leg.SetShadowColor(10) - leg.SetTextSize(0.035) - leg.SetTextFont(42) - - for b in hbackgrounds: - if(splitLeg): - leg2.AddEntry(hbackgrounds[b][0],param.legend[b],"f") - else: - leg.AddEntry(hbackgrounds[b][0],param.legend[b],"f") - for s in hsignal: - leg.AddEntry(hsignal[s][0],param.legend[s],"l") - - - yields={} - for s in hsignal: - yields[s]=[param.legend[s],hsignal[s][0].Integral(0,-1), hsignal[s][0].GetEntries()] - for b in hbackgrounds: - yields[b]=[param.legend[b],hbackgrounds[b][0].Integral(0,-1), hbackgrounds[b][0].GetEntries()] - - histos=[] - colors=[] - - nsig=len(hsignal) - nbkg=len(hbackgrounds) - - for s in hsignal: - histos.append(hsignal[s][0]) - colors.append(param.colors[s]) - - for b in hbackgrounds: - histos.append(hbackgrounds[b][0]) - colors.append(param.colors[b]) - - intLumiab = param.intLumi/1e+06 - intLumi = f'L = {intLumiab:.0f} ab^{{-1}}' - if hasattr(param, "intLumiLabel"): - intLumi = getattr(param, "intLumiLabel") - - lt = "FCCAnalyses: FCC-hh Simulation (Delphes)" - rt = "#sqrt{{s}} = {:.1f} TeV, L = {}".format(param.energy,intLumi) - - if 'ee' in param.collider: - lt = "FCCAnalyses: FCC-ee Simulation (Delphes)" - rt = "#sqrt{{s}} = {:.1f} GeV, {}".format(param.energy,intLumi) - - - - customLabel="" - try: - customLabel=param.customLabel - except AttributeError: - LOGGER.info('No customLable, using nothing...') - - scaleSig=1. - try: - scaleSig=param.scaleSig - except AttributeError: - LOGGER('No scale signal, using 1.') - param.scaleSig=scaleSig - - if 'AAAyields' in var: - drawStack(var, 'events', leg, lt, rt, param.formats, param.outdir+"/"+sel, False , True , histos, colors, param.ana_tex, extralab, scaleSig, customLabel, nsig, nbkg, leg2, yields, plotStatUnc) - return - - if 'stack' in param.stacksig: - if 'lin' in param.yaxis: - drawStack(var+"_stack_lin", 'events', leg, lt, rt, param.formats, param.outdir+"/"+sel, False , True , histos, colors, param.ana_tex, extralab, scaleSig, customLabel, nsig, nbkg, leg2, yields, plotStatUnc) - if 'log' in param.yaxis: - drawStack(var+"_stack_log", 'events', leg, lt, rt, param.formats, param.outdir+"/"+sel, True , True , histos, colors, param.ana_tex, extralab, scaleSig, customLabel, nsig, nbkg, leg2, yields, plotStatUnc) - if 'lin' not in param.yaxis and 'log' not in param.yaxis: - LOGGER.info('Unrecognised option in formats, should be ' - '[\'lin\',\'log\']') - - if 'nostack' in param.stacksig: - if 'lin' in param.yaxis: - drawStack(var+"_nostack_lin", 'events', leg, lt, rt, param.formats, param.outdir+"/"+sel, False , False , histos, colors, param.ana_tex, extralab, scaleSig, customLabel, nsig, nbkg, leg2, yields, plotStatUnc) - if 'log' in param.yaxis: - drawStack(var+"_nostack_log", 'events', leg, lt, rt, param.formats, param.outdir+"/"+sel, True , False , histos, colors, param.ana_tex, extralab, scaleSig, customLabel, nsig, nbkg, leg2, yields, plotStatUnc) - if 'lin' not in param.yaxis and 'log' not in param.yaxis: - LOGGER.info('Unrecognised option in formats, should be ' - '[\'lin\',\'log\']') - if 'stack' not in param.stacksig and 'nostack' not in param.stacksig: - LOGGER.info('Unrecognised option in stacksig, should be ' - '[\'stack\',\'nostack\']') - -#__________________________________________________________ -def runPlotsHistmaker(hName, param, plotCfg): - - output = plotCfg['output'] - hsignal,hbackgrounds=mapHistosFromHistmaker(hName, param, plotCfg) - - if hasattr(param, "splitLeg"): - splitLeg = param.splitLeg - else: - splitLeg = False - - if hasattr(param, "plotStatUnc"): - plotStatUnc = param.plotStatUnc - else: - plotStatUnc = False - - ###Below are settings for separate signal and background legends - if(splitLeg): - legsize = 0.04*(len(hsignal)) - legsize2 = 0.04*(len(hbackgrounds)) - legCoord = [0.15,0.60 - legsize,0.50,0.62] - leg2 = ROOT.TLegend(0.60,0.60 - legsize2,0.88,0.62) - leg2.SetFillColor(0) - leg2.SetFillStyle(0) - leg2.SetLineColor(0) - leg2.SetShadowColor(10) - leg2.SetTextSize(0.035) - leg2.SetTextFont(42) - else: - legsize = 0.04*(len(hbackgrounds)+len(hsignal)) - legCoord=[0.68, 0.86-legsize, 0.96, 0.88] - try: - legCoord=param.legendCoord - except AttributeError: - LOGGER.info('No legCoord, using default one...') - legCoord=[0.68, 0.86-legsize, 0.96, 0.88] - leg2 = None - - leg = ROOT.TLegend(legCoord[0],legCoord[1],legCoord[2],legCoord[3]) - leg.SetFillColor(0) - leg.SetFillStyle(0) - leg.SetLineColor(0) - leg.SetShadowColor(10) - leg.SetTextSize(0.035) - leg.SetTextFont(42) - - for b in hbackgrounds: - if(splitLeg): - leg2.AddEntry(hbackgrounds[b][0],param.legend[b],"f") - else: - leg.AddEntry(hbackgrounds[b][0],param.legend[b],"f") - for s in hsignal: - leg.AddEntry(hsignal[s][0],param.legend[s],"l") - - - yields={} - for s in hsignal: - yields[s]=[param.legend[s],hsignal[s][0].Integral(0,-1), hsignal[s][0].GetEntries()] - for b in hbackgrounds: - yields[b]=[param.legend[b],hbackgrounds[b][0].Integral(0,-1), hbackgrounds[b][0].GetEntries()] - - histos=[] - colors=[] - - nsig=len(hsignal) - nbkg=len(hbackgrounds) - - for s in hsignal: - histos.append(hsignal[s][0]) - colors.append(param.colors[s]) - - for b in hbackgrounds: - histos.append(hbackgrounds[b][0]) - colors.append(param.colors[b]) - - xtitle = plotCfg['xtitle'] if 'xtitle' in plotCfg else "" - ytitle = plotCfg['ytitle'] if 'ytitle' in plotCfg else "Events" - xmin = plotCfg['xmin'] if 'xmin' in plotCfg else -1 - xmax = plotCfg['xmax'] if 'xmax' in plotCfg else -1 - ymin = plotCfg['ymin'] if 'ymin' in plotCfg else -1 - ymax = plotCfg['ymax'] if 'ymax' in plotCfg else -1 - stack = plotCfg['stack'] if 'stack' in plotCfg else False - logy = plotCfg['logy'] if 'logy' in plotCfg else False - extralab = plotCfg['extralab'] if 'extralab' in plotCfg else "" - scaleSig = plotCfg['scaleSig'] if 'scaleSig' in plotCfg else 1 - - - intLumiab = param.intLumi/1e+06 - intLumi = f'L = {intLumiab:.0f} ab^{{-1}}' - if hasattr(param, "intLumiLabel"): - intLumi = getattr(param, "intLumiLabel") - - lt = "FCCAnalyses: FCC-hh Simulation (Delphes)" - rt = "#sqrt{{s}} = {:.1f} TeV, L = {}".format(param.energy,intLumi) - - if 'ee' in param.collider: - lt = "FCCAnalyses: FCC-ee Simulation (Delphes)" - rt = "#sqrt{{s}} = {:.1f} GeV, {}".format(param.energy,intLumi) - - customLabel="" - try: - customLabel=param.customLabel - except AttributeError: - LOGGER.info('No customLable, using nothing...') - - - - if stack: - if logy: - drawStack(output, ytitle, leg, lt, rt, param.formats, param.outdir, True , True , histos, colors, param.ana_tex, extralab, scaleSig, customLabel, nsig, nbkg, leg2, yields, plotStatUnc, xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax, xtitle=xtitle) - else: - drawStack(output, ytitle, leg, lt, rt, param.formats, param.outdir, False , True , histos, colors, param.ana_tex, extralab, scaleSig, customLabel, nsig, nbkg, leg2, yields, plotStatUnc, xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax, xtitle=xtitle) - - else: - if logy: - drawStack(output, ytitle, leg, lt, rt, param.formats, param.outdir, True , False , histos, colors, param.ana_tex, extralab, scaleSig, customLabel, nsig, nbkg, leg2, yields, plotStatUnc, xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax, xtitle=xtitle) - else: - drawStack(output, ytitle, leg, lt, rt, param.formats, param.outdir, False , False , histos, colors, param.ana_tex, extralab, scaleSig, customLabel, nsig, nbkg, leg2, yields, plotStatUnc, xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax, xtitle=xtitle) - - - - -#_____________________________________________________________________________________________________________ -def drawStack(name, ylabel, legend, leftText, rightText, formats, directory, logY, stacksig, histos, colors, ana_tex, extralab, scaleSig, customLabel, nsig, nbkg, legend2=None, yields=None, plotStatUnc=False, xmin=-1, xmax=-1, ymin=-1, ymax=-1, xtitle="", ytitle=""): - - canvas = ROOT.TCanvas(name, name, 800, 800) - canvas.SetLogy(logY) - canvas.SetTicks(1,1) - canvas.SetLeftMargin(0.14) - canvas.SetRightMargin(0.08) - - sumhistos = histos[0].Clone() - iterh = iter(histos) - next(iterh) - - unit = 'GeV' - if 'TeV' in str(histos[0].GetXaxis().GetTitle()): - unit = 'TeV' - - if unit in str(histos[0].GetXaxis().GetTitle()): - bwidth=sumhistos.GetBinWidth(1) - if bwidth.is_integer(): - ylabel+=' / {} {}'.format(int(bwidth), unit) - else: - ylabel+=' / {:.2f} {}'.format(bwidth, unit) - - - nbins = 1 if not isinstance(xtitle, list) else len(xtitle) - h_dummy = ROOT.TH1D("h_dummy", "", nbins, 0, nbins) - if nbins == 1: - h_dummy.GetXaxis().SetTitle(histos[0].GetXaxis().GetTitle() if xtitle == "" else xtitle) - h_dummy.GetYaxis().SetTitleOffset(1.95) - h_dummy.GetXaxis().SetTitleOffset(1.2*h_dummy.GetXaxis().GetTitleOffset()) - else: # for cutflow plots - for i,label in enumerate(xtitle): h_dummy.GetXaxis().SetBinLabel(i+1, label) - h_dummy.GetXaxis().LabelsOption("u") - h_dummy.GetXaxis().SetLabelSize(1.1*h_dummy.GetXaxis().GetLabelSize()) - h_dummy.GetXaxis().SetLabelOffset(1.5*h_dummy.GetXaxis().GetLabelOffset()) - h_dummy.GetYaxis().SetTitle(ylabel) - - - for h in iterh: - sumhistos.Add(h) - - maxh = sumhistos.GetMaximum() - minh = sumhistos.GetMinimum() - - if logY: - canvas.SetLogy(1) - - # define stacked histo - hStack = ROOT.THStack("hstack", "") - hStackBkg = ROOT.THStack("hstackbkg", "") - hStackSig = ROOT.THStack("hstacksig","") - BgMCHistYieldsDic = {} - - # first plot backgrounds (sorted by the yields) - for i in range(nsig, nsig+nbkg): - h = histos[i] - h.SetLineWidth(1) - h.SetLineColor(ROOT.kBlack) - h.SetFillColor(colors[i]) - if h.Integral() > 0: - BgMCHistYieldsDic[h.Integral()] = h - else: - BgMCHistYieldsDic[-1*nbkg] = h - # sort stack by yields (smallest to largest) - BgMCHistYieldsDic = sortedDictValues(BgMCHistYieldsDic) - for h in BgMCHistYieldsDic: - hStack.Add(h) - hStackBkg.Add(h) - - # add the signal histograms - for i in range(nsig): - h = histos[i] - h.SetLineWidth(3) - h.SetLineColor(colors[i]) - hStack.Add(h) - hStackSig.Add(h) - - - if xmin != -1 and xmax != -1: - h_dummy.GetXaxis().SetLimits(xmin, xmax) - - h_dummy.Draw("HIST") - if stacksig: - hStack.Draw("HIST SAME") - if plotStatUnc: - hUnc_sig_bkg = formatStatUncHist(hStack.GetHists(), "sig_bkg") # sig+bkg uncertainty - hUnc_sig_bkg.Draw("E2 SAME") - else: - hStackBkg.Draw("HIST SAME") - hStackSig.Draw("HIST SAME NOSTACK") - if plotStatUnc: - hUnc_bkg = formatStatUncHist(hStackBkg.GetHists(), "bkg_only") # bkg-only uncertainty - hUnc_bkg.Draw("E2 SAME") - for sHist in hStackSig.GetHists(): - hUnc_sig = formatStatUncHist([sHist], "sig", 3245) # sigs uncertainty - hUnc_sig.Draw("E2 SAME") - - # x limits - if xmin == -1: - h_tmp = hStack.GetStack().Last() - xmin = h_tmp.GetBinLowEdge(1) - if xmax == -1: - h_tmp = hStack.GetStack().Last() - xmax = h_tmp.GetBinLowEdge(h_tmp.GetNbinsX()+1) - h_dummy.GetXaxis().SetLimits(xmin, xmax) - - # y limits - def getMinMaxRange(hists, xmin, xmax): - hTot = hists[0].Clone(name + "_unc") - for h in hists[1:]: hTot.Add(h) - vals = [] - for i in range(0, hTot.GetNbinsX()+1): - if hTot.GetBinLowEdge(i) > xmin or hTot.GetBinLowEdge(i+1) < xmax: - if hTot.GetBinContent(i) != 0: - vals.append(hTot.GetBinContent(i)) - if len(vals) == 0: - return 1e-5, 1 - return min(vals), max(vals) - - if stacksig: - ymin_, ymax_ = getMinMaxRange(hStack.GetHists(), xmin, xmax) - else: - yminSig, ymaxSig = getMinMaxRange(hStackSig.GetHists(), xmin, xmax) - yminBkg, ymaxBkg = getMinMaxRange(hStackBkg.GetHists(), xmin, xmax) - ymin_ = min(yminSig, yminBkg) - ymax_ = max(ymaxSig, ymaxBkg) - if ymin == -1: - ymin = ymin_*0.1 if logY else 0 - if ymax == -1: - ymax = ymax_*1000. if logY else 1.4*ymax_ - if ymin <= 0 and logY: - LOGGER.error('Log scale can\'t start at: %i', ymin) - sys.exit(3) - h_dummy.SetMaximum(ymax) - h_dummy.SetMinimum(ymin) - - legend.Draw() - if legend2 != None: - legend2.Draw() - - Text = ROOT.TLatex() - Text.SetNDC() - Text.SetTextAlign(31); - Text.SetTextSize(0.04) - - text = '#it{' + leftText +'}' - Text.DrawLatex(0.90, 0.94, text) - - text = '#it{'+customLabel+'}' - Text.SetTextAlign(12); - Text.SetNDC(ROOT.kTRUE) - Text.SetTextSize(0.04) - Text.DrawLatex(0.18, 0.85, text) - - rightText = re.split(",", rightText) - text = '#bf{#it{' + rightText[0] +'}}' - - Text.SetTextAlign(12); - Text.SetNDC(ROOT.kTRUE) - Text.SetTextSize(0.04) - Text.DrawLatex(0.18, 0.81, text) - - rightText[1]=rightText[1].replace(" ","") - text = '#bf{#it{' + rightText[1] +'}}' - Text.SetTextSize(0.035) - Text.DrawLatex(0.18, 0.76, text) - - text = '#bf{#it{' + ana_tex +'}}' - Text.SetTextSize(0.04) - Text.DrawLatex(0.18, 0.71, text) - - text = '#bf{#it{' + extralab +'}}' - Text.SetTextSize(0.025) - Text.DrawLatex(0.18, 0.66, text) - - text = '#bf{#it{' + 'Signal scale=' + str(scaleSig)+'}}' - Text.SetTextSize(0.025) - if scaleSig!=1:Text.DrawLatex(0.18, 0.63, text) - - canvas.RedrawAxis() - canvas.GetFrame().SetBorderSize( 12 ) - canvas.Modified() - canvas.Update() - - if 'AAAyields' in name: - dummyh=ROOT.TH1F("","",1,0,1) - dummyh.SetStats(0) - dummyh.GetXaxis().SetLabelOffset(999) - dummyh.GetXaxis().SetLabelSize(0) - dummyh.GetYaxis().SetLabelOffset(999) - dummyh.GetYaxis().SetLabelSize(0) - dummyh.Draw("AH") - legend.Draw() - - Text.SetNDC() - Text.SetTextAlign(31); - Text.SetTextSize(0.04) - - text = '#it{' + leftText +'}' - Text.DrawLatex(0.90, 0.92, text) - - text = '#bf{#it{' + rightText[0] +'}}' - Text.SetTextAlign(12); - Text.SetNDC(ROOT.kTRUE) - Text.SetTextSize(0.04) - Text.DrawLatex(0.18, 0.83, text) - - text = '#bf{#it{' + rightText[1] +'}}' - Text.SetTextSize(0.035) - Text.DrawLatex(0.18, 0.78, text) - - text = '#bf{#it{' + ana_tex +'}}' - Text.SetTextSize(0.04) - Text.DrawLatex(0.18, 0.73, text) - - text = '#bf{#it{' + extralab +'}}' - Text.SetTextSize(0.025) - Text.DrawLatex(0.18, 0.68, text) - - text = '#bf{#it{' + 'Signal scale=' + str(scaleSig)+'}}' - Text.SetTextSize(0.04) - Text.DrawLatex(0.18, 0.55, text) - - dy=0 - text = '#bf{#it{' + 'Process' +'}}' - Text.SetTextSize(0.035) - Text.DrawLatex(0.18, 0.45, text) - - text = '#bf{#it{' + 'Yields' +'}}' - Text.SetTextSize(0.035) - Text.DrawLatex(0.5, 0.45, text) - - text = '#bf{#it{' + 'Raw MC' +'}}' - Text.SetTextSize(0.035) - Text.DrawLatex(0.75, 0.45, text) - - for y in yields: - text = '#bf{#it{' + yields[y][0] +'}}' - Text.SetTextSize(0.035) - Text.DrawLatex(0.18, 0.4-dy*0.05, text) - - stry=str(yields[y][1]) - stry=stry.split('.')[0] - text = '#bf{#it{' + stry +'}}' - Text.SetTextSize(0.035) - Text.DrawLatex(0.5, 0.4-dy*0.05, text) - - stry=str(yields[y][2]) - stry=stry.split('.')[0] - text = '#bf{#it{' + stry +'}}' - Text.SetTextSize(0.035) - Text.DrawLatex(0.75, 0.4-dy*0.05, text) - - - dy+=1 - #canvas.Modified() - #canvas.Update() - - - printCanvas(canvas, name, formats, directory) - - - - -#____________________________________________________ -def printCanvas(canvas, name, formats, directory): - - if format != "": - if not os.path.exists(directory) : - os.system("mkdir -p "+directory) - for f in formats: - outFile = os.path.join(directory, name) + "." + f - canvas.SaveAs(outFile) - - - -#__________________________________________________________ -def run(paramFile): - ROOT.gROOT.SetBatch(True) - ROOT.gErrorIgnoreLevel = ROOT.kWarning - - module_path = os.path.abspath(paramFile) - module_dir = os.path.dirname(module_path) - base_name = os.path.splitext(ntpath.basename(paramFile))[0] - - sys.path.insert(0, module_dir) - param = importlib.import_module(base_name) - - if hasattr(param, "splitLeg"): - splitLeg = param.splitLeg - else: - splitLeg = False - - if hasattr(param, "plotStatUnc"): - plotStatUnc = param.plotStatUnc - else: - plotStatUnc = False - - if hasattr(param, "hists"): - for hName,plotCfg in param.hists.items(): - runPlotsHistmaker(hName, param, plotCfg) - quit() - - counter=0 - for iVar,var in enumerate(param.variables): - for label, sels in param.selections.items(): - for sel in sels: - hsignal,hbackgrounds=mapHistos(var,label,sel, param, rebin=param.rebin[iVar] if hasattr(param, "rebin") and len(param.rebin) == len(param.variables) else 1) - runPlots(var+"_"+label,sel,param,hsignal,hbackgrounds,param.extralabel[sel],splitLeg,plotStatUnc) - if counter==0: runPlots("AAAyields_"+label,sel,param,hsignal,hbackgrounds,param.extralabel[sel],splitLeg,plotStatUnc) - counter+=1 diff --git a/python/do_plots.py b/python/do_plots.py new file mode 100644 index 0000000000..927d2280c0 --- /dev/null +++ b/python/do_plots.py @@ -0,0 +1,807 @@ +#!/usr/bin/env python +''' +Create plots out of the histograms produced in previous stages +''' +import sys +import os +import os.path +import ntpath +import importlib +import copy +import re +import logging +import ROOT # type: ignore + +ROOT.gROOT.SetBatch(True) +ROOT.gStyle.SetOptStat(0) +ROOT.gStyle.SetOptTitle(0) + + +LOGGER = logging.getLogger('FCCAnalyses.plot') + + +# _____________________________________________________________________________ +def removekey(d: dict, key: str) -> dict: + ''' + Remove dictionary element. + ''' + r = dict(d) + del r[key] + return r + + +def sorted_dict_values(dic: dict) -> list: + '''' + Sort values in the dictionary. + ''' + keys = sorted(dic) + return [dic[key] for key in keys] + + +def formatStatUncHist(hists, name, hstyle=3254): + hist_tot = hists[0].Clone(name + "_unc") + for h in hists[1:]: + hist_tot.Add(h) + hist_tot.SetFillColor(ROOT.kBlack) + hist_tot.SetMarkerSize(0) + hist_tot.SetLineWidth(0) + hist_tot.SetFillStyle(hstyle) + return hist_tot + + +# _____________________________________________________________________________ +def mapHistos(var, label, sel, param, rebin): + LOGGER.info('Run plots for var:%s label:%s selection:%s', + var, label, sel) + signal = param.plots[label]['signal'] + backgrounds = param.plots[label]['backgrounds'] + + hsignal = {} + for s in signal: + hsignal[s] = [] + for f in signal[s]: + fin = param.inputDir+f+'_'+sel+'_histo.root' + if not os.path.isfile(fin): + LOGGER.info('File "%s" not found!\nSkipping it...', fin) + continue + + with ROOT.TFile(fin, 'READ') as tf: + h = tf.Get(var) + hh = copy.deepcopy(h) + hh.SetDirectory(0) + scaleSig = 1. + try: + scaleSig = param.scaleSig + except AttributeError: + LOGGER.debug('No scale signal, using 1.') + param.scaleSig = scaleSig + LOGGER.info('ScaleSig: %g', scaleSig) + hh.Scale(param.intLumi*scaleSig) + hh.Rebin(rebin) + + if len(hsignal[s]) == 0: + hsignal[s].append(hh) + else: + hh.Add(hsignal[s][0]) + hsignal[s][0] = hh + + hbackgrounds = {} + for b in backgrounds: + hbackgrounds[b] = [] + for f in backgrounds[b]: + fin = param.inputDir+f+'_'+sel+'_histo.root' + if not os.path.isfile(fin): + LOGGER.info('File "%s" not found!\nSkipping it...', fin) + continue + + with ROOT.TFile(fin) as tf: + h = tf.Get(var) + hh = copy.deepcopy(h) + hh.SetDirectory(0) + hh.Scale(param.intLumi) + hh.Rebin(rebin) + if len(hbackgrounds[b]) == 0: + hbackgrounds[b].append(hh) + else: + hh.Add(hbackgrounds[b][0]) + hbackgrounds[b][0] = hh + + for s in hsignal: + if len(hsignal[s]) == 0: + hsignal = removekey(hsignal, s) + + for b in hbackgrounds: + if len(hbackgrounds[b]) == 0: + hbackgrounds = removekey(hbackgrounds, b) + + if not hsignal: + LOGGER.error('No signal input files found!\nAborting...') + sys.exit(3) + + return hsignal, hbackgrounds + + +# _____________________________________________________________________________ +def mapHistosFromHistmaker(hName, param, plotCfg): + rebin = plotCfg['rebin'] if 'rebin' in plotCfg else 1 + LOGGER.info('Get histograms for %s', hName) + signal = param.procs['signal'] + backgrounds = param.procs['backgrounds'] + scaleSig = plotCfg['scaleSig'] if 'scaleSig' in plotCfg else 1 + + hsignal = {} + for s in signal: + hsignal[s] = [] + for f in signal[s]: + fin = f"{param.inputDir}/{f}.root" + if not os.path.isfile(fin): + LOGGER.info('File "%s" not found!\nSkipping it...', fin) + continue + + with ROOT.TFile(fin) as tf: + h = tf.Get(hName) + hh = copy.deepcopy(h) + hh.SetDirectory(0) + LOGGER.info('ScaleSig: %g', scaleSig) + hh.Scale(param.intLumi*scaleSig) + hh.Rebin(rebin) + if len(hsignal[s]) == 0: + hsignal[s].append(hh) + else: + hh.Add(hsignal[s][0]) + hsignal[s][0] = hh + + hbackgrounds = {} + for b in backgrounds: + hbackgrounds[b] = [] + for f in backgrounds[b]: + fin = f"{param.inputDir}/{f}.root" + if not os.path.isfile(fin): + LOGGER.info('File "%s" not found!\nSkipping it...', fin) + continue + + with ROOT.TFile(fin) as tf: + h = tf.Get(hName) + hh = copy.deepcopy(h) + hh.SetDirectory(0) + hh.Scale(param.intLumi) + hh.Rebin(rebin) + if len(hbackgrounds[b]) == 0: + hbackgrounds[b].append(hh) + else: + hh.Add(hbackgrounds[b][0]) + hbackgrounds[b][0] = hh + + for s in hsignal: + if len(hsignal[s]) == 0: + hsignal = removekey(hsignal, s) + + for b in hbackgrounds: + if len(hbackgrounds[b]) == 0: + hbackgrounds = removekey(hbackgrounds, b) + + if not hsignal: + LOGGER.error('No signal input files found!\nAborting...') + sys.exit(3) + + return hsignal, hbackgrounds + + +# _____________________________________________________________________________ +def runPlots(var, sel, param, hsignal, hbackgrounds, extralab, splitLeg, + plotStatUnc): + + # Below are settings for separate signal and background legends + if splitLeg: + legsize = 0.04*(len(hsignal)) + legsize2 = 0.04*(len(hbackgrounds)) + legCoord = [0.15, 0.60 - legsize, 0.50, 0.62] + leg2 = ROOT.TLegend(0.60, 0.60 - legsize2, 0.88, 0.62) + leg2.SetFillColor(0) + leg2.SetFillStyle(0) + leg2.SetLineColor(0) + leg2.SetShadowColor(10) + leg2.SetTextSize(0.035) + leg2.SetTextFont(42) + else: + legsize = 0.04*(len(hbackgrounds)+len(hsignal)) + legCoord = [0.68, 0.86-legsize, 0.96, 0.88] + try: + legCoord = param.legendCoord + except AttributeError: + LOGGER.debug('No legCoord, using default one...') + legCoord = [0.68, 0.86-legsize, 0.96, 0.88] + leg2 = None + + leg = ROOT.TLegend(legCoord[0], legCoord[1], legCoord[2], legCoord[3]) + leg.SetFillColor(0) + leg.SetFillStyle(0) + leg.SetLineColor(0) + leg.SetShadowColor(10) + leg.SetTextSize(0.035) + leg.SetTextFont(42) + + for b in hbackgrounds: + if splitLeg: + leg2.AddEntry(hbackgrounds[b][0], param.legend[b], "f") + else: + leg.AddEntry(hbackgrounds[b][0], param.legend[b], "f") + for s in hsignal: + leg.AddEntry(hsignal[s][0], param.legend[s], "l") + + yields = {} + for s in hsignal: + yields[s] = [param.legend[s], + hsignal[s][0].Integral(0, -1), + hsignal[s][0].GetEntries()] + for b in hbackgrounds: + yields[b] = [param.legend[b], + hbackgrounds[b][0].Integral(0, -1), + hbackgrounds[b][0].GetEntries()] + + histos = [] + colors = [] + + nsig = len(hsignal) + nbkg = len(hbackgrounds) + + for s in hsignal: + histos.append(hsignal[s][0]) + colors.append(param.colors[s]) + + for b in hbackgrounds: + histos.append(hbackgrounds[b][0]) + colors.append(param.colors[b]) + + intLumiab = param.intLumi/1e+06 + intLumi = f'L = {intLumiab:.0f} ab^{{-1}}' + if hasattr(param, "intLumiLabel"): + intLumi = getattr(param, "intLumiLabel") + + lt = 'FCCAnalyses: FCC-hh Simulation (Delphes)' + rt = f'#sqrt{{s}} = {param.energy:.1f} TeV, L = {intLumi}' + + if 'ee' in param.collider: + lt = 'FCCAnalyses: FCC-ee Simulation (Delphes)' + rt = f'#sqrt{{s}} = {param.energy:.1f} GeV, {intLumi}' + + customLabel = "" + try: + customLabel = param.customLabel + except AttributeError: + LOGGER.debug('No customLable, using nothing...') + + scaleSig = 1. + try: + scaleSig = param.scaleSig + except AttributeError: + LOGGER.debug('No scale signal, using 1.') + param.scaleSig = scaleSig + + if 'AAAyields' in var: + drawStack(var, 'events', leg, lt, rt, param.formats, + param.outdir+"/"+sel, False, True, histos, colors, + param.ana_tex, extralab, scaleSig, customLabel, nsig, + nbkg, leg2, yields, plotStatUnc) + return + + if 'stack' in param.stacksig: + if 'lin' in param.yaxis: + drawStack(var+"_stack_lin", 'events', leg, lt, rt, param.formats, + param.outdir+"/"+sel, False, True, histos, colors, + param.ana_tex, extralab, scaleSig, customLabel, nsig, + nbkg, leg2, yields, plotStatUnc) + if 'log' in param.yaxis: + drawStack(var+"_stack_log", 'events', leg, lt, rt, param.formats, + param.outdir+"/"+sel, True, True, histos, colors, + param.ana_tex, extralab, scaleSig, customLabel, nsig, + nbkg, leg2, yields, plotStatUnc) + if 'lin' not in param.yaxis and 'log' not in param.yaxis: + LOGGER.info('Unrecognised option in formats, should be ' + '[\'lin\',\'log\']') + + if 'nostack' in param.stacksig: + if 'lin' in param.yaxis: + drawStack(var+"_nostack_lin", 'events', leg, lt, rt, param.formats, + param.outdir+"/"+sel, False, False, histos, colors, + param.ana_tex, extralab, scaleSig, customLabel, nsig, + nbkg, leg2, yields, plotStatUnc) + if 'log' in param.yaxis: + drawStack(var+"_nostack_log", 'events', leg, lt, rt, param.formats, + param.outdir+"/"+sel, True, False, histos, colors, + param.ana_tex, extralab, scaleSig, customLabel, nsig, + nbkg, leg2, yields, plotStatUnc) + if 'lin' not in param.yaxis and 'log' not in param.yaxis: + LOGGER.info('Unrecognised option in formats, should be ' + '[\'lin\',\'log\']') + if 'stack' not in param.stacksig and 'nostack' not in param.stacksig: + LOGGER.info('Unrecognised option in stacksig, should be ' + '[\'stack\',\'nostack\']') + + +# _____________________________________________________________________________ +def runPlotsHistmaker(hName, param, plotCfg): + + output = plotCfg['output'] + hsignal, hbackgrounds = mapHistosFromHistmaker(hName, param, plotCfg) + + if hasattr(param, "splitLeg"): + splitLeg = param.splitLeg + else: + splitLeg = False + + if hasattr(param, "plotStatUnc"): + plotStatUnc = param.plotStatUnc + else: + plotStatUnc = False + + # Below are settings for separate signal and background legends + if splitLeg: + legsize = 0.04*(len(hsignal)) + legsize2 = 0.04*(len(hbackgrounds)) + legCoord = [0.15, 0.60 - legsize, 0.50, 0.62] + leg2 = ROOT.TLegend(0.60, 0.60 - legsize2, 0.88, 0.62) + leg2.SetFillColor(0) + leg2.SetFillStyle(0) + leg2.SetLineColor(0) + leg2.SetShadowColor(10) + leg2.SetTextSize(0.035) + leg2.SetTextFont(42) + else: + legsize = 0.04*(len(hbackgrounds)+len(hsignal)) + legCoord = [0.68, 0.86-legsize, 0.96, 0.88] + try: + legCoord = param.legendCoord + except AttributeError: + LOGGER.debug('No legCoord, using default one...') + legCoord = [0.68, 0.86-legsize, 0.96, 0.88] + leg2 = None + + leg = ROOT.TLegend(legCoord[0], legCoord[1], legCoord[2], legCoord[3]) + leg.SetFillColor(0) + leg.SetFillStyle(0) + leg.SetLineColor(0) + leg.SetShadowColor(10) + leg.SetTextSize(0.035) + leg.SetTextFont(42) + + for b in hbackgrounds: + if splitLeg: + leg2.AddEntry(hbackgrounds[b][0], param.legend[b], "f") + else: + leg.AddEntry(hbackgrounds[b][0], param.legend[b], "f") + for s in hsignal: + leg.AddEntry(hsignal[s][0], param.legend[s], "l") + + yields = {} + for s in hsignal: + yields[s] = [param.legend[s], + hsignal[s][0].Integral(0, -1), + hsignal[s][0].GetEntries()] + for b in hbackgrounds: + yields[b] = [param.legend[b], + hbackgrounds[b][0].Integral(0, -1), + hbackgrounds[b][0].GetEntries()] + + histos = [] + colors = [] + + nsig = len(hsignal) + nbkg = len(hbackgrounds) + + for s in hsignal: + histos.append(hsignal[s][0]) + colors.append(param.colors[s]) + + for b in hbackgrounds: + histos.append(hbackgrounds[b][0]) + colors.append(param.colors[b]) + + xtitle = plotCfg['xtitle'] if 'xtitle' in plotCfg else "" + ytitle = plotCfg['ytitle'] if 'ytitle' in plotCfg else "Events" + xmin = plotCfg['xmin'] if 'xmin' in plotCfg else -1 + xmax = plotCfg['xmax'] if 'xmax' in plotCfg else -1 + ymin = plotCfg['ymin'] if 'ymin' in plotCfg else -1 + ymax = plotCfg['ymax'] if 'ymax' in plotCfg else -1 + stack = plotCfg['stack'] if 'stack' in plotCfg else False + logy = plotCfg['logy'] if 'logy' in plotCfg else False + extralab = plotCfg['extralab'] if 'extralab' in plotCfg else "" + scaleSig = plotCfg['scaleSig'] if 'scaleSig' in plotCfg else 1 + + intLumiab = param.intLumi/1e+06 + intLumi = f'L = {intLumiab:.0f} ab^{{-1}}' + if hasattr(param, "intLumiLabel"): + intLumi = getattr(param, "intLumiLabel") + + lt = 'FCCAnalyses: FCC-hh Simulation (Delphes)' + rt = f'#sqrt{{s}} = {param.energy:.1f} TeV, L = {intLumi}' + + if 'ee' in param.collider: + lt = 'FCCAnalyses: FCC-ee Simulation (Delphes)' + rt = f'#sqrt{{s}} = {param.energy:.1f} GeV, {intLumi}' + + customLabel = "" + try: + customLabel = param.customLabel + except AttributeError: + LOGGER.debug('No customLable, using nothing...') + + if stack: + if logy: + drawStack(output, ytitle, leg, lt, rt, param.formats, param.outdir, + True, True, histos, colors, param.ana_tex, extralab, + scaleSig, customLabel, nsig, nbkg, leg2, yields, + plotStatUnc, xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax, + xtitle=xtitle) + else: + drawStack(output, ytitle, leg, lt, rt, param.formats, param.outdir, + False, True, histos, colors, param.ana_tex, extralab, + scaleSig, customLabel, nsig, nbkg, leg2, yields, + plotStatUnc, xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax, + xtitle=xtitle) + + else: + if logy: + drawStack(output, ytitle, leg, lt, rt, param.formats, param.outdir, + True, False, histos, colors, param.ana_tex, extralab, + scaleSig, customLabel, nsig, nbkg, leg2, yields, + plotStatUnc, xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax, + xtitle=xtitle) + else: + drawStack(output, ytitle, leg, lt, rt, param.formats, param.outdir, + False, False, histos, colors, param.ana_tex, extralab, + scaleSig, customLabel, nsig, nbkg, leg2, yields, + plotStatUnc, xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax, + xtitle=xtitle) + + +# _____________________________________________________________________________ +def drawStack(name, ylabel, legend, leftText, rightText, formats, directory, + logY, stacksig, histos, colors, ana_tex, extralab, scaleSig, + customLabel, nsig, nbkg, legend2=None, yields=None, + plotStatUnc=False, xmin=-1, xmax=-1, ymin=-1, ymax=-1, + xtitle=""): + + canvas = ROOT.TCanvas(name, name, 800, 800) + canvas.SetLogy(logY) + canvas.SetTicks(1, 1) + canvas.SetLeftMargin(0.14) + canvas.SetRightMargin(0.08) + + sumhistos = histos[0].Clone() + iterh = iter(histos) + next(iterh) + + unit = 'GeV' + if 'TeV' in str(histos[0].GetXaxis().GetTitle()): + unit = 'TeV' + + if unit in str(histos[0].GetXaxis().GetTitle()): + bwidth = sumhistos.GetBinWidth(1) + if bwidth.is_integer(): + ylabel += f' / {bwidth} {unit}' + else: + ylabel += f' / {bwidth:.2f} {unit}' + + nbins = 1 if not isinstance(xtitle, list) else len(xtitle) + h_dummy = ROOT.TH1D("h_dummy", "", nbins, 0, nbins) + if nbins == 1: + h_dummy.GetXaxis().SetTitle( + histos[0].GetXaxis().GetTitle() if xtitle == "" else xtitle) + h_dummy.GetYaxis().SetTitleOffset(1.95) + h_dummy.GetXaxis().SetTitleOffset( + 1.2*h_dummy.GetXaxis().GetTitleOffset()) + else: # for cutflow plots + for i, label in enumerate(xtitle): + h_dummy.GetXaxis().SetBinLabel(i+1, label) + h_dummy.GetXaxis().LabelsOption("u") + h_dummy.GetXaxis().SetLabelSize(1.1*h_dummy.GetXaxis().GetLabelSize()) + h_dummy.GetXaxis().SetLabelOffset( + 1.5*h_dummy.GetXaxis().GetLabelOffset()) + h_dummy.GetYaxis().SetTitle(ylabel) + + for h in iterh: + sumhistos.Add(h) + + if logY: + canvas.SetLogy(1) + + # define stacked histo + hStack = ROOT.THStack("hstack", "") + hStackBkg = ROOT.THStack("hstackbkg", "") + hStackSig = ROOT.THStack("hstacksig", "") + BgMCHistYieldsDic = {} + + # first plot backgrounds (sorted by the yields) + for i in range(nsig, nsig+nbkg): + h = histos[i] + h.SetLineWidth(1) + h.SetLineColor(ROOT.kBlack) + h.SetFillColor(colors[i]) + if h.Integral() > 0: + BgMCHistYieldsDic[h.Integral()] = h + else: + BgMCHistYieldsDic[-1*nbkg] = h + # sort stack by yields (smallest to largest) + BgMCHistYieldsDic = sorted_dict_values(BgMCHistYieldsDic) + for h in BgMCHistYieldsDic: + hStack.Add(h) + hStackBkg.Add(h) + + # add the signal histograms + for i in range(nsig): + h = histos[i] + h.SetLineWidth(3) + h.SetLineColor(colors[i]) + hStack.Add(h) + hStackSig.Add(h) + + if xmin != -1 and xmax != -1: + h_dummy.GetXaxis().SetLimits(xmin, xmax) + + h_dummy.Draw("HIST") + if stacksig: + hStack.Draw("HIST SAME") + if plotStatUnc: + # sig+bkg uncertainty + hUnc_sig_bkg = formatStatUncHist(hStack.GetHists(), "sig_bkg") + hUnc_sig_bkg.Draw("E2 SAME") + else: + hStackBkg.Draw("HIST SAME") + hStackSig.Draw("HIST SAME NOSTACK") + if plotStatUnc: + # bkg-only uncertainty + hUnc_bkg = formatStatUncHist(hStackBkg.GetHists(), "bkg_only") + hUnc_bkg.Draw("E2 SAME") + for sHist in hStackSig.GetHists(): + # sigs uncertainty + hUnc_sig = formatStatUncHist([sHist], "sig", 3245) + hUnc_sig.Draw("E2 SAME") + + # x limits + if xmin == -1: + h_tmp = hStack.GetStack().Last() + xmin = h_tmp.GetBinLowEdge(1) + if xmax == -1: + h_tmp = hStack.GetStack().Last() + xmax = h_tmp.GetBinLowEdge(h_tmp.GetNbinsX()+1) + h_dummy.GetXaxis().SetLimits(xmin, xmax) + + # y limits + def get_minmax_range(hists, xmin, xmax): + hist_tot = hists[0].Clone(name + "_unc") + for h in hists[1:]: + hist_tot.Add(h) + vals = [] + for i in range(0, hist_tot.GetNbinsX()+1): + if hist_tot.GetBinLowEdge(i) > xmin or \ + hist_tot.GetBinLowEdge(i+1) < xmax: + if hist_tot.GetBinContent(i) != 0: + vals.append(hist_tot.GetBinContent(i)) + if len(vals) == 0: + return 1e-5, 1 + return min(vals), max(vals) + + if stacksig: + ymin_, ymax_ = get_minmax_range(hStack.GetHists(), xmin, xmax) + else: + ymin_sig, ymax_sig = get_minmax_range(hStackSig.GetHists(), xmin, xmax) + ymin_bkg, ymax_bkg = get_minmax_range(hStackBkg.GetHists(), xmin, xmax) + ymin_ = min(ymin_sig, ymin_bkg) + ymax_ = max(ymax_sig, ymax_bkg) + if ymin == -1: + ymin = ymin_*0.1 if logY else 0 + if ymax == -1: + ymax = ymax_*1000. if logY else 1.4*ymax_ + if ymin <= 0 and logY: + LOGGER.error('Log scale can\'t start at: %i', ymin) + sys.exit(3) + h_dummy.SetMaximum(ymax) + h_dummy.SetMinimum(ymin) + + legend.Draw() + if legend2 is not None: + legend2.Draw() + + latex = ROOT.TLatex() + latex.SetNDC() + latex.SetTextAlign(31) + latex.SetTextSize(0.04) + + text = '#it{' + leftText + '}' + latex.DrawLatex(0.90, 0.94, text) + + text = '#it{'+customLabel+'}' + latex.SetTextAlign(12) + latex.SetNDC(ROOT.kTRUE) + latex.SetTextSize(0.04) + latex.DrawLatex(0.18, 0.85, text) + + rightText = re.split(",", rightText) + text = '#bf{#it{' + rightText[0] + '}}' + + latex.SetTextAlign(12) + latex.SetNDC(ROOT.kTRUE) + latex.SetTextSize(0.04) + latex.DrawLatex(0.18, 0.81, text) + + rightText[1] = rightText[1].replace(" ", "") + text = '#bf{#it{' + rightText[1] + '}}' + latex.SetTextSize(0.035) + latex.DrawLatex(0.18, 0.76, text) + + text = '#bf{#it{' + ana_tex + '}}' + latex.SetTextSize(0.04) + latex.DrawLatex(0.18, 0.71, text) + + text = '#bf{#it{' + extralab + '}}' + latex.SetTextSize(0.025) + latex.DrawLatex(0.18, 0.66, text) + + text = '#bf{#it{' + 'Signal scale=' + str(scaleSig)+'}}' + latex.SetTextSize(0.025) + if scaleSig != 1: + latex.DrawLatex(0.18, 0.63, text) + + canvas.RedrawAxis() + canvas.GetFrame().SetBorderSize(12) + canvas.Modified() + canvas.Update() + + if 'AAAyields' in name: + dummyh = ROOT.TH1F("", "", 1, 0, 1) + dummyh.SetStats(0) + dummyh.GetXaxis().SetLabelOffset(999) + dummyh.GetXaxis().SetLabelSize(0) + dummyh.GetYaxis().SetLabelOffset(999) + dummyh.GetYaxis().SetLabelSize(0) + dummyh.Draw("AH") + legend.Draw() + + latex.SetNDC() + latex.SetTextAlign(31) + latex.SetTextSize(0.04) + + text = '#it{' + leftText + '}' + latex.DrawLatex(0.90, 0.92, text) + + text = '#bf{#it{' + rightText[0] + '}}' + latex.SetTextAlign(12) + latex.SetNDC(ROOT.kTRUE) + latex.SetTextSize(0.04) + latex.DrawLatex(0.18, 0.83, text) + + text = '#bf{#it{' + rightText[1] + '}}' + latex.SetTextSize(0.035) + latex.DrawLatex(0.18, 0.78, text) + + text = '#bf{#it{' + ana_tex + '}}' + latex.SetTextSize(0.04) + latex.DrawLatex(0.18, 0.73, text) + + text = '#bf{#it{' + extralab + '}}' + latex.SetTextSize(0.025) + latex.DrawLatex(0.18, 0.68, text) + + text = '#bf{#it{' + 'Signal scale=' + str(scaleSig) + '}}' + latex.SetTextSize(0.04) + latex.DrawLatex(0.18, 0.55, text) + + dy = 0 + text = '#bf{#it{' + 'Process' + '}}' + latex.SetTextSize(0.035) + latex.DrawLatex(0.18, 0.45, text) + + text = '#bf{#it{' + 'Yields' + '}}' + latex.SetTextSize(0.035) + latex.DrawLatex(0.5, 0.45, text) + + text = '#bf{#it{' + 'Raw MC' + '}}' + latex.SetTextSize(0.035) + latex.DrawLatex(0.75, 0.45, text) + + for y in yields: + text = '#bf{#it{' + yields[y][0] + '}}' + latex.SetTextSize(0.035) + latex.DrawLatex(0.18, 0.4-dy*0.05, text) + + stry = str(yields[y][1]) + stry = stry.split('.', maxsplit=1)[0] + text = '#bf{#it{' + stry + '}}' + latex.SetTextSize(0.035) + latex.DrawLatex(0.5, 0.4-dy*0.05, text) + + stry = str(yields[y][2]) + stry = stry.split('.', maxsplit=1)[0] + text = '#bf{#it{' + stry + '}}' + latex.SetTextSize(0.035) + latex.DrawLatex(0.75, 0.4-dy*0.05, text) + + dy += 1 + # canvas.Modified() + # canvas.Update() + + print_canvas(canvas, name, formats, directory) + + +# _____________________________________________________________________________ +def print_canvas(canvas, name, formats, directory): + ''' + Saving canvas in multiple formats. + ''' + + if not formats: + LOGGER.error('No output formats specified!\nAborting...') + sys.exit(3) + + if not os.path.exists(directory): + os.system("mkdir -p " + directory) + + for f in formats: + out_file = os.path.join(directory, name) + "." + f + canvas.SaveAs(out_file) + + +# _____________________________________________________________________________ +def run(script_path): + ''' + Run over all the plots. + ''' + ROOT.gROOT.SetBatch(True) + ROOT.gErrorIgnoreLevel = ROOT.kWarning + + module_path = os.path.abspath(script_path) + module_dir = os.path.dirname(module_path) + base_name = os.path.splitext(ntpath.basename(script_path))[0] + + sys.path.insert(0, module_dir) + param = importlib.import_module(base_name) + + split_leg = False + if hasattr(param, "splitLeg"): + split_leg = param.splitLeg + + plot_stat_unc = False + if hasattr(param, "plotStatUnc"): + plot_stat_unc = param.plotStatUnc + + if hasattr(param, "hists"): + for hist_name, plot_cfg in param.hists.items(): + runPlotsHistmaker(hist_name, param, plot_cfg) + sys.exit() + + counter = 0 + for var_index, var in enumerate(param.variables): + for label, sels in param.selections.items(): + for sel in sels: + rebin_tmp = 1 + if (hasattr(param, "rebin") and + len(param.rebin) == len(param.variables)): + rebin_tmp = param.rebin[var_index] + hsignal, hbackgrounds = mapHistos(var, label, sel, param, + rebin=rebin_tmp) + runPlots(var+"_"+label, sel, param, hsignal, hbackgrounds, + param.extralabel[sel], split_leg, plot_stat_unc) + if counter == 0: + runPlots("AAAyields_"+label, sel, param, hsignal, + hbackgrounds, param.extralabel[sel], split_leg, + plot_stat_unc) + counter += 1 + + +def do_plots(parser): + ''' + Run plots generation + ''' + + args, _ = parser.parse_known_args() + + if args.command != 'plots': + LOGGER.error('Wrong sub-command!\nAborting...') + + if not os.path.isfile(args.script_path): + LOGGER.error('Plots script "%s" not found!\nAborting...', + args.script_path) + sys.exit(3) + + run(args.script_path) diff --git a/python/init_analysis.py b/python/init_analysis.py new file mode 100644 index 0000000000..14794164d5 --- /dev/null +++ b/python/init_analysis.py @@ -0,0 +1,121 @@ +''' +Initialize local analysis +''' + +import logging +import os +import sys +from subprocess import getoutput + + +LOGGER = logging.getLogger('FCCAnalyses.init_analysis') + + +def find_author(): + ''' + Retrieve the author of the package from the git confioguration + ''' + return getoutput('git config --global --get user.name') + \ + ' <' + getoutput('git config --global --get user.email') + '>' + + +def replace_all(_input: str, repl: dict) -> str: + ''' + Replace all elements of repl in the provided string. + ''' + output = _input + for a, b in repl.items(): + output = output.replace(a, b) + return output + + +def create_file(dest_path: str, + template_path: str, + replacements: dict) -> bool: + ''' + Create file from a template. + TODO: exceptions + ''' + with open(template_path, 'r', encoding='utf-8') as template_file: + template: str = template_file.read() + + with open(dest_path, 'w', encoding='utf-8') as dest_file: + dest_file.write(replace_all(template, replacements)) + + return True + + +def setup_analysis(package: str, + author: str = '', + description: str = '', + name: str = '', + standalone: bool = False, + output_dir: str = ''): + ''' + Generates the analysis package. + ''' + if not author: + author = find_author() + if not description: + description = '[...]' + elif '\n' in description: + raise RuntimeError('Multiline description is not supported. Please ' + 'edit the output analysis header file directly.') + fccanalyses_path = getoutput('git rev-parse --show-toplevel') + replacement_dict = { + '__pkgname__': package, + '__pkgdesc__': description, + '__name__': name, + '__author__': author, + '__fccpath__': fccanalyses_path + } + + if not output_dir: + path = f'{fccanalyses_path}/case-studies/{package}' + else: + path = output_dir + + for p in [path, f'{path}/src', f'{path}/include', f'{path}/scripts']: + try: + os.mkdir(p) + except FileExistsError: + LOGGER.warning('FCCAnalysis package "%s" already exists.', package) + try: + tmpl_dir = os.path.join(fccanalyses_path, 'templates') + create_file(f'{path}/src/classes.h', f'{tmpl_dir}/classes.h', + replacement_dict) + create_file(f'{path}/src/classes_def.xml', + f'{tmpl_dir}/classes_def.xml', + replacement_dict) + create_file(f'{path}/src/{name}.cc', f'{tmpl_dir}/Package.cc', + replacement_dict) + create_file(f'{path}/include/{name}.h', f'{tmpl_dir}/Package.h', + replacement_dict) + create_file(f'{path}/scripts/analysis_cfg.py', + f'{tmpl_dir}/analysis_cfg.py', + replacement_dict) + if standalone: + create_file(f'{path}/CMakeLists.txt', f'{tmpl_dir}/CMakeLists.txt', + replacement_dict) + except OSError as error: + LOGGER.error('FCCAnalysis package "%s" creation error:\n%s', + package, error) + sys.exit(3) + + +def init_analysis(mainparser): + ''' + Initialize analysis package + ''' + + args, _ = mainparser.parse_known_args() + + if args.command != 'init': + LOGGER.error('Wrong sub-command!\nAborting...') + + setup_analysis(package=args.package, + name=args.name, + author=args.author, + description=args.description, + standalone=args.standalone, + output_dir=args.output_dir) diff --git a/python/parsers.py b/python/parsers.py new file mode 100644 index 0000000000..ce10776ef2 --- /dev/null +++ b/python/parsers.py @@ -0,0 +1,174 @@ +''' +Parsers for the fccanalysis sub-commands +''' + +import argparse + + +def setup_init_parser(parser): + ''' + Arguments for the init sub-command + ''' + init_args = parser.add_argument_group('Init arguments') + + init_args.add_argument('package', + help='name of the analysis package to be built') + init_args.add_argument('--name', + default='DummyAnalysis', + help='name of the main analysis utility') + init_args.add_argument( + '--author', + help="author's \"name \" (will use git-config if not " + "specified)") + init_args.add_argument('--description', + help='analysis package description') + init_args.add_argument( + '--standalone', + action='store_true', + default=False, + help='also add CMake directive to build standalone package') + init_args.add_argument( + '--output-dir', + help='output directory where the analysis package will be written') + + +def setup_build_parser(parser): + ''' + Arguments for the build sub-command + ''' + build_args = parser.add_argument_group('Build arguments') + build_args.add_argument('-c', '--clean-build', + action='store_true', + default=False, + help='do a clean build') + build_args.add_argument( + '-j', '--build-threads', + type=int, + default=1, + help='number of threads when building (equivalent to `make -j`)' + ) + + +def setup_test_parser(parser): + ''' + Arguments for the test sub-command + ''' + test_args = parser.add_argument_group('Test arguments') + test_args.add_argument( + '-R', '--tests-regex', + type=str, + help='Run tests matching regular expression (e.g. run only unit tests ' + 'with "^UT")' + ) + test_args.add_argument( + '-E', '--exclude-regex', + type=str, + help='Exclude tests matching regular expression' + ) + test_args.add_argument( + '-j', '--parallel', + type=int, + default=-1, + help='number of tests running in parallel (equivalent to `ctest -j`)' + ) + + +def setup_pin_parser(parser): + ''' + Arguments for the pin sub-command + ''' + pin_args = parser.add_argument_group('Pin arguments') + pin_args.add_argument('-c', '--clear', + action='store_true', + default=False, + help='clear analysis pin') + pin_args.add_argument('-f', '--force', + action='store_true', + default=False, + help='force recreate analysis pin') + pin_args.add_argument('-s', '--show', + action='store_true', + default=False, + help='show pinned stack') + + +def setup_run_parser(parser): + ''' + Define command line arguments for the run sub-command. + ''' + parser.add_argument('anascript_path', + help='path to analysis script') + parser.add_argument('--files-list', default=[], nargs='+', + help='specify input file(s) to bypass the processList') + parser.add_argument( + '--output', + type=str, + default='output.root', + help='specify output file name to bypass the processList and or ' + 'outputList') + parser.add_argument('--nevents', type=int, default=-1, + help='specify max number of events to process') + parser.add_argument('--test', action='store_true', default=False, + help='run over the test input file') + parser.add_argument('--bench', action='store_true', default=False, + help='output benchmark results to a JSON file') + parser.add_argument('--ncpus', type=int, default=-1, + help='set number of threads') + + # Internal argument, not to be used by the users + parser.add_argument('--batch', action='store_true', default=False, + help=argparse.SUPPRESS) + + +def setup_run_parser_final(parser): + ''' + Define command line arguments for the final sub-command. + ''' + parser.add_argument('anascript_path', + help='path to analysis_final script') + + +def setup_run_parser_plots(parser): + ''' + Define command line arguments for the plots sub-command. + ''' + parser.add_argument('script_path', help="path to the plots script") + + +# _____________________________________________________________________________ +def setup_subparsers(subparsers): + ''' + Sets all sub-parsers for all sub-commands + ''' + + # Create sub-parsers + parser_init = subparsers.add_parser( + 'init', + help="generate a RDataFrame based FCC analysis") + parser_build = subparsers.add_parser( + 'build', + help='build and install local analysis') + parser_test = subparsers.add_parser( + 'test', + help='test whole or a part of the analysis framework') + parser_pin = subparsers.add_parser( + 'pin', + help='pin fccanalyses to the current version of Key4hep stack') + parser_run = subparsers.add_parser( + 'run', + help="run a RDataFrame based FCC analysis") + parser_run_final = subparsers.add_parser( + 'final', + help="run a RDataFrame based FCC analysis final configuration") + parser_run_plots = subparsers.add_parser( + 'plots', + help="run a RDataFrame based FCC analysis plot configuration") + + # Register sub-parsers + setup_init_parser(parser_init) + setup_build_parser(parser_build) + setup_test_parser(parser_test) + setup_pin_parser(parser_pin) + setup_run_parser(parser_run) + setup_run_parser_final(parser_run_final) + setup_run_parser_plots(parser_run_plots) diff --git a/python/process.py b/python/process.py index 6b5d144e8d..cecbeab115 100644 --- a/python/process.py +++ b/python/process.py @@ -7,53 +7,57 @@ import json import glob import logging -import yaml -import ROOT +import urllib.request +import yaml # type: ignore +import ROOT # type: ignore ROOT.gROOT.SetBatch(True) -LOGGER = logging.getLogger('FCCAnalyses.process_info') +LOGGER: logging.Logger = logging.getLogger('FCCAnalyses.process_info') -def getEntries(f): - tf=ROOT.TFile.Open(f,"READ") - tf.cd() - tt=tf.Get("events") - nevents=tt.GetEntries() - tf.Close() + +def get_entries(inpath: str) -> int: + ''' + Get number of entries in the TTree named "events". + ''' + nevents = None + with ROOT.TFile(inpath, 'READ') as infile: + tt = infile.Get("events") + nevents = tt.GetEntries() return nevents -def getProcessInfo(process, prodTag, inputDir): +def get_process_info(process: str, + prod_tag: str, + input_dir: str) -> tuple[list[str], list[int]]: ''' Decide where to look for the filelist and eventlist. ''' - if prodTag is None and inputDir is None: + if prod_tag is None and input_dir is None: LOGGER.error('The variable or is mandatory in ' 'your analysis script!\nAborting...') sys.exit(3) - elif prodTag is not None and inputDir is not None: + elif prod_tag is not None and input_dir is not None: LOGGER.error('The variables and can\'t be set ' 'both at the same time in your analysis script!\n' 'Aborting...') sys.exit(3) - if prodTag!=None: - return getProcessInfoYaml(process, prodTag) - elif inputDir!=None: - return getProcessInfoFiles(process, inputDir) - else: - LOGGER.error('Problem, why are you here???\nAborting...') - sys.exit(3) + if prod_tag is not None: + return get_process_info_yaml(process, prod_tag) + + return get_process_info_files(process, input_dir) -def getProcessInfoFiles(process, inputDir): +def get_process_info_files(process: str, input_dir: str) -> tuple[list[str], + list[int]]: ''' Get list of files and events from the specified location ''' - filelist=[] - eventlist=[] - filetest='{}/{}.root'.format(inputDir, process) - dirtest='{}/{}'.format(inputDir, process) + filelist = [] + eventlist = [] + filetest = f'{input_dir}/{process}.root' + dirtest = f'{input_dir}/{process}' if os.path.isfile(filetest) and os.path.isdir(dirtest): LOGGER.error('For process "%s" both a file %s and a directory %s ' @@ -69,19 +73,19 @@ def getProcessInfoFiles(process, inputDir): if os.path.isfile(filetest): filelist.append(filetest) - eventlist.append(getEntries(filetest)) + eventlist.append(get_entries(filetest)) if os.path.isdir(dirtest): - flist=glob.glob(dirtest+"/*.root") + flist = glob.glob(dirtest+"/*.root") for f in flist: filelist.append(f) - eventlist.append(getEntries(f)) - + eventlist.append(get_entries(f)) return filelist, eventlist -def getProcessInfoYaml(process, prodTag): +def get_process_info_yaml(process: str, prod_tag: str) -> tuple[list[str], + list[int]]: ''' Get list of files and events from the YAML file ''' @@ -89,7 +93,8 @@ def getProcessInfoYaml(process, prodTag): proc_dict_dirs = get_process_dict_dirs() yamlfilepath = None for path in proc_dict_dirs: - yamlfilepath = os.path.join(path, 'yaml', prodTag, process, 'merge.yaml') + yamlfilepath = os.path.join(path, 'yaml', prod_tag, process, + 'merge.yaml') if not os.path.isfile(yamlfilepath): continue if not yamlfilepath: @@ -97,7 +102,7 @@ def getProcessInfoYaml(process, prodTag): 'Aborting...') sys.exit(3) - with open(yamlfilepath) as ftmp: + with open(yamlfilepath, 'r', encoding='utf-8') as ftmp: try: doc = yaml.load(ftmp, Loader=yaml.FullLoader) except yaml.YAMLError as exc: @@ -113,24 +118,24 @@ def getProcessInfoYaml(process, prodTag): filelist = [doc['merge']['outdir']+f[0] for f in doc['merge']['outfiles']] eventlist = [f[1] for f in doc['merge']['outfiles']] + return filelist, eventlist -def get_process_dict(proc_dict_location): +def get_process_dict(proc_dict_location: str) -> dict: ''' Pick up the dictionary with process information ''' if 'http://' in proc_dict_location or 'https://' in proc_dict_location: LOGGER.info('Getting process dictionary from the web:\n%s', proc_dict_location) - import urllib.request - req = urllib.request.urlopen(proc_dict_location).read() - try: - proc_dict = json.loads(req.decode('utf-8')) - except json.decoder.JSONDecodeError: - LOGGER.error('Failed to parse process dictionary correctly!\n' - 'Aborting...') - sys.exit(3) + with urllib.request.urlopen(proc_dict_location).read() as response: + try: + proc_dict = json.loads(response.read()) + except json.decoder.JSONDecodeError: + LOGGER.error('Failed to parse process dictionary correctly!\n' + 'Aborting...') + sys.exit(3) else: proc_dict_dirs = get_process_dict_dirs() @@ -144,7 +149,7 @@ def get_process_dict(proc_dict_location): LOGGER.info('Loading process dictionary from:\n%s', proc_dict_path) - with open(proc_dict_path, 'r') as infile: + with open(proc_dict_path, 'r', encoding='utf-8') as infile: try: proc_dict = json.load(infile) except json.decoder.JSONDecodeError: @@ -152,6 +157,9 @@ def get_process_dict(proc_dict_location): 'correctly!\nAborting...') sys.exit(3) + if proc_dict: + break + if not proc_dict: LOGGER.error('Process dictionary not found!\nAborting...') sys.exit(3) @@ -159,17 +167,17 @@ def get_process_dict(proc_dict_location): return proc_dict -def get_process_dict_dirs(): +def get_process_dict_dirs() -> list[str]: ''' Get search directories for the process dictionaries ''' - dirs = os.getenv('FCCDICTSDIR') - if not dirs: + dirs_var = os.getenv('FCCDICTSDIR') + if dirs_var is None: LOGGER.error('Environment variable FCCDICTSDIR not defined!\n' 'Was the setup.sh file sourced properly?\n' 'Aborting...') sys.exit(3) - dirs = dirs.split(':') - dirs = [d for d in dirs if d] + dirs = dirs_var.split(':') + dirs[:] = [d for d in dirs if d] return dirs diff --git a/python/process.pyi b/python/process.pyi new file mode 100644 index 0000000000..b3b12a976b --- /dev/null +++ b/python/process.pyi @@ -0,0 +1,12 @@ +# generated with `stubgen process.py` + +import logging + +LOGGER: logging.Logger + +def get_entries(inpath: str) -> int: ... +def get_process_info(process: str, prod_tag: str, input_dir: str) -> tuple[list[str], list[int]]: ... +def get_process_info_files(process: str, input_dir: str) -> tuple[list[str], list[int]]: ... +def get_process_info_yaml(process: str, prod_tag: str) -> tuple[list[str], list[int]]: ... +def get_process_dict(proc_dict_location: str) -> dict: ... +def get_process_dict_dirs() -> list[str]: ... diff --git a/python/run_analysis.py b/python/run_analysis.py new file mode 100644 index 0000000000..9ac5355249 --- /dev/null +++ b/python/run_analysis.py @@ -0,0 +1,924 @@ +''' +Run regular stage of an analysis +''' + +import os +import sys +import time +import json +import shutil +import logging +import subprocess +import importlib.util +import datetime +import numpy as np + +import ROOT # type: ignore +from anascript import get_element, get_element_dict +from process import get_process_info, get_process_dict + +LOGGER = logging.getLogger('FCCAnalyses.run') + +ROOT.gROOT.SetBatch(True) + + +# _____________________________________________________________________________ +def determine_os(local_dir: str) -> str | None: + ''' + Determines platform on which FCCAnalyses was compiled + ''' + cmake_config_path = local_dir + '/build/CMakeFiles/CMakeConfigureLog.yaml' + if not os.path.isfile(cmake_config_path): + LOGGER.warning('CMake configuration file was not found!\n' + 'Was FCCAnalyses properly build?') + return None + + with open(cmake_config_path, 'r', encoding='utf-8') as cmake_config_file: + cmake_config = cmake_config_file.read() + if 'centos7' in cmake_config: + return 'centos7' + if 'almalinux9' in cmake_config: + return 'almalinux9' + + return None + + +# _____________________________________________________________________________ +def create_condor_config(log_dir: str, + process_name: str, + build_os: str | None, + rdf_module, + subjob_scripts: list[str]) -> str: + ''' + Creates contents of condor configuration file. + ''' + cfg = 'executable = $(filename)\n' + + cfg += f'Log = {log_dir}/condor_job.{process_name}.' + cfg += '$(ClusterId).$(ProcId).log\n' + + cfg += f'Output = {log_dir}/condor_job.{process_name}.' + cfg += '$(ClusterId).$(ProcId).out\n' + + cfg += f'Error = {log_dir}/condor_job.{process_name}.' + cfg += '$(ClusterId).$(ProcId).error\n' + + cfg += 'getenv = False\n' + + cfg += 'environment = "LS_SUBCWD={log_dir}"\n' # not sure + + cfg += 'requirements = ( ' + if build_os == 'centos7': + cfg += '(OpSysAndVer =?= "CentOS7") && ' + if build_os == 'almalinux9': + cfg += '(OpSysAndVer =?= "AlmaLinux9") && ' + if build_os is None: + LOGGER.warning('Submitting jobs to default operating system. There ' + 'may be compatibility issues.') + cfg += '(Machine =!= LastRemoteHost) && (TARGET.has_avx2 =?= True) )\n' + + cfg += 'on_exit_remove = (ExitBySignal == False) && (ExitCode == 0)\n' + + cfg += 'max_retries = 3\n' + + cfg += '+JobFlavour = "%s"\n' % get_element(rdf_module, 'batchQueue') + + cfg += '+AccountingGroup = "%s"\n' % get_element(rdf_module, 'compGroup') + + cfg += 'RequestCpus = %i\n' % get_element(rdf_module, "nCPUS") + + cfg += 'queue filename matching files' + for script in subjob_scripts: + cfg += ' ' + script + cfg += '\n' + + return cfg + + +# _____________________________________________________________________________ +def create_subjob_script(local_dir: str, + rdf_module, + process_name: str, + chunk_num: int, + chunk_list: list[list[str]], + anapath: str) -> str: + ''' + Creates sub-job script to be run. + ''' + + output_dir = get_element(rdf_module, "outputDir") + output_dir_eos = get_element(rdf_module, "outputDirEos") + eos_type = get_element(rdf_module, "eosType") + user_batch_config = get_element(rdf_module, "userBatchConfig") + + scr = '#!/bin/bash\n\n' + scr += 'source ' + local_dir + '/setup.sh\n\n' + + # add userBatchConfig if any + if user_batch_config != '': + if not os.path.isfile(user_batch_config): + LOGGER.warning('userBatchConfig file can\'t be found! Will not ' + 'add it to the default config.') + else: + with open(user_batch_config, 'r', encoding='utf-8') as cfgfile: + for line in cfgfile: + scr += line + '\n' + scr += '\n\n' + + scr += f'mkdir job_{process_name}_chunk_{chunk_num}\n' + scr += f'cd job_{process_name}_chunk_{chunk_num}\n\n' + + if not os.path.isabs(output_dir): + output_path = os.path.join(output_dir, f'chunk_{chunk_num}.root') + else: + output_path = os.path.join(output_dir, process_name, + f'chunk_{chunk_num}.root') + + scr += local_dir + scr += f'/bin/fccanalysis run {anapath} --batch ' + scr += f'--output {output_path} ' + scr += '--files-list' + for file_path in chunk_list[chunk_num]: + scr += f' {file_path}' + scr += '\n\n' + + if not os.path.isabs(output_dir) and output_dir_eos == '': + final_dest = os.path.join(local_dir, output_dir, process_name, + f'chunk_{chunk_num}.root') + scr += f'cp {output_path} {final_dest}\n' + + if output_dir_eos != '': + final_dest = os.path.join(output_dir_eos, + process_name, + f'chunk_{chunk_num}.root') + final_dest = f'root://{eos_type}.cern.ch/' + final_dest + scr += f'xrdcp {output_path} {final_dest}\n' + + return scr + + +# _____________________________________________________________________________ +def get_subfile_list(in_file_list, event_list, fraction): + ''' + Obtain list of files roughly containing the requested fraction of events. + ''' + nevts_total = sum(event_list) + nevts_target = int(nevts_total * fraction) + + if nevts_target <= 0: + LOGGER.error('The reduction fraction %f too stringent, no events ' + 'left!\nAborting...', fraction) + sys.exit(3) + + nevts_real = 0 + out_file_list = [] + for i in enumerate(event_list): + if nevts_real >= nevts_target: + break + nevts_real += event_list[i] + out_file_list.append(in_file_list[i]) + + info_msg = f'Reducing the input file list by fraction "{fraction}" of ' + info_msg += 'total events:\n\t' + info_msg += f'- total number of events: {nevts_total:,}\n\t' + info_msg += f'- targeted number of events: {nevts_target:,}\n\t' + info_msg += '- number of events in the resulting file list: ' + info_msg += f'{nevts_real:,}\n\t' + info_msg += '- number of files after reduction: ' + info_msg += str((len(out_file_list))) + LOGGER.info(info_msg) + + return out_file_list + + +# _____________________________________________________________________________ +def get_chunk_list(file_list: str, chunks: int): + ''' + Get list of input file paths arranged into chunks. + ''' + chunk_list = list(np.array_split(file_list, chunks)) + return [chunk for chunk in chunk_list if chunk.size > 0] + + +# _____________________________________________________________________________ +def save_benchmark(outfile, benchmark): + ''' + Save benchmark results to a JSON file. + ''' + benchmarks = [] + try: + with open(outfile, 'r', encoding='utf-8') as benchin: + benchmarks = json.load(benchin) + except OSError: + pass + + benchmarks = [b for b in benchmarks if b['name'] != benchmark['name']] + benchmarks.append(benchmark) + + with open(outfile, 'w', encoding='utf-8') as benchout: + json.dump(benchmarks, benchout, indent=2) + + +# _____________________________________________________________________________ +def submit_job(cmd: str, max_trials: int) -> bool: + ''' + Submit job to condor, retry `max_trials` times. + ''' + for i in range(max_trials): + with subprocess.Popen(cmd, shell=True, + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + universal_newlines=True) as proc: + (stdout, stderr) = proc.communicate() + + if proc.returncode == 0 and len(stderr) == 0: + LOGGER.info(stdout) + LOGGER.info('GOOD SUBMISSION') + return True + + LOGGER.warning('Error while submitting, retrying...\n ' + 'Trial: %i / %i\n Error: %s', + i, max_trials, stderr) + time.sleep(10) + + LOGGER.error('Failed submitting after: %i trials!', max_trials) + return False + + +# _____________________________________________________________________________ +def initialize(args, rdf_module, anapath: str): + ''' + Common initialization steps. + ''' + + # for convenience and compatibility with user code + ROOT.gInterpreter.Declare("using namespace FCCAnalyses;") + geometry_file = get_element(rdf_module, "geometryFile") + readout_name = get_element(rdf_module, "readoutName") + if geometry_file != "" and readout_name != "": + ROOT.CaloNtupleizer.loadGeometry(geometry_file, readout_name) + + # set multithreading (no MT if number of events is specified) + ncpus = 1 + if args.nevents < 0: + if isinstance(args.ncpus, int) and args.ncpus >= 1: + ncpus = args.ncpus + else: + ncpus = get_element(rdf_module, "nCPUS") + if ncpus < 0: # use all available threads + ROOT.EnableImplicitMT() + ncpus = ROOT.GetThreadPoolSize() + ROOT.ROOT.EnableImplicitMT(ncpus) + ROOT.EnableThreadSafety() + + if ROOT.IsImplicitMTEnabled(): + LOGGER.info('Multithreading enabled. Running over %i threads', + ROOT.GetThreadPoolSize()) + else: + LOGGER.info('No multithreading enabled. Running in single thread...') + + # custom header files + include_paths = get_element(rdf_module, "includePaths") + if include_paths: + ROOT.gInterpreter.ProcessLine(".O3") + basepath = os.path.dirname(os.path.abspath(anapath)) + "/" + for path in include_paths: + LOGGER.info('Loading %s...', path) + ROOT.gInterpreter.Declare(f'#include "{basepath}/{path}"') + + # check if analyses plugins need to be loaded before anything + # still in use? + analyses_list = get_element(rdf_module, "analysesList") + if analyses_list and len(analyses_list) > 0: + _ana = [] + for analysis in analyses_list: + LOGGER.info('Load cxx analyzers from %s...', analysis) + if analysis.startswith('libFCCAnalysis_'): + ROOT.gSystem.Load(analysis) + else: + ROOT.gSystem.Load(f'libFCCAnalysis_{analysis}') + if not hasattr(ROOT, analysis): + ROOT.error('Analysis %s not properly loaded!\nAborting...', + analysis) + sys.exit(3) + _ana.append(getattr(ROOT, analysis).dictionary) + + +# _____________________________________________________________________________ +def run_rdf(rdf_module, + input_list: list[str], + out_file: str, + args) -> int: + ''' + Create RDataFrame and snapshot it. + ''' + df = ROOT.RDataFrame("events", input_list) + + # limit number of events processed + if args.nevents > 0: + df = df.Range(0, args.nevents) + + try: + df2 = get_element(rdf_module.RDFanalysis, "analysers")(df) + + branch_list = ROOT.vector('string')() + blist = get_element(rdf_module.RDFanalysis, "output")() + for bname in blist: + branch_list.push_back(bname) + + df2.Snapshot("events", out_file, branch_list) + except Exception as excp: + LOGGER.error('During the execution of the analysis file exception ' + 'occurred:\n%s', excp) + sys.exit(3) + + return df2.Count() + + +# _____________________________________________________________________________ +def send_to_batch(rdf_module, chunk_list, process, anapath: str): + ''' + Send jobs to HTCondor batch system. + ''' + local_dir = os.environ['LOCAL_DIR'] + current_date = datetime.datetime.fromtimestamp( + datetime.datetime.now().timestamp()).strftime('%Y-%m-%d_%H-%M-%S') + log_dir = os.path.join(local_dir, 'BatchOutputs', current_date, process) + if not os.path.exists(log_dir): + os.system(f'mkdir -p {log_dir}') + + # Making sure the FCCAnalyses libraries are compiled and installed + try: + subprocess.check_output(['make', 'install'], + cwd=local_dir+'/build', + stderr=subprocess.DEVNULL + ) + except subprocess.CalledProcessError: + LOGGER.error('The FCCanalyses libraries are not properly build and ' + 'installed!\nAborting job submission...') + sys.exit(3) + + subjob_scripts = [] + for ch in range(len(chunk_list)): + subjob_script_path = os.path.join(log_dir, + f'job_{process}_chunk_{ch}.sh') + subjob_scripts.append(subjob_script_path) + + for i in range(3): + try: + with open(subjob_script_path, 'w', encoding='utf-8') as ofile: + subjob_script = create_subjob_script(local_dir, + rdf_module, + process, + ch, + chunk_list, + anapath) + ofile.write(subjob_script) + except IOError as e: + if i < 2: + LOGGER.warning('I/O error(%i): %s', e.errno, e.strerror) + else: + LOGGER.error('I/O error(%i): %s', e.errno, e.strerror) + sys.exit(3) + else: + break + time.sleep(10) + subprocess.getstatusoutput(f'chmod 777 {subjob_script_path}') + + LOGGER.debug('Sub-job scripts to be run:\n - %s', + '\n - '.join(subjob_scripts)) + + condor_config_path = f'{log_dir}/job_desc_{process}.cfg' + + for i in range(3): + try: + with open(condor_config_path, 'w', encoding='utf-8') as cfgfile: + condor_config = create_condor_config(log_dir, + process, + determine_os(local_dir), + rdf_module, + subjob_scripts) + cfgfile.write(condor_config) + except IOError as e: + LOGGER.warning('I/O error(%i): %s', e.errno, e.strerror) + if i == 2: + sys.exit(3) + else: + break + time.sleep(10) + subprocess.getstatusoutput(f'chmod 777 {condor_config_path}') + + batch_cmd = f'condor_submit {condor_config_path}' + LOGGER.info('Batch command:\n %s', batch_cmd) + success = submit_job(batch_cmd, 10) + if not success: + sys.exit(3) + + +# _____________________________________________________________________________ +def apply_filepath_rewrites(filepath: str) -> str: + ''' + Apply path rewrites if applicable. + ''' + # Stripping leading and trailing white spaces + filepath_stripped = filepath.strip() + # Stripping leading and trailing slashes + filepath_stripped = filepath_stripped.strip('/') + + # Splitting the path along slashes + filepath_splitted = filepath_stripped.split('/') + + if len(filepath_splitted) > 1 and filepath_splitted[0] == 'eos': + if filepath_splitted[1] == 'experiment': + filepath = 'root://eospublic.cern.ch//' + filepath_stripped + elif filepath_splitted[1] == 'user': + filepath = 'root://eosuser.cern.ch//' + filepath_stripped + elif 'home-' in filepath_splitted[1]: + filepath = 'root://eosuser.cern.ch//eos/user/' + \ + filepath_stripped.replace('eos/home-', '') + else: + LOGGER.warning('Unknown EOS path type!\nPlease check with the ' + 'developers as this might impact performance of ' + 'the analysis.') + return filepath + + +# _____________________________________________________________________________ +def run_local(rdf_module, infile_list, args): + ''' + Run analysis locally. + ''' + # Create list of files to be processed + info_msg = 'Creating dataframe object from files:\n\t' + file_list = ROOT.vector('string')() + # Amount of events processed in previous stage (= 0 if it is the first + # stage) + nevents_orig = 0 + # The amount of events in the input file(s) + nevents_local = 0 + for filepath in infile_list: + + filepath = apply_filepath_rewrites(filepath) + + file_list.push_back(filepath) + info_msg += f'- {filepath}\t\n' + infile = ROOT.TFile.Open(filepath, 'READ') + try: + nevents_orig += infile.Get('eventsProcessed').GetVal() + except AttributeError: + pass + + try: + nevents_local += infile.Get("events").GetEntries() + except AttributeError: + LOGGER.error('Input file:\n%s\nis missing events TTree!\n' + 'Aborting...', filepath) + infile.Close() + sys.exit(3) + infile.Close() + + LOGGER.info(info_msg) + + # Adjust number of events in case --nevents was specified + if args.nevents > 0 and args.nevents < nevents_local: + nevents_local = args.nevents + + if nevents_orig > 0: + LOGGER.info('Number of events:\n\t- original: %s\n\t- local: %s', + f'{nevents_orig:,}', f'{nevents_local:,}') + else: + LOGGER.info('Number of local events: %s', f'{nevents_local:,}') + + output_dir = get_element(rdf_module, "outputDir") + if not args.batch: + if os.path.isabs(args.output): + LOGGER.warning('Provided output path is absolute, "outputDir" ' + 'from analysis script will be ignored!') + outfile_path = os.path.join(output_dir, args.output) + else: + outfile_path = args.output + LOGGER.info('Output file path:\n%s', outfile_path) + + # Run RDF + start_time = time.time() + outn = run_rdf(rdf_module, file_list, outfile_path, args) + outn = outn.GetValue() + + with ROOT.TFile(outfile_path, 'update') as outfile: + param = ROOT.TParameter(int)( + 'eventsProcessed', + nevents_orig if nevents_orig != 0 else nevents_local) + param.Write() + outfile.Write() + + elapsed_time = time.time() - start_time + info_msg = f"{' SUMMARY ':=^80}\n" + info_msg += 'Elapsed time (H:M:S): ' + info_msg += time.strftime('%H:%M:%S', time.gmtime(elapsed_time)) + info_msg += '\nEvents processed/second: ' + info_msg += f'{int(nevents_local/elapsed_time):,}' + info_msg += f'\nTotal events processed: {int(nevents_local):,}' + info_msg += f'\nNo. result events: {int(outn):,}' + if nevents_local > 0: + info_msg += f'\nReduction factor local: {outn/nevents_local}' + if nevents_orig > 0: + info_msg += f'\nReduction factor total: {outn/nevents_orig}' + info_msg += '\n' + info_msg += 80 * '=' + info_msg += '\n' + LOGGER.info(info_msg) + + if args.bench: + analysis_name = get_element(rdf_module, 'analysisName') + if not analysis_name: + analysis_name = args.anascript_path + + bench_time = {} + bench_time['name'] = 'Time spent running the analysis: ' + bench_time['name'] += analysis_name + bench_time['unit'] = 'Seconds' + bench_time['value'] = elapsed_time + bench_time['range'] = 10 + bench_time['extra'] = 'Analysis path: ' + args.anascript_path + save_benchmark('benchmarks_smaller_better.json', bench_time) + + bench_evt_per_sec = {} + bench_evt_per_sec['name'] = 'Events processed per second: ' + bench_evt_per_sec['name'] += analysis_name + bench_evt_per_sec['unit'] = 'Evt/s' + bench_evt_per_sec['value'] = nevents_local / elapsed_time + bench_time['range'] = 1000 + bench_time['extra'] = 'Analysis path: ' + args.anascript_path + save_benchmark('benchmarks_bigger_better.json', bench_evt_per_sec) + + +# _____________________________________________________________________________ +def run_stages(args, rdf_module, anapath): + ''' + Run regular stage. + ''' + + # Set ncpus, load header files, custom dicts, ... + initialize(args, rdf_module, anapath) + + # Check if outputDir exist and if not create it + output_dir = get_element(rdf_module, "outputDir") + if not os.path.exists(output_dir) and output_dir: + os.system(f'mkdir -p {output_dir}') + + # Check if outputDir exist and if not create it + output_dir_eos = get_element(rdf_module, "outputDirEos") + if not os.path.exists(output_dir_eos) and output_dir_eos: + os.system(f'mkdir -p {output_dir_eos}') + + # Check if test mode is specified, and if so run the analysis on it (this + # will exit after) + if args.test: + LOGGER.info('Running over test file...') + testfile_path = get_element(rdf_module, "testFile") + directory, _ = os.path.split(args.output) + if directory: + os.system(f'mkdir -p {directory}') + run_local(rdf_module, [testfile_path], args) + sys.exit(0) + + # Check if files are specified, and if so run the analysis on it/them (this + # will exit after) + if len(args.files_list) > 0: + LOGGER.info('Running over files provided in command line argument...') + directory, _ = os.path.split(args.output) + if directory: + os.system(f'mkdir -p {directory}') + run_local(rdf_module, args.files_list, args) + sys.exit(0) + + # Check if batch mode is available + run_batch = get_element(rdf_module, 'runBatch') + if run_batch and shutil.which('condor_q') is None: + LOGGER.error('HTCondor tools can\'t be found!\nAborting...') + sys.exit(3) + + # Check if the process list is specified + process_list = get_element(rdf_module, 'processList') + + for process_name in process_list: + file_list, event_list = get_process_info( + process_name, + get_element(rdf_module, "prodTag"), + get_element(rdf_module, "inputDir")) + + if len(file_list) <= 0: + LOGGER.error('No files to process!\nAborting...') + sys.exit(3) + + # Determine the fraction of the input to be processed + fraction = 1 + if get_element_dict(process_list[process_name], 'fraction'): + fraction = get_element_dict(process_list[process_name], 'fraction') + # Put together output path + output_stem = process_name + if get_element_dict(process_list[process_name], 'output'): + output_stem = get_element_dict(process_list[process_name], + 'output') + # Determine the number of chunks the output will be split into + chunks = 1 + if get_element_dict(process_list[process_name], 'chunks'): + chunks = get_element_dict(process_list[process_name], 'chunks') + + info_msg = f'Adding process "{process_name}" with:' + if fraction < 1: + info_msg += f'\n\t- fraction: {fraction}' + info_msg += f'\n\t- number of files: {len(file_list):,}' + info_msg += f'\n\t- output stem: {output_stem}' + if chunks > 1: + info_msg += f'\n\t- number of chunks: {chunks}' + + if fraction < 1: + file_list = get_subfile_list(file_list, event_list, fraction) + + chunk_list = [file_list] + if chunks > 1: + chunk_list = get_chunk_list(file_list, chunks) + LOGGER.info('Number of the output files: %s', f'{len(chunk_list):,}') + + # Create directory if more than 1 chunk + if chunks > 1: + output_directory = os.path.join(output_dir, output_stem) + + if not os.path.exists(output_directory): + os.system(f'mkdir -p {output_directory}') + + if run_batch: + # Sending to the batch system + LOGGER.info('Running on the batch...') + if len(chunk_list) == 1: + LOGGER.warning('\033[4m\033[1m\033[91mRunning on batch with ' + 'only one chunk might not be optimal\033[0m') + + send_to_batch(rdf_module, chunk_list, process_name, anapath) + + else: + # Running locally + LOGGER.info('Running locally...') + if len(chunk_list) == 1: + args.output = f'{output_stem}.root' + run_local(rdf_module, chunk_list[0], args) + else: + for index, chunk in enumerate(chunk_list): + args.output = f'{output_stem}/chunk{index}.root' + run_local(rdf_module, chunk, args) + + +def run_histmaker(args, rdf_module, anapath): + ''' + Run the analysis using histmaker (all stages integrated into one). + ''' + + # set ncpus, load header files, custom dicts, ... + initialize(args, rdf_module, anapath) + + # load process dictionary + proc_dict_location = get_element(rdf_module, "procDict", True) + if not proc_dict_location: + LOGGER.error('Location of the procDict not provided.\nAborting...') + sys.exit(3) + + proc_dict = get_process_dict(proc_dict_location) + + # check if outputDir exist and if not create it + output_dir = get_element(rdf_module, "outputDir") + if not os.path.exists(output_dir) and output_dir != '': + os.system(f'mkdir -p {output_dir}') + + do_scale = get_element(rdf_module, "doScale", True) + int_lumi = get_element(rdf_module, "intLumi", True) + + # check if the process list is specified, and create graphs for them + process_list = get_element(rdf_module, "processList") + graph_function = getattr(rdf_module, "build_graph") + results = [] # all the histograms + hweights = [] # all the weights + evtcounts = [] # event count of the input file + # number of events processed per process, in a potential previous step + events_processed_dict = {} + for process in process_list: + file_list, event_list = get_process_info( + process, + get_element(rdf_module, "prodTag"), + get_element(rdf_module, "inputDir")) + if len(file_list) == 0: + LOGGER.error('No files to process!\nAborting...') + sys.exit(3) + fraction = 1 + output = process + chunks = 1 + try: + if get_element_dict(process_list[process], 'fraction') is not None: + fraction = get_element_dict(process_list[process], 'fraction') + if get_element_dict(process_list[process], 'output') is not None: + output = get_element_dict(process_list[process], 'output') + if get_element_dict(process_list[process], 'chunks') is not None: + chunks = get_element_dict(process_list[process], 'chunks') + except TypeError: + LOGGER.warning('No values set for process %s will use default ' + 'values!', process) + if fraction < 1: + file_list = get_subfile_list(file_list, event_list, fraction) + + # get the number of events processed, in a potential previous step + file_list_root = ROOT.vector('string')() + # amount of events processed in previous stage (= 0 if it is the first + # stage) + nevents_meta = 0 + for file_name in file_list: + file_name = apply_filepath_rewrites(file_name) + file_list_root.push_back(file_name) + # Skip check for processed events in case of first stage + if get_element(rdf_module, "prodTag") is None: + infile = ROOT.TFile(str(file_name), 'READ') + for key in infile.GetListOfKeys(): + if 'eventsProcessed' == key.GetName(): + nevents_meta += infile.eventsProcessed.GetVal() + break + infile.Close() + if args.test: + break + events_processed_dict[process] = nevents_meta + info_msg = f'Add process "{process}" with:' + info_msg += f'\n\tfraction = {fraction}' + info_msg += f'\n\tnFiles = {len(file_list_root):,}' + info_msg += f'\n\toutput = {output}\n\tchunks = {chunks}' + LOGGER.info(info_msg) + + df = ROOT.ROOT.RDataFrame("events", file_list_root) + evtcount = df.Count() + res, hweight = graph_function(df, process) + results.append(res) + hweights.append(hweight) + evtcounts.append(evtcount) + + LOGGER.info('Starting the event loop...') + start_time = time.time() + ROOT.ROOT.RDF.RunGraphs(evtcounts) + LOGGER.info('Event loop done!') + elapsed_time = time.time() - start_time + + LOGGER.info('Writing out output files...') + nevents_tot = 0 + for process, res, hweight, evtcount in zip(process_list, + results, + hweights, + evtcounts): + # get the cross-sections etc. First try locally, then the procDict + if 'crossSection' in process_list[process]: + cross_section = process_list[process]['crossSection'] + elif process in proc_dict and 'crossSection' in proc_dict[process]: + cross_section = proc_dict[process]['crossSection'] + else: + LOGGER.warning('Can\'t find cross-section for process %s in ' + 'processList or procDict!\nUsing default value ' + 'of 1', process) + cross_section = 1 + + if 'kfactor' in process_list[process]: + kfactor = process_list[process]['kfactor'] + elif process in proc_dict and 'kfactor' in proc_dict[process]: + kfactor = proc_dict[process]['kfactor'] + else: + kfactor = 1 + + if 'matchingEfficiency' in process_list[process]: + matching_efficiency = process_list[process]['matchingEfficiency'] + elif process in proc_dict \ + and 'matchingEfficiency' in proc_dict[process]: + matching_efficiency = proc_dict[process]['matchingEfficiency'] + else: + matching_efficiency = 1 + + events_processed = events_processed_dict[process] \ + if events_processed_dict[process] != 0 else evtcount.GetValue() + scale = cross_section*kfactor*matching_efficiency/events_processed + + nevents_tot += evtcount.GetValue() + + hists_to_write = {} + for r in res: + hist = r.GetValue() + hname = hist.GetName() + # merge histograms in case histogram exists + if hist.GetName() in hists_to_write: + hists_to_write[hname].Add(hist) + else: + hists_to_write[hname] = hist + + LOGGER.info('Writing out process %s, nEvents processed %s', + process, f'{evtcount.GetValue():,}') + with ROOT.TFile(f'{output_dir}/{process}.root', 'RECREATE'): + for hist in hists_to_write.values(): + if do_scale: + hist.Scale(scale * int_lumi) + hist.Write() + + # write all meta info to the output file + p = ROOT.TParameter(int)("eventsProcessed", events_processed) + p.Write() + p = ROOT.TParameter(float)("sumOfWeights", hweight.GetValue()) + p.Write() + p = ROOT.TParameter(float)("intLumi", int_lumi) + p.Write() + p = ROOT.TParameter(float)("crossSection", cross_section) + p.Write() + p = ROOT.TParameter(float)("kfactor", kfactor) + p.Write() + p = ROOT.TParameter(float)("matchingEfficiency", + matching_efficiency) + p.Write() + + info_msg = f"{' SUMMARY ':=^80}\n" + info_msg += 'Elapsed time (H:M:S): ' + info_msg += time.strftime('%H:%M:%S', time.gmtime(elapsed_time)) + info_msg += '\nEvents processed/second: ' + info_msg += f'{int(nevents_tot/elapsed_time):,}' + info_msg += f'\nTotal events processed: {nevents_tot:,}' + info_msg += '\n' + info_msg += 80 * '=' + info_msg += '\n' + LOGGER.info(info_msg) + + +def run(parser): + ''' + Set things in motion. + ''' + + args, _ = parser.parse_known_args() + + if not hasattr(args, 'command'): + LOGGER.error('Error occurred during subcommand routing!\nAborting...') + sys.exit(3) + + if args.command != 'run': + LOGGER.error('Unknow sub-command "%s"!\nAborting...') + sys.exit(3) + + # Check that the analysis file exists + anapath = args.anascript_path + if not os.path.isfile(anapath): + LOGGER.error('Analysis script %s not found!\nAborting...', + anapath) + sys.exit(3) + + # Set verbosity level of the RDataFrame + if args.verbose: + # ROOT.Experimental.ELogLevel.kInfo verbosity level is more + # equivalent to DEBUG in other log systems + LOGGER.debug('Setting verbosity level "kInfo" for RDataFrame...') + verbosity = ROOT.Experimental.RLogScopedVerbosity( + ROOT.Detail.RDF.RDFLogChannel(), + ROOT.Experimental.ELogLevel.kInfo) + LOGGER.debug(verbosity) + if args.more_verbose: + LOGGER.debug('Setting verbosity level "kDebug" for RDataFrame...') + verbosity = ROOT.Experimental.RLogScopedVerbosity( + ROOT.Detail.RDF.RDFLogChannel(), + ROOT.Experimental.ELogLevel.kDebug) + LOGGER.debug(verbosity) + if args.most_verbose: + LOGGER.debug('Setting verbosity level "kDebug+10" for ' + 'RDataFrame...') + verbosity = ROOT.Experimental.RLogScopedVerbosity( + ROOT.Detail.RDF.RDFLogChannel(), + ROOT.Experimental.ELogLevel.kDebug+10) + LOGGER.debug(verbosity) + + # Load pre compiled analyzers + LOGGER.info('Loading analyzers from libFCCAnalyses...') + ROOT.gSystem.Load("libFCCAnalyses") + # Is this still needed?? 01/04/2022 still to be the case + fcc_loaded = ROOT.dummyLoader() + if fcc_loaded: + LOGGER.debug('Succesfuly loaded main FCCanalyses analyzers.') + + # Load the analysis script as a module + anapath = os.path.abspath(anapath) + LOGGER.info('Loading analysis file:\n%s', anapath) + rdf_spec = importlib.util.spec_from_file_location("rdfanalysis", + anapath) + rdf_module = importlib.util.module_from_spec(rdf_spec) + rdf_spec.loader.exec_module(rdf_module) + + if hasattr(rdf_module, "build_graph") and \ + hasattr(rdf_module, "RDFanalysis"): + LOGGER.error('Analysis file ambiguous!\nBoth "RDFanalysis" ' + 'class and "build_graph" function are defined.') + sys.exit(3) + elif hasattr(rdf_module, "build_graph") and \ + not hasattr(rdf_module, "RDFanalysis"): + run_histmaker(args, rdf_module, anapath) + elif not hasattr(rdf_module, "build_graph") and \ + hasattr(rdf_module, "RDFanalysis"): + run_stages(args, rdf_module, anapath) + else: + LOGGER.error('Analysis file does not contain required ' + 'objects!\nProvide either "RDFanalysis" class or ' + '"build_graph" function.') + sys.exit(3) diff --git a/python/run_final_analysis.py b/python/run_final_analysis.py new file mode 100644 index 0000000000..d3874eeb7b --- /dev/null +++ b/python/run_final_analysis.py @@ -0,0 +1,496 @@ +''' +Run final stage of an analysis +''' + +import os +import sys +import time +import glob +import logging +import importlib.util + +import ROOT # type: ignore +from anascript import get_element, get_element_dict +from process import get_process_dict + +LOGGER = logging.getLogger('FCCAnalyses.run_final') + +ROOT.gROOT.SetBatch(True) + + +# _____________________________________________________________________________ +def get_entries(infilepath: str) -> tuple[int, int]: + ''' + Get number of original entries and number of actual entries in the file + ''' + events_processed = 0 + events_in_ttree = 0 + + with ROOT.TFile(infilepath, 'READ') as infile: + try: + events_processed = infile.Get('eventsProcessed').GetVal() + except AttributeError: + LOGGER.warning('Input file is missing information about ' + 'original number of events!') + + try: + events_in_ttree = infile.Get("events").GetEntries() + except AttributeError: + LOGGER.error('Input file is missing "events" TTree!\nAborting...') + sys.exit(3) + + return events_processed, events_in_ttree + + +# _____________________________________________________________________________ +def testfile(f: str) -> bool: + ''' + Test input file from previous stages + ''' + with ROOT.TFile(f, 'READ') as infile: + tt = None + try: + tt = infile.Get("events") + if tt is None: + LOGGER.warning('File does not contains events, selection was ' + 'too tight, skipping it: %s', f) + return False + except IOError as e: + LOGGER.warning('I/O error(%i): %s', e.errno, e.strerror) + return False + except ValueError: + LOGGER.warning('Could read the file') + return False + except: + LOGGER.warning('Unexpected error: %s\nfile ===%s=== must be ' + 'deleted', + sys.exc_info()[0], f) + return False + return True + + +# __________________________________________________________ +def run(rdf_module): + ''' + Main loop. + ''' + proc_dict_location = get_element(rdf_module, "procDict", True) + if not proc_dict_location: + LOGGER.error( + 'Location of the process dictionary not provided!\nAborting...') + sys.exit(3) + + process_dict = get_process_dict(proc_dict_location) + + process_dict_additions = get_element(rdf_module, "procDictAdd", True) + for addition in process_dict_additions: + if get_element_dict(process_dict, addition) is None: + process_dict[addition] = process_dict_additions[addition] + else: + LOGGER.debug('Process already in the dictionary. Skipping it...') + + ROOT.ROOT.EnableImplicitMT(get_element(rdf_module, "nCPUS", True)) + + nevents_real = 0 + start_time = time.time() + + process_events = {} + events_ttree = {} + file_list = {} + save_tab = [] + efficiency_list = [] + + input_dir = get_element(rdf_module, "inputDir", True) + if not input_dir: + LOGGER.error('The inputDir variable is mandatory for the final stage ' + 'of the analysis!\nAborting...') + sys.exit(3) + + if input_dir[-1] != "/": + input_dir += "/" + + output_dir = get_element(rdf_module, "outputDir", True) + if output_dir != "": + if output_dir[-1] != "/": + output_dir += "/" + + if not os.path.exists(output_dir) and output_dir != '': + os.system(f'mkdir -p {output_dir}') + + cut_list = get_element(rdf_module, "cutList", True) + length_cuts_names = max(len(cut) for cut in cut_list) + cut_labels = get_element(rdf_module, "cutLabels", True) + + # save a table in a separate tex file + save_tabular = get_element(rdf_module, "saveTabular", True) + if save_tabular: + # option to rewrite the cuts in a better way for the table. otherwise, + # take them from the cutList + if cut_labels: + cut_names = list(cut_labels.values()) + else: + cut_names = list(cut_list) + + cut_names.insert(0, ' ') + save_tab.append(cut_names) + efficiency_list.append(cut_names) + + process_list = get_element(rdf_module, "processList", True) + for process_name in process_list: + process_events[process_name] = 0 + events_ttree[process_name] = 0 + file_list[process_name] = ROOT.vector('string')() + + infilepath = input_dir + process_name + '.root' # input file + if not os.path.isfile(infilepath): + LOGGER.debug('File %s does not exist!\nTrying if it is a ' + 'directory as it might have been processed in batch.', + infilepath) + else: + LOGGER.info('Open file:\n\t%s', infilepath) + process_events[process_name], events_ttree[process_name] = \ + get_entries(infilepath) + file_list[process_name].push_back(infilepath) + + indirpath = input_dir + process_name + if os.path.isdir(indirpath): + info_msg = f'Open directory {indirpath}' + flist = glob.glob(indirpath + '/chunk*.root') + for filepath in flist: + info_msg += '\n\t' + filepath + chunk_process_events, chunk_events_ttree = \ + get_entries(filepath) + process_events[process_name] += chunk_process_events + events_ttree[process_name] += chunk_events_ttree + file_list[process_name].push_back(filepath) + LOGGER.info(info_msg) + + info_msg = 'Processed events:' + for process_name, n_events in process_events.items(): + info_msg += f'\n\t- {process_name}: {n_events:,}' + LOGGER.info(info_msg) + info_msg = 'Events in the TTree:' + for process_name, n_events in events_ttree.items(): + info_msg += f'\n\t- {process_name}: {n_events:,}' + LOGGER.info(info_msg) + + histo_list = get_element(rdf_module, "histoList", True) + do_scale = get_element(rdf_module, "doScale", True) + int_lumi = get_element(rdf_module, "intLumi", True) + + do_tree = get_element(rdf_module, "doTree", True) + for process_name in process_list: + LOGGER.info('Running over process: %s', process_name) + + if process_events[process_name] == 0: + LOGGER.error('Can\'t scale histograms, the number of processed ' + 'events for the process "%s" seems to be zero!', + process_name) + sys.exit(3) + + df = ROOT.ROOT.RDataFrame("events", file_list[process_name]) + define_list = get_element(rdf_module, "defineList", True) + if len(define_list) > 0: + LOGGER.info('Registering extra DataFrame defines...') + for define in define_list: + df = df.Define(define, define_list[define]) + + fout_list = [] + histos_list = [] + tdf_list = [] + count_list = [] + cuts_list = [] + cuts_list.append(process_name) + eff_list = [] + eff_list.append(process_name) + + # Define all histos, snapshots, etc... + LOGGER.info('Defining snapshots and histograms') + for cut in cut_list: + # output file for tree + fout = output_dir + process_name + '_' + cut + '.root' + fout_list.append(fout) + + df_cut = df.Filter(cut_list[cut]) + count_list.append(df_cut.Count()) + + histos = [] + + for v in histo_list: + # default 1D histogram + if "name" in histo_list[v]: + model = ROOT.RDF.TH1DModel( + v, + f';{histo_list[v]["title"]};', + histo_list[v]["bin"], + histo_list[v]["xmin"], + histo_list[v]["xmax"]) + histos.append(df_cut.Histo1D(model, histo_list[v]["name"])) + # multi dim histogram (1, 2 or 3D) + elif "cols" in histo_list[v]: + cols = histo_list[v]['cols'] + bins = histo_list[v]['bins'] + bins_unpacked = tuple(i for sub in bins for i in sub) + if len(bins) != len(cols): + LOGGER.error('Amount of columns should be equal to ' + 'the amount of bin configs!\nAborting...') + sys.exit(3) + if len(cols) == 1: + histos.append(df_cut.Histo1D((v, "", *bins_unpacked), + cols[0])) + elif len(cols) == 2: + histos.append(df_cut.Histo2D((v, "", *bins_unpacked), + cols[0], + cols[1])) + elif len(cols) == 3: + histos.append(df_cut.Histo3D((v, "", *bins_unpacked), + cols[0], + cols[1], + cols[2])) + else: + LOGGER.error('Only 1, 2 or 3D histograms supported.') + sys.exit(3) + else: + LOGGER.error('Error parsing the histogram config. Provide ' + 'either name or cols.') + sys.exit(3) + histos_list.append(histos) + + if do_tree: + opts = ROOT.RDF.RSnapshotOptions() + opts.fLazy = True + try: + snapshot_tdf = df_cut.Snapshot("events", fout, "", opts) + except Exception as excp: + LOGGER.error('During the execution of the final stage ' + 'exception occurred:\n%s', excp) + sys.exit(3) + + # Needed to avoid python garbage collector messing around with + # the snapshot + tdf_list.append(snapshot_tdf) + + # Now perform the loop and evaluate everything at once. + LOGGER.info('Evaluating...') + all_events = df.Count().GetValue() + LOGGER.info('Done') + + nevents_real += all_events + uncertainty = ROOT.Math.sqrt(all_events) + + if do_scale: + all_events = all_events * 1. * \ + process_dict[process_name]["crossSection"] * \ + process_dict[process_name]["kfactor"] * \ + process_dict[process_name]["matchingEfficiency"] * \ + int_lumi / process_events[process_name] + uncertainty = ROOT.Math.sqrt(all_events) * \ + process_dict[process_name]["crossSection"] * \ + process_dict[process_name]["kfactor"] * \ + process_dict[process_name]["matchingEfficiency"] * \ + int_lumi / process_events[process_name] + LOGGER.info('Printing scaled number of events!!!') + + cfn_width = 16 + length_cuts_names # Cutflow name width + info_msg = 'Cutflow:' + info_msg += f'\n\t{"All events":{cfn_width}} : {all_events:,}' + + if save_tabular: + # scientific notation - recomended for backgrounds + cuts_list.append(f'{all_events:.2e} $\\pm$ {uncertainty:.2e}') + # float notation - recomended for signals with few events + # cuts_list.append(f'{all_events:.3f} $\\pm$ {uncertainty:.3f}') + # ####eff_list.append(1.) # start with 100% efficiency + + for i, cut in enumerate(cut_list): + nevents_this_cut = count_list[i].GetValue() + nevents_this_cut_raw = nevents_this_cut + uncertainty = ROOT.Math.sqrt(nevents_this_cut_raw) + if do_scale: + nevents_this_cut = \ + nevents_this_cut * 1. * \ + process_dict[process_name]["crossSection"] * \ + process_dict[process_name]["kfactor"] * \ + process_dict[process_name]["matchingEfficiency"] * \ + int_lumi / process_events[process_name] + uncertainty = \ + ROOT.Math.sqrt(nevents_this_cut_raw) * \ + process_dict[process_name]["crossSection"] * \ + process_dict[process_name]["kfactor"] * \ + process_dict[process_name]["matchingEfficiency"] * \ + int_lumi / process_events[process_name] + info_msg += f'\n\t{"After selection " + cut:{cfn_width}} : ' + info_msg += f'{nevents_this_cut:,}' + + # Saving the number of events, uncertainty and efficiency for the + # output-file + if save_tabular and cut != 'selNone': + if nevents_this_cut != 0: + # scientific notation - recomended for backgrounds + cuts_list.append( + f'{nevents_this_cut:.2e} $\\pm$ {uncertainty:.2e}') + # float notation - recomended for signals with few events + # cuts_list.append( + # f'{neventsThisCut:.3f} $\\pm$ {uncertainty:.3f}') + eff_list.append(f'{1.*nevents_this_cut/all_events:.3f}') + # if number of events is zero, the previous uncertainty is + # saved instead: + elif '$\\pm$' in cuts_list[-1]: + cut = (cuts_list[-1]).split() + cuts_list.append(f'$\\leq$ {cut[2]}') + eff_list.append('0.') + else: + cuts_list.append(cuts_list[-1]) + eff_list.append('0.') + + LOGGER.info(info_msg) + + # And save everything + LOGGER.info('Saving the outputs...') + for i, cut in enumerate(cut_list): + # output file for histograms + fhisto = output_dir + process_name + '_' + cut + '_histo.root' + with ROOT.TFile(fhisto, 'RECREATE'): + for h in histos_list[i]: + try: + h.Scale( + 1. * process_dict[process_name]["crossSection"] * + process_dict[process_name]["kfactor"] * + process_dict[process_name]["matchingEfficiency"] / + process_events[process_name]) + except KeyError: + LOGGER.warning( + 'No value defined for process %s in dictionary!', + process_name) + if h.Integral(0, -1) > 0: + h.Scale(1./h.Integral(0, -1)) + h.Write() + + if do_tree: + # test that the snapshot worked well + validfile = testfile(fout_list[i]) + if not validfile: + continue + + if save_tabular and cut != 'selNone': + save_tab.append(cuts_list) + efficiency_list.append(eff_list) + + if save_tabular: + tabular_path = output_dir + 'outputTabular.txt' + LOGGER.info('Saving tabular to:\n%s', tabular_path) + with open(tabular_path, 'w', encoding='utf-8') as outfile: + # Printing the number of events in format of a LaTeX table + outfile.write('\\begin{table}[H]\n' + ' \\centering\n' + ' \\resizebox{\\textwidth}{!}{\n' + ' \\begin{tabular}{|l||') + outfile.write('c|' * (len(cuts_list)-1)) + outfile.write('} \\hline\n') + for i, row in enumerate(save_tab): + outfile.write(' ') + outfile.write(' & '.join(row)) + outfile.write(' \\\\\n') + if i == 0: + outfile.write(' \\hline\n') + outfile.write(' \\hline \n' + ' \\end{tabular}} \n' + ' \\caption{Caption} \n' + ' \\label{tab:my_label} \n' + '\\end{table}\n') + + # Efficiency: + outfile.write('\n\nEfficiency:\n') + outfile.write('\\begin{table}[H] \n' + ' \\centering \n' + ' \\resizebox{\\textwidth}{!}{ \n' + ' \\begin{tabular}{|l||') + outfile.write('c|' * (len(cuts_list)-1)) + outfile.write('} \\hline\n') + print(eff_list) + for i in range(len(eff_list)): + outfile.write(' ') + print('i:', i) + print(efficiency_list) + v = [row[i] for row in efficiency_list] + outfile.write(' & '.join(str(v))) + outfile.write(' \\\\\n') + if i == 0: + outfile.write(' \\hline\n') + outfile.write(' \\hline \n' + ' \\end{tabular}} \n' + ' \\caption{Caption} \n' + ' \\label{tab:my_label} \n' + '\\end{table}\n') + + elapsed_time = time.time() - start_time + + info_msg = f"{' SUMMARY ':=^80}\n" + info_msg += 'Elapsed time (H:M:S): ' + info_msg += time.strftime('%H:%M:%S', time.gmtime(elapsed_time)) + info_msg += '\nEvents processed/second: ' + info_msg += f'{int(nevents_real/elapsed_time):,}' + info_msg += f'\nTotal events processed: {nevents_real:,}' + info_msg += '\n' + info_msg += 80 * '=' + info_msg += '\n' + LOGGER.info(info_msg) + + +def run_final(parser): + ''' + Run final stage of the analysis. + ''' + + args, _ = parser.parse_known_args() + + if args.command != 'final': + LOGGER.error('Unknow sub-command "%s"!\nAborting...', args.command) + sys.exit(3) + + # Check that the analysis file exists + anapath = args.anascript_path + if not os.path.isfile(anapath): + LOGGER.error('Analysis script "%s" not found!\nAborting...', + anapath) + sys.exit(3) + + # Load pre compiled analyzers + LOGGER.info('Loading analyzers from libFCCAnalyses...') + ROOT.gSystem.Load("libFCCAnalyses") + # Is this still needed?? 01/04/2022 still to be the case + _fcc = ROOT.dummyLoader + LOGGER.debug(_fcc) + + # Set verbosity level + if args.verbose: + # ROOT.Experimental.ELogLevel.kInfo verbosity level is more + # equivalent to DEBUG in other log systems + LOGGER.debug('Setting verbosity level "kInfo" for RDataFrame...') + verbosity = ROOT.Experimental.RLogScopedVerbosity( + ROOT.Detail.RDF.RDFLogChannel(), + ROOT.Experimental.ELogLevel.kInfo) + LOGGER.debug(verbosity) + if args.more_verbose: + LOGGER.debug('Setting verbosity level "kDebug" for RDataFrame...') + verbosity = ROOT.Experimental.RLogScopedVerbosity( + ROOT.Detail.RDF.RDFLogChannel(), + ROOT.Experimental.ELogLevel.kDebug) + LOGGER.debug(verbosity) + if args.most_verbose: + LOGGER.debug('Setting verbosity level "kDebug+10" for ' + 'RDataFrame...') + verbosity = ROOT.Experimental.RLogScopedVerbosity( + ROOT.Detail.RDF.RDFLogChannel(), + ROOT.Experimental.ELogLevel.kDebug+10) + LOGGER.debug(verbosity) + + # Load the analysis + anapath_abs = os.path.abspath(anapath) + LOGGER.info('Loading analysis script:\n%s', anapath_abs) + rdf_spec = importlib.util.spec_from_file_location('rdfanalysis', + anapath_abs) + rdf_module = importlib.util.module_from_spec(rdf_spec) + rdf_spec.loader.exec_module(rdf_module) + + run(rdf_module) diff --git a/python/test.py b/python/test_fccanalyses.py similarity index 100% rename from python/test.py rename to python/test_fccanalyses.py