From 0547572d087d12b079cc59e88873670886035a6a Mon Sep 17 00:00:00 2001 From: Joosep Pata Date: Fri, 6 Oct 2023 17:57:11 +0300 Subject: [PATCH] Timing the ONNX model, retrain CMS-GNNLSH-TF (#229) * timing code, retrain CMS-GNNLSH-TF, new multi-particle gun dataset, small hit-based datasets --- mlpf/data_cms/genjob.sh | 11 +- mlpf/data_cms/prepare_args.py | 46 ++-- mlpf/heptfds/clic_pf_edm4hep_hits/qq_10k.py | 66 +++++ .../heptfds/clic_pf_edm4hep_hits/ttbar_10k.py | 66 +++++ .../heptfds/clic_pf_edm4hep_hits/utils_edm.py | 4 +- mlpf/heptfds/cms_pf/cms_utils.py | 2 + mlpf/heptfds/cms_pf/multiparticlegun.py | 62 +++++ mlpf/tfmodel/timing.py | 56 ---- mlpf/tfmodel/utils.py | 3 + mlpf/timing.py | 110 ++++++++ notebooks/cms/cms-simvalidation.ipynb | 133 +++++----- parameters/bench/clic-hits-bench.yaml | 248 ++++++++++++++++++ parameters/{ => bench}/delphes-bench.yaml | 0 parameters/cms-gen.yaml | 13 +- scripts/generate_tfds.sh | 14 +- scripts/tallinn/a100/clic-hits-train.sh | 6 +- scripts/tallinn/a100/clic-train-hvd.sh | 4 +- scripts/tallinn/a100/clic-train.sh | 4 +- scripts/tallinn/a100/cms-train.sh | 7 +- scripts/tallinn/rtx/clic-train.sh | 4 +- scripts/tallinn/rtx/delphes-train.sh | 4 +- scripts/tallinn/rtx/eval.sh | 9 +- timing.md | 36 +++ 23 files changed, 727 insertions(+), 181 deletions(-) create mode 100644 mlpf/heptfds/clic_pf_edm4hep_hits/qq_10k.py create mode 100644 mlpf/heptfds/clic_pf_edm4hep_hits/ttbar_10k.py create mode 100644 mlpf/heptfds/cms_pf/multiparticlegun.py delete mode 100644 mlpf/tfmodel/timing.py create mode 100644 mlpf/timing.py create mode 100644 parameters/bench/clic-hits-bench.yaml rename parameters/{ => bench}/delphes-bench.yaml (100%) create mode 100644 timing.md diff --git a/mlpf/data_cms/genjob.sh b/mlpf/data_cms/genjob.sh index 7ed70b797..d2940f855 100755 --- a/mlpf/data_cms/genjob.sh +++ b/mlpf/data_cms/genjob.sh @@ -13,12 +13,16 @@ MLPF_PATH=/home/joosep/particleflow/ SAMPLE=$1 SEED=$2 -WORKDIR=`pwd`/$SAMPLE/$SEED +WORKDIR=/scratch/local/joosep/$SAMPLE/$SEED +#WORKDIR=`pwd`/$SAMPLE/$SEED mkdir -p $WORKDIR +OUTDIR=/local/joosep/mlpf/cms/v2/$SAMPLE/raw +mkdir -p $OUTDIR + PILEUP=NoPileUp -N=1000 +N=100 env source /cvmfs/cms.cern.ch/cmsset_default.sh @@ -73,4 +77,5 @@ cmsRun $CMSSWDIR/src/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py mv pfntuple.root pfntuple_${SEED}.root python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ --save-normalized-table bzip2 -z pfntuple_${SEED}.pkl -#rm step*.root +cp *.pkl.bz2 $OUTDIR/ +rm -Rf $WORKDIR diff --git a/mlpf/data_cms/prepare_args.py b/mlpf/data_cms/prepare_args.py index 92efdbb62..19b174cfc 100644 --- a/mlpf/data_cms/prepare_args.py +++ b/mlpf/data_cms/prepare_args.py @@ -3,43 +3,41 @@ import os -outdir = "/local/joosep/mlpf/gen/v3/" +outdir = "/local/joosep/mlpf/cms/v2/" samples = [ - "SingleElectronFlatPt1To1000_pythia8_cfi", - "SingleGammaFlatPt1To1000_pythia8_cfi", - "SingleMuFlatPt1To1000_pythia8_cfi", - "SingleNeutronFlatPt0p7To1000_cfi", - "SinglePi0Pt1To1000_pythia8_cfi", - "SinglePiMinusFlatPt0p7To1000_cfi", - "SingleProtonMinusFlatPt0p7To1000_cfi", - "SingleTauFlatPt1To1000_cfi", + # "SingleElectronFlatPt1To1000_pythia8_cfi", + # "SingleGammaFlatPt1To1000_pythia8_cfi", + # "SingleMuFlatPt1To1000_pythia8_cfi", + # "SingleNeutronFlatPt0p7To1000_cfi", + # "SinglePi0Pt1To1000_pythia8_cfi", + # "SinglePiMinusFlatPt0p7To1000_cfi", + # "SingleProtonMinusFlatPt0p7To1000_cfi", + # "SingleTauFlatPt1To1000_cfi", + # "MultiParticlePFGun_cfi", + ("MultiParticlePFGun50_cfi", 100000, 102050), ] samples_pu = [ - "TTbar_14TeV_TuneCUETP8M1_cfi", - "ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi", - "QCDForPF_14TeV_TuneCUETP8M1_cfi", - "QCD_Pt_3000_7000_14TeV_TuneCUETP8M1_cfi", - "SMS-T1tttt_mGl-1500_mLSP-100_TuneCP5_14TeV_pythia8_cfi", - "ZpTT_1500_14TeV_TuneCP5_cfi", + # "TTbar_14TeV_TuneCUETP8M1_cfi", + # "ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi", + # "QCDForPF_14TeV_TuneCUETP8M1_cfi", + # "QCD_Pt_3000_7000_14TeV_TuneCUETP8M1_cfi", + # "SMS-T1tttt_mGl-1500_mLSP-100_TuneCP5_14TeV_pythia8_cfi", + # "ZpTT_1500_14TeV_TuneCP5_cfi", ] -NUM_SAMPLES = 1050 -SEED = 1 - if __name__ == "__main__": - for s in samples_pu + samples: + for s, seed0, seed1 in samples_pu + samples: is_pu = s in samples_pu os.makedirs(outdir + "/" + s + "/raw", exist_ok=True) os.makedirs(outdir + "/" + s + "/root", exist_ok=True) - for nsamp in range(NUM_SAMPLES): - if not os.path.isfile(outdir + "/" + s + "/raw/pfntuple_{}.pkl.bz2".format(SEED)): + for seed in range(seed0, seed1): + if not os.path.isfile(outdir + "/" + s + "/raw/pfntuple_{}.pkl.bz2".format(seed)): if is_pu: - print("sbatch mlpf/tallinn/genjob_pu.sh {} {}".format(s, SEED)) + print("sbatch genjob_pu.sh {} {}".format(s, seed)) else: - print("sbatch mlpf/tallinn/genjob.sh {} {}".format(s, SEED)) - SEED += 1 + print("sbatch genjob.sh {} {}".format(s, seed)) diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/qq_10k.py b/mlpf/heptfds/clic_pf_edm4hep_hits/qq_10k.py new file mode 100644 index 000000000..f208df289 --- /dev/null +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/qq_10k.py @@ -0,0 +1,66 @@ +from pathlib import Path + +import tensorflow as tf +from utils_edm import ( + X_FEATURES_CH, + X_FEATURES_TRK, + Y_FEATURES, + generate_examples, + split_sample, +) + +import tensorflow_datasets as tfds + +from qq import _DESCRIPTION, _CITATION + + +class ClicEdmQqHitsPf10k(tfds.core.GeneratorBasedBuilder): + VERSION = tfds.core.Version("1.5.0") + RELEASE_NOTES = { + "1.5.0": "Regenerate with ARRAY_RECORD", + } + MANUAL_DOWNLOAD_INSTRUCTIONS = """ + For the raw input files in ROOT EDM4HEP format, please see the citation above. + + The processed tensorflow_dataset can also be downloaded from: + FIXME + """ + + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(ClicEdmQqHitsPf10k, self).__init__(*args, **kwargs) + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict( + { + "X": tfds.features.Tensor( + shape=( + None, + max(len(X_FEATURES_TRK), len(X_FEATURES_CH)), + ), + dtype=tf.float32, + ), + "ygen": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), + "ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), + } + ), + supervised_keys=None, + homepage="", + citation=_CITATION, + metadata=tfds.core.MetadataDict( + x_features_track=X_FEATURES_TRK, + x_features_calohit=X_FEATURES_CH, + y_features=Y_FEATURES, + ), + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + path = dl_manager.manual_dir + return split_sample(Path(path / "p8_ee_qq_ecm380/"), max_files=100) + + def _generate_examples(self, files): + return generate_examples(files) diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/ttbar_10k.py b/mlpf/heptfds/clic_pf_edm4hep_hits/ttbar_10k.py new file mode 100644 index 000000000..e9198ece8 --- /dev/null +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/ttbar_10k.py @@ -0,0 +1,66 @@ +from pathlib import Path + +import tensorflow as tf +from utils_edm import ( + X_FEATURES_CH, + X_FEATURES_TRK, + Y_FEATURES, + generate_examples, + split_sample, +) + +import tensorflow_datasets as tfds + +from ttbar import _DESCRIPTION, _CITATION + + +class ClicEdmTtbarHitsPf10k(tfds.core.GeneratorBasedBuilder): + VERSION = tfds.core.Version("1.5.0") + RELEASE_NOTES = { + "1.5.0": "Regenerate with ARRAY_RECORD", + } + MANUAL_DOWNLOAD_INSTRUCTIONS = """ + For the raw input files in ROOT EDM4HEP format, please see the citation above. + + The processed tensorflow_dataset can also be downloaded from: + FIXME + """ + + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(ClicEdmTtbarHitsPf10k, self).__init__(*args, **kwargs) + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict( + { + "X": tfds.features.Tensor( + shape=( + None, + max(len(X_FEATURES_TRK), len(X_FEATURES_CH)), + ), + dtype=tf.float32, + ), + "ygen": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), + "ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), + } + ), + supervised_keys=None, + homepage="", + citation=_CITATION, + metadata=tfds.core.MetadataDict( + x_features_track=X_FEATURES_TRK, + x_features_calohit=X_FEATURES_CH, + y_features=Y_FEATURES, + ), + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + path = dl_manager.manual_dir + return split_sample(Path(path / "p8_ee_tt_ecm380/"), max_files=100) + + def _generate_examples(self, files): + return generate_examples(files) diff --git a/mlpf/heptfds/clic_pf_edm4hep_hits/utils_edm.py b/mlpf/heptfds/clic_pf_edm4hep_hits/utils_edm.py index 93e1acc73..7f8cc19c8 100644 --- a/mlpf/heptfds/clic_pf_edm4hep_hits/utils_edm.py +++ b/mlpf/heptfds/clic_pf_edm4hep_hits/utils_edm.py @@ -41,8 +41,10 @@ labels = [0, 211, 130, 22, 11, 13] -def split_sample(path, test_frac=0.8): +def split_sample(path, test_frac=0.8, max_files=0): files = sorted(list(path.glob("*.parquet"))) + if max_files > 0: + files = files[:max_files] print("Found {} files in {}".format(len(files), path)) assert len(files) > 0 idx_split = int(test_frac * len(files)) diff --git a/mlpf/heptfds/cms_pf/cms_utils.py b/mlpf/heptfds/cms_pf/cms_utils.py index 286b54c7c..46dee72eb 100644 --- a/mlpf/heptfds/cms_pf/cms_utils.py +++ b/mlpf/heptfds/cms_pf/cms_utils.py @@ -270,6 +270,8 @@ def generate_examples(files): x = Xs[ii] yg = ygens[ii] yc = ycands[ii] + + uniqs, counts = np.unique(yg[:, 0], return_counts=True) yield str(fi) + "_" + str(ii), { "X": x, "ygen": yg, diff --git a/mlpf/heptfds/cms_pf/multiparticlegun.py b/mlpf/heptfds/cms_pf/multiparticlegun.py new file mode 100644 index 000000000..7eb680832 --- /dev/null +++ b/mlpf/heptfds/cms_pf/multiparticlegun.py @@ -0,0 +1,62 @@ +"""CMS PF SinglePi dataset.""" +import cms_utils +import tensorflow as tf + +import tensorflow_datasets as tfds + +X_FEATURES = cms_utils.X_FEATURES +Y_FEATURES = cms_utils.Y_FEATURES + +_DESCRIPTION = """ +Dataset generated with CMSSW and full detector sim. + +Multi-particle gun events. +""" + +# TODO(cms_pf): BibTeX citation +_CITATION = """ +""" + + +class CmsPfMultiParticleGun(tfds.core.GeneratorBasedBuilder): + """DatasetBuilder for cms_pf_multi_particle_gun dataset.""" + + VERSION = tfds.core.Version("1.6.0") + RELEASE_NOTES = { + "1.6.0": "Initial release", + } + MANUAL_DOWNLOAD_INSTRUCTIONS = """ + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/MultiParticlePFGun_cfi data/ + """ + + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(CmsPfMultiParticleGun, self).__init__(*args, **kwargs) + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict( + { + "X": tfds.features.Tensor(shape=(None, len(X_FEATURES)), dtype=tf.float32), + "ygen": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), + "ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), + } + ), + supervised_keys=("X", "ycand"), + homepage="", + citation=_CITATION, + metadata=tfds.core.MetadataDict(x_features=X_FEATURES, y_features=Y_FEATURES), + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + """Returns SplitGenerators.""" + path = dl_manager.manual_dir + sample_dir = "MultiParticlePFGun50_cfi" + return cms_utils.split_sample(path / sample_dir / "raw") + + def _generate_examples(self, files): + return cms_utils.generate_examples(files) diff --git a/mlpf/tfmodel/timing.py b/mlpf/tfmodel/timing.py deleted file mode 100644 index 52492f40f..000000000 --- a/mlpf/tfmodel/timing.py +++ /dev/null @@ -1,56 +0,0 @@ -import sys -import time - -import numpy as np -import onnxruntime -import pynvml - -# pip install only onnxruntime_gpu, not onnxruntime! - -if __name__ == "__main__": - pynvml.nvmlInit() - handle = pynvml.nvmlDeviceGetHandleByIndex(0) - - EP_list = ["CUDAExecutionProvider"] - - time.sleep(5) - - mem = pynvml.nvmlDeviceGetMemoryInfo(handle) - mem_initial = mem.used / 1000 / 1000 - print("mem_initial", mem_initial) - - onnx_sess = onnxruntime.InferenceSession(sys.argv[1], providers=EP_list) - time.sleep(5) - - mem = pynvml.nvmlDeviceGetMemoryInfo(handle) - mem_onnx = mem.used / 1000 / 1000 - print("mem_onnx", mem_initial) - - for num_elems in range(1600, 25600, 320): - times = [] - mem_used = [] - - # average over 100 events - for i in range(100): - - # allocate array in system RAM - X = np.array(np.random.randn(1, num_elems, 25), np.float32) - - # transfer data to GPU, run model, transfer data back - t0 = time.time() - pred_onx = onnx_sess.run(None, {"x:0": X}) - t1 = time.time() - dt = t1 - t0 - times.append(dt) - mem = pynvml.nvmlDeviceGetMemoryInfo(handle) - mem_used.append(mem.used / 1000 / 1000) - - print( - "Nelem={} mean_time={:.2f} ms stddev_time={:.2f} ms mem_used={:.0f} MB".format( - num_elems, - 1000.0 * np.mean(times), - 1000.0 * np.std(times), - np.max(mem_used), - ) - ) - time.sleep(5) diff --git a/mlpf/tfmodel/utils.py b/mlpf/tfmodel/utils.py index 1874734f2..29f18298d 100644 --- a/mlpf/tfmodel/utils.py +++ b/mlpf/tfmodel/utils.py @@ -824,9 +824,12 @@ def model_scope(config, total_steps, weights=None, horovod_enabled=False): def model_weight_setting(): grad_vars = model.trainable_weights + logging.info("grad_vars={}".format(len(grad_vars))) zero_grads = [tf.zeros_like(w) for w in grad_vars] + logging.info("applying zero gradients to initialize optimizer") opt.apply_gradients(zip(zero_grads, grad_vars)) if loaded_opt: + logging.info("setting optimizer state") opt.set_weights(loaded_opt["weights"]) logging.info("distributing optimizer state") diff --git a/mlpf/timing.py b/mlpf/timing.py new file mode 100644 index 000000000..d73a1b996 --- /dev/null +++ b/mlpf/timing.py @@ -0,0 +1,110 @@ +import sys +import time + +import numpy as np +import onnxruntime as rt +import pynvml +import resource +import argparse + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--bin-size", type=int, default=256) + parser.add_argument("--num-features", type=int, default=17) + parser.add_argument("--batch-size", type=int, default=20) + parser.add_argument("--num-threads", type=int, default=1) + parser.add_argument("--use-gpu", type=bool, action="store_true") + args = parser.parse_args() + return args + + +# for GPU testing, you need to +# pip install only onnxruntime_gpu, not onnxruntime! +args = parse_args() + +bin_size = args.bin_size +num_features = args.num_features +use_gpu = args.use_gpu +batch_size = args.batch_size +num_threads = args.num_threads + +if use_gpu: + pynvml.nvmlInit() + handle = pynvml.nvmlDeviceGetHandleByIndex(0) + + +def get_mem_cpu_mb(): + return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000 + + +def get_mem_gpu_mb(): + mem = pynvml.nvmlDeviceGetMemoryInfo(handle) + return mem.used / 1000 / 1000 + + +def get_mem_mb(use_gpu): + if use_gpu: + return get_mem_gpu_mb() + else: + return get_mem_cpu_mb() + + +if __name__ == "__main__": + print( + "batch_size={} bin_size={} num_features={} use_gpu={} num_threads={}".format( + batch_size, bin_size, num_features, use_gpu, num_threads + ) + ) + + if use_gpu: + EP_list = ["CUDAExecutionProvider"] + else: + EP_list = ["CPUExecutionProvider"] + + time.sleep(5) + + mem_initial = get_mem_mb(use_gpu) + print("mem_initial", mem_initial) + + sess_options = rt.SessionOptions() + sess_options.intra_op_num_threads = num_threads + sess_options.inter_op_num_threads = num_threads + sess_options.execution_mode = rt.ExecutionMode.ORT_PARALLEL + sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL + sess_options.add_session_config_entry("session.intra_op.allow_spinning", "1") + + onnx_sess = rt.InferenceSession(sys.argv[1], sess_options, providers=EP_list) + time.sleep(5) + + mem_onnx = get_mem_mb(use_gpu) + print("mem_onnx", mem_onnx) + + for num_elems in [bin_size, 2 * bin_size, 10 * bin_size, 20 * bin_size, 40 * bin_size]: + times = [] + mem_used = [] + + # average over 100 events + for i in range(10): + + # allocate array in system memory + X = np.array(np.random.randn(batch_size, num_elems, num_features), np.float32) + + # transfer data to GPU, run model, transfer data back + t0 = time.time() + pred_onx = onnx_sess.run(None, {"x:0": X}) + t1 = time.time() + dt = (t1 - t0) / batch_size + times.append(dt) + + mem_used.append(get_mem_mb(use_gpu)) + + print( + "Nelem={} mean_time={:.2f} ms stddev_time={:.2f} ms mem_used={:.0f} MB".format( + num_elems, + 1000.0 * np.mean(times), + 1000.0 * np.std(times), + np.max(mem_used), + ) + ) + time.sleep(5) diff --git a/notebooks/cms/cms-simvalidation.ipynb b/notebooks/cms/cms-simvalidation.ipynb index 44a3421d5..34772a119 100644 --- a/notebooks/cms/cms-simvalidation.ipynb +++ b/notebooks/cms/cms-simvalidation.ipynb @@ -63,7 +63,7 @@ "source": [ "import sys\n", "\n", - "sys.path += [\"../mlpf/plotting/\"]\n", + "sys.path += [\"../../mlpf/plotting/\"]\n", "\n", "from plot_utils import ELEM_LABELS_CMS, ELEM_NAMES_CMS\n", "from plot_utils import CLASS_LABELS_CMS, CLASS_NAMES_CMS\n", @@ -78,7 +78,7 @@ "metadata": {}, "outputs": [], "source": [ - "!ls -lrt /local/joosep/mlpf/gen/v2" + "!ls -lrt /home/joosep/particleflow/mlpf/data_cms/MultiParticlePFGun_cfi/1" ] }, { @@ -88,7 +88,7 @@ "metadata": {}, "outputs": [], "source": [ - "sample = \"SingleNeutronFlatPt0p7To1000_cfi\"\n", + "sample = \"MultiParticlePFGun_cfi\"\n", "\n", "maxfiles = 50\n", "if sample.startswith(\"Single\"):\n", @@ -109,7 +109,7 @@ "pickle_data = sum(\n", " [\n", " pickle.load(bz2.BZ2File(f, \"r\"))\n", - " for f in tqdm.tqdm(list(glob.glob(\"/local/joosep/mlpf/gen/v2/{}/raw/*.pkl.bz2\".format(sample)))[:maxfiles])\n", + " for f in tqdm.tqdm(list(glob.glob(\"/home/joosep/particleflow/mlpf/data_cms/{}/*/*.pkl.bz2\".format(sample)))[:maxfiles])\n", " ],\n", " [],\n", ")\n", @@ -186,7 +186,7 @@ " plt.legend(loc=6)\n", " cms_label(ax)\n", " plt.ylim(1, 1e5)\n", - " sample_label(ax, sample)\n", + " #sample_label(ax, sample)\n", " plt.savefig(plot_outpath + \"all_pt.pdf\", bbox_inches=\"tight\")" ] }, @@ -210,7 +210,7 @@ " plt.ylabel(\"Number of events\")\n", " plt.legend(loc=6)\n", " cms_label(ax)\n", - " sample_label(ax, sample)\n", + " #sample_label(ax, sample)\n", " plt.ylim(1, 1e3)\n", " plt.savefig(plot_outpath + \"all_sume.pdf\", bbox_inches=\"tight\")" ] @@ -250,23 +250,23 @@ "source": [ "plt.figure(figsize=(12, 10))\n", "ax = plt.axes()\n", - "b = np.logspace(1, 6, 101)\n", + "b = np.logspace(3, 5, 101)\n", "plt.hist2d(\n", - " awkward.sum(arrs_awk[\"ygen\"][\"e\"], axis=1),\n", - " awkward.sum(arrs_awk[\"ycand\"][\"e\"], axis=1),\n", + " awkward.to_numpy(awkward.sum(arrs_awk[\"ygen\"][\"e\"], axis=1)),\n", + " awkward.to_numpy(awkward.sum(arrs_awk[\"ycand\"][\"e\"], axis=1)),\n", " bins=(b, b),\n", " cmap=\"hot_r\",\n", " norm=matplotlib.colors.Normalize(vmin=0),\n", ")\n", - "plt.plot([1e1, 1e6], [1e1, 1e6], color=\"black\", ls=\"--\")\n", + "plt.plot([1e3, 1e3], [1e5, 1e5], color=\"black\", ls=\"--\")\n", "plt.colorbar()\n", "plt.xscale(\"log\")\n", "plt.yscale(\"log\")\n", "plt.xlabel(\"MLPF truth event $\\sum E$ [GeV]\")\n", "plt.ylabel(\"PF event $\\sum E$ [GeV]\")\n", "\n", - "cms_label(ax)\n", - "sample_label(ax, sample)\n", + "#cms_label(ax)\n", + "#sample_label(ax, sample)\n", "plt.savefig(plot_outpath + \"pf_vs_truth_sume.pdf\", bbox_inches=\"tight\")" ] }, @@ -295,8 +295,8 @@ "ax = plt.axes()\n", "b = np.logspace(1, 6, 100)\n", "plt.hist2d(\n", - " met(arrs_awk[\"ygen\"][\"pt\"], arrs_awk[\"ygen\"][\"phi\"]),\n", - " met(arrs_awk[\"ycand\"][\"pt\"], arrs_awk[\"ycand\"][\"phi\"]),\n", + " awkward.to_numpy(met(arrs_awk[\"ygen\"][\"pt\"], arrs_awk[\"ygen\"][\"phi\"])),\n", + " awkward.to_numpy(met(arrs_awk[\"ycand\"][\"pt\"], arrs_awk[\"ycand\"][\"phi\"])),\n", " bins=(b, b),\n", " cmap=\"hot_r\",\n", " norm=matplotlib.colors.Normalize(vmin=0),\n", @@ -308,8 +308,8 @@ "plt.xlabel(\"MLPF truth MET [GeV]\")\n", "plt.ylabel(\"PF MET [GeV]\")\n", "\n", - "cms_label(ax)\n", - "sample_label(ax, sample)\n", + "#cms_label(ax)\n", + "#sample_label(ax, sample)\n", "plt.savefig(plot_outpath + \"pf_vs_truth_met.pdf\", bbox_inches=\"tight\")" ] }, @@ -317,9 +317,7 @@ "cell_type": "code", "execution_count": null, "id": "e40bb409", - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "for pid in [\n", @@ -329,8 +327,8 @@ " msk = arrs_flat[\"ygen\"][\"typ\"] != pid\n", " else:\n", " msk = arrs_flat[\"ygen\"][\"typ\"] == pid\n", - " data1 = awkward.flatten(arrs_flat[\"Xelem\"][\"eta\"][msk])\n", - " data2 = awkward.flatten(arrs_flat[\"ygen\"][\"eta\"][msk])\n", + " data1 = awkward.to_numpy(awkward.flatten(arrs_flat[\"Xelem\"][\"eta\"][msk]))\n", + " data2 = awkward.to_numpy(awkward.flatten(arrs_flat[\"ygen\"][\"eta\"][msk]))\n", "\n", " plt.figure(figsize=(12, 10))\n", " ax = plt.axes()\n", @@ -349,16 +347,16 @@ " cbar.formatter.set_useMathText(True)\n", "\n", " cms_label(ax)\n", - " if pid == 0:\n", - " sample_label(ax, sample)\n", - " else:\n", - " sample_label(ax, sample, \", \" + CLASS_NAMES_CMS[CLASS_LABELS_CMS.index(pid)])\n", + " # if pid == 0:\n", + " # sample_label(ax, sample)\n", + " # else:\n", + " # sample_label(ax, sample, \", \" + CLASS_NAMES_CMS[CLASS_LABELS_CMS.index(pid)])\n", " plt.xlabel(\"Truth $\\eta$\")\n", " plt.ylabel(\"PFElement $\\eta$\")\n", " plt.savefig(plot_outpath + \"truth_vs_pfelement_eta_{}.pdf\".format(pid), bbox_inches=\"tight\")\n", "\n", - " data1 = awkward.flatten(arrs_flat[\"Xelem\"][\"phi\"][msk])\n", - " data2 = awkward.flatten(arrs_flat[\"ygen\"][\"phi\"][msk])\n", + " data1 = awkward.to_numpy(awkward.flatten(arrs_flat[\"Xelem\"][\"phi\"][msk]))\n", + " data2 = awkward.to_numpy(awkward.flatten(arrs_flat[\"ygen\"][\"phi\"][msk]))\n", " plt.figure(figsize=(12, 10))\n", " ax = plt.axes()\n", " plt.hist2d(\n", @@ -376,10 +374,10 @@ " cbar.formatter.set_useMathText(True)\n", "\n", " cms_label(ax)\n", - " if pid == 0:\n", - " sample_label(ax, sample)\n", - " else:\n", - " sample_label(ax, sample, \", \" + CLASS_NAMES_CMS[CLASS_LABELS_CMS.index(pid)])\n", + " # if pid == 0:\n", + " # sample_label(ax, sample)\n", + " # else:\n", + " # sample_label(ax, sample, \", \" + CLASS_NAMES_CMS[CLASS_LABELS_CMS.index(pid)])\n", " plt.xlabel(\"MLPF truth $\\phi$\")\n", " plt.ylabel(\"PFElement $\\phi$\")\n", " plt.savefig(plot_outpath + \"truth_vs_pfelement_phi_{}.pdf\".format(pid), bbox_inches=\"tight\")\n", @@ -424,9 +422,7 @@ "cell_type": "code", "execution_count": null, "id": "789aceea", - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(15, 10))\n", @@ -471,17 +467,17 @@ "ax = plt.axes()\n", "\n", "b = np.logspace(-2, 4, 101)\n", - "h = np.histogram(awkward.flatten(arrs_awk[\"ycand\"][\"pt\"]), bins=b)\n", + "h = np.histogram(awkward.to_numpy(awkward.flatten(arrs_awk[\"ycand\"][\"pt\"])), bins=b)\n", "mplhep.histplot(h, histtype=\"step\", label=\"PF\")\n", "\n", - "h = np.histogram(awkward.flatten(arrs_awk[\"ygen\"][\"pt\"]), bins=b)\n", + "h = np.histogram(awkward.to_numpy(awkward.flatten(arrs_awk[\"ygen\"][\"pt\"])), bins=b)\n", "mplhep.histplot(h, histtype=\"step\", label=\"MLPF truth\")\n", "\n", "plt.xscale(\"log\")\n", "plt.legend(ncol=1, loc=(0.6, 0.5))\n", "\n", "cms_label(ax)\n", - "sample_label(ax, sample)\n", + "#sample_label(ax, sample)\n", "\n", "plt.xlabel(\"$p_T$ [GeV]\")\n", "plt.ylabel(\"Number of particles\")\n", @@ -500,16 +496,16 @@ "\n", "b = np.linspace(-6, 6, 101)\n", "\n", - "h = np.histogram(awkward.flatten(arrs_awk[\"ycand\"][\"eta\"]), bins=b)\n", + "h = np.histogram(awkward.to_numpy(awkward.flatten(arrs_awk[\"ycand\"][\"eta\"])), bins=b)\n", "mplhep.histplot(h, histtype=\"step\", label=\"PF\")\n", "\n", - "h = np.histogram(awkward.flatten(arrs_awk[\"ygen\"][\"eta\"]), bins=b)\n", + "h = np.histogram(awkward.to_numpy(awkward.flatten(arrs_awk[\"ygen\"][\"eta\"])), bins=b)\n", "mplhep.histplot(h, histtype=\"step\", label=\"MLPF truth\")\n", "\n", "plt.legend(ncol=1, loc=(0.68, 0.75))\n", "\n", "cms_label(ax)\n", - "sample_label(ax, sample)\n", + "#sample_label(ax, sample)\n", "\n", "plt.xlabel(\"particle $\\eta$\")\n", "plt.ylabel(\"Number of particles\")\n", @@ -531,7 +527,7 @@ "colors = plt.cm.get_cmap(\"tab20c\", len(pids))\n", "labels = []\n", "for pid in pids[::-1]:\n", - " pt_pid = awkward.flatten(arrs_awk[\"ygen\"][\"pt\"][arrs_awk[\"ygen\"][\"typ\"] == pid])\n", + " pt_pid = awkward.to_numpy(awkward.to_numpy(awkward.flatten(arrs_awk[\"ygen\"][\"pt\"][arrs_awk[\"ygen\"][\"typ\"] == pid])))\n", " hs.append(np.histogram(pt_pid, bins=b))\n", " labels.append(CLASS_NAMES_CMS[CLASS_LABELS_CMS.index(pid)])\n", "mplhep.histplot(hs, stack=True, histtype=\"fill\", label=labels, color=colors.colors)\n", @@ -544,12 +540,12 @@ "plt.ticklabel_format(style=\"sci\", axis=\"y\", scilimits=(0, 0))\n", "ax.yaxis.major.formatter._useMathText = True\n", "\n", - "plt.legend(ncol=1, loc=(0.7, 0.4))\n", + "plt.legend(ncol=1, loc=(0.1, 0.4))\n", "plt.xlabel(\"particle $p_T$ [GeV]\")\n", "plt.ylabel(\"Number of particles / bin\")\n", "# plt.title(\"{}\\nMLPF truth\".format(sample))\n", "cms_label(ax)\n", - "sample_label(ax, sample, \", MLPF truth\")\n", + "#sample_label(ax, sample, \", MLPF truth\")\n", "plt.xlim(10**-2, 10**4)\n", "plt.savefig(plot_outpath + \"truth_pt.pdf\", bbox_inches=\"tight\")" ] @@ -569,7 +565,7 @@ "colors = plt.cm.get_cmap(\"tab20c\", len(pids))\n", "labels = []\n", "for pid in pids[::-1]:\n", - " pt_pid = awkward.flatten(arrs_awk[\"ygen\"][\"eta\"][arrs_awk[\"ygen\"][\"typ\"] == pid])\n", + " pt_pid = awkward.to_numpy(awkward.flatten(arrs_awk[\"ygen\"][\"eta\"][arrs_awk[\"ygen\"][\"typ\"] == pid]))\n", " hs.append(np.histogram(pt_pid, bins=b))\n", " labels.append(CLASS_NAMES_CMS[CLASS_LABELS_CMS.index(pid)])\n", "mplhep.histplot(hs, stack=True, histtype=\"fill\", label=labels, color=colors.colors)\n", @@ -586,7 +582,7 @@ "plt.ylabel(\"Number of particles / bin\")\n", "# plt.title(\"{}\\nMLPF truth\".format(sample))\n", "cms_label(ax)\n", - "sample_label(ax, sample, \", MLPF truth\")\n", + "#sample_label(ax, sample, \", MLPF truth\")\n", "plt.xlim(-6, 6)\n", "plt.savefig(plot_outpath + \"truth_eta.pdf\", bbox_inches=\"tight\")" ] @@ -606,7 +602,7 @@ "colors = plt.cm.get_cmap(\"tab20c\", len(pids))\n", "labels = []\n", "for pid in pids[::-1]:\n", - " pt_pid = awkward.flatten(arrs_awk[\"ycand\"][\"pt\"][arrs_awk[\"ycand\"][\"typ\"] == pid])\n", + " pt_pid = awkward.to_numpy(awkward.flatten(arrs_awk[\"ycand\"][\"pt\"][arrs_awk[\"ycand\"][\"typ\"] == pid]))\n", " hs.append(np.histogram(pt_pid, bins=b))\n", " labels.append(CLASS_NAMES_CMS[CLASS_LABELS_CMS.index(pid)])\n", "mplhep.histplot(hs, stack=True, histtype=\"fill\", label=labels, color=colors.colors)\n", @@ -623,7 +619,7 @@ "plt.ylabel(\"Number of particles / bin\")\n", "# plt.title(\"{}\\nMLPF truth\".format(sample))\n", "cms_label(ax)\n", - "sample_label(ax, sample, \", PF\")\n", + "#sample_label(ax, sample, \", PF\")\n", "plt.xlim(10**-2, 10**4)\n", "plt.savefig(plot_outpath + \"pf_pt.pdf\", bbox_inches=\"tight\")" ] @@ -643,7 +639,7 @@ "colors = plt.cm.get_cmap(\"tab20c\", len(pids))\n", "labels = []\n", "for pid in pids[::-1]:\n", - " pt_pid = awkward.flatten(arrs_awk[\"ycand\"][\"eta\"][arrs_awk[\"ycand\"][\"typ\"] == pid])\n", + " pt_pid = awkward.to_numpy(awkward.flatten(arrs_awk[\"ycand\"][\"eta\"][arrs_awk[\"ycand\"][\"typ\"] == pid]))\n", " hs.append(np.histogram(pt_pid, bins=b))\n", " labels.append(CLASS_NAMES_CMS[CLASS_LABELS_CMS.index(pid)])\n", "mplhep.histplot(hs, stack=True, histtype=\"fill\", label=labels, color=colors.colors)\n", @@ -660,7 +656,7 @@ "plt.ylabel(\"Number of particles / bin\")\n", "# plt.title(\"{}\\nMLPF truth\".format(sample))\n", "cms_label(ax)\n", - "sample_label(ax, sample, \", PF\")\n", + "#sample_label(ax, sample, \", PF\")\n", "plt.xlim(-6, 6)\n", "plt.savefig(plot_outpath + \"pf_eta.pdf\", bbox_inches=\"tight\")" ] @@ -681,7 +677,7 @@ " colors = plt.cm.get_cmap(\"tab20c\", len(pids))\n", " labels = []\n", " for pid in pids[::-1]:\n", - " pt_pid = awkward.flatten(arrs_flat[\"pythia\"][\"pt\"][arrs_flat[\"pythia\"][\"typ\"] == pid])\n", + " pt_pid = awkward.to_numpy(awkward.flatten(arrs_flat[\"pythia\"][\"pt\"][arrs_flat[\"pythia\"][\"typ\"] == pid]))\n", " hs.append(np.histogram(pt_pid, bins=b))\n", " labels.append(int(pid))\n", " mplhep.histplot(hs, stack=True, histtype=\"fill\", label=labels, color=colors.colors)\n", @@ -692,7 +688,7 @@ " plt.ylabel(\"Number of particles\")\n", " # plt.title(\"{}\\nMLPF truth\".format(sample))\n", " cms_label(ax)\n", - " sample_label(ax, sample, \", Pythia\")\n", + " #sample_label(ax, sample, \", Pythia\")\n", " plt.savefig(plot_outpath + \"pythia_pt.pdf\", bbox_inches=\"tight\")" ] }, @@ -712,7 +708,7 @@ " colors = plt.cm.get_cmap(\"tab20c\", len(pids))\n", " labels = []\n", " for pid in pids[::-1]:\n", - " pt_pid = awkward.flatten(arrs_flat[\"pythia\"][\"eta\"][arrs_flat[\"pythia\"][\"typ\"] == pid])\n", + " pt_pid = awkward.to_numpy(awkward.flatten(arrs_flat[\"pythia\"][\"eta\"][arrs_flat[\"pythia\"][\"typ\"] == pid]))\n", " hs.append(np.histogram(pt_pid, bins=b))\n", " labels.append(int(pid))\n", " mplhep.histplot(hs, stack=True, histtype=\"fill\", label=labels, color=colors.colors)\n", @@ -723,7 +719,7 @@ " plt.ylabel(\"Number of particles\")\n", " # plt.title(\"{}\\nMLPF truth\".format(sample))\n", " cms_label(ax)\n", - " sample_label(ax, sample, \", Pythia\")\n", + " #sample_label(ax, sample, \", Pythia\")\n", " plt.savefig(plot_outpath + \"pythia_eta.pdf\", bbox_inches=\"tight\")" ] }, @@ -731,9 +727,7 @@ "cell_type": "code", "execution_count": null, "id": "58723bd8", - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "b = np.logspace(-2, 4, 100)\n", @@ -741,10 +735,11 @@ " plt.figure()\n", " ax = plt.axes()\n", " plt.hist(\n", - " awkward.flatten(arrs_awk[\"ycand\"][\"pt\"][arrs_awk[\"ycand\"][\"typ\"] == pid]), bins=b, histtype=\"step\", lw=2, label=\"PF\"\n", + " awkward.to_numpy(awkward.flatten(arrs_awk[\"ycand\"][\"pt\"][arrs_awk[\"ycand\"][\"typ\"] == pid])),\n", + " bins=b, histtype=\"step\", lw=2, label=\"PF\"\n", " )\n", " plt.hist(\n", - " awkward.flatten(arrs_awk[\"ygen\"][\"pt\"][arrs_awk[\"ygen\"][\"typ\"] == pid]),\n", + " awkward.to_numpy(awkward.flatten(arrs_awk[\"ygen\"][\"pt\"][arrs_awk[\"ygen\"][\"typ\"] == pid])),\n", " bins=b,\n", " histtype=\"step\",\n", " lw=2,\n", @@ -756,7 +751,7 @@ " plt.legend(ncol=1, loc=(0.68, 0.8))\n", " plt.xlabel(\"$p_T$ [GeV]\")\n", " cms_label(ax)\n", - " sample_label(ax, sample)\n", + " #sample_label(ax, sample)\n", " plt.savefig(plot_outpath + \"pid{}_pt.pdf\".format(pid), bbox_inches=\"tight\")" ] }, @@ -764,9 +759,7 @@ "cell_type": "code", "execution_count": null, "id": "268143b5", - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "b = np.linspace(-6, 6, 100)\n", @@ -814,8 +807,8 @@ "\n", " b = np.logspace(1, 6, 100)\n", " plt.hist2d(\n", - " awkward.sum(arrs_flat[\"pythia\"][\"e\"], axis=1),\n", - " awkward.sum(arrs_flat[\"ygen\"][\"e\"], axis=1),\n", + " awkward.to_numpy(awkward.sum(arrs_flat[\"pythia\"][\"e\"], axis=1)),\n", + " awkward.to_numpy(awkward.sum(arrs_flat[\"ygen\"][\"e\"], axis=1)),\n", " bins=(b, b),\n", " cmap=\"hot_r\",\n", " norm=matplotlib.colors.Normalize(vmin=0),\n", @@ -823,7 +816,7 @@ " plt.plot([1e1, 1e6], [1e1, 1e6], color=\"black\", ls=\"--\")\n", " plt.colorbar(label=\"events / bin\")\n", " cms_label(ax)\n", - " sample_label(ax, sample)\n", + " #sample_label(ax, sample)\n", " plt.xscale(\"log\")\n", " plt.yscale(\"log\")\n", " plt.xlabel(\"Pythia $\\sum E$ [GeV]\")\n", @@ -844,8 +837,8 @@ "\n", " b = np.logspace(1, 6, 100)\n", " plt.hist2d(\n", - " awkward.sum(arrs_flat[\"pythia\"][\"e\"], axis=1),\n", - " awkward.sum(arrs_flat[\"ycand\"][\"e\"], axis=1),\n", + " awkward.to_numpy(awkward.sum(arrs_flat[\"pythia\"][\"e\"], axis=1)),\n", + " awkward.to_numpy(awkward.sum(arrs_flat[\"ycand\"][\"e\"], axis=1)),\n", " bins=(b, b),\n", " cmap=\"hot_r\",\n", " norm=matplotlib.colors.Normalize(vmin=0),\n", @@ -853,7 +846,7 @@ " plt.plot([1e1, 1e6], [1e1, 1e6], color=\"black\", ls=\"--\")\n", " plt.colorbar(label=\"events / bin\")\n", " cms_label(ax)\n", - " sample_label(ax, sample)\n", + " #sample_label(ax, sample)\n", " plt.xscale(\"log\")\n", " plt.yscale(\"log\")\n", " plt.xlabel(\"Pythia $\\sum E$ [GeV]\")\n", @@ -872,7 +865,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -886,7 +879,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/parameters/bench/clic-hits-bench.yaml b/parameters/bench/clic-hits-bench.yaml new file mode 100644 index 000000000..0a7eb8ae6 --- /dev/null +++ b/parameters/bench/clic-hits-bench.yaml @@ -0,0 +1,248 @@ +backend: tensorflow + +cache: caches/clic_hits + +dataset: + schema: clic + target_particles: gen + num_input_features: 15 + #(none=0, track=1, hit=2) + num_input_classes: 3 + #(none=0, ch.had=1, n.had=2, gamma=3, e=4, mu=5) + num_output_classes: 6 + cls_weight_by_pt: no + reg_weight_by_pt: no + enable_tfds_caching: no + +loss: + classification_loss_coef: 100.0 + charge_loss_coef: 1.0 + pt_loss_coef: 10.0 + eta_loss_coef: 10.0 + sin_phi_loss_coef: 10.0 + cos_phi_loss_coef: 10.0 + energy_loss_coef: 10.0 + cls_loss: + type: SigmoidFocalCrossEntropy + from_logits: yes + gamma: 2.0 + charge_loss: + type: CategoricalCrossentropy + from_logits: yes + energy_loss: + type: Huber + pt_loss: + type: Huber + sin_phi_loss: + type: Huber + cos_phi_loss: + type: Huber + eta_loss: + type: Huber + event_loss: none #none, sliced_wasserstein, gen_jet_logcosh, gen_jet_mse, hist_2d + event_loss_coef: 0.0 + met_loss: none + met_loss_coef: 1.0 + +tensorflow: + eager: no + +setup: + train: yes + weights: + weights_config: + lr: 0.0005 + num_epochs: 20 + dtype: float32 + trainable: + lr_schedule: cosinedecay # cosinedecay, exponentialdecay, onecycle, none + optimizer: adam # adam, adamw, sgd + horovod_enabled: no + cls_output_as_logits: yes + small_graph_opt: no + use_normalizer: yes + +batching: + # if enabled, use dynamic batching instead of the fixed-size batches configured in batch_per_gpu + bucket_by_sequence_length: yes + bucket_batch_sizes: auto + batch_multiplier: 1 + +optimizer: + adam: + amsgrad: no + adamw: + amsgrad: yes + weight_decay: 0.001 + sgd: + nesterov: no + momentum: 0.9 + +# LR Schedules +exponentialdecay: + decay_steps: 2000 + decay_rate: 0.99 + staircase: yes +onecycle: + mom_min: 0.85 + mom_max: 0.95 + warmup_ratio: 0.3 + div_factor: 25.0 + final_div: 100000.0 + +parameters: + model: gnn_dense + input_encoding: clic + node_update_mode: additive + do_node_encoding: yes + node_encoding_hidden_dim: 512 + + combined_graph_layer: + bin_size: 256 + max_num_bins: 200 + distance_dim: 128 + layernorm: yes + dropout: 0.0 + dist_activation: elu + ffn_dist_num_layers: 2 + ffn_dist_hidden_dim: 128 + + # MPNN + #kernel: + # type: NodePairTrainableKernel + # activation: elu + #num_node_messages: 1 + #node_message: + # type: NodeMessageLearnable + # output_dim: 64 + # hidden_dim: 128 + # num_layers: 2 + # activation: elu + #activation: elu + + # GCN + kernel: + type: NodePairGaussianKernel + dist_mult: 0.1 + clip_value_low: 0.0 + dist_norm: l2 + num_node_messages: 2 + node_message: + type: GHConvDense + output_dim: 512 + activation: elu + #if this is enabled, it will break float16 training + normalize_degrees: no + activation: elu + + num_graph_layers_id: 6 + num_graph_layers_reg: 6 + output_decoding: + activation: elu + regression_use_classification: yes + dropout: 0.1 + + pt_as_correction: no + + id_dim_decrease: yes + charge_dim_decrease: yes + pt_dim_decrease: yes + eta_dim_decrease: yes + phi_dim_decrease: yes + energy_dim_decrease: yes + + id_hidden_dim: 256 + charge_hidden_dim: 256 + pt_hidden_dim: 256 + eta_hidden_dim: 256 + phi_hidden_dim: 256 + energy_hidden_dim: 256 + + id_num_layers: 2 + charge_num_layers: 2 + pt_num_layers: 2 + eta_num_layers: 2 + phi_num_layers: 2 + energy_num_layers: 2 + layernorm: yes + mask_reg_cls0: yes + + skip_connection: no + debug: no + +timing: + num_ev: 100 + num_iter: 3 + +callbacks: + checkpoint: + monitor: "val_loss" + plot_freq: 1 + tensorboard: + dump_history: yes + hist_freq: 1 + +hypertune: + algorithm: hyperband # random, bayesian, hyperband + random: + objective: val_loss + max_trials: 100 + bayesian: + objective: val_loss + max_trials: 100 + num_initial_points: 2 + hyperband: + objective: val_loss + max_epochs: 10 + factor: 3 + iterations: 1 + executions_per_trial: 1 + +raytune: + local_dir: # Note: please specify an absolute path + sched: asha # asha, hyperband + search_alg: # bayes, bohb, hyperopt, nevergrad, scikit + default_metric: "val_loss" + default_mode: "min" + # Tune schedule specific parameters + asha: + max_t: 200 + reduction_factor: 4 + brackets: 1 + grace_period: 10 + hyperband: + max_t: 200 + reduction_factor: 4 + hyperopt: + n_random_steps: 10 + nevergrad: + n_random_steps: 10 + +train_test_datasets: + physical: + batch_per_gpu: 1 + event_pad_size: 15360 + datasets: + - clic_edm_ttbar_hits_pf10k + - clic_edm_qq_hits_pf10k + +validation_dataset: clic_edm_ttbar_hits_pf10k +validation_batch_size: 20 +validation_num_events: 2000 + +evaluation_datasets: + clic_edm_ttbar_hits_pf: + batch_size: 10 + num_events: 10000 + +evaluation_jet_algo: ee_genkt_algorithm + +datasets: + clic_edm_ttbar_hits_pf10k: + version: 1.5.0 + data_dir: + manual_dir: + clic_edm_qq_hits_pf10k: + version: 1.5.0 + data_dir: + manual_dir: diff --git a/parameters/delphes-bench.yaml b/parameters/bench/delphes-bench.yaml similarity index 100% rename from parameters/delphes-bench.yaml rename to parameters/bench/delphes-bench.yaml diff --git a/parameters/cms-gen.yaml b/parameters/cms-gen.yaml index 7b5e4a24d..394be230e 100644 --- a/parameters/cms-gen.yaml +++ b/parameters/cms-gen.yaml @@ -42,9 +42,9 @@ loss: type: CategoricalCrossentropy from_logits: yes energy_loss: - type: MeanSquaredLogarithmicError + type: Huber pt_loss: - type: MeanSquaredLogarithmicError + type: Huber sin_phi_loss: type: Huber delta: 0.1 @@ -223,6 +223,11 @@ raytune: n_random_steps: 10 train_test_datasets: + multiparticlegun: + batch_per_gpu: 1 + event_pad_size: -1 + datasets: + - cms_pf_multi_particle_gun physical: batch_per_gpu: 1 event_pad_size: -1 @@ -307,3 +312,7 @@ datasets: version: 1.6.0 data_dir: manual_dir: + cms_pf_multi_particle_gun: + version: 1.6.0 + data_dir: + manual_dir: diff --git a/scripts/generate_tfds.sh b/scripts/generate_tfds.sh index dab0921ce..af31eabd3 100755 --- a/scripts/generate_tfds.sh +++ b/scripts/generate_tfds.sh @@ -3,7 +3,7 @@ # Tallinn export MANUAL_DIR=/local/joosep/mlpf/cms/v2 export DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets -export IMG=/home/software/singularity/tf-2.13.0.simg +export IMG=/home/software/singularity/tf-2.14.0.simg export PYTHONPATH=`pwd`/mlpf export CMD="singularity exec -B /local -B /scratch/persistent --env PYTHONPATH=$PYTHONPATH $IMG tfds build " @@ -26,6 +26,7 @@ export CMD="singularity exec -B /local -B /scratch/persistent --env PYTHONPATH=$ # $CMD mlpf/heptfds/cms_pf/singlepi --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_singlepi.log & # $CMD mlpf/heptfds/cms_pf/singleproton --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_singleproton.log & # $CMD mlpf/heptfds/cms_pf/singletau --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_singletau.log & +# $CMD mlpf/heptfds/cms_pf/multiparticlegun --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite # wait # CLIC cluster-based @@ -39,10 +40,11 @@ export CMD="singularity exec -B /local -B /scratch/persistent --env PYTHONPATH=$ # wait # CLIC hit-based -# export MANUAL_DIR=/local/joosep/mlpf_hits/clic_edm4hep/ -# export MANUAL_DIR=/media/joosep/data/mlpf_hits/clic_edm4hep/ +# export MANUAL_DIR=/local/joosep/mlpf/clic_edm4hep_hits/ # $CMD mlpf/heptfds/clic_pf_edm4hep_hits/qq --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_qq_hits.log & +# $CMD mlpf/heptfds/clic_pf_edm4hep_hits/qq_10k --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_qq_hits_10k.log & # $CMD mlpf/heptfds/clic_pf_edm4hep_hits/ttbar --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_ttbar_hits.log & +# $CMD mlpf/heptfds/clic_pf_edm4hep_hits/ttbar_10k --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_ttbar_hits_10k.log & # $CMD mlpf/heptfds/clic_pf_edm4hep_hits/single_kaon0L --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_single_kaon0L_hits.log & # $CMD mlpf/heptfds/clic_pf_edm4hep_hits/single_ele --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_single_ele_hits.log & # $CMD mlpf/heptfds/clic_pf_edm4hep_hits/single_pi0 --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_single_pi0_hits.log & @@ -53,7 +55,7 @@ export CMD="singularity exec -B /local -B /scratch/persistent --env PYTHONPATH=$ # wait # Delphes -export MANUAL_DIR=/local/joosep/mlpf/delphes/ +# export MANUAL_DIR=/local/joosep/mlpf/delphes/ # $CMD mlpf/heptfds/delphes_pf/delphes_ttbar_pf --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_delphes_ttbar.log & -$CMD mlpf/heptfds/delphes_pf/delphes_qcd_pf --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_delphes_qcd.log & -wait +# $CMD mlpf/heptfds/delphes_pf/delphes_qcd_pf --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_delphes_qcd.log & +# wait diff --git a/scripts/tallinn/a100/clic-hits-train.sh b/scripts/tallinn/a100/clic-hits-train.sh index 399a74ee0..fc36ed61f 100755 --- a/scripts/tallinn/a100/clic-hits-train.sh +++ b/scripts/tallinn/a100/clic-hits-train.sh @@ -1,15 +1,15 @@ #!/bin/bash #SBATCH --partition gpu -#SBATCH --gres gpu:a100:2 +#SBATCH --gres gpu:a100:1 #SBATCH --mem-per-gpu 100G #SBATCH -o logs/slurm-%x-%j-%N.out -IMG=/home/software/singularity/tf-2.13.0.simg +IMG=/home/software/singularity/tf-2.14.0.simg cd ~/particleflow #TF training singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ --env TFDS_DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets \ - $IMG python mlpf/pipeline.py train -c parameters/clic-hits.yaml \ + $IMG python3.10 mlpf/pipeline.py train -c parameters/bench/clic-hits-bench.yaml \ --plot-freq 1 --num-cpus 32 --batch-multiplier 1 diff --git a/scripts/tallinn/a100/clic-train-hvd.sh b/scripts/tallinn/a100/clic-train-hvd.sh index 64fec3df2..773db3f97 100755 --- a/scripts/tallinn/a100/clic-train-hvd.sh +++ b/scripts/tallinn/a100/clic-train-hvd.sh @@ -4,14 +4,14 @@ #SBATCH --mem-per-gpu 40G #SBATCH -o logs/slurm-%x-%j-%N.out -IMG=/home/software/singularity/tf-2.13.0.simg +IMG=/home/software/singularity/tf-2.14.0.simg cd ~/particleflow #TF training singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ --env TFDS_DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets \ - $IMG horovodrun -np 2 -H localhost:2 python mlpf/pipeline.py train -c parameters/clic-test.yaml \ + $IMG horovodrun -np 2 -H localhost:2 python3.10 mlpf/pipeline.py train -c parameters/clic-test.yaml \ --plot-freq 0 --num-cpus 32 --batch-multiplier 5 \ --horovod-enabled --ntrain 50000 --ntest 50000 --nepochs 11 --benchmark_dir exp_dir diff --git a/scripts/tallinn/a100/clic-train.sh b/scripts/tallinn/a100/clic-train.sh index 5b86808b1..3d096f02d 100755 --- a/scripts/tallinn/a100/clic-train.sh +++ b/scripts/tallinn/a100/clic-train.sh @@ -4,12 +4,12 @@ #SBATCH --mem-per-gpu 40G #SBATCH -o logs/slurm-%x-%j-%N.out -IMG=/home/software/singularity/tf-2.13.0.simg +IMG=/home/software/singularity/tf-2.14.0.simg cd ~/particleflow #TF training singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ --env TFDS_DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets \ - $IMG python mlpf/pipeline.py train -c parameters/clic.yaml \ + $IMG python3.10 mlpf/pipeline.py train -c parameters/clic.yaml \ --plot-freq 1 --num-cpus 32 --batch-multiplier 5 diff --git a/scripts/tallinn/a100/cms-train.sh b/scripts/tallinn/a100/cms-train.sh index 733551f93..7bd0efe63 100755 --- a/scripts/tallinn/a100/cms-train.sh +++ b/scripts/tallinn/a100/cms-train.sh @@ -1,14 +1,15 @@ #!/bin/bash #SBATCH -p gpu -#SBATCH --gres gpu:a100:2 +#SBATCH --gres gpu:a100:1 #SBATCH --mem-per-gpu=40G #SBATCH -o logs/slurm-%x-%j-%N.out -IMG=/home/software/singularity/tf-2.13.0.simg +IMG=/home/software/singularity/tf-2.14.0.simg cd ~/particleflow #TF training singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ --env TFDS_DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets \ - $IMG python mlpf/pipeline.py train -c parameters/cms-gen.yaml --plot-freq 1 --num-cpus 32 --batch-multiplier 2 + $IMG python3.10 mlpf/pipeline.py train -c parameters/cms-gen.yaml --plot-freq 1 --num-cpus 32 --batch-multiplier 2 \ + --weights experiments/cms-gen_20231003_164730_341214.gpu1.local/weights/weights-37-1.192368.hdf5 diff --git a/scripts/tallinn/rtx/clic-train.sh b/scripts/tallinn/rtx/clic-train.sh index 7fe10b7a4..f4eba1ed7 100755 --- a/scripts/tallinn/rtx/clic-train.sh +++ b/scripts/tallinn/rtx/clic-train.sh @@ -4,13 +4,13 @@ #SBATCH --mem-per-gpu 40G #SBATCH -o logs/slurm-%x-%j-%N.out -IMG=/home/software/singularity/tf-2.13.0.simg +IMG=/home/software/singularity/tf-2.14.0.simg cd ~/particleflow #TF training singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ --env TFDS_DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets \ - $IMG python mlpf/pipeline.py train -c parameters/clic.yaml \ + $IMG python3.10 mlpf/pipeline.py train -c parameters/clic.yaml \ --plot-freq 1 \ --batch-multiplier 0.5 diff --git a/scripts/tallinn/rtx/delphes-train.sh b/scripts/tallinn/rtx/delphes-train.sh index 982d87faa..67a987722 100755 --- a/scripts/tallinn/rtx/delphes-train.sh +++ b/scripts/tallinn/rtx/delphes-train.sh @@ -4,13 +4,13 @@ #SBATCH --mem-per-gpu 40G #SBATCH -o logs/slurm-%x-%j-%N.out -IMG=/home/software/singularity/tf-2.13.0.simg +IMG=/home/software/singularity/tf-2.14.0.simg cd ~/particleflow #TF training singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ --env TFDS_DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets \ - $IMG python mlpf/pipeline.py train -c parameters/delphes.yaml \ + $IMG python3.10 mlpf/pipeline.py train -c parameters/delphes.yaml \ --plot-freq 1 \ --batch-multiplier 0.5 diff --git a/scripts/tallinn/rtx/eval.sh b/scripts/tallinn/rtx/eval.sh index 9f1593f52..8542093d7 100755 --- a/scripts/tallinn/rtx/eval.sh +++ b/scripts/tallinn/rtx/eval.sh @@ -4,13 +4,12 @@ #SBATCH --mem-per-gpu 40G #SBATCH -o logs/slurm-%x-%j-%N.out -#IMG=/home/software/singularity/tf-2.13.0.simg -IMG=/home/joosep/singularity/tf-2.14.0.simg +IMG=/home/software/singularity/tf-2.14.0.simg cd ~/particleflow #change these -EXPDIR=experiments/clic_20230927_171955_073632.gpu0.local -WEIGHTS=../test/particleflow/models/mlpf-clic-2023-results/clusters_best_tuned_gnn_clic_v130/weights/weights-96-5.346523.hdf5 +EXPDIR=experiments/cms-gen_20230926_205923_762855.gpu1.local +WEIGHTS=experiments/cms-gen_20230926_205923_762855.gpu1.local/weights/weights-31-0.417710.hdf5 singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ @@ -21,5 +20,5 @@ singularity exec -B /scratch/persistent --nv \ singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ --env TFDS_DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets \ - $IMG python mlpf/pipeline.py plots \ + $IMG python3.10 mlpf/pipeline.py plots \ --train-dir $EXPDIR diff --git a/timing.md b/timing.md new file mode 100644 index 000000000..c6d2b9ec2 --- /dev/null +++ b/timing.md @@ -0,0 +1,36 @@ +# CLIC v1.6 + +## On CPU + +Tested on Intel(R) Xeon(R) Silver 4214R CPU @ 2.40GHz, single thread. + +``` +batch_size=20 bin_size=256 num_features=17 use_gpu=False num_threads=1 +Nelem=256 mean_time=171.83 ms stddev_time=0.71 ms mem_used=193 MB +Nelem=512 mean_time=343.16 ms stddev_time=3.18 ms mem_used=282 MB +Nelem=2560 mean_time=1689.36 ms stddev_time=6.76 ms mem_used=1056 MB +Nelem=5120 mean_time=3339.05 ms stddev_time=6.48 ms mem_used=2038 MB +Nelem=10240 mean_time=6707.42 ms stddev_time=5.38 ms mem_used=3997 MB +``` + +On 12 threads +``` +batch_size=20 bin_size=256 num_features=17 use_gpu=False num_threads=12 +Nelem=256 mean_time=42.46 ms stddev_time=1.02 ms mem_used=169 MB +Nelem=512 mean_time=78.22 ms stddev_time=0.65 ms mem_used=213 MB +Nelem=2560 mean_time=377.50 ms stddev_time=4.07 ms mem_used=612 MB +Nelem=5120 mean_time=740.40 ms stddev_time=4.85 ms mem_used=1181 MB +Nelem=10240 mean_time=1458.50 ms stddev_time=11.97 ms mem_used=2319 MB +``` + +## On GPU + +Tested on RTX2070S 8192MiB. +``` +batch_size=20 bin_size=256 num_features=17 use_gpu=True num_threads=1 +Nelem=256 mean_time=1.96 ms stddev_time=0.52 ms mem_used=782 MB +Nelem=512 mean_time=3.21 ms stddev_time=0.15 ms mem_used=916 MB +Nelem=2560 mean_time=21.48 ms stddev_time=0.14 ms mem_used=1721 MB +Nelem=5120 mean_time=50.09 ms stddev_time=0.19 ms mem_used=2795 MB +Nelem=10240 mean_time=106.15 ms stddev_time=0.63 ms mem_used=4943 MB +```