From 5262cdb2a4823bf32af07789fdfd6a087ca11e02 Mon Sep 17 00:00:00 2001 From: Joosep Pata Date: Wed, 4 Sep 2024 16:11:46 +0300 Subject: [PATCH] try to improve val loss stability (#342) * log correction for pt and e --- mlpf/data_cms/genjob_pu55to75.sh | 7 +++---- mlpf/data_cms/postprocessing_jobs.py | 3 ++- mlpf/data_cms/prepare_args.py | 10 +++++----- mlpf/pyg/PFDataset.py | 14 ++++++++++++++ mlpf/pyg/inference.py | 8 ++++++++ mlpf/pyg/mlpf.py | 16 ++++++++-------- mlpf/pyg/training.py | 27 +++++++++++++++++++++------ parameters/pytorch/pyg-cms.yaml | 6 +++--- scripts/tallinn/a100/pytorch-small.sh | 9 +++++---- scripts/tallinn/a100/pytorch.sh | 5 +++-- 10 files changed, 72 insertions(+), 33 deletions(-) diff --git a/mlpf/data_cms/genjob_pu55to75.sh b/mlpf/data_cms/genjob_pu55to75.sh index 54ed7c166..75a5866b7 100755 --- a/mlpf/data_cms/genjob_pu55to75.sh +++ b/mlpf/data_cms/genjob_pu55to75.sh @@ -75,11 +75,10 @@ ls -lrt echo "process.RandomNumberGeneratorService.generator.initialSeed = $SEED" >> step2_phase1_new.py cmsRun step2_phase1_new.py > /dev/null cmsRun step3_phase1_new.py > /dev/null -#cmsRun $CMSSWDIR/src/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py mv pfntuple.root pfntuple_${SEED}.root -# python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ -# bzip2 -z pfntuple_${SEED}.pkl -# cp *.pkl.bz2 $OUTDIR/$SAMPLE/raw/ +python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ +bzip2 -z pfntuple_${SEED}.pkl +cp *.pkl.bz2 $OUTDIR/$SAMPLE/raw/ #copy ROOT outputs #cp step2_phase1_new.root $OUTDIR/$SAMPLE/root/step2_${SEED}.root diff --git a/mlpf/data_cms/postprocessing_jobs.py b/mlpf/data_cms/postprocessing_jobs.py index 685470a3d..43836aa96 100644 --- a/mlpf/data_cms/postprocessing_jobs.py +++ b/mlpf/data_cms/postprocessing_jobs.py @@ -34,13 +34,14 @@ def write_script(infiles, outfiles): samples = [ "/local/joosep/mlpf/cms/20240823_simcluster/pu55to75/TTbar_14TeV_TuneCUETP8M1_cfi", + "/local/joosep/mlpf/cms/20240823_simcluster/pu55to75/QCDForPF_14TeV_TuneCUETP8M1_cfi", ] ichunk = 1 for sample in samples: infiles = list(glob.glob(f"{sample}/root/pfntuple*.root")) for infiles_chunk in chunks(infiles, 10): - outfiles_chunk = [inf.replace(".root", ".pkl.bz2").replace("/root/", "/raw2/") for inf in infiles_chunk] + outfiles_chunk = [inf.replace(".root", ".pkl.bz2").replace("/root/", "/raw/") for inf in infiles_chunk] os.makedirs(os.path.dirname(outfiles_chunk[0]), exist_ok=True) scr = write_script(infiles_chunk, outfiles_chunk) ofname = f"jobscripts/postproc_{ichunk}.sh" diff --git a/mlpf/data_cms/prepare_args.py b/mlpf/data_cms/prepare_args.py index a07892e56..328a510b6 100644 --- a/mlpf/data_cms/prepare_args.py +++ b/mlpf/data_cms/prepare_args.py @@ -6,15 +6,15 @@ outdir = "/local/joosep/mlpf/cms/20240823_simcluster" samples = [ -# ("TTbar_14TeV_TuneCUETP8M1_cfi", 105000, 110010, "genjob_pu55to75.sh", outdir + "/pu55to75"), + ("TTbar_14TeV_TuneCUETP8M1_cfi", 105000, 110010, "genjob_pu55to75.sh", outdir + "/pu55to75"), # ("ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi", 200000, 220010, "genjob_pu55to75.sh", outdir + "/pu55to75"), - ("QCDForPF_14TeV_TuneCUETP8M1_cfi", 300000, 305000, "genjob_pu55to75.sh", outdir + "/pu55to75"), + ("QCDForPF_14TeV_TuneCUETP8M1_cfi", 305000, 310010, "genjob_pu55to75.sh", outdir + "/pu55to75"), # ("SMS-T1tttt_mGl-1500_mLSP-100_TuneCP5_14TeV_pythia8_cfi", 500000, 520010, "genjob_pu55to75.sh", outdir + "/pu55to75"), # ("ZpTT_1500_14TeV_TuneCP5_cfi", 600000, 620010, "genjob_pu55to75.sh", outdir + "/pu55to75"), -# ("VBF_TuneCP5_14TeV_pythia8_cfi", 700000, 720010, "genjob_pu55to75.sh", outdir + "/pu55to75"), +# ("VBF_TuneCP5_14TeV_pythia8_cfi", 700000, 705010, "genjob_pu55to75.sh", outdir + "/pu55to75"), # ("TTbar_14TeV_TuneCUETP8M1_cfi", 702000, 705000, "genjob_nopu.sh", outdir + "/nopu"), -# ("MultiParticlePFGun50_cfi", 800000, 820000, "genjob_nopu.sh", outdir + "/nopu"), + ("MultiParticlePFGun50_cfi", 800000, 805000, "genjob_nopu.sh", outdir + "/nopu"), # ("VBF_TuneCP5_14TeV_pythia8_cfi", 900000, 920010, "genjob_nopu.sh", outdir + "/nopu"), # ("QCDForPF_14TeV_TuneCUETP8M1_cfi", 1000000,1020010, "genjob_nopu.sh", outdir + "/nopu"), @@ -36,6 +36,6 @@ os.makedirs(this_outdir + "/" + samp + "/root", exist_ok=True) for seed in range(seed0, seed1): - p = this_outdir + "/" + samp + "/raw2/pfntuple_{}.pkl.bz2".format(seed) + p = this_outdir + "/" + samp + "/root/pfntuple_{}.root".format(seed) if not os.path.isfile(p): print(f"sbatch --mem-per-cpu 8G --partition main --time 20:00:00 --cpus-per-task 1 scripts/tallinn/cmssw-el8.sh mlpf/data_cms/{script} {samp} {seed}") diff --git a/mlpf/pyg/PFDataset.py b/mlpf/pyg/PFDataset.py index 22bcec6a4..da148ed14 100644 --- a/mlpf/pyg/PFDataset.py +++ b/mlpf/pyg/PFDataset.py @@ -70,6 +70,20 @@ def __getitem__(self, item): ret["ygen"][:, 0][(ret["X"][:, 0] == 10) & (ret["ygen"][:, 0] == 7)] = 2 ret["ygen"][:, 0][(ret["X"][:, 0] == 11) & (ret["ygen"][:, 0] == 7)] = 2 + # set pt for HO which would otherwise be 0 + msk_ho = ret["X"][:, 0] == 10 + eta = ret["X"][:, 2][msk_ho] + e = ret["X"][:, 5][msk_ho] + ret["X"][:, 1][msk_ho] = np.sqrt(e**2 - (np.tanh(eta) * e) ** 2) + + # transform pt -> log(pt / elem pt), same for energy + ret["ygen"][:, 6] = np.log(ret["ygen"][:, 6] / ret["X"][:, 5]) + ret["ygen"][:, 6][np.isnan(ret["ygen"][:, 6])] = 0.0 + ret["ygen"][:, 6][np.isinf(ret["ygen"][:, 6])] = 0.0 + ret["ygen"][:, 2] = np.log(ret["ygen"][:, 2] / ret["X"][:, 1]) + ret["ygen"][:, 2][np.isnan(ret["ygen"][:, 2])] = 0.0 + ret["ygen"][:, 2][np.isinf(ret["ygen"][:, 2])] = 0.0 + return ret def __len__(self): diff --git a/mlpf/pyg/inference.py b/mlpf/pyg/inference.py index 8d874c025..5816d07c7 100644 --- a/mlpf/pyg/inference.py +++ b/mlpf/pyg/inference.py @@ -42,6 +42,14 @@ def predict_one_batch(conv_type, model, i, batch, rank, jetdef, jet_ptcut, jet_m batch = batch.to(rank) ypred = model(batch.X, batch.mask) + # transform log (pt/elempt) -> pt + ypred[2][..., 0] = torch.exp(ypred[2][..., 0]) * batch.X[..., 1] + batch.ygen[..., 2] = torch.exp(batch.ygen[..., 2]) * batch.X[..., 1] + + # transform log (E/elemE) -> E + ypred[2][..., 4] = torch.exp(ypred[2][..., 4]) * batch.X[..., 5] + batch.ygen[..., 6] = torch.exp(batch.ygen[..., 6]) * batch.X[..., 1] + # convert all outputs to float32 in case running in float16 or bfloat16 ypred = tuple([y.to(torch.float32) for y in ypred]) diff --git a/mlpf/pyg/mlpf.py b/mlpf/pyg/mlpf.py index ccf4523a5..f5f908de1 100644 --- a/mlpf/pyg/mlpf.py +++ b/mlpf/pyg/mlpf.py @@ -220,11 +220,11 @@ def __init__( layernorm=True, conv_type="attention", input_encoding="joint", - pt_mode="additive-elemtype", - eta_mode="additive-elemtype", - sin_phi_mode="additive-elemtype", - cos_phi_mode="additive-elemtype", - energy_mode="additive-elemtype", + pt_mode="linear", + eta_mode="linear", + sin_phi_mode="linear", + cos_phi_mode="linear", + energy_mode="linear", # element types which actually exist in the dataset elemtypes_nonzero=[1, 4, 5, 6, 8, 9, 10, 11], # should the conv layer outputs be concatted (concat) or take the last (last) @@ -348,7 +348,7 @@ def __init__( self.nn_pid = ffn(decoding_dim, num_classes, width, self.act, dropout_ff) # elementwise DNN for node momentum regression - embed_dim = decoding_dim + 2 + num_classes + embed_dim = decoding_dim self.nn_pt = RegressionOutput(pt_mode, embed_dim, width, self.act, dropout_ff, self.elemtypes_nonzero) self.nn_eta = RegressionOutput(eta_mode, embed_dim, width, self.act, dropout_ff, self.elemtypes_nonzero) self.nn_sin_phi = RegressionOutput(sin_phi_mode, embed_dim, width, self.act, dropout_ff, self.elemtypes_nonzero) @@ -402,9 +402,9 @@ def forward(self, X_features, mask): # regression input if self.learned_representation_mode == "concat": - final_embedding_reg = torch.cat([Xfeat_normed] + embeddings_reg + [preds_binary_particle.detach(), preds_pid.detach()], axis=-1) + final_embedding_reg = torch.cat([Xfeat_normed] + embeddings_reg, axis=-1) elif self.learned_representation_mode == "last": - final_embedding_reg = torch.cat([Xfeat_normed] + [embeddings_reg[-1]] + [preds_binary_particle.detach(), preds_pid.detach()], axis=-1) + final_embedding_reg = torch.cat([Xfeat_normed] + [embeddings_reg[-1]], axis=-1) if self.use_pre_layernorm: final_embedding_reg = self.final_norm_reg(final_embedding_reg) diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py index 0100a9a72..092f3e790 100644 --- a/mlpf/pyg/training.py +++ b/mlpf/pyg/training.py @@ -15,6 +15,7 @@ import sklearn import sklearn.metrics import numpy as np +import pandas # comet needs to be imported before torch from comet_ml import OfflineExperiment, Experiment # noqa: F401, isort:skip @@ -98,7 +99,7 @@ def mlpf_loss(y, ypred, batch): loss_pid_classification[y["cls_id"] == 0] *= 0 # compare particle momentum, only for cases where there was a true particle - loss_regression = 10 * torch.nn.functional.huber_loss(ypred["momentum"], y["momentum"], reduction="none") + loss_regression = 10 * torch.nn.functional.mse_loss(ypred["momentum"], y["momentum"], reduction="none") loss_regression[y["cls_id"] == 0] *= 0 # set the loss to 0 on padded elements in the batch @@ -111,11 +112,12 @@ def mlpf_loss(y, ypred, batch): loss["Classification"] = loss_pid_classification.sum() / nelem # normalize loss with stddev to stabilize across batches with very different pt, E distributions - mom_normalizer = y["momentum"][y["cls_id"] != 0].std(axis=0) + # mom_normalizer = y["momentum"][y["cls_id"] != 0].std(axis=0) reg_losses = loss_regression[y["cls_id"] != 0] # average over all true particles - loss["Regression"] = (reg_losses / mom_normalizer).sum() / npart + # loss["Regression"] = (reg_losses / mom_normalizer).sum() / npart + loss["Regression"] = reg_losses.sum() / npart # in case we are using the 3D-padded mode, we can compute a few additional event-level monitoring losses msk_pred_particle = torch.unsqueeze(torch.argmax(ypred["cls_binary"].detach(), axis=1) != 0, axis=-1) @@ -298,12 +300,12 @@ def train_and_valid( with torch.autocast(device_type=device_type, dtype=dtype, enabled=device_type == "cuda"): if is_train: - ypred = model(batch.X, batch.mask) + ypred_raw = model(batch.X, batch.mask) else: with torch.no_grad(): - ypred = model(batch.X, batch.mask) + ypred_raw = model(batch.X, batch.mask) - ypred = unpack_predictions(ypred) + ypred = unpack_predictions(ypred_raw) if not is_train: cm_X_gen += sklearn.metrics.confusion_matrix( @@ -315,6 +317,19 @@ def train_and_valid( cm_id += sklearn.metrics.confusion_matrix( ygen["cls_id"][batch.mask].detach().cpu().numpy(), ypred["cls_id"][batch.mask].detach().cpu().numpy(), labels=range(13) ) + # save the events of the first validation batch for quick checks + if itrain == 0: + arr = ( + torch.concatenate( + [batch.X[batch.mask], batch.ygen[batch.mask], ypred_raw[0][batch.mask], ypred_raw[1][batch.mask], ypred_raw[2][batch.mask]], + axis=-1, + ) + .detach() + .cpu() + .numpy() + ) + df = pandas.DataFrame(arr) + df.to_parquet(f"{outdir}/batch0_epoch{epoch}.parquet") with torch.autocast(device_type=device_type, dtype=dtype, enabled=device_type == "cuda"): if is_train: diff --git a/parameters/pytorch/pyg-cms.yaml b/parameters/pytorch/pyg-cms.yaml index 68e160693..c45d59fdb 100644 --- a/parameters/pytorch/pyg-cms.yaml +++ b/parameters/pytorch/pyg-cms.yaml @@ -8,7 +8,7 @@ gpu_batch_multiplier: 1 load: num_epochs: 100 patience: 20 -lr: 0.0001 +lr: 0.00001 lr_schedule: cosinedecay # constant, cosinedecay, onecycle conv_type: attention ntrain: @@ -30,11 +30,11 @@ model: learned_representation_mode: last #last, concat input_encoding: joint #split, joint - pt_mode: linear + pt_mode: direct eta_mode: linear sin_phi_mode: linear cos_phi_mode: linear - energy_mode: linear + energy_mode: direct gnn_lsh: conv_type: gnn_lsh diff --git a/scripts/tallinn/a100/pytorch-small.sh b/scripts/tallinn/a100/pytorch-small.sh index 76feb888a..732b6d371 100755 --- a/scripts/tallinn/a100/pytorch-small.sh +++ b/scripts/tallinn/a100/pytorch-small.sh @@ -4,14 +4,15 @@ #SBATCH --mem-per-gpu 60G #SBATCH -o logs/slurm-%x-%j-%N.out -IMG=/home/software/singularity/pytorch.simg:2024-07-08 +IMG=/home/software/singularity/pytorch.simg:2024-08-18 cd ~/particleflow env +ulimit -n 10000 singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ --env KERAS_BACKEND=torch \ - $IMG python3.10 mlpf/pyg_pipeline.py --dataset clic --gpus 1 \ - --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-clic.yaml \ - --train --test --make-plots --conv-type attention --attention-type flash --gpu-batch-multiplier 40 --num-workers 1 --prefetch-factor 50 --dtype bfloat16 --checkpoint-freq 1 + $IMG python3 mlpf/pyg_pipeline.py --dataset cms --gpus 1 \ + --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms.yaml \ + --train --test --make-plots --conv-type attention --attention-type flash --gpu-batch-multiplier 4 --num-workers 1 --prefetch-factor 10 --dtype bfloat16 --checkpoint-freq 1 --ntrain 1000 --nvalid 1000 --ntest 1000 --comet diff --git a/scripts/tallinn/a100/pytorch.sh b/scripts/tallinn/a100/pytorch.sh index 8a0535da2..628524119 100755 --- a/scripts/tallinn/a100/pytorch.sh +++ b/scripts/tallinn/a100/pytorch.sh @@ -1,7 +1,7 @@ #!/bin/bash #SBATCH --partition gpu #SBATCH --gres gpu:a100:1 -#SBATCH --mem-per-gpu 100G +#SBATCH --mem-per-gpu 200G #SBATCH -o logs/slurm-%x-%j-%N.out IMG=/home/software/singularity/pytorch.simg:2024-08-18 @@ -14,4 +14,5 @@ singularity exec -B /scratch/persistent --nv \ $IMG python3 mlpf/pyg_pipeline.py --dataset cms --gpus 1 \ --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms.yaml \ --train --test --make-plots --num-epochs 100 --conv-type attention \ - --gpu-batch-multiplier 10 --num-workers 4 --prefetch-factor 100 --checkpoint-freq 1 --comet + --num-epochs 10 \ + --gpu-batch-multiplier 32 --checkpoint-freq 1 --num-workers 16 --prefetch-factor 10 --comet --ntrain 1000 --ntest 1000 --nvalid 1000