try to improve val loss stability (jpata#342)

* log correction for pt and e
farakiko · Sep 4, 2024 · 5262cdb · 5262cdb
1 parent d0c45d7
commit 5262cdb
Show file tree

Hide file tree

Showing 10 changed files with 72 additions and 33 deletions.
diff --git a/mlpf/data_cms/genjob_pu55to75.sh b/mlpf/data_cms/genjob_pu55to75.sh
@@ -75,11 +75,10 @@ ls -lrt
 echo "process.RandomNumberGeneratorService.generator.initialSeed = $SEED" >> step2_phase1_new.py
 cmsRun step2_phase1_new.py > /dev/null
 cmsRun step3_phase1_new.py > /dev/null
-#cmsRun $CMSSWDIR/src/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py
 mv pfntuple.root pfntuple_${SEED}.root
-# python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./
-# bzip2 -z pfntuple_${SEED}.pkl
-# cp *.pkl.bz2 $OUTDIR/$SAMPLE/raw/
+python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./
+bzip2 -z pfntuple_${SEED}.pkl
+cp *.pkl.bz2 $OUTDIR/$SAMPLE/raw/
 
 #copy ROOT outputs
 #cp step2_phase1_new.root $OUTDIR/$SAMPLE/root/step2_${SEED}.root

diff --git a/mlpf/data_cms/postprocessing_jobs.py b/mlpf/data_cms/postprocessing_jobs.py
@@ -34,13 +34,14 @@ def write_script(infiles, outfiles):
 
 samples = [
     "/local/joosep/mlpf/cms/20240823_simcluster/pu55to75/TTbar_14TeV_TuneCUETP8M1_cfi",
+    "/local/joosep/mlpf/cms/20240823_simcluster/pu55to75/QCDForPF_14TeV_TuneCUETP8M1_cfi",
 ]
 
 ichunk = 1
 for sample in samples:
     infiles = list(glob.glob(f"{sample}/root/pfntuple*.root"))
     for infiles_chunk in chunks(infiles, 10):
-        outfiles_chunk = [inf.replace(".root", ".pkl.bz2").replace("/root/", "/raw2/") for inf in infiles_chunk]
+        outfiles_chunk = [inf.replace(".root", ".pkl.bz2").replace("/root/", "/raw/") for inf in infiles_chunk]
         os.makedirs(os.path.dirname(outfiles_chunk[0]), exist_ok=True)
         scr = write_script(infiles_chunk, outfiles_chunk)
         ofname = f"jobscripts/postproc_{ichunk}.sh"

diff --git a/mlpf/data_cms/prepare_args.py b/mlpf/data_cms/prepare_args.py
@@ -6,15 +6,15 @@
 outdir = "/local/joosep/mlpf/cms/20240823_simcluster"
 
 samples = [
-#    ("TTbar_14TeV_TuneCUETP8M1_cfi",                           105000, 110010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
+    ("TTbar_14TeV_TuneCUETP8M1_cfi",                           105000, 110010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
 #    ("ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi",                200000, 220010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
-    ("QCDForPF_14TeV_TuneCUETP8M1_cfi",                        300000, 305000, "genjob_pu55to75.sh", outdir + "/pu55to75"),
+    ("QCDForPF_14TeV_TuneCUETP8M1_cfi",                        305000, 310010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
 #    ("SMS-T1tttt_mGl-1500_mLSP-100_TuneCP5_14TeV_pythia8_cfi", 500000, 520010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
 #    ("ZpTT_1500_14TeV_TuneCP5_cfi",                            600000, 620010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
-#    ("VBF_TuneCP5_14TeV_pythia8_cfi",                          700000, 720010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
+#    ("VBF_TuneCP5_14TeV_pythia8_cfi",                          700000, 705010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
 
 #    ("TTbar_14TeV_TuneCUETP8M1_cfi",                           702000, 705000, "genjob_nopu.sh", outdir + "/nopu"),
-#    ("MultiParticlePFGun50_cfi",                               800000, 820000, "genjob_nopu.sh", outdir + "/nopu"),
+    ("MultiParticlePFGun50_cfi",                               800000, 805000, "genjob_nopu.sh", outdir + "/nopu"),
 #    ("VBF_TuneCP5_14TeV_pythia8_cfi",                         900000, 920010, "genjob_nopu.sh", outdir + "/nopu"),
 #    ("QCDForPF_14TeV_TuneCUETP8M1_cfi",                      1000000,1020010, "genjob_nopu.sh", outdir + "/nopu"),
 
@@ -36,6 +36,6 @@
         os.makedirs(this_outdir + "/" + samp + "/root", exist_ok=True)
 
         for seed in range(seed0, seed1):
-            p = this_outdir + "/" + samp + "/raw2/pfntuple_{}.pkl.bz2".format(seed)
+            p = this_outdir + "/" + samp + "/root/pfntuple_{}.root".format(seed)
             if not os.path.isfile(p):
                 print(f"sbatch --mem-per-cpu 8G --partition main --time 20:00:00 --cpus-per-task 1 scripts/tallinn/cmssw-el8.sh mlpf/data_cms/{script} {samp} {seed}")
diff --git a/mlpf/pyg/PFDataset.py b/mlpf/pyg/PFDataset.py
@@ -70,6 +70,20 @@ def __getitem__(self, item):
             ret["ygen"][:, 0][(ret["X"][:, 0] == 10) & (ret["ygen"][:, 0] == 7)] = 2
             ret["ygen"][:, 0][(ret["X"][:, 0] == 11) & (ret["ygen"][:, 0] == 7)] = 2
 
+            # set pt for HO which would otherwise be 0
+            msk_ho = ret["X"][:, 0] == 10
+            eta = ret["X"][:, 2][msk_ho]
+            e = ret["X"][:, 5][msk_ho]
+            ret["X"][:, 1][msk_ho] = np.sqrt(e**2 - (np.tanh(eta) * e) ** 2)
+
+        # transform pt -> log(pt / elem pt), same for energy
+        ret["ygen"][:, 6] = np.log(ret["ygen"][:, 6] / ret["X"][:, 5])
+        ret["ygen"][:, 6][np.isnan(ret["ygen"][:, 6])] = 0.0
+        ret["ygen"][:, 6][np.isinf(ret["ygen"][:, 6])] = 0.0
+        ret["ygen"][:, 2] = np.log(ret["ygen"][:, 2] / ret["X"][:, 1])
+        ret["ygen"][:, 2][np.isnan(ret["ygen"][:, 2])] = 0.0
+        ret["ygen"][:, 2][np.isinf(ret["ygen"][:, 2])] = 0.0
+
         return ret
 
     def __len__(self):

diff --git a/mlpf/pyg/inference.py b/mlpf/pyg/inference.py
@@ -42,6 +42,14 @@ def predict_one_batch(conv_type, model, i, batch, rank, jetdef, jet_ptcut, jet_m
     batch = batch.to(rank)
     ypred = model(batch.X, batch.mask)
 
+    # transform log (pt/elempt) -> pt
+    ypred[2][..., 0] = torch.exp(ypred[2][..., 0]) * batch.X[..., 1]
+    batch.ygen[..., 2] = torch.exp(batch.ygen[..., 2]) * batch.X[..., 1]
+
+    # transform log (E/elemE) -> E
+    ypred[2][..., 4] = torch.exp(ypred[2][..., 4]) * batch.X[..., 5]
+    batch.ygen[..., 6] = torch.exp(batch.ygen[..., 6]) * batch.X[..., 1]
+
     # convert all outputs to float32 in case running in float16 or bfloat16
     ypred = tuple([y.to(torch.float32) for y in ypred])
 

diff --git a/mlpf/pyg/mlpf.py b/mlpf/pyg/mlpf.py
@@ -220,11 +220,11 @@ def __init__(
         layernorm=True,
         conv_type="attention",
         input_encoding="joint",
-        pt_mode="additive-elemtype",
-        eta_mode="additive-elemtype",
-        sin_phi_mode="additive-elemtype",
-        cos_phi_mode="additive-elemtype",
-        energy_mode="additive-elemtype",
+        pt_mode="linear",
+        eta_mode="linear",
+        sin_phi_mode="linear",
+        cos_phi_mode="linear",
+        energy_mode="linear",
         # element types which actually exist in the dataset
         elemtypes_nonzero=[1, 4, 5, 6, 8, 9, 10, 11],
         # should the conv layer outputs be concatted (concat) or take the last (last)
@@ -348,7 +348,7 @@ def __init__(
         self.nn_pid = ffn(decoding_dim, num_classes, width, self.act, dropout_ff)
 
         # elementwise DNN for node momentum regression
-        embed_dim = decoding_dim + 2 + num_classes
+        embed_dim = decoding_dim
         self.nn_pt = RegressionOutput(pt_mode, embed_dim, width, self.act, dropout_ff, self.elemtypes_nonzero)
         self.nn_eta = RegressionOutput(eta_mode, embed_dim, width, self.act, dropout_ff, self.elemtypes_nonzero)
         self.nn_sin_phi = RegressionOutput(sin_phi_mode, embed_dim, width, self.act, dropout_ff, self.elemtypes_nonzero)
@@ -402,9 +402,9 @@ def forward(self, X_features, mask):
 
         # regression input
         if self.learned_representation_mode == "concat":
-            final_embedding_reg = torch.cat([Xfeat_normed] + embeddings_reg + [preds_binary_particle.detach(), preds_pid.detach()], axis=-1)
+            final_embedding_reg = torch.cat([Xfeat_normed] + embeddings_reg, axis=-1)
         elif self.learned_representation_mode == "last":
-            final_embedding_reg = torch.cat([Xfeat_normed] + [embeddings_reg[-1]] + [preds_binary_particle.detach(), preds_pid.detach()], axis=-1)
+            final_embedding_reg = torch.cat([Xfeat_normed] + [embeddings_reg[-1]], axis=-1)
 
         if self.use_pre_layernorm:
             final_embedding_reg = self.final_norm_reg(final_embedding_reg)

diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py
@@ -15,6 +15,7 @@
 import sklearn
 import sklearn.metrics
 import numpy as np
+import pandas
 
 # comet needs to be imported before torch
 from comet_ml import OfflineExperiment, Experiment  # noqa: F401, isort:skip
@@ -98,7 +99,7 @@ def mlpf_loss(y, ypred, batch):
     loss_pid_classification[y["cls_id"] == 0] *= 0
 
     # compare particle momentum, only for cases where there was a true particle
-    loss_regression = 10 * torch.nn.functional.huber_loss(ypred["momentum"], y["momentum"], reduction="none")
+    loss_regression = 10 * torch.nn.functional.mse_loss(ypred["momentum"], y["momentum"], reduction="none")
     loss_regression[y["cls_id"] == 0] *= 0
 
     # set the loss to 0 on padded elements in the batch
@@ -111,11 +112,12 @@ def mlpf_loss(y, ypred, batch):
     loss["Classification"] = loss_pid_classification.sum() / nelem
 
     # normalize loss with stddev to stabilize across batches with very different pt, E distributions
-    mom_normalizer = y["momentum"][y["cls_id"] != 0].std(axis=0)
+    # mom_normalizer = y["momentum"][y["cls_id"] != 0].std(axis=0)
     reg_losses = loss_regression[y["cls_id"] != 0]
 
     # average over all true particles
-    loss["Regression"] = (reg_losses / mom_normalizer).sum() / npart
+    # loss["Regression"] = (reg_losses / mom_normalizer).sum() / npart
+    loss["Regression"] = reg_losses.sum() / npart
 
     # in case we are using the 3D-padded mode, we can compute a few additional event-level monitoring losses
     msk_pred_particle = torch.unsqueeze(torch.argmax(ypred["cls_binary"].detach(), axis=1) != 0, axis=-1)
@@ -298,12 +300,12 @@ def train_and_valid(
 
         with torch.autocast(device_type=device_type, dtype=dtype, enabled=device_type == "cuda"):
             if is_train:
-                ypred = model(batch.X, batch.mask)
+                ypred_raw = model(batch.X, batch.mask)
             else:
                 with torch.no_grad():
-                    ypred = model(batch.X, batch.mask)
+                    ypred_raw = model(batch.X, batch.mask)
 
-        ypred = unpack_predictions(ypred)
+        ypred = unpack_predictions(ypred_raw)
 
         if not is_train:
             cm_X_gen += sklearn.metrics.confusion_matrix(
@@ -315,6 +317,19 @@ def train_and_valid(
             cm_id += sklearn.metrics.confusion_matrix(
                 ygen["cls_id"][batch.mask].detach().cpu().numpy(), ypred["cls_id"][batch.mask].detach().cpu().numpy(), labels=range(13)
             )
+            # save the events of the first validation batch for quick checks
+            if itrain == 0:
+                arr = (
+                    torch.concatenate(
+                        [batch.X[batch.mask], batch.ygen[batch.mask], ypred_raw[0][batch.mask], ypred_raw[1][batch.mask], ypred_raw[2][batch.mask]],
+                        axis=-1,
+                    )
+                    .detach()
+                    .cpu()
+                    .numpy()
+                )
+                df = pandas.DataFrame(arr)
+                df.to_parquet(f"{outdir}/batch0_epoch{epoch}.parquet")
 
         with torch.autocast(device_type=device_type, dtype=dtype, enabled=device_type == "cuda"):
             if is_train:

diff --git a/parameters/pytorch/pyg-cms.yaml b/parameters/pytorch/pyg-cms.yaml
@@ -8,7 +8,7 @@ gpu_batch_multiplier: 1
 load:
 num_epochs: 100
 patience: 20
-lr: 0.0001
+lr: 0.00001
 lr_schedule: cosinedecay  # constant, cosinedecay, onecycle
 conv_type: attention
 ntrain:
@@ -30,11 +30,11 @@ model:
 
   learned_representation_mode: last #last, concat
   input_encoding: joint #split, joint
-  pt_mode: linear
+  pt_mode: direct
   eta_mode: linear
   sin_phi_mode: linear
   cos_phi_mode: linear
-  energy_mode: linear
+  energy_mode: direct
 
   gnn_lsh:
     conv_type: gnn_lsh

diff --git a/scripts/tallinn/a100/pytorch-small.sh b/scripts/tallinn/a100/pytorch-small.sh
@@ -4,14 +4,15 @@
 #SBATCH --mem-per-gpu 60G
 #SBATCH -o logs/slurm-%x-%j-%N.out
 
-IMG=/home/software/singularity/pytorch.simg:2024-07-08
+IMG=/home/software/singularity/pytorch.simg:2024-08-18
 cd ~/particleflow
 
 env
 
+ulimit -n 10000
 singularity exec -B /scratch/persistent --nv \
     --env PYTHONPATH=hep_tfds \
     --env KERAS_BACKEND=torch \
-    $IMG python3.10 mlpf/pyg_pipeline.py --dataset clic --gpus 1 \
-    --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-clic.yaml \
-    --train --test --make-plots --conv-type attention --attention-type flash --gpu-batch-multiplier 40 --num-workers 1 --prefetch-factor 50 --dtype bfloat16 --checkpoint-freq 1
+    $IMG python3 mlpf/pyg_pipeline.py --dataset cms --gpus 1 \
+    --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms.yaml \
+    --train --test --make-plots --conv-type attention --attention-type flash --gpu-batch-multiplier 4 --num-workers 1 --prefetch-factor 10 --dtype bfloat16 --checkpoint-freq 1 --ntrain 1000 --nvalid 1000 --ntest 1000 --comet
diff --git a/scripts/tallinn/a100/pytorch.sh b/scripts/tallinn/a100/pytorch.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 #SBATCH --partition gpu
 #SBATCH --gres gpu:a100:1
-#SBATCH --mem-per-gpu 100G
+#SBATCH --mem-per-gpu 200G
 #SBATCH -o logs/slurm-%x-%j-%N.out
 
 IMG=/home/software/singularity/pytorch.simg:2024-08-18
@@ -14,4 +14,5 @@ singularity exec -B /scratch/persistent --nv \
     $IMG python3 mlpf/pyg_pipeline.py --dataset cms --gpus 1 \
     --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms.yaml \
     --train --test --make-plots --num-epochs 100 --conv-type attention \
-    --gpu-batch-multiplier 10 --num-workers 4 --prefetch-factor 100 --checkpoint-freq 1 --comet
+    --num-epochs 10 \
+    --gpu-batch-multiplier 32 --checkpoint-freq 1 --num-workers 16 --prefetch-factor 10 --comet --ntrain 1000 --ntest 1000 --nvalid 1000