Skip to content

Commit

Permalink
try to improve val loss stability (jpata#342)
Browse files Browse the repository at this point in the history
* log correction for pt and e
  • Loading branch information
jpata authored Sep 4, 2024
1 parent d0c45d7 commit 5262cdb
Show file tree
Hide file tree
Showing 10 changed files with 72 additions and 33 deletions.
7 changes: 3 additions & 4 deletions mlpf/data_cms/genjob_pu55to75.sh
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,10 @@ ls -lrt
echo "process.RandomNumberGeneratorService.generator.initialSeed = $SEED" >> step2_phase1_new.py
cmsRun step2_phase1_new.py > /dev/null
cmsRun step3_phase1_new.py > /dev/null
#cmsRun $CMSSWDIR/src/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py
mv pfntuple.root pfntuple_${SEED}.root
# python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./
# bzip2 -z pfntuple_${SEED}.pkl
# cp *.pkl.bz2 $OUTDIR/$SAMPLE/raw/
python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./
bzip2 -z pfntuple_${SEED}.pkl
cp *.pkl.bz2 $OUTDIR/$SAMPLE/raw/

#copy ROOT outputs
#cp step2_phase1_new.root $OUTDIR/$SAMPLE/root/step2_${SEED}.root
Expand Down
3 changes: 2 additions & 1 deletion mlpf/data_cms/postprocessing_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,14 @@ def write_script(infiles, outfiles):

samples = [
"/local/joosep/mlpf/cms/20240823_simcluster/pu55to75/TTbar_14TeV_TuneCUETP8M1_cfi",
"/local/joosep/mlpf/cms/20240823_simcluster/pu55to75/QCDForPF_14TeV_TuneCUETP8M1_cfi",
]

ichunk = 1
for sample in samples:
infiles = list(glob.glob(f"{sample}/root/pfntuple*.root"))
for infiles_chunk in chunks(infiles, 10):
outfiles_chunk = [inf.replace(".root", ".pkl.bz2").replace("/root/", "/raw2/") for inf in infiles_chunk]
outfiles_chunk = [inf.replace(".root", ".pkl.bz2").replace("/root/", "/raw/") for inf in infiles_chunk]
os.makedirs(os.path.dirname(outfiles_chunk[0]), exist_ok=True)
scr = write_script(infiles_chunk, outfiles_chunk)
ofname = f"jobscripts/postproc_{ichunk}.sh"
Expand Down
10 changes: 5 additions & 5 deletions mlpf/data_cms/prepare_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@
outdir = "/local/joosep/mlpf/cms/20240823_simcluster"

samples = [
# ("TTbar_14TeV_TuneCUETP8M1_cfi", 105000, 110010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
("TTbar_14TeV_TuneCUETP8M1_cfi", 105000, 110010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
# ("ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi", 200000, 220010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
("QCDForPF_14TeV_TuneCUETP8M1_cfi", 300000, 305000, "genjob_pu55to75.sh", outdir + "/pu55to75"),
("QCDForPF_14TeV_TuneCUETP8M1_cfi", 305000, 310010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
# ("SMS-T1tttt_mGl-1500_mLSP-100_TuneCP5_14TeV_pythia8_cfi", 500000, 520010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
# ("ZpTT_1500_14TeV_TuneCP5_cfi", 600000, 620010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
# ("VBF_TuneCP5_14TeV_pythia8_cfi", 700000, 720010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
# ("VBF_TuneCP5_14TeV_pythia8_cfi", 700000, 705010, "genjob_pu55to75.sh", outdir + "/pu55to75"),

# ("TTbar_14TeV_TuneCUETP8M1_cfi", 702000, 705000, "genjob_nopu.sh", outdir + "/nopu"),
# ("MultiParticlePFGun50_cfi", 800000, 820000, "genjob_nopu.sh", outdir + "/nopu"),
("MultiParticlePFGun50_cfi", 800000, 805000, "genjob_nopu.sh", outdir + "/nopu"),
# ("VBF_TuneCP5_14TeV_pythia8_cfi", 900000, 920010, "genjob_nopu.sh", outdir + "/nopu"),
# ("QCDForPF_14TeV_TuneCUETP8M1_cfi", 1000000,1020010, "genjob_nopu.sh", outdir + "/nopu"),

Expand All @@ -36,6 +36,6 @@
os.makedirs(this_outdir + "/" + samp + "/root", exist_ok=True)

for seed in range(seed0, seed1):
p = this_outdir + "/" + samp + "/raw2/pfntuple_{}.pkl.bz2".format(seed)
p = this_outdir + "/" + samp + "/root/pfntuple_{}.root".format(seed)
if not os.path.isfile(p):
print(f"sbatch --mem-per-cpu 8G --partition main --time 20:00:00 --cpus-per-task 1 scripts/tallinn/cmssw-el8.sh mlpf/data_cms/{script} {samp} {seed}")
14 changes: 14 additions & 0 deletions mlpf/pyg/PFDataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,20 @@ def __getitem__(self, item):
ret["ygen"][:, 0][(ret["X"][:, 0] == 10) & (ret["ygen"][:, 0] == 7)] = 2
ret["ygen"][:, 0][(ret["X"][:, 0] == 11) & (ret["ygen"][:, 0] == 7)] = 2

# set pt for HO which would otherwise be 0
msk_ho = ret["X"][:, 0] == 10
eta = ret["X"][:, 2][msk_ho]
e = ret["X"][:, 5][msk_ho]
ret["X"][:, 1][msk_ho] = np.sqrt(e**2 - (np.tanh(eta) * e) ** 2)

# transform pt -> log(pt / elem pt), same for energy
ret["ygen"][:, 6] = np.log(ret["ygen"][:, 6] / ret["X"][:, 5])
ret["ygen"][:, 6][np.isnan(ret["ygen"][:, 6])] = 0.0
ret["ygen"][:, 6][np.isinf(ret["ygen"][:, 6])] = 0.0
ret["ygen"][:, 2] = np.log(ret["ygen"][:, 2] / ret["X"][:, 1])
ret["ygen"][:, 2][np.isnan(ret["ygen"][:, 2])] = 0.0
ret["ygen"][:, 2][np.isinf(ret["ygen"][:, 2])] = 0.0

return ret

def __len__(self):
Expand Down
8 changes: 8 additions & 0 deletions mlpf/pyg/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ def predict_one_batch(conv_type, model, i, batch, rank, jetdef, jet_ptcut, jet_m
batch = batch.to(rank)
ypred = model(batch.X, batch.mask)

# transform log (pt/elempt) -> pt
ypred[2][..., 0] = torch.exp(ypred[2][..., 0]) * batch.X[..., 1]
batch.ygen[..., 2] = torch.exp(batch.ygen[..., 2]) * batch.X[..., 1]

# transform log (E/elemE) -> E
ypred[2][..., 4] = torch.exp(ypred[2][..., 4]) * batch.X[..., 5]
batch.ygen[..., 6] = torch.exp(batch.ygen[..., 6]) * batch.X[..., 1]

# convert all outputs to float32 in case running in float16 or bfloat16
ypred = tuple([y.to(torch.float32) for y in ypred])

Expand Down
16 changes: 8 additions & 8 deletions mlpf/pyg/mlpf.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,11 +220,11 @@ def __init__(
layernorm=True,
conv_type="attention",
input_encoding="joint",
pt_mode="additive-elemtype",
eta_mode="additive-elemtype",
sin_phi_mode="additive-elemtype",
cos_phi_mode="additive-elemtype",
energy_mode="additive-elemtype",
pt_mode="linear",
eta_mode="linear",
sin_phi_mode="linear",
cos_phi_mode="linear",
energy_mode="linear",
# element types which actually exist in the dataset
elemtypes_nonzero=[1, 4, 5, 6, 8, 9, 10, 11],
# should the conv layer outputs be concatted (concat) or take the last (last)
Expand Down Expand Up @@ -348,7 +348,7 @@ def __init__(
self.nn_pid = ffn(decoding_dim, num_classes, width, self.act, dropout_ff)

# elementwise DNN for node momentum regression
embed_dim = decoding_dim + 2 + num_classes
embed_dim = decoding_dim
self.nn_pt = RegressionOutput(pt_mode, embed_dim, width, self.act, dropout_ff, self.elemtypes_nonzero)
self.nn_eta = RegressionOutput(eta_mode, embed_dim, width, self.act, dropout_ff, self.elemtypes_nonzero)
self.nn_sin_phi = RegressionOutput(sin_phi_mode, embed_dim, width, self.act, dropout_ff, self.elemtypes_nonzero)
Expand Down Expand Up @@ -402,9 +402,9 @@ def forward(self, X_features, mask):

# regression input
if self.learned_representation_mode == "concat":
final_embedding_reg = torch.cat([Xfeat_normed] + embeddings_reg + [preds_binary_particle.detach(), preds_pid.detach()], axis=-1)
final_embedding_reg = torch.cat([Xfeat_normed] + embeddings_reg, axis=-1)
elif self.learned_representation_mode == "last":
final_embedding_reg = torch.cat([Xfeat_normed] + [embeddings_reg[-1]] + [preds_binary_particle.detach(), preds_pid.detach()], axis=-1)
final_embedding_reg = torch.cat([Xfeat_normed] + [embeddings_reg[-1]], axis=-1)

if self.use_pre_layernorm:
final_embedding_reg = self.final_norm_reg(final_embedding_reg)
Expand Down
27 changes: 21 additions & 6 deletions mlpf/pyg/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import sklearn
import sklearn.metrics
import numpy as np
import pandas

# comet needs to be imported before torch
from comet_ml import OfflineExperiment, Experiment # noqa: F401, isort:skip
Expand Down Expand Up @@ -98,7 +99,7 @@ def mlpf_loss(y, ypred, batch):
loss_pid_classification[y["cls_id"] == 0] *= 0

# compare particle momentum, only for cases where there was a true particle
loss_regression = 10 * torch.nn.functional.huber_loss(ypred["momentum"], y["momentum"], reduction="none")
loss_regression = 10 * torch.nn.functional.mse_loss(ypred["momentum"], y["momentum"], reduction="none")
loss_regression[y["cls_id"] == 0] *= 0

# set the loss to 0 on padded elements in the batch
Expand All @@ -111,11 +112,12 @@ def mlpf_loss(y, ypred, batch):
loss["Classification"] = loss_pid_classification.sum() / nelem

# normalize loss with stddev to stabilize across batches with very different pt, E distributions
mom_normalizer = y["momentum"][y["cls_id"] != 0].std(axis=0)
# mom_normalizer = y["momentum"][y["cls_id"] != 0].std(axis=0)
reg_losses = loss_regression[y["cls_id"] != 0]

# average over all true particles
loss["Regression"] = (reg_losses / mom_normalizer).sum() / npart
# loss["Regression"] = (reg_losses / mom_normalizer).sum() / npart
loss["Regression"] = reg_losses.sum() / npart

# in case we are using the 3D-padded mode, we can compute a few additional event-level monitoring losses
msk_pred_particle = torch.unsqueeze(torch.argmax(ypred["cls_binary"].detach(), axis=1) != 0, axis=-1)
Expand Down Expand Up @@ -298,12 +300,12 @@ def train_and_valid(

with torch.autocast(device_type=device_type, dtype=dtype, enabled=device_type == "cuda"):
if is_train:
ypred = model(batch.X, batch.mask)
ypred_raw = model(batch.X, batch.mask)
else:
with torch.no_grad():
ypred = model(batch.X, batch.mask)
ypred_raw = model(batch.X, batch.mask)

ypred = unpack_predictions(ypred)
ypred = unpack_predictions(ypred_raw)

if not is_train:
cm_X_gen += sklearn.metrics.confusion_matrix(
Expand All @@ -315,6 +317,19 @@ def train_and_valid(
cm_id += sklearn.metrics.confusion_matrix(
ygen["cls_id"][batch.mask].detach().cpu().numpy(), ypred["cls_id"][batch.mask].detach().cpu().numpy(), labels=range(13)
)
# save the events of the first validation batch for quick checks
if itrain == 0:
arr = (
torch.concatenate(
[batch.X[batch.mask], batch.ygen[batch.mask], ypred_raw[0][batch.mask], ypred_raw[1][batch.mask], ypred_raw[2][batch.mask]],
axis=-1,
)
.detach()
.cpu()
.numpy()
)
df = pandas.DataFrame(arr)
df.to_parquet(f"{outdir}/batch0_epoch{epoch}.parquet")

with torch.autocast(device_type=device_type, dtype=dtype, enabled=device_type == "cuda"):
if is_train:
Expand Down
6 changes: 3 additions & 3 deletions parameters/pytorch/pyg-cms.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ gpu_batch_multiplier: 1
load:
num_epochs: 100
patience: 20
lr: 0.0001
lr: 0.00001
lr_schedule: cosinedecay # constant, cosinedecay, onecycle
conv_type: attention
ntrain:
Expand All @@ -30,11 +30,11 @@ model:

learned_representation_mode: last #last, concat
input_encoding: joint #split, joint
pt_mode: linear
pt_mode: direct
eta_mode: linear
sin_phi_mode: linear
cos_phi_mode: linear
energy_mode: linear
energy_mode: direct

gnn_lsh:
conv_type: gnn_lsh
Expand Down
9 changes: 5 additions & 4 deletions scripts/tallinn/a100/pytorch-small.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
#SBATCH --mem-per-gpu 60G
#SBATCH -o logs/slurm-%x-%j-%N.out

IMG=/home/software/singularity/pytorch.simg:2024-07-08
IMG=/home/software/singularity/pytorch.simg:2024-08-18
cd ~/particleflow

env

ulimit -n 10000
singularity exec -B /scratch/persistent --nv \
--env PYTHONPATH=hep_tfds \
--env KERAS_BACKEND=torch \
$IMG python3.10 mlpf/pyg_pipeline.py --dataset clic --gpus 1 \
--data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-clic.yaml \
--train --test --make-plots --conv-type attention --attention-type flash --gpu-batch-multiplier 40 --num-workers 1 --prefetch-factor 50 --dtype bfloat16 --checkpoint-freq 1
$IMG python3 mlpf/pyg_pipeline.py --dataset cms --gpus 1 \
--data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms.yaml \
--train --test --make-plots --conv-type attention --attention-type flash --gpu-batch-multiplier 4 --num-workers 1 --prefetch-factor 10 --dtype bfloat16 --checkpoint-freq 1 --ntrain 1000 --nvalid 1000 --ntest 1000 --comet
5 changes: 3 additions & 2 deletions scripts/tallinn/a100/pytorch.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash
#SBATCH --partition gpu
#SBATCH --gres gpu:a100:1
#SBATCH --mem-per-gpu 100G
#SBATCH --mem-per-gpu 200G
#SBATCH -o logs/slurm-%x-%j-%N.out

IMG=/home/software/singularity/pytorch.simg:2024-08-18
Expand All @@ -14,4 +14,5 @@ singularity exec -B /scratch/persistent --nv \
$IMG python3 mlpf/pyg_pipeline.py --dataset cms --gpus 1 \
--data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms.yaml \
--train --test --make-plots --num-epochs 100 --conv-type attention \
--gpu-batch-multiplier 10 --num-workers 4 --prefetch-factor 100 --checkpoint-freq 1 --comet
--num-epochs 10 \
--gpu-batch-multiplier 32 --checkpoint-freq 1 --num-workers 16 --prefetch-factor 10 --comet --ntrain 1000 --ntest 1000 --nvalid 1000

0 comments on commit 5262cdb

Please sign in to comment.