diff --git a/mlpf/data_cms/genjob_pu.sh b/mlpf/data_cms/genjob_pu.sh index 8a2fc0fdc..f9b5bf827 100755 --- a/mlpf/data_cms/genjob_pu.sh +++ b/mlpf/data_cms/genjob_pu.sh @@ -67,4 +67,5 @@ cmsRun $CMSSWDIR/src/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py mv pfntuple.root pfntuple_${SEED}.root python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ --save-normalized-table bzip2 -z pfntuple_${SEED}.pkl -#rm step*.root +cp *.pkl.bz2 $OUTDIR/ +rm -Rf $WORKDIR diff --git a/mlpf/data_cms/prepare_args.py b/mlpf/data_cms/prepare_args.py index 19b174cfc..9d6cd5ed1 100644 --- a/mlpf/data_cms/prepare_args.py +++ b/mlpf/data_cms/prepare_args.py @@ -23,7 +23,7 @@ # "ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi", # "QCDForPF_14TeV_TuneCUETP8M1_cfi", # "QCD_Pt_3000_7000_14TeV_TuneCUETP8M1_cfi", - # "SMS-T1tttt_mGl-1500_mLSP-100_TuneCP5_14TeV_pythia8_cfi", + ("SMS-T1tttt_mGl-1500_mLSP-100_TuneCP5_14TeV_pythia8_cfi", 200000, 202050), # "ZpTT_1500_14TeV_TuneCP5_cfi", ] diff --git a/mlpf/pipeline.py b/mlpf/pipeline.py index cc977fbaa..2d7ece032 100644 --- a/mlpf/pipeline.py +++ b/mlpf/pipeline.py @@ -1290,6 +1290,8 @@ def plots(train_dir, max_files): load_loss_history, loss_plot, plot_jet_response_binned, + plot_jet_response_binned_separate, + plot_jet_response_binned_eta, plot_met_response_binned, get_class_names, plot_rocs, @@ -1387,6 +1389,8 @@ def plots(train_dir, max_files): plot_particles(yvals, cp_dir=cp_dir, title=_title) plot_jet_response_binned(yvals, cp_dir=cp_dir, title=_title) + plot_jet_response_binned_eta(yvals, cp_dir=cp_dir, title=_title) + plot_jet_response_binned_separate(yvals, cp_dir=cp_dir, title=_title) plot_met_response_binned(met_data, cp_dir=cp_dir, title=_title) mom_data = compute_3dmomentum_and_ratio(yvals) diff --git a/mlpf/plotting/plot_utils.py b/mlpf/plotting/plot_utils.py index f058f1d67..dfa3ca70c 100644 --- a/mlpf/plotting/plot_utils.py +++ b/mlpf/plotting/plot_utils.py @@ -82,6 +82,7 @@ "gen_met": "$p_{\mathrm{T,gen}}^\text{miss}$ [GeV]", "gen_mom": "$p_{\mathrm{gen}}$ [GeV]", "gen_jet": "jet $p_{\mathrm{T,gen}}$ [GeV]", + "gen_jet_eta": "jet $\eta_{\mathrm{gen}}$ [GeV]", "reco_met": "$p_{\mathrm{T,reco}}^\text{miss}$ [GeV]", "reco_gen_met_ratio": "$p_{\mathrm{T,reco}}^\mathrm{miss} / p_{\\mathrm{T,gen}}^\mathrm{miss}$", "reco_gen_mom_ratio": "$p_{\mathrm{reco}} / p_{\\mathrm{gen}}$", @@ -89,6 +90,7 @@ "gen_met_range": "${} \less p_{{\mathrm{{T,gen}}}}^\mathrm{{miss}}\leq {}$", "gen_mom_range": "${} \less p_{{\mathrm{{gen}}}}\leq {}$", "gen_jet_range": "${} \less p_{{\mathrm{{T,gen}}}} \leq {}$", + "gen_jet_range_eta": "${} \less \eta_{{\mathrm{{gen}}}} \leq {}$", } @@ -296,6 +298,12 @@ def compute_jet_ratio(data, yvals): axis=1, ) ) + ret["jet_gen_to_pred_geneta"] = awkward.to_numpy( + awkward.flatten( + vector.awk(data["jets"]["gen"][data["matched_jets"]["gen_to_pred"]["gen"]]).eta, + axis=1, + ) + ) ret["jet_gen_to_pred_predpt"] = awkward.to_numpy( awkward.flatten( vector.awk(data["jets"]["pred"][data["matched_jets"]["gen_to_pred"]["pred"]]).pt, @@ -308,6 +316,12 @@ def compute_jet_ratio(data, yvals): axis=1, ) ) + ret["jet_gen_to_cand_geneta"] = awkward.to_numpy( + awkward.flatten( + vector.awk(data["jets"]["gen"][data["matched_jets"]["gen_to_cand"]["gen"]]).eta, + axis=1, + ) + ) ret["jet_gen_to_cand_candpt"] = awkward.to_numpy( awkward.flatten( vector.awk(data["jets"]["cand"][data["matched_jets"]["gen_to_cand"]["cand"]]).pt, @@ -978,6 +992,70 @@ def plot_particles(yvals, epoch=None, cp_dir=None, comet_experiment=None, title= ) +def plot_jet_response_binned_separate(yvals, epoch=None, cp_dir=None, comet_experiment=None, title=None): + pf_genjet_pt = yvals["jet_gen_to_cand_genpt"] + mlpf_genjet_pt = yvals["jet_gen_to_pred_genpt"] + + pf_response = yvals["jet_ratio_cand"] + mlpf_response = yvals["jet_ratio_pred"] + + genjet_bins = [10, 20, 40, 60, 80, 100, 200] + + x_vals = [] + pf_vals = [] + mlpf_vals = [] + b = np.linspace(0, 2, 100) + + for ibin in range(len(genjet_bins) - 1): + lim_low = genjet_bins[ibin] + lim_hi = genjet_bins[ibin + 1] + x_vals.append(np.mean([lim_low, lim_hi])) + + mask_genjet = (pf_genjet_pt > lim_low) & (pf_genjet_pt <= lim_hi) + pf_subsample = pf_response[mask_genjet] + if len(pf_subsample) > 0: + pf_p25 = np.percentile(pf_subsample, 25) + pf_p50 = np.percentile(pf_subsample, 50) + pf_p75 = np.percentile(pf_subsample, 75) + else: + pf_p25 = 0 + pf_p50 = 0 + pf_p75 = 0 + pf_vals.append([pf_p25, pf_p50, pf_p75]) + + mask_genjet = (mlpf_genjet_pt > lim_low) & (mlpf_genjet_pt <= lim_hi) + mlpf_subsample = mlpf_response[mask_genjet] + + if len(mlpf_subsample) > 0: + mlpf_p25 = np.percentile(mlpf_subsample, 25) + mlpf_p50 = np.percentile(mlpf_subsample, 50) + mlpf_p75 = np.percentile(mlpf_subsample, 75) + else: + mlpf_p25 = 0 + mlpf_p50 = 0 + mlpf_p75 = 0 + mlpf_vals.append([mlpf_p25, mlpf_p50, mlpf_p75]) + + plt.figure() + plt.hist(pf_subsample, bins=b, histtype="step", lw=2, label="PF") + plt.hist(mlpf_subsample, bins=b, histtype="step", lw=2, label="MLPF") + plt.xlim(0, 2) + plt.xticks([0, 0.5, 1, 1.5, 2]) + plt.ylabel("Matched jets / bin") + plt.xlabel(labels["reco_gen_jet_ratio"]) + plt.axvline(1.0, ymax=0.7, color="black", ls="--") + plt.legend(loc=1, fontsize=16) + plt.title(labels["gen_jet_range"].format(lim_low, lim_hi)) + plt.yscale("log") + + save_img( + "jet_response_binned_pt{}.png".format(lim_low), + epoch, + cp_dir=cp_dir, + comet_experiment=comet_experiment, + ) + + def plot_jet_response_binned(yvals, epoch=None, cp_dir=None, comet_experiment=None, title=None): pf_genjet_pt = yvals["jet_gen_to_cand_genpt"] mlpf_genjet_pt = yvals["jet_gen_to_pred_genpt"] @@ -1049,19 +1127,10 @@ def plot_jet_response_binned(yvals, epoch=None, cp_dir=None, comet_experiment=No mlpf_vals = np.array(mlpf_vals) # Plot median and IQR as a function of gen pt - fig, axs = plt.subplots(2, 1, sharex=True) - plt.sca(axs[0]) - plt.plot(x_vals, pf_vals[:, 1], marker="o", label="PF") - plt.plot(x_vals, mlpf_vals[:, 1], marker="o", label="MLPF") - plt.ylim(0.75, 1.25) - plt.axhline(1.0, color="black", ls="--") - plt.ylabel("Response median") - plt.legend(title=title) - - plt.sca(axs[1]) - plt.plot(x_vals, pf_vals[:, 2] - pf_vals[:, 0], marker="o", label="PF") - plt.plot(x_vals, mlpf_vals[:, 2] - mlpf_vals[:, 0], marker="o", label="MLPF") - plt.ylabel("Response IQR") + plt.figure() + plt.plot(x_vals, (pf_vals[:, 2] - pf_vals[:, 0]) / pf_vals[:, 1], marker="o", label="PF") + plt.plot(x_vals, (mlpf_vals[:, 2] - mlpf_vals[:, 0]) / mlpf_vals[:, 1], marker="o", label="MLPF") + plt.ylabel("Response IQR / median") plt.xlabel(labels["gen_jet"]) plt.tight_layout() @@ -1073,6 +1142,91 @@ def plot_jet_response_binned(yvals, epoch=None, cp_dir=None, comet_experiment=No ) +def plot_jet_response_binned_eta(yvals, epoch=None, cp_dir=None, comet_experiment=None, title=None): + pf_genjet_eta = yvals["jet_gen_to_cand_geneta"] + mlpf_genjet_eta = yvals["jet_gen_to_pred_geneta"] + + pf_response = yvals["jet_ratio_cand"] + mlpf_response = yvals["jet_ratio_pred"] + + genjet_bins = [-4, -3, -2, -1, 0, 1, 2, 3, 4] + + x_vals = [] + pf_vals = [] + mlpf_vals = [] + b = np.linspace(0, 2, 100) + + fig, axs = plt.subplots(3, 3, figsize=(3 * 5, 3 * 5)) + axs = axs.flatten() + for ibin in range(len(genjet_bins) - 1): + lim_low = genjet_bins[ibin] + lim_hi = genjet_bins[ibin + 1] + x_vals.append(np.mean([lim_low, lim_hi])) + + mask_genjet = (pf_genjet_eta > lim_low) & (pf_genjet_eta <= lim_hi) + pf_subsample = pf_response[mask_genjet] + if len(pf_subsample) > 0: + pf_p25 = np.percentile(pf_subsample, 25) + pf_p50 = np.percentile(pf_subsample, 50) + pf_p75 = np.percentile(pf_subsample, 75) + else: + pf_p25 = 0 + pf_p50 = 0 + pf_p75 = 0 + pf_vals.append([pf_p25, pf_p50, pf_p75]) + + mask_genjet = (mlpf_genjet_eta > lim_low) & (mlpf_genjet_eta <= lim_hi) + mlpf_subsample = mlpf_response[mask_genjet] + + if len(mlpf_subsample) > 0: + mlpf_p25 = np.percentile(mlpf_subsample, 25) + mlpf_p50 = np.percentile(mlpf_subsample, 50) + mlpf_p75 = np.percentile(mlpf_subsample, 75) + else: + mlpf_p25 = 0 + mlpf_p50 = 0 + mlpf_p75 = 0 + mlpf_vals.append([mlpf_p25, mlpf_p50, mlpf_p75]) + + plt.sca(axs[ibin]) + plt.hist(pf_subsample, bins=b, histtype="step", lw=2, label="PF") + plt.hist(mlpf_subsample, bins=b, histtype="step", lw=2, label="MLPF") + plt.xlim(0, 2) + plt.xticks([0, 0.5, 1, 1.5, 2]) + plt.ylabel("Matched jets / bin") + plt.xlabel(labels["reco_gen_jet_ratio"]) + plt.axvline(1.0, ymax=0.7, color="black", ls="--") + plt.legend(loc=1, fontsize=16) + plt.title(labels["gen_jet_range_eta"].format(lim_low, lim_hi)) + plt.yscale("log") + + plt.tight_layout() + save_img( + "jet_response_binned_eta.png", + epoch, + cp_dir=cp_dir, + comet_experiment=comet_experiment, + ) + + x_vals = np.array(x_vals) + pf_vals = np.array(pf_vals) + mlpf_vals = np.array(mlpf_vals) + + # Plot median and IQR as a function of gen pt + plt.figure() + plt.plot(x_vals, (pf_vals[:, 2] - pf_vals[:, 0]) / pf_vals[:, 1], marker="o", label="PF") + plt.plot(x_vals, (mlpf_vals[:, 2] - mlpf_vals[:, 0]) / mlpf_vals[:, 1], marker="o", label="MLPF") + plt.ylabel("Response IQR / median") + plt.xlabel(labels["gen_jet_eta"]) + plt.tight_layout() + save_img( + "jet_response_med_iqr_eta.png", + epoch, + cp_dir=cp_dir, + comet_experiment=comet_experiment, + ) + + def plot_met_response_binned(yvals, epoch=None, cp_dir=None, comet_experiment=None, title=None): genmet = yvals["gen_met"] @@ -1141,21 +1295,10 @@ def plot_met_response_binned(yvals, epoch=None, cp_dir=None, comet_experiment=No mlpf_vals = np.array(mlpf_vals) # Plot median and IQR as a function of gen pt - fig, axs = plt.subplots(2, 1, sharex=True) - plt.sca(axs[0]) - plt.plot(x_vals, pf_vals[:, 1], marker="o", label="PF") - plt.plot(x_vals, mlpf_vals[:, 1], marker="o", label="MLPF") - plt.ylim(0.75, 1.25) - plt.axhline(1.0, color="black", ls="--") - plt.ylabel("Response median") - if title: - plt.title(title) - plt.legend() - - plt.sca(axs[1]) - plt.plot(x_vals, pf_vals[:, 2] - pf_vals[:, 0], marker="o", label="PF") - plt.plot(x_vals, mlpf_vals[:, 2] - mlpf_vals[:, 0], marker="o", label="MLPF") - plt.ylabel("Response IQR") + plt.figure() + plt.plot(x_vals, (pf_vals[:, 2] - pf_vals[:, 0]) / pf_vals[:, 1], marker="o", label="PF") + plt.plot(x_vals, (mlpf_vals[:, 2] - mlpf_vals[:, 0]) / mlpf_vals[:, 1], marker="o", label="MLPF") + plt.ylabel("Response IQR / median") plt.legend() if title: plt.title(title) @@ -1238,18 +1381,9 @@ def plot_3dmomentum_response_binned(yvals, epoch=None, cp_dir=None, comet_experi mlpf_vals = np.array(mlpf_vals) # Plot median and IQR as a function of gen pt - fig, axs = plt.subplots(2, 1, sharex=True) - plt.sca(axs[0]) - plt.plot(x_vals, pf_vals[:, 1], marker="o", label="PF") - plt.plot(x_vals, mlpf_vals[:, 1], marker="o", label="MLPF") - plt.ylim(0.75, 1.25) - plt.axhline(1.0, color="black", ls="--") - plt.ylabel("Response median") - plt.legend(title=title) - - plt.sca(axs[1]) - plt.plot(x_vals, pf_vals[:, 2] - pf_vals[:, 0], marker="o", label="PF") - plt.plot(x_vals, mlpf_vals[:, 2] - mlpf_vals[:, 0], marker="o", label="MLPF") + plt.figure() + plt.plot(x_vals, (pf_vals[:, 2] - pf_vals[:, 0]) / pf_vals[:, 1], marker="o", label="PF") + plt.plot(x_vals, (mlpf_vals[:, 2] - mlpf_vals[:, 0]) / mlpf_vals[:, 1], marker="o", label="MLPF") plt.ylabel("Response IQR") plt.xlabel(labels["gen_mom"]) diff --git a/mlpf/pyg/inference.py b/mlpf/pyg/inference.py index ab8f8e5a2..9524786b4 100644 --- a/mlpf/pyg/inference.py +++ b/mlpf/pyg/inference.py @@ -45,6 +45,7 @@ def particle_array_to_awkward(batch_ids, arr_id, arr_p4): return ret +@torch.no_grad() def run_predictions(rank, mlpf, loader, sample, outpath): """Runs inference on the given sample and stores the output as .parquet files.""" @@ -53,8 +54,9 @@ def run_predictions(rank, mlpf, loader, sample, outpath): ti = time.time() - for i, batch in tqdm.tqdm(enumerate(loader)): - event = batch.to(rank) + for i, event in tqdm.tqdm(enumerate(loader), total=len(loader)): + event.X = event.X.to(rank) + event.batch = event.batch.to(rank) # recall target ~ ["PDG", "charge", "pt", "eta", "sin_phi", "cos_phi", "energy", "jet_idx"] target_ids = event.ygen[:, 0].long() @@ -65,10 +67,13 @@ def run_predictions(rank, mlpf, loader, sample, outpath): # make mlpf forward pass pred_ids_one_hot, pred_momentum, pred_charge = mlpf(event) + pred_ids_one_hot = pred_ids_one_hot.detach().cpu() + pred_momentum = pred_momentum.detach().cpu() + pred_charge = pred_charge.detach().cpu() - pred_ids = torch.argmax(pred_ids_one_hot.detach(), axis=-1) - pred_charge = torch.argmax(pred_charge.detach(), axis=1, keepdim=True) - 1 - pred_p4 = torch.cat([pred_charge, pred_momentum.detach()], axis=-1) + pred_ids = torch.argmax(pred_ids_one_hot, axis=-1) + pred_charge = torch.argmax(pred_charge, axis=1, keepdim=True) - 1 + pred_p4 = torch.cat([pred_charge, pred_momentum], axis=-1) batch_ids = event.batch.cpu().numpy() awkvals = { @@ -80,22 +85,22 @@ def run_predictions(rank, mlpf, loader, sample, outpath): gen_p4, cand_p4, pred_p4 = [], [], [] gen_cls, cand_cls, pred_cls = [], [], [] Xs = [] - for _ibatch in np.unique(event.batch.cpu().numpy()): - msk_batch = event.batch == _ibatch - msk_gen = target_ids[msk_batch] != 0 - msk_cand = cand_ids[msk_batch] != 0 - msk_pred = pred_ids[msk_batch] != 0 + for _ibatch in np.unique(batch_ids): + msk_batch = batch_ids == _ibatch + msk_gen = (target_ids[msk_batch] != 0).numpy() + msk_cand = (cand_ids[msk_batch] != 0).numpy() + msk_pred = (pred_ids[msk_batch] != 0).numpy() Xs.append(event.X[msk_batch].cpu().numpy()) - gen_p4.append(event.ygen[msk_batch, 1:][msk_gen]) - gen_cls.append(target_ids[msk_batch][msk_gen]) + gen_p4.append(event.ygen[msk_batch, 1:][msk_gen].numpy()) + gen_cls.append(target_ids[msk_batch][msk_gen].numpy()) - cand_p4.append(event.ycand[msk_batch, 1:][msk_cand]) - cand_cls.append(cand_ids[msk_batch][msk_cand]) + cand_p4.append(event.ycand[msk_batch, 1:][msk_cand].numpy()) + cand_cls.append(cand_ids[msk_batch][msk_cand].numpy()) - pred_p4.append(pred_momentum[msk_batch, :][msk_pred]) - pred_cls.append(pred_ids[msk_batch][msk_pred]) + pred_p4.append(pred_momentum[msk_batch, :][msk_pred].numpy()) + pred_cls.append(pred_ids[msk_batch][msk_pred].numpy()) Xs = awkward.from_iter(Xs) gen_p4 = awkward.from_iter(gen_p4) @@ -158,7 +163,7 @@ def run_predictions(rank, mlpf, loader, sample, outpath): ) _logger.info(f"Saved predictions at {outpath}/preds/{sample}/pred_{rank}_{i}.parquet") - if i == 2: + if i == 100: break _logger.info(f"Time taken to make predictions on device {rank} is: {((time.time() - ti) / 60):.2f} min") diff --git a/mlpf/pyg/mlpf.py b/mlpf/pyg/mlpf.py index 1ade81faa..157e4373e 100644 --- a/mlpf/pyg/mlpf.py +++ b/mlpf/pyg/mlpf.py @@ -103,13 +103,13 @@ def __init__( for i in range(num_convs): gnn_conf = { "inout_dim": embedding_dim, - "bin_size": 256, + "bin_size": 640, "max_num_bins": 200, "distance_dim": 128, "layernorm": True, "num_node_messages": 2, "dropout": 0.0, - "ffn_dist_hidden_dim": 64, + "ffn_dist_hidden_dim": 128, } self.conv_id.append(CombinedGraphLayer(**gnn_conf)) self.conv_reg.append(CombinedGraphLayer(**gnn_conf)) diff --git a/mlpf/pyg/model.py b/mlpf/pyg/model.py index 5798821b2..aae5b7a95 100644 --- a/mlpf/pyg/model.py +++ b/mlpf/pyg/model.py @@ -39,7 +39,17 @@ def index_dim(a, b): def split_indices_to_bins_batch(cmul, nbins, bin_size, msk): - bin_idx = torch.argmax(cmul, axis=-1) + torch.where(~msk, nbins - 1, 0).to(torch.int64) + a = torch.argmax(cmul, axis=-1) + + # This gives a CUDA error for some reason + # b = torch.where(~msk, nbins - 1, 0) + # b = b.to(torch.int64) + + b = torch.zeros(msk.shape, dtype=torch.int64, device=cmul.device) + # JP: check if this should be ~msk or msk (both here and in the TF implementation) + b[~msk] = nbins - 1 + + bin_idx = a + b bins_split = torch.reshape(torch.argsort(bin_idx, stable=True), (cmul.shape[0], nbins, bin_size)) return bins_split diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py index c1967b8c6..0432c4a90 100644 --- a/mlpf/pyg/training.py +++ b/mlpf/pyg/training.py @@ -12,6 +12,8 @@ from torch.nn import functional as F from torch.utils.tensorboard import SummaryWriter +# from torch.profiler import profile, record_function, ProfilerActivity + from .logger import _logger # Ignore divide by 0 errors @@ -136,9 +138,7 @@ def train(rank, model, train_loader, valid_loader, optimizer, tensorboard_writer losses = {"Total": 0.0, "Classification": 0.0, "Regression": 0.0, "Charge": 0.0} num_iterations = 0 - print("loader index:", loader.cur_index) - for i, batch in tqdm.tqdm(enumerate(loader)): - print("loader index inside:", loader.cur_index) + for i, batch in tqdm.tqdm(enumerate(loader), total=len(loader)): num_iterations += 1 if tensorboard_writer: @@ -152,13 +152,13 @@ def train(rank, model, train_loader, valid_loader, optimizer, tensorboard_writer # recall target ~ ["PDG", "charge", "pt", "eta", "sin_phi", "cos_phi", "energy", "jet_idx"] target_ids = event.ygen[:, 0].long() - target_charge = (event.ygen[:, 1] + 1).to(dtype=torch.float32) # -1, 0, 1 -> 0, 1, 2 + target_charge = torch.clamp((event.ygen[:, 1] + 1).to(dtype=torch.float32), 0, 2) # -1, 0, 1 -> 0, 1, 2 target_momentum = event.ygen[:, 2:-1].to(dtype=torch.float32) # make mlpf forward pass - t0 = time.time() + # t0 = time.time() pred_ids_one_hot, pred_momentum, pred_charge = model(event) - print(f"{event}: {(time.time() - t0):.2f}s") + # print(f"{event}: {(time.time() - t0):.2f}s") for icls in range(pred_ids_one_hot.shape[1]): if tensorboard_writer: @@ -168,7 +168,8 @@ def train(rank, model, train_loader, valid_loader, optimizer, tensorboard_writer ISTEP_GLOBAL_TRAIN if is_train else ISTEP_GLOBAL_VALID, ) - assert np.all(target_charge.unique().cpu().numpy() == [0, 1, 2]) + # JP: need to debug this + # assert np.all(target_charge.unique().cpu().numpy() == [0, 1, 2]) loss_ = {} # for CLASSIFYING PID @@ -185,6 +186,12 @@ def train(rank, model, train_loader, valid_loader, optimizer, tensorboard_writer # TOTAL LOSS loss_["Total"] = loss_["Classification"] + loss_["Regression"] + loss_["Charge"] + if tensorboard_writer: + tensorboard_writer.add_scalar( + "step_{}/loss".format(step_type), + loss_["Total"], + ISTEP_GLOBAL_TRAIN if is_train else ISTEP_GLOBAL_VALID, + ) if is_train: for param in model.parameters(): param.grad = None @@ -202,9 +209,6 @@ def train(rank, model, train_loader, valid_loader, optimizer, tensorboard_writer else: ISTEP_GLOBAL_VALID += 1 - if i == 10: - break - for loss in losses: losses[loss] = losses[loss] / num_iterations @@ -254,6 +258,10 @@ def train_mlpf(rank, world_size, model, train_loader, valid_loader, n_epochs, pa # training step losses_t = train(rank, model, train_loader, valid_loader, optimizer, tensorboard_writer) + # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof: + # with record_function("model_train"): + # print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=20)) + for k, v in losses_t.items(): tensorboard_writer.add_scalar(f"epoch/train_loss_rank_{rank}_" + k, v, epoch) for loss in losses_of_interest: diff --git a/mlpf/pyg/utils.py b/mlpf/pyg/utils.py index ffd7f96fb..c24250ab0 100644 --- a/mlpf/pyg/utils.py +++ b/mlpf/pyg/utils.py @@ -2,15 +2,14 @@ import os import os.path as osp import pickle as pkl -from typing import List, Optional, Sequence, Union +from typing import List, Optional import tensorflow_datasets as tfds import torch import torch.utils.data from torch import Tensor -from torch_geometric.data import Batch, Data, Dataset +from torch_geometric.data import Batch, Data from torch_geometric.data.data import BaseData -from torch_geometric.data.datapipes import DatasetAdapter # https://github.com/ahlinist/cmssw/blob/1df62491f48ef964d198f574cdfcccfd17c70425/DataFormats/ParticleFlowReco/interface/PFBlockElement.h#L33 # https://github.com/cms-sw/cmssw/blob/master/DataFormats/ParticleFlowCandidate/src/PFCandidate.cc#L254 @@ -145,67 +144,10 @@ def save_mlpf(args, mlpf, model_kwargs): json.dump({**{"Num of mlpf parameters": num_mlpf_parameters}, **vars(args)}, fp) -class DataLoader(torch.utils.data.DataLoader): - """ - Copied from https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/loader/dataloader.html#DataLoader - because we need to implement our own Collater class to load the tensorflow_datasets (see below). - """ - - def __init__( - self, - dataset: Union[Dataset, Sequence[BaseData], DatasetAdapter], - batch_size: int = 1, - shuffle: bool = False, - follow_batch: Optional[List[str]] = None, - exclude_keys: Optional[List[str]] = None, - **kwargs, - ): - # Remove for PyTorch Lightning: - kwargs.pop("collate_fn", None) - - # Save for PyTorch Lightning < 1.6: - self.follow_batch = follow_batch - self.exclude_keys = exclude_keys - - super().__init__( - dataset, - batch_size, - shuffle, - collate_fn=Collater(follow_batch, exclude_keys), - **kwargs, - ) - - -class Collater: - """Based on the Collater found on torch_geometric docs we build our own.""" - - def __init__(self, follow_batch=None, exclude_keys=None): - self.follow_batch = follow_batch - self.exclude_keys = exclude_keys - - def __call__(self, inputs): - num_samples_in_batch = len(inputs) - elem_keys = list(inputs[0].keys()) - - batch = [] - for ev in range(num_samples_in_batch): - batch.append(Data()) - for elem_key in elem_keys: - batch[ev][elem_key] = Tensor(inputs[ev][elem_key]) - batch[ev]["batch"] = torch.tensor([ev] * len(inputs[ev][elem_key])) - - elem = batch[0] - - if isinstance(elem, BaseData): - return Batch.from_data_list(batch, self.follow_batch, self.exclude_keys) - - raise TypeError(f"DataLoader found invalid type: {type(elem)}") - - -class Dataset: +class PFDataset: """Builds a DataSource from tensorflow datasets.""" - def __init__(self, data_dir, name, split): + def __init__(self, data_dir, name, split, keys_to_get): """ Args dataset: "cms", "clic", or "delphes" @@ -221,6 +163,10 @@ def __init__(self, data_dir, name, split): # to prevent a warning from tfds about accessing sequences of indices self.ds.__class__.__getitems__ = my_getitem + self.keys_to_get = keys_to_get + + # self.ds = torch.utils.data.Subset(self.ds, range(100)) + def get_sampler(self): sampler = torch.utils.data.RandomSampler(self.ds) return sampler @@ -234,14 +180,14 @@ def get_loader(self, batch_size, world_size, num_workers=0, prefetch_factor=4): return DataLoader( self.ds, batch_size=batch_size, - collate_fn=Collater(), + collate_fn=Collater(self.keys_to_get), sampler=self.get_distributed_sampler(), ) elif num_workers: return DataLoader( self.ds, batch_size=batch_size, - collate_fn=Collater(), + collate_fn=Collater(self.keys_to_get), sampler=self.get_sampler(), num_workers=num_workers, prefetch_factor=prefetch_factor, @@ -250,7 +196,7 @@ def get_loader(self, batch_size, world_size, num_workers=0, prefetch_factor=4): return DataLoader( self.ds, batch_size=batch_size, - collate_fn=Collater(), + collate_fn=Collater(self.keys_to_get), sampler=self.get_sampler(), ) @@ -261,7 +207,68 @@ def __repr__(self): return self.ds.__repr__() +class DataLoader(torch.utils.data.DataLoader): + """ + Copied from https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/loader/dataloader.html#DataLoader + because we need to implement our own Collater class to load the tensorflow_datasets (see below). + """ + + def __init__( + self, + dataset: PFDataset, + batch_size: int = 1, + shuffle: bool = False, + follow_batch: Optional[List[str]] = None, + exclude_keys: Optional[List[str]] = None, + **kwargs, + ): + # Remove for PyTorch Lightning: + collate_fn = kwargs.pop("collate_fn", None) + + # Save for PyTorch Lightning < 1.6: + self.follow_batch = follow_batch + self.exclude_keys = exclude_keys + + super().__init__( + dataset, + batch_size, + shuffle, + collate_fn=collate_fn, + **kwargs, + ) + + +class Collater: + """Based on the Collater found on torch_geometric docs we build our own.""" + + def __init__(self, keys_to_get, follow_batch=None, exclude_keys=None): + self.follow_batch = follow_batch + self.exclude_keys = exclude_keys + self.keys_to_get = keys_to_get + + def __call__(self, inputs): + num_samples_in_batch = len(inputs) + elem_keys = self.keys_to_get + + batch = [] + for ev in range(num_samples_in_batch): + batch.append(Data()) + for elem_key in elem_keys: + batch[ev][elem_key] = Tensor(inputs[ev][elem_key]) + batch[ev]["batch"] = torch.tensor([ev] * len(inputs[ev][elem_key])) + + elem = batch[0] + + if isinstance(elem, BaseData): + return Batch.from_data_list(batch, self.follow_batch, self.exclude_keys) + + raise TypeError(f"DataLoader found invalid type: {type(elem)}") + + def my_getitem(self, vals): + # print( + # "reading dataset {}:{} from disk in slice {}, total={}".format(self.dataset_info.name, self.split, vals, len(self)) + # ) records = self.data_source.__getitems__(vals) return [self.dataset_info.features.deserialize_example_np(record, decoders=self.decoders) for record in records] diff --git a/mlpf/pyg_pipeline.py b/mlpf/pyg_pipeline.py index c417ba029..d97dc6468 100644 --- a/mlpf/pyg_pipeline.py +++ b/mlpf/pyg_pipeline.py @@ -20,7 +20,7 @@ from pyg.logger import _logger from pyg.mlpf import MLPF from pyg.training import train_mlpf -from pyg.utils import CLASS_LABELS, X_FEATURES, Dataset, InterleavedIterator, save_mlpf +from pyg.utils import CLASS_LABELS, X_FEATURES, PFDataset, InterleavedIterator, save_mlpf logging.basicConfig(level=logging.INFO) @@ -31,6 +31,9 @@ parser.add_argument("--overwrite", dest="overwrite", action="store_true", help="overwrites the model if True") parser.add_argument("--data_dir", type=str, default="/pfvol/tensorflow_datasets/", help="path to `tensorflow_datasets/`") parser.add_argument("--gpus", type=str, default="0", help="to use CPU set to empty string; else e.g., `0,1`") +parser.add_argument( + "--gpu-batch-multiplier", type=int, default=1, help="Increase batch size per GPU by this constant factor" +) parser.add_argument("--dataset", type=str, choices=["clic", "cms", "delphes"], required=True, help="which dataset?") parser.add_argument("--load", action="store_true", help="load the model (no training)") parser.add_argument("--train", action="store_true", help="initiates a training") @@ -89,23 +92,24 @@ def run(rank, world_size, args): train_loaders, valid_loaders = [], [] for sample in config["train_dataset"][args.dataset]: version = config["train_dataset"][args.dataset][sample]["version"] - batch_size = config["train_dataset"][args.dataset][sample]["batch_size"] + batch_size = config["train_dataset"][args.dataset][sample]["batch_size"] * args.gpu_batch_multiplier - ds = Dataset(args.data_dir, f"{sample}:{version}", "train") + ds = PFDataset(args.data_dir, f"{sample}:{version}", "train", ["X", "ygen"]) _logger.info(f"train_dataset: {ds}, {len(ds)}", color="blue") train_loaders.append(ds.get_loader(batch_size=batch_size, world_size=world_size)) if (rank == 0) or (rank == "cpu"): # validation only on a single machine version = config["train_dataset"][args.dataset][sample]["version"] - batch_size = config["train_dataset"][args.dataset][sample]["batch_size"] + batch_size = config["train_dataset"][args.dataset][sample]["batch_size"] * args.gpu_batch_multiplier - ds = Dataset(args.data_dir, f"{sample}:{version}", "test") + ds = PFDataset(args.data_dir, f"{sample}:{version}", "test", ["X", "ygen", "ycand"]) _logger.info(f"valid_dataset: {ds}, {len(ds)}", color="blue") valid_loaders.append(ds.get_loader(batch_size=batch_size, world_size=1)) train_loader = InterleavedIterator(train_loaders) + valid_loader = None if (rank == 0) or (rank == "cpu"): # validation only on a single machine valid_loader = InterleavedIterator(valid_loaders) @@ -125,9 +129,9 @@ def run(rank, world_size, args): test_loaders = {} for sample in config["test_dataset"][args.dataset]: version = config["test_dataset"][args.dataset][sample]["version"] - batch_size = config["test_dataset"][args.dataset][sample]["batch_size"] + batch_size = config["test_dataset"][args.dataset][sample]["batch_size"] * args.gpu_batch_multiplier - ds = Dataset(args.data_dir, f"{sample}:{version}", "test") + ds = PFDataset(args.data_dir, f"{sample}:{version}", "test", ["X", "ygen", "ycand"]) _logger.info(f"test_dataset: {ds}, {len(ds)}", color="blue") test_loaders[sample] = InterleavedIterator([ds.get_loader(batch_size=batch_size, world_size=world_size)]) @@ -140,6 +144,7 @@ def run(rank, world_size, args): for sample in test_loaders: _logger.info(f"Running predictions on {sample}") + torch.cuda.empty_cache() run_predictions(rank, model, test_loaders[sample], sample, args.model_prefix) if (rank == 0) or (rank == "cpu"): # make plots and export to onnx only on a single machine diff --git a/parameters/cms-gen.yaml b/parameters/cms-gen.yaml index 394be230e..ad0f5a539 100644 --- a/parameters/cms-gen.yaml +++ b/parameters/cms-gen.yaml @@ -236,6 +236,7 @@ train_test_datasets: - cms_pf_ztt - cms_pf_qcd - cms_pf_qcd_high_pt + - cms_pf_sms_t1tttt gun: batch_per_gpu: 50 event_pad_size: -1 @@ -316,3 +317,7 @@ datasets: version: 1.6.0 data_dir: manual_dir: + cms_pf_sms_t1tttt: + version: 1.6.0 + data_dir: + manual_dir: diff --git a/parameters/pyg-config-cms.yaml b/parameters/pyg-config-cms.yaml deleted file mode 100644 index fd2d24e2e..000000000 --- a/parameters/pyg-config-cms.yaml +++ /dev/null @@ -1,110 +0,0 @@ -backend: pytorch - -model: - gnn-lsh: - conv_type: gnn-lsh - embedding_dim: 36 - width: 128 - num_convs: 2 - k: 16 - propagate_dimensions: 22 - space_dimensions: 4 - dropout: True - - gravnet: - conv_type: gravnet - embedding_dim: 36 - width: 128 - num_convs: 2 - k: 16 - propagate_dimensions: 22 - space_dimensions: 4 - dropout: True - - attention: - conv_type: attention - embedding_dim: 128 - width: 128 - num_convs: 2 - k: 16 - propagate_dimensions: 22 - space_dimensions: 4 - dropout: True - -train_dataset: - cms: - cms_pf_ttbar: - version: 1.6.0 - batch_size: 5 - # cms_pf_qcd: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_ztt: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_qcd_high_pt: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_electron: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_gamma: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_pi0: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_neutron: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_pi: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_tau: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_mu: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_proton: - # version: 1.6.0 - # batch_size: 5 - -test_dataset: - cms: - # cms_pf_ttbar: - # version: 1.6.0 - # batch_size: 5 - cms_pf_qcd: - version: 1.6.0 - batch_size: 5 - # cms_pf_ztt: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_qcd_high_pt: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_electron: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_gamma: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_pi0: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_neutron: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_pi: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_tau: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_mu: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_proton: - # version: 1.6.0 - # batch_size: 5 diff --git a/parameters/pyg-config-test.yaml b/parameters/pyg-config-test.yaml new file mode 100644 index 000000000..f3d75defa --- /dev/null +++ b/parameters/pyg-config-test.yaml @@ -0,0 +1,43 @@ +backend: pytorch + +model: + gnn-lsh: + conv_type: gnn-lsh + embedding_dim: 36 + width: 128 + num_convs: 2 + k: 16 + propagate_dimensions: 22 + space_dimensions: 4 + dropout: True + + gravnet: + conv_type: gravnet + embedding_dim: 36 + width: 128 + num_convs: 2 + k: 16 + propagate_dimensions: 22 + space_dimensions: 4 + dropout: True + + attention: + conv_type: attention + embedding_dim: 128 + width: 128 + num_convs: 2 + k: 16 + propagate_dimensions: 22 + space_dimensions: 4 + dropout: True + +train_dataset: + cms: + cms_pf_ttbar: + version: 1.6.0 + batch_size: 2 +test_dataset: + cms: + cms_pf_ttbar: + version: 1.6.0 + batch_size: 2 diff --git a/parameters/pyg-config.yaml b/parameters/pyg-config.yaml index 1080e7d56..affe7134e 100644 --- a/parameters/pyg-config.yaml +++ b/parameters/pyg-config.yaml @@ -3,13 +3,10 @@ backend: pytorch model: gnn-lsh: conv_type: gnn-lsh - embedding_dim: 36 - width: 128 - num_convs: 2 - k: 16 - propagate_dimensions: 22 - space_dimensions: 4 - dropout: True + embedding_dim: 512 + width: 512 + num_convs: 3 + dropout: 0.0 gravnet: conv_type: gravnet @@ -19,56 +16,59 @@ model: k: 16 propagate_dimensions: 22 space_dimensions: 4 - dropout: True + dropout: 0.0 attention: conv_type: attention embedding_dim: 128 width: 128 num_convs: 2 - k: 16 - propagate_dimensions: 22 - space_dimensions: 4 - dropout: True + dropout: 0.0 train_dataset: cms: cms_pf_ttbar: + version: 1.6.0 + batch_size: 1 + cms_pf_qcd: + version: 1.6.0 + batch_size: 1 + cms_pf_ztt: + version: 1.6.0 + batch_size: 1 + cms_pf_qcd_high_pt: + version: 1.6.0 + batch_size: 1 + cms_pf_sms_t1tttt: + version: 1.6.0 + batch_size: 1 + cms_pf_single_electron: + version: 1.6.0 + batch_size: 20 + cms_pf_single_gamma: + version: 1.6.0 + batch_size: 20 + cms_pf_single_pi0: + version: 1.6.0 + batch_size: 20 + cms_pf_single_neutron: + version: 1.6.0 + batch_size: 20 + cms_pf_single_pi: + version: 1.6.0 + batch_size: 20 + cms_pf_single_tau: + version: 1.6.0 + batch_size: 20 + cms_pf_single_mu: + version: 1.6.0 + batch_size: 20 + cms_pf_single_proton: + version: 1.6.0 + batch_size: 20 + cms_pf_multi_particle_gun: version: 1.6.0 batch_size: 5 - # cms_pf_qcd: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_ztt: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_qcd_high_pt: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_electron: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_gamma: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_pi0: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_neutron: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_pi: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_tau: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_mu: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_proton: - # version: 1.6.0 - # batch_size: 5 clic: clic_edm_qq_pf: version: 1.5.0 @@ -96,41 +96,47 @@ train_dataset: test_dataset: cms: cms_pf_ttbar: + version: 1.6.0 + batch_size: 1 + cms_pf_qcd: + version: 1.6.0 + batch_size: 1 + cms_pf_ztt: + version: 1.6.0 + batch_size: 1 + cms_pf_qcd_high_pt: + version: 1.6.0 + batch_size: 1 + cms_pf_sms_t1tttt: + version: 1.6.0 + batch_size: 1 + cms_pf_single_electron: + version: 1.6.0 + batch_size: 20 + cms_pf_single_gamma: + version: 1.6.0 + batch_size: 20 + cms_pf_single_pi0: + version: 1.6.0 + batch_size: 20 + cms_pf_single_neutron: + version: 1.6.0 + batch_size: 20 + cms_pf_single_pi: + version: 1.6.0 + batch_size: 20 + cms_pf_single_tau: + version: 1.6.0 + batch_size: 20 + cms_pf_single_mu: + version: 1.6.0 + batch_size: 20 + cms_pf_single_proton: + version: 1.6.0 + batch_size: 20 + cms_pf_multi_particle_gun: version: 1.6.0 batch_size: 5 - # cms_pf_qcd: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_ztt: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_qcd_high_pt: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_electron: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_gamma: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_pi0: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_neutron: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_pi: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_tau: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_mu: - # version: 1.6.0 - # batch_size: 5 - # cms_pf_single_proton: - # version: 1.6.0 - # batch_size: 5 clic: clic_edm_qq_pf: version: 1.5.0 diff --git a/scripts/generate_tfds.sh b/scripts/generate_tfds.sh index af31eabd3..3dedbb39a 100755 --- a/scripts/generate_tfds.sh +++ b/scripts/generate_tfds.sh @@ -18,6 +18,7 @@ export CMD="singularity exec -B /local -B /scratch/persistent --env PYTHONPATH=$ # $CMD mlpf/heptfds/cms_pf/qcd --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_qcd.log & # $CMD mlpf/heptfds/cms_pf/ztt --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_ztt.log & # $CMD mlpf/heptfds/cms_pf/qcd_high_pt --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_qcd_high_pt.log & +# $CMD mlpf/heptfds/cms_pf/smst1tttt --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_smst1tttt.log & # $CMD mlpf/heptfds/cms_pf/singlepi0 --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_singlepi0.log & # $CMD mlpf/heptfds/cms_pf/singleneutron --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_singleneutron.log & # $CMD mlpf/heptfds/cms_pf/singleele --data_dir $DATA_DIR --manual_dir $MANUAL_DIR --overwrite &> logs/tfds_singleele.log & diff --git a/scripts/local_test_pyg.sh b/scripts/local_test_pyg.sh index 1441e878d..91585aed1 100755 --- a/scripts/local_test_pyg.sh +++ b/scripts/local_test_pyg.sh @@ -27,4 +27,4 @@ mkdir -p experiments tfds build mlpf/heptfds/cms_pf/ttbar --manual_dir ./local_test_data -python mlpf/pyg_pipeline.py --dataset cms --data_dir ./tensorflow_datasets/ --model-prefix ./experiments/MLPF_test --gpus "" --train --test --make-plots +python mlpf/pyg_pipeline.py --config parameters/pyg-config-test.yaml --dataset cms --data_dir ./tensorflow_datasets/ --model-prefix ./experiments/MLPF_test --gpus "" --train --test --make-plots diff --git a/scripts/tallinn/a100/cms-train.sh b/scripts/tallinn/a100/cms-train.sh index 7bd0efe63..96f774c26 100755 --- a/scripts/tallinn/a100/cms-train.sh +++ b/scripts/tallinn/a100/cms-train.sh @@ -12,4 +12,4 @@ singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ --env TFDS_DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets \ $IMG python3.10 mlpf/pipeline.py train -c parameters/cms-gen.yaml --plot-freq 1 --num-cpus 32 --batch-multiplier 2 \ - --weights experiments/cms-gen_20231003_164730_341214.gpu1.local/weights/weights-37-1.192368.hdf5 + --weights experiments/cms-gen_20231005_111723_141101.gpu1.local/weights/weights-49-1.280692.hdf5