From 7ce69c8c430d3b1432c05b1713d3cf779380cd50 Mon Sep 17 00:00:00 2001 From: Farouk Mokhtar Date: Thu, 12 Dec 2024 10:35:02 +0100 Subject: [PATCH] Unify key4hep (#374) * up first version of unified postprocessing * must remove ECALother from fcc * include MCParticles.PDG for fcc * remove MCParticles#1.index from fcc * remove "index": prop_data["MCParticles#1.index"][iev] * fix daus index * rename dir to key4hep * add option to postprocess a directory at once * up tfds script for fcc * copy utils script * add BUILDER_CONFIGS * add NUM_SPLITS * add gamma to heptfds * must fix the track state * up radiusOfInnermostHit * add proper event splitting for ntrain and nvalid * linting --- mlpf/data/key4hep/postprocessing.py | 1306 ++++++++++++++++++++++ mlpf/heptfds/cld_pf_edm4hep/ttbar.py | 18 +- mlpf/heptfds/cld_pf_edm4hep/utils_edm.py | 222 ++++ mlpf/heptfds/clic_pf_edm4hep/gamma.py | 82 ++ mlpf/model/PFDataset.py | 11 +- 5 files changed, 1629 insertions(+), 10 deletions(-) create mode 100644 mlpf/data/key4hep/postprocessing.py create mode 100644 mlpf/heptfds/cld_pf_edm4hep/utils_edm.py create mode 100644 mlpf/heptfds/clic_pf_edm4hep/gamma.py diff --git a/mlpf/data/key4hep/postprocessing.py b/mlpf/data/key4hep/postprocessing.py new file mode 100644 index 000000000..e4b9fc9a1 --- /dev/null +++ b/mlpf/data/key4hep/postprocessing.py @@ -0,0 +1,1306 @@ +import os + +# noqa: to prevent https://stackoverflow.com/questions/52026652/openblas-blas-thread-init-pthread-create-resource-temporarily-unavailable +os.environ["OMP_NUM_THREADS"] = "1" +os.environ["OPENBLAS_NUM_THREADS"] = "1" +os.environ["MKL_NUM_THREADS"] = "1" +os.environ["VECLIB_MAXIMUM_THREADS"] = "1" +os.environ["NUMEXPR_NUM_THREADS"] = "1" + +import glob +import math + +import awkward +import fastjet +import numpy as np +import tqdm +import uproot +import vector +from scipy.sparse import coo_matrix + +jetdef = fastjet.JetDefinition(fastjet.ee_genkt_algorithm, 0.4, -1.0) +jet_ptcut = 5 + +track_coll = "SiTracks_Refitted" +mc_coll = "MCParticles" + +# the feature matrices will be saved in this order +particle_feature_order = [ + "PDG", + "charge", + "pt", + "eta", + "sin_phi", + "cos_phi", + "energy", + "ispu", + "generatorStatus", + "simulatorStatus", + "gp_to_track", + "gp_to_cluster", + "jet_idx", +] + +# arrange track and cluster features such that pt (et), eta, phi, p (energy) are in the same spot +# so we can easily use them in skip connections +track_feature_order = [ + "elemtype", + "pt", + "eta", + "sin_phi", + "cos_phi", + "p", + "chi2", + "ndf", + "dEdx", + "dEdxError", + "radiusOfInnermostHit", + "tanLambda", + "D0", + "omega", + "Z0", + "time", +] +cluster_feature_order = [ + "elemtype", + "et", + "eta", + "sin_phi", + "cos_phi", + "energy", + "position.x", + "position.y", + "position.z", + "iTheta", + "energy_ecal", + "energy_hcal", + "energy_other", + "num_hits", + "sigma_x", + "sigma_y", + "sigma_z", +] +hit_feature_order = [ + "elemtype", + "et", + "eta", + "sin_phi", + "cos_phi", + "energy", + "position.x", + "position.y", + "position.z", + "time", + "subdetector", + "type", +] + + +def deltaphi(phi1, phi2): + diff = phi1 - phi2 + return np.arctan2(np.sin(diff), np.cos(diff)) + + +def deltar(eta1, phi1, eta2, phi2): + deta = eta1 - eta2 + dphi = deltaphi(phi1, phi2) + return np.sqrt(deta**2 + dphi**2) + + +# https://stackoverflow.com/questions/2413522/weighted-standard-deviation-in-numpy +def weighted_avg_and_std(values, weights): + """ + Return the weighted average and standard deviation. + + They weights are in effect first normalized so that they + sum to 1 (and so they must not all be 0). + + values, weights -- NumPy ndarrays with the same shape. + """ + average = np.average(values, weights=weights) + # Fast and numerically precise: + variance = np.average((values - average) ** 2, weights=weights) + return (average, math.sqrt(variance)) + + +def track_pt(omega): + a = 3 * 10**-4 + b = 4 # B-field in tesla, from clicRec_e4h_input + + return a * np.abs(b / omega) + + +def map_pdgid_to_candid(pdgid, charge): + if pdgid == 0: + return 0 + + # photon, electron, muon + if pdgid in [22, 11, 13]: + return pdgid + + # charged hadron + if abs(charge) > 0: + return 211 + + # neutral hadron + return 130 + + +def map_charged_to_neutral(pdg): + if pdg == 0: + return 0 + if pdg == 11 or pdg == 22: + return 22 + return 130 + + +def map_neutral_to_charged(pdg): + if pdg == 130 or pdg == 22: + return 211 + return pdg + + +def sanitize(arr): + arr[np.isnan(arr)] = 0.0 + arr[np.isinf(arr)] = 0.0 + + +class EventData: + def __init__( + self, + gen_features, + hit_features, + cluster_features, + track_features, + genparticle_to_hit, + genparticle_to_track, + hit_to_cluster, + gp_merges, + ): + self.gen_features = gen_features # feature matrix of the genparticles + self.hit_features = hit_features # feature matrix of the calo hits + self.cluster_features = cluster_features # feature matrix of the calo clusters + self.track_features = track_features # feature matrix of the tracks + self.genparticle_to_hit = genparticle_to_hit # sparse COO matrix of genparticles to hits (idx_gp, idx_hit, weight) + self.genparticle_to_track = genparticle_to_track # sparse COO matrix of genparticles to tracks (idx_gp, idx_track, weight) + self.hit_to_cluster = hit_to_cluster # sparse COO matrix of hits to clusters (idx_hit, idx_cluster, weight) + self.gp_merges = gp_merges # sparse COO matrix of any merged genparticles + + self.genparticle_to_hit = ( + np.array(self.genparticle_to_hit[0]), + np.array(self.genparticle_to_hit[1]), + np.array(self.genparticle_to_hit[2]), + ) + self.genparticle_to_track = ( + np.array(self.genparticle_to_track[0]), + np.array(self.genparticle_to_track[1]), + np.array(self.genparticle_to_track[2]), + ) + self.hit_to_cluster = ( + np.array(self.hit_to_cluster[0]), + np.array(self.hit_to_cluster[1]), + np.array(self.hit_to_cluster[2]), + ) + self.gp_merges = np.array(self.gp_merges[0]), np.array(self.gp_merges[1]) + + +def hits_to_features(hit_data, iev, coll, feats): + feat_arr = {f: hit_data[coll + "." + f][iev] for f in feats} + + # set the subdetector type + sdcoll = "subdetector" + feat_arr[sdcoll] = np.zeros(len(feat_arr["type"]), dtype=np.int32) + if coll.startswith("ECAL"): + feat_arr[sdcoll][:] = 0 + elif coll.startswith("HCAL"): + feat_arr[sdcoll][:] = 1 + else: + feat_arr[sdcoll][:] = 2 + + # hit elemtype is always 2 + feat_arr["elemtype"] = 2 * np.ones(len(feat_arr["type"]), dtype=np.int32) + + # precompute some approximate et, eta, phi + pos_mag = np.sqrt(feat_arr["position.x"] ** 2 + feat_arr["position.y"] ** 2 + feat_arr["position.z"] ** 2) + px = (feat_arr["position.x"] / pos_mag) * feat_arr["energy"] + py = (feat_arr["position.y"] / pos_mag) * feat_arr["energy"] + pz = (feat_arr["position.z"] / pos_mag) * feat_arr["energy"] + feat_arr["et"] = np.sqrt(px**2 + py**2) + feat_arr["eta"] = 0.5 * np.log((feat_arr["energy"] + pz) / (feat_arr["energy"] - pz)) + feat_arr["sin_phi"] = py / feat_arr["energy"] + feat_arr["cos_phi"] = px / feat_arr["energy"] + + return awkward.Record(feat_arr) + + +def get_calohit_matrix_and_genadj(dataset, hit_data, calohit_links, iev, collectionIDs): + feats = ["type", "cellID", "energy", "energyError", "time", "position.x", "position.y", "position.z"] + + hit_idx_global = 0 + hit_idx_global_to_local = {} + hit_feature_matrix = [] + for col in sorted(hit_data.keys()): + icol = collectionIDs[col] + hit_features = hits_to_features(hit_data[col], iev, col, feats) + hit_feature_matrix.append(hit_features) + for ihit in range(len(hit_data[col][col + ".energy"][iev])): + hit_idx_global_to_local[hit_idx_global] = (icol, ihit) + hit_idx_global += 1 + hit_idx_local_to_global = {v: k for k, v in hit_idx_global_to_local.items()} + hit_feature_matrix = awkward.Record( + {k: awkward.concatenate([hit_feature_matrix[i][k] for i in range(len(hit_feature_matrix))]) for k in hit_feature_matrix[0].fields} + ) + + # add all edges from genparticle to calohit + calohit_to_gen_weight = calohit_links["CalohitMCTruthLink.weight"][iev] + if dataset == "clic": + calohit_to_gen_calo_colid = calohit_links["CalohitMCTruthLink#0.collectionID"][iev] + calohit_to_gen_gen_colid = calohit_links["CalohitMCTruthLink#1.collectionID"][iev] + calohit_to_gen_calo_idx = calohit_links["CalohitMCTruthLink#0.index"][iev] + calohit_to_gen_gen_idx = calohit_links["CalohitMCTruthLink#1.index"][iev] + elif dataset == "fcc": + calohit_to_gen_calo_colid = calohit_links["_CalohitMCTruthLink_from/_CalohitMCTruthLink_from.collectionID"][iev] + calohit_to_gen_gen_colid = calohit_links["_CalohitMCTruthLink_to/_CalohitMCTruthLink_to.collectionID"][iev] + calohit_to_gen_calo_idx = calohit_links["_CalohitMCTruthLink_from/_CalohitMCTruthLink_from.index"][iev] + calohit_to_gen_gen_idx = calohit_links["_CalohitMCTruthLink_to/_CalohitMCTruthLink_to.index"][iev] + else: + raise Exception("--dataset provided is not supported. Only 'fcc' or 'clic' are supported atm.") + + genparticle_to_hit_matrix_coo0 = [] + genparticle_to_hit_matrix_coo1 = [] + genparticle_to_hit_matrix_w = [] + for calo_colid, calo_idx, gen_colid, gen_idx, w in zip( + calohit_to_gen_calo_colid, + calohit_to_gen_calo_idx, + calohit_to_gen_gen_colid, + calohit_to_gen_gen_idx, + calohit_to_gen_weight, + ): + genparticle_to_hit_matrix_coo0.append(gen_idx) + genparticle_to_hit_matrix_coo1.append(hit_idx_local_to_global[(calo_colid, calo_idx)]) + genparticle_to_hit_matrix_w.append(w) + + return ( + hit_feature_matrix, + ( + np.array(genparticle_to_hit_matrix_coo0), + np.array(genparticle_to_hit_matrix_coo1), + np.array(genparticle_to_hit_matrix_w), + ), + hit_idx_local_to_global, + ) + + +def hit_cluster_adj(dataset, prop_data, hit_idx_local_to_global, iev): + + if dataset == "clic": + coll_arr = prop_data["PandoraClusters#1"]["PandoraClusters#1.collectionID"][iev] + idx_arr = prop_data["PandoraClusters#1"]["PandoraClusters#1.index"][iev] + hits_begin = prop_data["PandoraClusters"]["PandoraClusters.hits_begin"][iev] + hits_end = prop_data["PandoraClusters"]["PandoraClusters.hits_end"][iev] + elif dataset == "fcc": + coll_arr = prop_data["_PandoraClusters_hits/_PandoraClusters_hits.collectionID"][iev] + idx_arr = prop_data["_PandoraClusters_hits/_PandoraClusters_hits.index"][iev] + hits_begin = prop_data["PandoraClusters"]["PandoraClusters.hits_begin"][iev] + hits_end = prop_data["PandoraClusters"]["PandoraClusters.hits_end"][iev] + else: + raise Exception("--dataset provided is not supported. Only 'fcc' or 'clic' are supported atm.") + + # index in the array of all hits + hit_to_cluster_matrix_coo0 = [] + # index in the cluster array + hit_to_cluster_matrix_coo1 = [] + + # weight + hit_to_cluster_matrix_w = [] + + # loop over all clusters + for icluster in range(len(hits_begin)): + + # get the slice in the hit array corresponding to this cluster + hbeg = hits_begin[icluster] + hend = hits_end[icluster] + idx_range = idx_arr[hbeg:hend] + coll_range = coll_arr[hbeg:hend] + + # add edges from hit to cluster + for icol, idx in zip(coll_range, idx_range): + hit_to_cluster_matrix_coo0.append(hit_idx_local_to_global[(icol, idx)]) + hit_to_cluster_matrix_coo1.append(icluster) + hit_to_cluster_matrix_w.append(1.0) + return hit_to_cluster_matrix_coo0, hit_to_cluster_matrix_coo1, hit_to_cluster_matrix_w + + +def gen_to_features(dataset, prop_data, iev): + + if dataset == "clic": + gen_arr = prop_data[iev] + elif dataset == "fcc": + gen_arr = prop_data[mc_coll][iev] + else: + raise Exception("--dataset provided is not supported. Only 'fcc' or 'clic' are supported atm.") + + gen_arr = {k.replace(mc_coll + ".", ""): gen_arr[k] for k in gen_arr.fields} + + MCParticles_p4 = vector.awk( + awkward.zip({"mass": gen_arr["mass"], "x": gen_arr["momentum.x"], "y": gen_arr["momentum.y"], "z": gen_arr["momentum.z"]}) + ) + gen_arr["pt"] = MCParticles_p4.pt + gen_arr["eta"] = MCParticles_p4.eta + gen_arr["phi"] = MCParticles_p4.phi + gen_arr["energy"] = MCParticles_p4.energy + gen_arr["sin_phi"] = np.sin(gen_arr["phi"]) + gen_arr["cos_phi"] = np.cos(gen_arr["phi"]) + + # placeholder flag + gen_arr["ispu"] = np.zeros_like(gen_arr["phi"]) + + ret = { + "PDG": gen_arr["PDG"], + "generatorStatus": gen_arr["generatorStatus"], + "charge": gen_arr["charge"], + "pt": gen_arr["pt"], + "eta": gen_arr["eta"], + "phi": gen_arr["phi"], + "sin_phi": gen_arr["sin_phi"], + "cos_phi": gen_arr["cos_phi"], + "energy": gen_arr["energy"], + "ispu": gen_arr["ispu"], + "simulatorStatus": gen_arr["simulatorStatus"], + "gp_to_track": np.zeros(len(gen_arr["PDG"]), dtype=np.float64), + "gp_to_cluster": np.zeros(len(gen_arr["PDG"]), dtype=np.float64), + "jet_idx": np.zeros(len(gen_arr["PDG"]), dtype=np.int64), + "daughters_begin": gen_arr["daughters_begin"], + "daughters_end": gen_arr["daughters_end"], + } + + if dataset == "clic": + ret["index"] = prop_data["MCParticles#1.index"][iev] + elif dataset == "fcc": + ret["index"] = prop_data["_MCParticles_daughters/_MCParticles_daughters.index"][iev] + else: + raise Exception("--dataset provided is not supported. Only 'fcc' or 'clic' are supported atm.") + + return ret + + +def genparticle_track_adj(dataset, sitrack_links, iev): + + if dataset == "clic": + trk_to_gen_trkidx = sitrack_links["SiTracksMCTruthLink#0.index"][iev] + trk_to_gen_genidx = sitrack_links["SiTracksMCTruthLink#1.index"][iev] + elif dataset == "fcc": + trk_to_gen_trkidx = sitrack_links["_SiTracksMCTruthLink_from/_SiTracksMCTruthLink_from.index"][iev] + trk_to_gen_genidx = sitrack_links["_SiTracksMCTruthLink_to/_SiTracksMCTruthLink_to.index"][iev] + else: + raise Exception("--dataset provided is not supported. Only 'fcc' or 'clic' are supported atm.") + + trk_to_gen_w = sitrack_links["SiTracksMCTruthLink.weight"][iev] + + genparticle_to_track_matrix_coo0 = awkward.to_numpy(trk_to_gen_genidx) + genparticle_to_track_matrix_coo1 = awkward.to_numpy(trk_to_gen_trkidx) + genparticle_to_track_matrix_w = awkward.to_numpy(trk_to_gen_w) + + return genparticle_to_track_matrix_coo0, genparticle_to_track_matrix_coo1, genparticle_to_track_matrix_w + + +def cluster_to_features(prop_data, hit_features, hit_to_cluster, iev): + cluster_arr = prop_data["PandoraClusters"][iev] + feats = ["type", "position.x", "position.y", "position.z", "iTheta", "phi", "energy"] + ret = {feat: cluster_arr["PandoraClusters." + feat] for feat in feats} + + hit_idx = np.array(hit_to_cluster[0]) + cluster_idx = np.array(hit_to_cluster[1]) + cl_energy_ecal = [] + cl_energy_hcal = [] + cl_energy_other = [] + num_hits = [] + cl_sigma_x = [] + cl_sigma_y = [] + cl_sigma_z = [] + + n_cl = len(ret["energy"]) + for cl in range(n_cl): + msk_cl = cluster_idx == cl + hits = hit_idx[msk_cl] + + num_hits.append(len(hits)) + + subdets = hit_features["subdetector"][hits] + + hits_energy = hit_features["energy"][hits] + + hits_posx = hit_features["position.x"][hits] + hits_posy = hit_features["position.y"][hits] + hits_posz = hit_features["position.z"][hits] + + energy_ecal = np.sum(hits_energy[subdets == 0]) + energy_hcal = np.sum(hits_energy[subdets == 1]) + energy_other = np.sum(hits_energy[subdets == 2]) + + cl_energy_ecal.append(energy_ecal) + cl_energy_hcal.append(energy_hcal) + cl_energy_other.append(energy_other) + + # weighted standard deviation of cluster hits + sigma_x = weighted_avg_and_std(hits_posx, hits_energy)[1] + sigma_y = weighted_avg_and_std(hits_posy, hits_energy)[1] + sigma_z = weighted_avg_and_std(hits_posz, hits_energy)[1] + cl_sigma_x.append(sigma_x) + cl_sigma_y.append(sigma_y) + cl_sigma_z.append(sigma_z) + + ret["energy_ecal"] = np.array(cl_energy_ecal) + ret["energy_hcal"] = np.array(cl_energy_hcal) + ret["energy_other"] = np.array(cl_energy_other) + ret["num_hits"] = np.array(num_hits) + ret["sigma_x"] = np.array(cl_sigma_x) + ret["sigma_y"] = np.array(cl_sigma_y) + ret["sigma_z"] = np.array(cl_sigma_z) + + tt = awkward.to_numpy(np.tan(ret["iTheta"] / 2.0)) + eta = awkward.to_numpy(-np.log(tt, where=tt > 0)) + eta[tt <= 0] = 0.0 + ret["eta"] = eta + + costheta = np.cos(ret["iTheta"]) + ez = ret["energy"] * costheta + ret["et"] = np.sqrt(ret["energy"] ** 2 - ez**2) + + # cluster is always type 2 + ret["elemtype"] = 2 * np.ones(n_cl, dtype=np.float32) + + ret["sin_phi"] = np.sin(ret["phi"]) + ret["cos_phi"] = np.cos(ret["phi"]) + + return awkward.Record(ret) + + +def track_to_features(dataset, prop_data, iev): + if dataset == "clic": + track_arr = prop_data[track_coll][iev] + feats_from_track = ["type", "chi2", "ndf", "dEdx", "dEdxError", "radiusOfInnermostHit"] + ret = {feat: track_arr[track_coll + "." + feat] for feat in feats_from_track} + + elif dataset == "fcc": + track_arr = prop_data[track_coll][iev] + # the following are needed since they are no longer defined under SiTracks_Refitted + track_arr_dQdx = prop_data["SiTracks_Refitted_dQdx"][iev] + track_arr_trackStates = prop_data["_SiTracks_Refitted_trackStates"][iev] + + feats_from_track = ["type", "chi2", "ndf"] + ret = {feat: track_arr[track_coll + "." + feat] for feat in feats_from_track} + + ret["dEdx"] = track_arr_dQdx["SiTracks_Refitted_dQdx.dQdx.value"] + ret["dEdxError"] = track_arr_dQdx["SiTracks_Refitted_dQdx.dQdx.error"] + + # build the radiusOfInnermostHit variable + num_tracks = len(ret["dEdx"]) + innermost_radius = [] + for itrack in range(num_tracks): + + # select the track states corresponding to itrack + # pick the state AtFirstHit + # https://github.com/key4hep/EDM4hep/blob/fe5a54046a91a7e648d0b588960db7841aebc670/edm4hep.yaml#L220 + ibegin = track_arr[track_coll + "." + "trackStates_begin"][itrack] + iend = track_arr[track_coll + "." + "trackStates_end"][itrack] + + refX = track_arr_trackStates["_SiTracks_Refitted_trackStates" + "." + "referencePoint.x"][ibegin:iend] + refY = track_arr_trackStates["_SiTracks_Refitted_trackStates" + "." + "referencePoint.y"][ibegin:iend] + location = track_arr_trackStates["_SiTracks_Refitted_trackStates" + "." + "location"][ibegin:iend] + + istate = np.argmax(location == 2) # 2 refers to AtFirstHit + + innermost_radius.append(math.sqrt(refX[istate] ** 2 + refY[istate] ** 2)) + + ret["radiusOfInnermostHit"] = np.array(innermost_radius) + + else: + raise Exception("--dataset provided is not supported. Only 'fcc' or 'clic' are supported atm.") + + n_tr = len(ret["type"]) + + # get the index of the first track state + trackstate_idx = prop_data[track_coll][track_coll + ".trackStates_begin"][iev] + # get the properties of the track at the first track state (at the origin) + for k in ["tanLambda", "D0", "phi", "omega", "Z0", "time"]: + + if dataset == "clic": + ret[k] = awkward.to_numpy(prop_data["SiTracks_1"]["SiTracks_1." + k][iev][trackstate_idx]) + elif dataset == "fcc": + ret[k] = awkward.to_numpy(prop_data["_SiTracks_Refitted_trackStates"]["_SiTracks_Refitted_trackStates." + k][iev][trackstate_idx]) + + else: + raise Exception("--dataset provided is not supported. Only 'fcc' or 'clic' are supported atm.") + + ret["pt"] = awkward.to_numpy(track_pt(ret["omega"])) + ret["px"] = awkward.to_numpy(np.cos(ret["phi"])) * ret["pt"] + ret["py"] = awkward.to_numpy(np.sin(ret["phi"])) * ret["pt"] + ret["pz"] = awkward.to_numpy(ret["tanLambda"]) * ret["pt"] + ret["p"] = np.sqrt(ret["px"] ** 2 + ret["py"] ** 2 + ret["pz"] ** 2) + cos_theta = np.divide(ret["pz"], ret["p"], where=ret["p"] > 0) + theta = np.arccos(cos_theta) + tt = np.tan(theta / 2.0) + eta = awkward.to_numpy(-np.log(tt, where=tt > 0)) + eta[tt <= 0] = 0.0 + ret["eta"] = eta + + ret["sin_phi"] = np.sin(ret["phi"]) + ret["cos_phi"] = np.cos(ret["phi"]) + + # track is always type 1 + ret["elemtype"] = 1 * np.ones(n_tr, dtype=np.float32) + + return awkward.Record(ret) + + +def filter_adj(adj, all_to_filtered): + i0s_new = [] + i1s_new = [] + ws_new = [] + for i0, i1, w in zip(*adj): + if i0 in all_to_filtered: + i0_new = all_to_filtered[i0] + i0s_new.append(i0_new) + i1s_new.append(i1) + ws_new.append(w) + return np.array(i0s_new), np.array(i1s_new), np.array(ws_new) + + +# loop over status 1 particles and collect the hits of their immediate daughters +# genparticle_to_hit: tuple with 3 arrays (genparticle_indices, hit_indices, weights) +# genparticle_to_trk: tuple with 3 arrays (genparticle_indices, track_indices, weights) +def add_daughters_to_status1(gen_features, genparticle_to_hit, genparticle_to_trk): + mask_status1 = gen_features["generatorStatus"] == 1 + dau_beg = gen_features["daughters_begin"] + dau_end = gen_features["daughters_end"] + dau_ind = gen_features["index"] + genparticle_to_hit_additional_gp = [] + genparticle_to_hit_additional_hit = [] + genparticle_to_hit_additional_w = [] + genparticle_to_trk_additional_gp = [] + genparticle_to_trk_additional_trk = [] + genparticle_to_trk_additional_w = [] + for idx_st1 in np.where(mask_status1)[0]: + pdg = abs(gen_features["PDG"][idx_st1]) + if pdg not in [12, 14, 16]: + db = dau_beg[idx_st1] + de = dau_end[idx_st1] + daus = dau_ind[db:de] + for dau in daus: + dau_hit_idx = genparticle_to_hit[1][genparticle_to_hit[0] == dau] + dau_hit_w = genparticle_to_hit[2][genparticle_to_hit[0] == dau] + for dh_idx, dh_w in zip(dau_hit_idx, dau_hit_w): + genparticle_to_hit_additional_gp.append(idx_st1) + genparticle_to_hit_additional_hit.append(dh_idx) + genparticle_to_hit_additional_w.append(dh_w) + + dau_trk_idx = genparticle_to_trk[1][genparticle_to_trk[0] == dau] + dau_trk_w = genparticle_to_trk[2][genparticle_to_trk[0] == dau] + for dt_idx, dt_w in zip(dau_trk_idx, dau_trk_w): + genparticle_to_trk_additional_gp.append(idx_st1) + genparticle_to_trk_additional_trk.append(dt_idx) + genparticle_to_trk_additional_w.append(dt_w) + + genparticle_to_hit = ( + np.concatenate([genparticle_to_hit[0], genparticle_to_hit_additional_gp]), + np.concatenate([genparticle_to_hit[1], genparticle_to_hit_additional_hit]), + np.concatenate([genparticle_to_hit[2], genparticle_to_hit_additional_w]), + ) + genparticle_to_trk = ( + np.concatenate([genparticle_to_trk[0], genparticle_to_trk_additional_gp]), + np.concatenate([genparticle_to_trk[1], genparticle_to_trk_additional_trk]), + np.concatenate([genparticle_to_trk[2], genparticle_to_trk_additional_w]), + ) + return genparticle_to_hit, genparticle_to_trk + + +def get_genparticles_and_adjacencies(dataset, prop_data, hit_data, calohit_links, sitrack_links, iev, collectionIDs): + gen_features = gen_to_features(dataset, prop_data, iev) + hit_features, genparticle_to_hit, hit_idx_local_to_global = get_calohit_matrix_and_genadj(dataset, hit_data, calohit_links, iev, collectionIDs) + hit_to_cluster = hit_cluster_adj(dataset, prop_data, hit_idx_local_to_global, iev) + cluster_features = cluster_to_features(prop_data, hit_features, hit_to_cluster, iev) + track_features = track_to_features(dataset, prop_data, iev) + genparticle_to_trk = genparticle_track_adj(dataset, sitrack_links, iev) + + # collect hits of st=1 daughters to the st=1 particles + mask_status1 = gen_features["generatorStatus"] == 1 + + if gen_features["index"] is not None: # if there are even daughters + genparticle_to_hit, genparticle_to_trk = add_daughters_to_status1(gen_features, genparticle_to_hit, genparticle_to_trk) + + n_gp = awkward.count(gen_features["PDG"]) + n_track = awkward.count(track_features["type"]) + n_hit = awkward.count(hit_features["type"]) + n_cluster = awkward.count(cluster_features["type"]) + + if len(genparticle_to_trk[0]) > 0: + gp_to_track = coo_matrix((genparticle_to_trk[2], (genparticle_to_trk[0], genparticle_to_trk[1])), shape=(n_gp, n_track)).max(axis=1).todense() + else: + gp_to_track = np.zeros((n_gp, 1)) + + gp_to_calohit = coo_matrix((genparticle_to_hit[2], (genparticle_to_hit[0], genparticle_to_hit[1])), shape=(n_gp, n_hit)) + calohit_to_cluster = coo_matrix((hit_to_cluster[2], (hit_to_cluster[0], hit_to_cluster[1])), shape=(n_hit, n_cluster)) + gp_to_cluster = (gp_to_calohit * calohit_to_cluster).sum(axis=1) + + # 20% of the hits of a track must come from the genparticle + gp_in_tracker = np.array(gp_to_track >= 0.2)[:, 0] + + # at least 5% of the energy of the genparticle should be matched to a calorimeter cluster + gp_in_calo = (np.array(gp_to_cluster)[:, 0] / gen_features["energy"]) > 0.05 + + gp_interacted_with_detector = gp_in_tracker | gp_in_calo + + gen_features["gp_to_track"] = np.asarray(gp_to_track)[:, 0] + gen_features["gp_to_cluster"] = np.asarray(gp_to_cluster)[:, 0] + + mask_visible = awkward.to_numpy(mask_status1 & gp_interacted_with_detector) + + idx_all_masked = np.where(mask_visible)[0] + genpart_idx_all_to_filtered = {idx_all: idx_filtered for idx_filtered, idx_all in enumerate(idx_all_masked)} + + if np.array(mask_visible).sum() == 0: + print("event does not have even one 'visible' particle. will skip event") + return None + + if len(np.array(mask_visible)) == 1: + # event has only one particle (then index will be empty because no daughters) + gen_features = awkward.Record({feat: (gen_features[feat][mask_visible] if feat != "index" else None) for feat in gen_features.keys()}) + else: + gen_features = awkward.Record({feat: gen_features[feat][mask_visible] for feat in gen_features.keys()}) + + genparticle_to_hit = filter_adj(genparticle_to_hit, genpart_idx_all_to_filtered) + genparticle_to_trk = filter_adj(genparticle_to_trk, genpart_idx_all_to_filtered) + + return EventData( + gen_features, + hit_features, + cluster_features, + track_features, + genparticle_to_hit, + genparticle_to_trk, + hit_to_cluster, + ([], []), + ) + + +def assign_genparticles_to_obj_and_merge(gpdata): + + n_gp = awkward.count(gpdata.gen_features["PDG"]) + n_track = awkward.count(gpdata.track_features["type"]) + n_hit = awkward.count(gpdata.hit_features["type"]) + n_cluster = awkward.count(gpdata.cluster_features["type"]) + + gp_to_track = np.array( + coo_matrix( + (gpdata.genparticle_to_track[2], (gpdata.genparticle_to_track[0], gpdata.genparticle_to_track[1])), + shape=(n_gp, n_track), + ).todense() + ) + + gp_to_calohit = coo_matrix((gpdata.genparticle_to_hit[2], (gpdata.genparticle_to_hit[0], gpdata.genparticle_to_hit[1])), shape=(n_gp, n_hit)) + calohit_to_cluster = coo_matrix((gpdata.hit_to_cluster[2], (gpdata.hit_to_cluster[0], gpdata.hit_to_cluster[1])), shape=(n_hit, n_cluster)) + + gp_to_cluster = np.array((gp_to_calohit * calohit_to_cluster).todense()) + + # map each genparticle to a track or a cluster + gp_to_obj = -1 * np.ones((n_gp, 2), dtype=np.int32) + set_used_tracks = set([]) + set_used_clusters = set([]) + gps_sorted_energy = sorted(range(n_gp), key=lambda x: gpdata.gen_features["energy"][x], reverse=True) + + for igp in gps_sorted_energy: + + # first check if we can match the genparticle to a track + matched_tracks = gp_to_track[igp] + trks = np.where(matched_tracks)[0] + trks = sorted(trks, key=lambda x: matched_tracks[x], reverse=True) + for trk in trks: + # if the track was not already used for something else + if trk not in set_used_tracks: + gp_to_obj[igp, 0] = trk + set_used_tracks.add(trk) + break + + # if there was no matched track, try a cluster + if gp_to_obj[igp, 0] == -1: + matched_clusters = gp_to_cluster[igp] + clusters = np.where(matched_clusters)[0] + clusters = sorted(clusters, key=lambda x: matched_clusters[x], reverse=True) + for cl in clusters: + if cl not in set_used_clusters: + gp_to_obj[igp, 1] = cl + set_used_clusters.add(cl) + break + + # the genparticles that could not be matched to a track or cluster are merged to the closest genparticle + unmatched = np.where((gp_to_obj[:, 0] == -1) & (gp_to_obj[:, 1] == -1))[0] + mask_gp_unmatched = np.ones(n_gp, dtype=bool) + + pt_arr = np.array(awkward.to_numpy(gpdata.gen_features["pt"])) + eta_arr = np.array(awkward.to_numpy(gpdata.gen_features["eta"])) + phi_arr = np.array(awkward.to_numpy(gpdata.gen_features["phi"])) + energy_arr = np.array(awkward.to_numpy(gpdata.gen_features["energy"])) + + # now merge unmatched genparticles to their closest genparticle + gp_merges_gp0 = [] + gp_merges_gp1 = [] + for igp_unmatched in unmatched: + mask_gp_unmatched[igp_unmatched] = False + + # find closest cluster that this particle is matched to + idx_best_cluster = np.argmax(gp_to_cluster[igp_unmatched]) + # get the first genparticle matched to that cluster + idx_gp_bestcluster = np.where(gp_to_obj[:, 1] == idx_best_cluster)[0] + + # if the genparticle is not matched to any cluster, then it left a few hits to some other track + # this is rare, happens only for low-pT particles and we don't want to try to reconstruct it + if len(idx_gp_bestcluster) != 1: + print("unmatched pt=", pt_arr[igp_unmatched]) + continue + + idx_gp_bestcluster = idx_gp_bestcluster[0] + + gp_merges_gp0.append(idx_gp_bestcluster) + gp_merges_gp1.append(igp_unmatched) + + vec0 = vector.obj( + pt=gpdata.gen_features["pt"][igp_unmatched], + eta=gpdata.gen_features["eta"][igp_unmatched], + phi=gpdata.gen_features["phi"][igp_unmatched], + e=gpdata.gen_features["energy"][igp_unmatched], + ) + vec1 = vector.obj( + pt=gpdata.gen_features["pt"][idx_gp_bestcluster], + eta=gpdata.gen_features["eta"][idx_gp_bestcluster], + phi=gpdata.gen_features["phi"][idx_gp_bestcluster], + e=gpdata.gen_features["energy"][idx_gp_bestcluster], + ) + vec = vec0 + vec1 + pt_arr[idx_gp_bestcluster] = vec.pt + eta_arr[idx_gp_bestcluster] = vec.eta + phi_arr[idx_gp_bestcluster] = vec.phi + energy_arr[idx_gp_bestcluster] = vec.energy + + gen_features_new = { + "PDG": np.abs(gpdata.gen_features["PDG"][mask_gp_unmatched]), + "charge": gpdata.gen_features["charge"][mask_gp_unmatched], + "pt": pt_arr[mask_gp_unmatched], + "eta": eta_arr[mask_gp_unmatched], + "sin_phi": np.sin(phi_arr[mask_gp_unmatched]), + "cos_phi": np.cos(phi_arr[mask_gp_unmatched]), + "energy": energy_arr[mask_gp_unmatched], + "ispu": gpdata.gen_features["ispu"][mask_gp_unmatched], + "generatorStatus": gpdata.gen_features["generatorStatus"][mask_gp_unmatched], + "simulatorStatus": gpdata.gen_features["simulatorStatus"][mask_gp_unmatched], + "gp_to_track": gpdata.gen_features["gp_to_track"][mask_gp_unmatched], + "gp_to_cluster": gpdata.gen_features["gp_to_cluster"][mask_gp_unmatched], + "jet_idx": gpdata.gen_features["jet_idx"][mask_gp_unmatched], + } + assert (np.sum(gen_features_new["energy"]) - np.sum(gpdata.gen_features["energy"])) < 1e-2 + + idx_all_masked = np.where(mask_gp_unmatched)[0] + genpart_idx_all_to_filtered = {idx_all: idx_filtered for idx_filtered, idx_all in enumerate(idx_all_masked)} + genparticle_to_hit = filter_adj(gpdata.genparticle_to_hit, genpart_idx_all_to_filtered) + genparticle_to_track = filter_adj(gpdata.genparticle_to_track, genpart_idx_all_to_filtered) + gp_to_obj = gp_to_obj[mask_gp_unmatched] + + return ( + EventData( + gen_features_new, + gpdata.hit_features, + gpdata.cluster_features, + gpdata.track_features, + genparticle_to_hit, + genparticle_to_track, + gpdata.hit_to_cluster, + (gp_merges_gp0, gp_merges_gp1), + ), + gp_to_obj, + ) + + +# for each PF element (track, cluster), get the index of the best-matched particle (gen or reco) +# if the PF element has no best-matched particle, returns -1 +def assign_to_recoobj(n_obj, obj_to_ptcl, used_particles): + obj_to_ptcl_all = -1 * np.ones(n_obj, dtype=np.int64) + for iobj in range(n_obj): + if iobj in obj_to_ptcl: + iptcl = obj_to_ptcl[iobj] + obj_to_ptcl_all[iobj] = iptcl + assert used_particles[iptcl] == 0 + used_particles[iptcl] = 1 + return obj_to_ptcl_all + + +def get_recoptcl_to_obj(n_rps, reco_arr, idx_rp_to_track, idx_rp_to_cluster): + track_to_rp = {} + cluster_to_rp = {} + + # loop over the reco particles + for irp in range(n_rps): + assigned = False + + # find and loop over tracks associated to the reco particle + trks_begin = reco_arr["tracks_begin"][irp] + trks_end = reco_arr["tracks_end"][irp] + for itrk in range(trks_begin, trks_end): + + # get the index of the track in the track collection + itrk_real = idx_rp_to_track[itrk] + assert itrk_real not in track_to_rp + track_to_rp[itrk_real] = irp + assigned = True + + # only look for clusters if tracks were not found + if not assigned: + + # find and loop over clusters associated to the reco particle + cls_begin = reco_arr["clusters_begin"][irp] + cls_end = reco_arr["clusters_end"][irp] + for icls in range(cls_begin, cls_end): + + # get the index of the cluster in the cluster collection + icls_real = idx_rp_to_cluster[icls] + assert icls_real not in cluster_to_rp + cluster_to_rp[icls_real] = irp + return track_to_rp, cluster_to_rp + + +def get_reco_properties(dataset, prop_data, iev): + + if dataset == "clic": + reco_arr = prop_data["MergedRecoParticles"][iev] + reco_arr = {k.replace("MergedRecoParticles.", ""): reco_arr[k] for k in reco_arr.fields} + elif dataset == "fcc": + reco_arr = prop_data["PandoraPFOs"][iev] + reco_arr = {k.replace("PandoraPFOs.", ""): reco_arr[k] for k in reco_arr.fields} + else: + raise Exception("--dataset provided is not supported. Only 'fcc' or 'clic' are supported atm.") + + reco_p4 = vector.awk( + awkward.zip({"mass": reco_arr["mass"], "x": reco_arr["momentum.x"], "y": reco_arr["momentum.y"], "z": reco_arr["momentum.z"]}) + ) + reco_arr["pt"] = reco_p4.pt + reco_arr["eta"] = reco_p4.eta + reco_arr["phi"] = reco_p4.phi + reco_arr["energy"] = reco_p4.energy + + if dataset == "clic": + msk = reco_arr["type"] != 0 + elif dataset == "fcc": + msk = reco_arr["PDG"] != 0 + else: + raise Exception("--dataset provided is not supported. Only 'fcc' or 'clic' are supported atm.") + + reco_arr = awkward.Record({k: reco_arr[k][msk] for k in reco_arr.keys()}) + return reco_arr + + +def get_particle_feature_matrix(pfelem_to_particle, feature_dict, features): + feats = [] + for feat in features: + feat_arr = feature_dict[feat] + if len(feat_arr) == 0: + feat_arr_reordered = feat_arr + else: + feat_arr_reordered = awkward.to_numpy(feat_arr[pfelem_to_particle]) + feat_arr_reordered[pfelem_to_particle == -1] = 0.0 + feats.append(feat_arr_reordered) + feats = np.array(feats) + return feats.T + + +def get_feature_matrix(feature_dict, features): + feats = [] + for feat in features: + feat_arr = awkward.to_numpy(feature_dict[feat]) + feats.append(feat_arr) + feats = np.array(feats) + return feats.T + + +def get_p4(part, prefix="MCParticles"): + p4_x = part[prefix + ".momentum.x"] + p4_y = part[prefix + ".momentum.y"] + p4_z = part[prefix + ".momentum.z"] + p4_mass = part[prefix + ".mass"] + + p4 = vector.awk( + awkward.zip( + { + "mass": p4_mass, + "px": p4_x, + "py": p4_y, + "pz": p4_z, + } + ) + ) + + return p4 + + +def compute_met(p4): + sum_px = awkward.sum(p4.px, axis=1) + sum_py = awkward.sum(p4.py, axis=1) + met = np.sqrt(sum_px**2 + sum_py**2) + return met + + +def compute_jets(particles_p4, min_pt=jet_ptcut, with_indices=False): + cluster = fastjet.ClusterSequence(particles_p4, jetdef) + jets = vector.awk(cluster.inclusive_jets(min_pt=min_pt)) + jets = vector.awk(awkward.zip({"energy": jets["t"], "px": jets["x"], "py": jets["y"], "pz": jets["z"]})) + jets = awkward.Array({"pt": jets.pt, "eta": jets.eta, "phi": jets.phi, "energy": jets.energy}) + ret = jets + if with_indices: + indices = cluster.constituent_index(min_pt=min_pt) + ret = jets, indices + return ret + + +def process_one_file(fn, ofn, dataset): + + # output exists, do not recreate + if os.path.isfile(ofn): + print("{} exists".format(ofn)) + return + + print("loading {}".format(fn)) + fi = uproot.open(fn) + arrs = fi["events"] + + if dataset == "clic": + collectionIDs = { + k: v + for k, v in zip( + fi.get("metadata").arrays("CollectionIDs")["CollectionIDs"]["m_names"][0], + fi.get("metadata").arrays("CollectionIDs")["CollectionIDs"]["m_collectionIDs"][0], + ) + } + prop_data = arrs.arrays( + [ + "MCParticles.PDG", + "MCParticles.momentum.x", + "MCParticles.momentum.y", + "MCParticles.momentum.z", + "MCParticles.mass", + "MCParticles.charge", + "MCParticles.generatorStatus", + "MCParticles.simulatorStatus", + "MCParticles.daughters_begin", + "MCParticles.daughters_end", + "MCParticles#1.index", + track_coll, + "SiTracks_1", + "PandoraClusters", + "PandoraClusters#1", + "PandoraClusters#0", + "MergedRecoParticles", + ] + ) + calohit_links = arrs.arrays( + [ + "CalohitMCTruthLink.weight", + "CalohitMCTruthLink#0.index", + "CalohitMCTruthLink#0.collectionID", + "CalohitMCTruthLink#1.index", + "CalohitMCTruthLink#1.collectionID", + ] + ) + sitrack_links = arrs.arrays( + [ + "SiTracksMCTruthLink.weight", + "SiTracksMCTruthLink#0.index", + "SiTracksMCTruthLink#0.collectionID", + "SiTracksMCTruthLink#1.index", + "SiTracksMCTruthLink#1.collectionID", + ] + ) + # maps the recoparticle track/cluster index (in tracks_begin,end and clusters_begin,end) + # to the index in the track/cluster collection + idx_rp_to_cluster = arrs["MergedRecoParticles#0/MergedRecoParticles#0.index"].array() + idx_rp_to_track = arrs["MergedRecoParticles#1/MergedRecoParticles#1.index"].array() + + hit_data = { + "ECALBarrel": arrs["ECALBarrel"].array(), + "ECALEndcap": arrs["ECALEndcap"].array(), + "ECALOther": arrs["ECALOther"].array(), + "HCALBarrel": arrs["HCALBarrel"].array(), + "HCALEndcap": arrs["HCALEndcap"].array(), + "HCALOther": arrs["HCALOther"].array(), + "MUON": arrs["MUON"].array(), + } + elif dataset == "fcc": + collectionIDs = { + k: v + for k, v in zip( + fi.get("podio_metadata").arrays("events___idTable/m_names")["events___idTable/m_names"][0], + fi.get("podio_metadata").arrays("events___idTable/m_collectionIDs")["events___idTable/m_collectionIDs"][0], + ) + } + prop_data = arrs.arrays( + [ + mc_coll, + "MCParticles.PDG", + "MCParticles.momentum.x", + "MCParticles.momentum.y", + "MCParticles.momentum.z", + "MCParticles.mass", + "MCParticles.charge", + "MCParticles.generatorStatus", + "MCParticles.simulatorStatus", + "MCParticles.daughters_begin", + "MCParticles.daughters_end", + "_MCParticles_daughters/_MCParticles_daughters.index", # similar to "MCParticles#1.index" in clic + track_coll, + "_SiTracks_Refitted_trackStates", + "PandoraClusters", + "_PandoraClusters_hits/_PandoraClusters_hits.index", + "_PandoraClusters_hits/_PandoraClusters_hits.collectionID", + "PandoraPFOs", + "SiTracks_Refitted_dQdx", + ] + ) + calohit_links = arrs.arrays( + [ + "CalohitMCTruthLink.weight", + "_CalohitMCTruthLink_to/_CalohitMCTruthLink_to.collectionID", + "_CalohitMCTruthLink_to/_CalohitMCTruthLink_to.index", + "_CalohitMCTruthLink_from/_CalohitMCTruthLink_from.collectionID", + "_CalohitMCTruthLink_from/_CalohitMCTruthLink_from.index", + ] + ) + sitrack_links = arrs.arrays( + [ + "SiTracksMCTruthLink.weight", + "_SiTracksMCTruthLink_to/_SiTracksMCTruthLink_to.collectionID", + "_SiTracksMCTruthLink_to/_SiTracksMCTruthLink_to.index", + "_SiTracksMCTruthLink_from/_SiTracksMCTruthLink_from.collectionID", + "_SiTracksMCTruthLink_from/_SiTracksMCTruthLink_from.index", + ] + ) + + # maps the recoparticle track/cluster index (in tracks_begin,end and clusters_begin,end) + # to the index in the track/cluster collection + idx_rp_to_cluster = arrs["_PandoraPFOs_clusters/_PandoraPFOs_clusters.index"].array() + idx_rp_to_track = arrs["_PandoraPFOs_tracks/_PandoraPFOs_tracks.index"].array() + + hit_data = { + "ECALBarrel": arrs["ECALBarrel"].array(), + "ECALEndcap": arrs["ECALEndcap"].array(), + "HCALBarrel": arrs["HCALBarrel"].array(), + "HCALEndcap": arrs["HCALEndcap"].array(), + "HCALOther": arrs["HCALOther"].array(), + "MUON": arrs["MUON"].array(), + } + else: + raise Exception("--dataset provided is not supported. Only 'fcc' or 'clic' are supported atm.") + + # Compute truth MET and jets from status=1 pythia particles + mc_pdg = np.abs(prop_data["MCParticles.PDG"]) + mc_st1_mask = (prop_data["MCParticles.generatorStatus"] == 1) & (mc_pdg != 12) & (mc_pdg != 14) & (mc_pdg != 16) + mc_st1_p4 = vector.awk( + awkward.zip( + { + "px": prop_data["MCParticles.momentum.x"][mc_st1_mask], + "py": prop_data["MCParticles.momentum.y"][mc_st1_mask], + "pz": prop_data["MCParticles.momentum.z"][mc_st1_mask], + "mass": prop_data["MCParticles.mass"][mc_st1_mask], + } + ) + ) + met_st1 = compute_met(mc_st1_p4) + genjets_st1 = compute_jets(mc_st1_p4) + + ret = [] + for iev in tqdm.tqdm(range(arrs.num_entries), total=arrs.num_entries): + + # get the reco particles + reco_arr = get_reco_properties(dataset, prop_data, iev) + + if dataset == "clic": + reco_type = np.abs(reco_arr["type"]) + elif dataset == "fcc": + reco_type = np.abs(reco_arr["PDG"]) + else: + raise Exception("--dataset provided is not supported. Only 'fcc' or 'clic' are supported atm.") + + n_rps = len(reco_type) + reco_features = awkward.Record( + { + "PDG": np.abs(reco_type), + "charge": reco_arr["charge"], + "pt": reco_arr["pt"], + "eta": reco_arr["eta"], + "sin_phi": np.sin(reco_arr["phi"]), + "cos_phi": np.cos(reco_arr["phi"]), + "energy": reco_arr["energy"], + "ispu": np.zeros(len(reco_type)), + "generatorStatus": np.zeros(len(reco_type)), + "simulatorStatus": np.zeros(len(reco_type)), + "gp_to_track": np.zeros(len(reco_type)), + "gp_to_cluster": np.zeros(len(reco_type)), + "jet_idx": np.zeros(len(reco_type)), + } + ) + + # get the genparticles and the links between genparticles and tracks/clusters + gpdata = get_genparticles_and_adjacencies( + dataset, + prop_data, + hit_data, + calohit_links, + sitrack_links, + iev, + collectionIDs, + ) + if gpdata is None: + continue + + # find the reconstructable genparticles and associate them to the best track/cluster + gpdata_cleaned, gp_to_obj = assign_genparticles_to_obj_and_merge(gpdata) + + n_tracks = len(gpdata_cleaned.track_features["type"]) + n_clusters = len(gpdata_cleaned.cluster_features["type"]) + n_gps = len(gpdata_cleaned.gen_features["PDG"]) + + assert len(gp_to_obj) == len(gpdata_cleaned.gen_features["PDG"]) + assert gp_to_obj.shape[1] == 2 + + # for each reco particle, find the tracks and clusters associated with it + # construct track/cluster -> recoparticle maps + track_to_rp, cluster_to_rp = get_recoptcl_to_obj(n_rps, reco_arr, idx_rp_to_track[iev], idx_rp_to_cluster[iev]) + + # get the track/cluster -> genparticle map + track_to_gp = {itrk: igp for igp, itrk in enumerate(gp_to_obj[:, 0]) if itrk != -1} + cluster_to_gp = {icl: igp for igp, icl in enumerate(gp_to_obj[:, 1]) if icl != -1} + + used_gps = np.zeros(n_gps, dtype=np.int64) + track_to_gp_all = assign_to_recoobj(n_tracks, track_to_gp, used_gps) + cluster_to_gp_all = assign_to_recoobj(n_clusters, cluster_to_gp, used_gps) + # all genparticles must be assigned to some PFElement + assert np.all(used_gps == 1) + + used_rps = np.zeros(n_rps, dtype=np.int64) + track_to_rp_all = assign_to_recoobj(n_tracks, track_to_rp, used_rps) + cluster_to_rp_all = assign_to_recoobj(n_clusters, cluster_to_rp, used_rps) + # all reco particles must be assigned to some PFElement + assert np.all(used_rps == 1) + + gps_track = get_particle_feature_matrix(track_to_gp_all, gpdata_cleaned.gen_features, particle_feature_order) + gps_track[:, 0] = np.array([map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(gps_track[:, 0], gps_track[:, 1])]) + gps_cluster = get_particle_feature_matrix(cluster_to_gp_all, gpdata_cleaned.gen_features, particle_feature_order) + gps_cluster[:, 0] = np.array([map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(gps_cluster[:, 0], gps_cluster[:, 1])]) + gps_cluster[:, 1] = 0 + + rps_track = get_particle_feature_matrix(track_to_rp_all, reco_features, particle_feature_order) + rps_track[:, 0] = np.array([map_neutral_to_charged(map_pdgid_to_candid(p, c)) for p, c in zip(rps_track[:, 0], rps_track[:, 1])]) + rps_cluster = get_particle_feature_matrix(cluster_to_rp_all, reco_features, particle_feature_order) + rps_cluster[:, 0] = np.array([map_charged_to_neutral(map_pdgid_to_candid(p, c)) for p, c in zip(rps_cluster[:, 0], rps_cluster[:, 1])]) + rps_cluster[:, 1] = 0 + + # all initial gen/reco particle energy must be reconstructable + assert abs(np.sum(gps_track[:, 6]) + np.sum(gps_cluster[:, 6]) - np.sum(gpdata_cleaned.gen_features["energy"])) < 1e-2 + + assert abs(np.sum(rps_track[:, 6]) + np.sum(rps_cluster[:, 6]) - np.sum(reco_features["energy"])) < 1e-2 + + # we don't want to try to reconstruct charged particles from primary clusters, make sure the charge is 0 + assert np.all(gps_cluster[:, 1] == 0) + assert np.all(rps_cluster[:, 1] == 0) + + X_track = get_feature_matrix(gpdata_cleaned.track_features, track_feature_order) + X_cluster = get_feature_matrix(gpdata_cleaned.cluster_features, cluster_feature_order) + ytarget_track = gps_track + ytarget_cluster = gps_cluster + ycand_track = rps_track + ycand_cluster = rps_cluster + + sanitize(X_track) + sanitize(X_cluster) + sanitize(ytarget_track) + sanitize(ytarget_cluster) + sanitize(ycand_track) + sanitize(ycand_cluster) + + # cluster target particles to jets, save per-particle jet index + ytarget = np.concatenate([ytarget_track, ytarget_cluster], axis=0) + ytarget_constituents = -1 * np.ones(len(ytarget), dtype=np.int64) + valid = ytarget[:, 0] != 0 + # save mapping of index after masking -> index before masking as numpy array + # inspired from: + # https://stackoverflow.com/questions/432112/1044443#comment54747416_1044443 + cumsum = np.cumsum(valid) - 1 + _, index_mapping = np.unique(cumsum, return_index=True) + ytarget = ytarget[valid] + ytarget_p4 = ytarget[:, 2:7] + ytarget_p4 = vector.awk( + awkward.zip( + { + "pt": ytarget_p4[:, 0], + "eta": ytarget_p4[:, 1], + "phi": np.arctan2(ytarget_p4[:, 2], ytarget_p4[:, 3]), + "energy": ytarget_p4[:, 4], + } + ) + ) + target_jets, target_jets_indices = compute_jets(ytarget_p4, with_indices=True) + sorted_jet_idx = awkward.argsort(target_jets.pt, axis=-1, ascending=False).to_list() + target_jets_indices = target_jets_indices.to_list() + for jet_idx in sorted_jet_idx: + jet_constituents = [index_mapping[idx] for idx in target_jets_indices[jet_idx]] # map back to constituent index *before* masking + ytarget_constituents[jet_constituents] = jet_idx + ytarget_track_constituents = ytarget_constituents[: len(ytarget_track)] + ytarget_cluster_constituents = ytarget_constituents[len(ytarget_track) :] + ytarget_track[:, particle_feature_order.index("jet_idx")] = ytarget_track_constituents + ytarget_cluster[:, particle_feature_order.index("jet_idx")] = ytarget_cluster_constituents + + this_ev = awkward.Record( + { + "X_track": X_track, + "X_cluster": X_cluster, + "ytarget_track": ytarget_track, + "ytarget_cluster": ytarget_cluster, + "ycand_track": ycand_track, + "ycand_cluster": ycand_cluster, + "genmet": met_st1[iev], + "genjet": get_feature_matrix(genjets_st1[iev], ["pt", "eta", "phi", "energy"]), + "targetjet": get_feature_matrix(target_jets, ["pt", "eta", "phi", "energy"]), + } + ) + ret.append(this_ev) + + ret = awkward.Record({k: awkward.from_iter([r[k] for r in ret]) for k in ret[0].fields}) + awkward.to_parquet(ret, ofn) + + +def parse_args(): + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--input", type=str, help="Input file ROOT file", required=True) + parser.add_argument("--outpath", type=str, default="raw", help="output path") + parser.add_argument("--dataset", type=str, help="Which detector dataset?", required=True, choices=["clic", "fcc"]) + + args = parser.parse_args() + return args + + +def process(args): + + if os.path.isdir(args.input) is True: + print("Will process all files in " + args.input) + + flist = glob.glob(args.input + "/*.root") + for infile in flist: + outfile = os.path.join(args.outpath, os.path.basename(infile).split(".")[0] + ".parquet") + process_one_file(infile, outfile, args.dataset) + else: + infile = args.input + outfile = os.path.join(args.outpath, os.path.basename(infile).split(".")[0] + ".parquet") + process_one_file(infile, outfile, args.dataset) + + +if __name__ == "__main__": + args = parse_args() + process(args) diff --git a/mlpf/heptfds/cld_pf_edm4hep/ttbar.py b/mlpf/heptfds/cld_pf_edm4hep/ttbar.py index 64a781838..fd0dc776c 100644 --- a/mlpf/heptfds/cld_pf_edm4hep/ttbar.py +++ b/mlpf/heptfds/cld_pf_edm4hep/ttbar.py @@ -1,7 +1,9 @@ from pathlib import Path import tensorflow as tf -from mlpf.heptfds.clic_pf_edm4hep.utils_edm import ( +import tensorflow_datasets as tfds +from utils_edm import ( + NUM_SPLITS, X_FEATURES_CL, X_FEATURES_TRK, Y_FEATURES, @@ -9,8 +11,6 @@ split_sample, ) -import tensorflow_datasets as tfds - _DESCRIPTION = """ CLD EDM4HEP dataset with ee -> ttbar at 365 GeV. - X: reconstructed tracks and clusters, variable number N per event @@ -24,9 +24,11 @@ class CldEdmTtbarPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("2.0.0") + VERSION = tfds.core.Version("2.5.0") RELEASE_NOTES = { "2.0.0": "Initial release", + "2.3.0": "Fix target/truth momentum, st=1 more inclusive: PR352", + "2.5.0": "Use 10 splits, skip 2.4.0 to unify with CMS datasets", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ For the raw input files in ROOT EDM4HEP format, please see the citation above. @@ -35,6 +37,9 @@ class CldEdmTtbarPf(tfds.core.GeneratorBasedBuilder): rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cld_edm4hep/ ./ """ + # create configs 1 ... NUM_SPLITS + 1 that allow to parallelize the dataset building + BUILDER_CONFIGS = [tfds.core.BuilderConfig(name=str(group)) for group in range(1, NUM_SPLITS + 1)] + def __init__(self, *args, **kwargs): kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD super(CldEdmTtbarPf, self).__init__(*args, **kwargs) @@ -53,10 +58,11 @@ def _info(self) -> tfds.core.DatasetInfo: ), dtype=tf.float32, ), - "ygen": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), + "ytarget": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), "ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), "genmet": tfds.features.Scalar(dtype=tf.float32), "genjets": tfds.features.Tensor(shape=(None, 4), dtype=tf.float32), + "targetjets": tfds.features.Tensor(shape=(None, 4), dtype=tf.float32), } ), supervised_keys=None, @@ -71,7 +77,7 @@ def _info(self) -> tfds.core.DatasetInfo: def _split_generators(self, dl_manager: tfds.download.DownloadManager): path = dl_manager.manual_dir - return split_sample(Path(path / "p8_ee_tt_ecm365")) + return split_sample(Path(path / "p8_ee_tt_ecm365"), self.builder_config, num_splits=NUM_SPLITS) def _generate_examples(self, files): return generate_examples(files) diff --git a/mlpf/heptfds/cld_pf_edm4hep/utils_edm.py b/mlpf/heptfds/cld_pf_edm4hep/utils_edm.py new file mode 100644 index 000000000..9cf0828e1 --- /dev/null +++ b/mlpf/heptfds/cld_pf_edm4hep/utils_edm.py @@ -0,0 +1,222 @@ +import awkward as ak +import numpy as np +import random + +NUM_SPLITS = 10 + +# from fcc/postprocessing.py +X_FEATURES_TRK = [ + "elemtype", + "pt", + "eta", + "sin_phi", + "cos_phi", + "p", + "chi2", + "ndf", + "dEdx", + "dEdxError", + "radiusOfInnermostHit", + "tanLambda", + "D0", + "omega", + "Z0", + "time", +] +X_FEATURES_CL = [ + "elemtype", + "et", + "eta", + "sin_phi", + "cos_phi", + "energy", + "position.x", + "position.y", + "position.z", + "iTheta", + "energy_ecal", + "energy_hcal", + "energy_other", + "num_hits", + "sigma_x", + "sigma_y", + "sigma_z", +] + +Y_FEATURES = [ + "PDG", + "charge", + "pt", + "eta", + "sin_phi", + "cos_phi", + "energy", + "ispu", + "generatorStatus", + "simulatorStatus", + "gp_to_track", + "gp_to_cluster", + "jet_idx", +] +labels = [0, 211, 130, 22, 11, 13] + +N_X_FEATURES = max(len(X_FEATURES_CL), len(X_FEATURES_TRK)) +N_Y_FEATURES = len(Y_FEATURES) + + +def split_list(lst, x): + # Calculate the size of each sublist (except potentially the last) + sublist_size = len(lst) // x + + # Create x-1 sublists of equal size + result = [lst[i * sublist_size : (i + 1) * sublist_size] for i in range(x - 1)] + + # Add the remaining elements to the last sublist + result.append(lst[(x - 1) * sublist_size :]) + + return result + + +def split_sample(path, builder_config, num_splits=NUM_SPLITS, test_frac=0.9): + files = sorted(list(path.glob("*.parquet"))) + print("Found {} files in {}".format(len(files), path)) + assert len(files) > 0 + idx_split = int(test_frac * len(files)) + files_train = files[:idx_split] + files_test = files[idx_split:] + assert len(files_train) > 0 + assert len(files_test) > 0 + + split_index = int(builder_config.name) - 1 + files_train_split = split_list(files_train, num_splits) + files_test_split = split_list(files_test, num_splits) + + return { + "train": generate_examples(files_train_split[split_index]), + "test": generate_examples(files_test_split[split_index]), + } + + +# merge and shuffle several samples (e.g. e+, e-), split into test/train +def split_sample_several(paths, builder_config, num_splits=NUM_SPLITS, test_frac=0.9): + files = sum([list(path.glob("*.parquet")) for path in paths], []) + random.shuffle(files) + print("Found {} files".format(len(files))) + assert len(files) > 0 + idx_split = int(test_frac * len(files)) + files_train = files[:idx_split] + files_test = files[idx_split:] + assert len(files_train) > 0 + assert len(files_test) > 0 + + split_index = int(builder_config.name) - 1 + files_train_split = split_list(files_train, num_splits) + files_test_split = split_list(files_test, num_splits) + + return { + "train": generate_examples(files_train_split[split_index]), + "test": generate_examples(files_test_split[split_index]), + } + + +def prepare_data_clic(fn): + ret = ak.from_parquet(fn) + X_track = ret["X_track"] + X_cluster = ret["X_cluster"] + + assert len(X_track) == len(X_cluster) + nev = len(X_track) + + Xs = [] + ytargets = [] + ycands = [] + genmets = [] + genjets = [] + targetjets = [] + for iev in range(nev): + + X1 = ak.to_numpy(X_track[iev]) + X2 = ak.to_numpy(X_cluster[iev]) + + if len(X1) == 0 and len(X2) == 0: + continue + + if len(X1) == 0: + X1 = np.zeros((0, N_X_FEATURES)) + if len(X2) == 0: + X2 = np.zeros((0, N_X_FEATURES)) + + ytarget_track = ak.to_numpy(ret["ytarget_track"][iev]) + ytarget_cluster = ak.to_numpy(ret["ytarget_cluster"][iev]) + ycand_track = ak.to_numpy(ret["ycand_track"][iev]) + ycand_cluster = ak.to_numpy(ret["ycand_cluster"][iev]) + + genmet = ak.to_numpy(ret["genmet"][iev]) + genjet = ak.to_numpy(ret["genjet"][iev]) + targetjet = ak.to_numpy(ret["targetjet"][iev]) + + if len(ytarget_track) == 0 and len(ytarget_cluster) == 0: + continue + + if len(genjet) == 0: + genjet = np.zeros((0, 4), dtype=np.float32) + if len(targetjet) == 0: + targetjet = np.zeros((0, 4), dtype=np.float32) + + # in case the event had no track or cluster, create the right shapes + if len(ytarget_track) == 0: + ytarget_track = np.zeros((0, N_Y_FEATURES)) + if len(ytarget_cluster) == 0: + ytarget_cluster = np.zeros((0, N_Y_FEATURES)) + if len(ycand_track) == 0: + ycand_track = np.zeros((0, N_Y_FEATURES)) + if len(ycand_cluster) == 0: + ycand_cluster = np.zeros((0, N_Y_FEATURES)) + + # pad feature dim between tracks and clusters to the same size + if X1.shape[1] < N_X_FEATURES: + X1 = np.pad(X1, [[0, 0], [0, N_X_FEATURES - X1.shape[1]]]) + if X2.shape[1] < N_X_FEATURES: + X2 = np.pad(X2, [[0, 0], [0, N_X_FEATURES - X2.shape[1]]]) + + # concatenate tracks and clusters in features and targets + X = np.concatenate([X1, X2]) + ytarget = np.concatenate([ytarget_track, ytarget_cluster]) + ycand = np.concatenate([ycand_track, ycand_cluster]) + + # this should not happen + if (ytarget.shape[0] != X.shape[0]) or (ycand.shape[0] != X.shape[0]): + print("Shape mismatch:", X.shape, ytarget.shape, ycand.shape) + continue + # raise Exception("Shape mismatch") + + # replace PID with index in labels array + arr = np.array([labels.index(p) for p in ytarget[:, 0]]) + ytarget[:, 0][:] = arr[:] + arr = np.array([labels.index(p) for p in ycand[:, 0]]) + ycand[:, 0][:] = arr[:] + + Xs.append(X) + ytargets.append(ytarget) + ycands.append(ycand) + genmets.append(genmet) + genjets.append(genjet) + targetjets.append(targetjet) + return Xs, ytargets, ycands, genmets, genjets, targetjets + + +def generate_examples(files): + for fi in files: + Xs, ytargets, ycands, genmets, genjets, targetjets = prepare_data_clic(fi) + for iev in range(len(Xs)): + gm = genmets[iev][0] + gj = genjets[iev] + tj = targetjets[iev] + yield str(fi) + "_" + str(iev), { + "X": Xs[iev].astype(np.float32), + "ytarget": ytargets[iev].astype(np.float32), + "ycand": ycands[iev].astype(np.float32), + "genmet": gm, + "genjets": gj.astype(np.float32), + "targetjets": tj.astype(np.float32), + } diff --git a/mlpf/heptfds/clic_pf_edm4hep/gamma.py b/mlpf/heptfds/clic_pf_edm4hep/gamma.py new file mode 100644 index 000000000..5d7e8360e --- /dev/null +++ b/mlpf/heptfds/clic_pf_edm4hep/gamma.py @@ -0,0 +1,82 @@ +from pathlib import Path + +import tensorflow as tf +import tensorflow_datasets as tfds +from utils_edm import ( + NUM_SPLITS, + X_FEATURES_CL, + X_FEATURES_TRK, + Y_FEATURES, + generate_examples, + split_sample, +) + +_DESCRIPTION = """ +CLIC EDM4HEP dataset with single photon gun samples. + - X: reconstructed tracks and clusters, variable number N per event + - ygen: stable generator particles, zero-padded to N per event + - ycand: baseline particle flow particles, zero-padded to N per event +""" + +_CITATION = """ +Pata, Joosep, Wulff, Eric, Duarte, Javier, Mokhtar, Farouk, Zhang, Mengke, Girone, Maria, & Southwick, David. (2023). +Simulated datasets for detector and particle flow reconstruction: CLIC detector (1.1) [Data set]. +Zenodo. https://doi.org/10.5281/zenodo.8260741 +""" + + +class ClicEdmGamma(tfds.core.GeneratorBasedBuilder): + VERSION = tfds.core.Version("2.5.0") + RELEASE_NOTES = { + "2.5.0": "Use 10 splits, skip 2.4.0 to unify with CMS datasets", + } + MANUAL_DOWNLOAD_INSTRUCTIONS = """ + For the raw input files in ROOT EDM4HEP format, please see the citation above. + + The processed tensorflow_dataset can also be downloaded from: + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/clic_edm4hep/ ./ + """ + + # create configs 1 ... NUM_SPLITS + 1 that allow to parallelize the dataset building + BUILDER_CONFIGS = [tfds.core.BuilderConfig(name=str(group)) for group in range(1, NUM_SPLITS + 1)] + + def __init__(self, *args, **kwargs): + kwargs["file_format"] = tfds.core.FileFormat.ARRAY_RECORD + super(ClicEdmGamma, self).__init__(*args, **kwargs) + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict( + { + "X": tfds.features.Tensor( + shape=( + None, + max(len(X_FEATURES_TRK), len(X_FEATURES_CL)), + ), + dtype=tf.float32, + ), + "ytarget": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), + "ycand": tfds.features.Tensor(shape=(None, len(Y_FEATURES)), dtype=tf.float32), + "genmet": tfds.features.Scalar(dtype=tf.float32), + "genjets": tfds.features.Tensor(shape=(None, 4), dtype=tf.float32), + "targetjets": tfds.features.Tensor(shape=(None, 4), dtype=tf.float32), + } + ), + homepage="https://github.com/jpata/particleflow", + citation=_CITATION, + metadata=tfds.core.MetadataDict( + x_features_track=X_FEATURES_TRK, + x_features_cluster=X_FEATURES_CL, + y_features=Y_FEATURES, + ), + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + path = dl_manager.manual_dir + return split_sample(Path(path / "gamma//"), self.builder_config, num_splits=NUM_SPLITS) + + def _generate_examples(self, files): + return generate_examples(files) diff --git a/mlpf/model/PFDataset.py b/mlpf/model/PFDataset.py index 57802023f..ea7d5f2fc 100644 --- a/mlpf/model/PFDataset.py +++ b/mlpf/model/PFDataset.py @@ -1,14 +1,13 @@ +import sys from types import SimpleNamespace +import numpy as np import tensorflow_datasets as tfds import torch import torch.utils.data from mlpf.model.logger import _logger -import numpy as np -import sys - class TFDSDataSource: def __init__(self, ds, sort): @@ -229,12 +228,16 @@ def get_interleaved_dataloaders(world_size, rank, config, use_cuda, use_ray): split_configs = config[f"{split}_dataset"][config["dataset"]][type_]["samples"][sample]["splits"] print("split_configs", split_configs) + nevents = None + if not (config[f"n{split}"] is None): + nevents = config[f"n{split}"] // len(split_configs) + for split_config in split_configs: ds = PFDataset( config["data_dir"], f"{sample}/{split_config}:{version}", split, - num_samples=config[f"n{split}"], + num_samples=nevents, sort=config["sort_data"], ).ds