From 6a433c73a1975215fc1769365fad79c06c207209 Mon Sep 17 00:00:00 2001 From: rkansal47 Date: Fri, 31 May 2024 17:30:16 -0500 Subject: [PATCH] john arc results --- paper/latex_tables.ipynb | 70 ++++- paper/limit_plots.ipynb | 259 ++++++++++++++++ paper/tables/nonres_lpsfs.tex | 9 + .../postprocessing/InferenceAnalysis.ipynb | 90 +++--- src/HHbbVV/postprocessing/PostProcess.ipynb | 279 ++++++++---------- src/HHbbVV/postprocessing/TrainBDT.py | 3 + src/HHbbVV/postprocessing/plotting.py | 32 +- src/HHbbVV/postprocessing/postprocessing.py | 2 + 8 files changed, 541 insertions(+), 203 deletions(-) create mode 100644 paper/limit_plots.ipynb create mode 100644 paper/tables/nonres_lpsfs.tex diff --git a/paper/latex_tables.ipynb b/paper/latex_tables.ipynb index e4f5a55d..875c2b14 100644 --- a/paper/latex_tables.ipynb +++ b/paper/latex_tables.ipynb @@ -2,14 +2,16 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import yaml\n", "import json\n", "import numpy as np\n", - "from copy import deepcopy" + "import pandas as pd\n", + "from copy import deepcopy\n", + "from pathlib import Path" ] }, { @@ -343,12 +345,72 @@ " f.writelines(lines)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Lund plane SFs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Nonresonant" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "templates_dir = Path(\"../src/HHbbVV/postprocessing/templates/24Apr26NonresBDT995AllSigs\")\n", + "dfs = {\n", + " \"ggF\": pd.read_csv(templates_dir / \"lpsfs_passggf.csv\").to_numpy(),\n", + " \"VBF\": pd.read_csv(templates_dir / \"lpsfs_passvbf.csv\").to_numpy(),\n", + "}\n", + "\n", + "sig_map = {\n", + " \"HHbbVV\": r\"SM ggF \\HH\",\n", + " \"VBFHHbbVV\": r\"SM VBF \\HH\",\n", + " \"qqHH_CV_1_C2V_0_kl_1_HHbbVV\": r\"VBF \\HH ($\\kapvv = 0$)\",\n", + " \"qqHH_CV_1_C2V_2_kl_1_HHbbVV\": r\"VBF \\HH ($\\kapvv = 2$)\",\n", + "}\n", + "\n", + "df = dfs[\"ggF\"]\n", + "\n", + "lines = []\n", + "\n", + "for j, (region, df) in enumerate(dfs.items()):\n", + " sigs = df[:, 0]\n", + " for i, (sig, siglabel) in enumerate(sig_map.items()):\n", + " if i == 0:\n", + " region_label = rf\"\\multirow{{{len(sig_map)}}}{{*}}{{{region}}}\"\n", + " else:\n", + " region_label = \"\"\n", + "\n", + " line = [region_label, siglabel]\n", + " sigidx = np.where(sigs == sig)[0][0]\n", + " row = df[sigidx, 1:]\n", + "\n", + " sf = row[0].split(\" ± \")\n", + " line.append(rf\"${sf[0]} \\pm {sf[1]}$\")\n", + "\n", + " for i in range(1, len(row)):\n", + " line.append(f\"{row[i]:.2f}\")\n", + "\n", + " lines.append(\" & \".join(line) + r\" \\\\\" + \"\\n\")\n", + "\n", + " if j != len(dfs) - 1:\n", + " lines.append(r\"\\hline\" + \"\\n\")\n", + "\n", + "# remove trailing \"\\\\\" and \"\\n\"\n", + "lines[-1] = lines[-1][:-4]\n", + "\n", + "with Path(\"tables/nonres_lpsfs.tex\").open(\"w\") as f:\n", + " f.writelines(lines)" + ] } ], "metadata": { diff --git a/paper/limit_plots.ipynb b/paper/limit_plots.ipynb new file mode 100644 index 00000000..036621a5 --- /dev/null +++ b/paper/limit_plots.ipynb @@ -0,0 +1,259 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib as mpl\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.ticker as mticker\n", + "import mplhep as hep\n", + "\n", + "plt.style.use(hep.style.CMS)\n", + "hep.style.use(\"CMS\")\n", + "formatter = mticker.ScalarFormatter(useMathText=True)\n", + "formatter.set_powerlimits((-3, 3))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import matplotlib.patches as patches\n", + "import mplhep as hep\n", + "\n", + "# Apply the CMS style\n", + "plt.style.use(hep.style.CMS)\n", + "\n", + "# Define the observed and expected data for the first set (from the first PDF)\n", + "observed_value_1 = 142\n", + "expected_median_1 = 69\n", + "expected_68_low_1, expected_68_high_1 = 50, 80 # 68% confidence interval\n", + "expected_95_low_1, expected_95_high_1 = 40, 100 # 95% confidence interval\n", + "\n", + "# Define the observed and expected data for the second set (from the second PDF)\n", + "observed_value_2 = 1.1\n", + "expected_median_2 = 0.9\n", + "expected_68_low_2, expected_68_high_2 = 0.7, 1.1 # 68% confidence interval\n", + "expected_95_low_2, expected_95_high_2 = 0.5, 1.3 # 95% confidence interval\n", + "\n", + "# Create the figure with custom dimensions\n", + "fig, ax = plt.subplots(figsize=(10, 6)) # Adjust the dimensions as needed\n", + "\n", + "# Plot for the first set\n", + "# Add rectangles for the confidence intervals\n", + "rect_95_1 = patches.Rectangle(\n", + " (expected_95_low_1, 0.7),\n", + " expected_95_high_1 - expected_95_low_1,\n", + " 0.2,\n", + " linewidth=0,\n", + " edgecolor=\"none\",\n", + " facecolor=\"yellow\",\n", + " alpha=0.5,\n", + ")\n", + "ax.add_patch(rect_95_1)\n", + "rect_68_1 = patches.Rectangle(\n", + " (expected_68_low_1, 0.7),\n", + " expected_68_high_1 - expected_68_low_1,\n", + " 0.2,\n", + " linewidth=0,\n", + " edgecolor=\"none\",\n", + " facecolor=\"green\",\n", + " alpha=0.5,\n", + ")\n", + "ax.add_patch(rect_68_1)\n", + "ax.plot([expected_median_1, expected_median_1], [0.7, 0.9], \"k--\")\n", + "ax.plot([observed_value_1, observed_value_1], [0.7, 0.9], \"k-\", linewidth=2)\n", + "\n", + "# Plot for the second set\n", + "# Add rectangles for the confidence intervals\n", + "rect_95_2 = patches.Rectangle(\n", + " (expected_95_low_2, 0.4),\n", + " expected_95_high_2 - expected_95_low_2,\n", + " 0.2,\n", + " linewidth=0,\n", + " edgecolor=\"none\",\n", + " facecolor=\"yellow\",\n", + " alpha=0.5,\n", + ")\n", + "ax.add_patch(rect_95_2)\n", + "rect_68_2 = patches.Rectangle(\n", + " (expected_68_low_2, 0.4),\n", + " expected_68_high_2 - expected_68_low_2,\n", + " 0.2,\n", + " linewidth=0,\n", + " edgecolor=\"none\",\n", + " facecolor=\"green\",\n", + " alpha=0.5,\n", + ")\n", + "ax.add_patch(rect_68_2)\n", + "ax.plot([expected_median_2, expected_median_2], [0.4, 0.6], \"k--\")\n", + "ax.plot([observed_value_2, observed_value_2], [0.4, 0.6], \"k-\", linewidth=2)\n", + "\n", + "# Set the x and y axis labels\n", + "ax.set_xlabel(r\"95% CL limit on $\\sigma(pp \\rightarrow HH) / \\sigma$\")\n", + "ax.set_yticks([0.8, 0.5])\n", + "ax.set_yticklabels(\n", + " [\n", + " r\"$\\kappa_{\\lambda} = 1, \\kappa_{t} = 2, \\kappa_{V} = 1$\",\n", + " r\"$\\kappa_{\\lambda} = 1, \\kappa_{t} = 1, \\kappa_{V} = 0$\",\n", + " ]\n", + ")\n", + "\n", + "# Set the title\n", + "ax.set_title(\"CMS Work in Progress\")\n", + "\n", + "# Set x-axis to logarithmic scale\n", + "ax.set_xscale(\"log\")\n", + "\n", + "# Add a legend in the top right without the limit values\n", + "legend_elements = [\n", + " patches.Patch(color=\"green\", alpha=0.5, label=\"68% expected\"),\n", + " patches.Patch(color=\"yellow\", alpha=0.5, label=\"95% expected\"),\n", + " plt.Line2D([0], [0], color=\"k\", linestyle=\"--\", label=\"Median expected\"),\n", + " plt.Line2D([0], [0], color=\"k\", linewidth=2, label=\"Observed\"),\n", + "]\n", + "ax.legend(handles=legend_elements, loc=\"upper right\")\n", + "\n", + "# Set x-axis limits\n", + "ax.set_xlim(0.1, 200)\n", + "\n", + "# Use scientific notation for the x-axis\n", + "ax.xaxis.set_major_formatter(plt.ScalarFormatter(useMathText=True))\n", + "ax.ticklabel_format(style=\"sci\", axis=\"x\", scilimits=(0, 0))\n", + "\n", + "# Show grid\n", + "ax.grid(True, axis=\"x\")\n", + "\n", + "# Apply CMS label with `mplhep`\n", + "hep.cms.label(ax=ax, data=True, lumi=138, com=13)\n", + "\n", + "# Adjust layout\n", + "plt.tight_layout()\n", + "\n", + "# Show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import matplotlib.patches as patches\n", + "\n", + "# Define the observed and expected data\n", + "observed_value = 142\n", + "expected_median = 69\n", + "expected_68_low, expected_68_high = 50, 80 # 68% confidence interval\n", + "expected_95_low, expected_95_high = 40, 100 # 95% confidence interval\n", + "\n", + "# Create the plot with custom dimensions\n", + "fig, ax = plt.subplots(figsize=(8, 2)) # Adjust the dimensions as needed\n", + "\n", + "# Add rectangles for the confidence intervals\n", + "# 95% confidence interval\n", + "rect_95 = patches.Rectangle(\n", + " (expected_95_low, -0.1),\n", + " expected_95_high - expected_95_low,\n", + " 0.2,\n", + " linewidth=0,\n", + " edgecolor=\"none\",\n", + " facecolor=\"yellow\",\n", + " alpha=0.5,\n", + " label=\"95% expected\",\n", + ")\n", + "ax.add_patch(rect_95)\n", + "\n", + "# 68% confidence interval\n", + "rect_68 = patches.Rectangle(\n", + " (expected_68_low, -0.1),\n", + " expected_68_high - expected_68_low,\n", + " 0.2,\n", + " linewidth=0,\n", + " edgecolor=\"none\",\n", + " facecolor=\"green\",\n", + " alpha=0.5,\n", + " label=\"68% expected\",\n", + ")\n", + "ax.add_patch(rect_68)\n", + "\n", + "# Plot the expected median line\n", + "ax.plot([expected_median, expected_median], [-0.1, 0.1], \"k--\", label=\"Median expected: 69\")\n", + "\n", + "# Plot the observed value as a solid black line\n", + "ax.plot([observed_value, observed_value], [-0.1, 0.1], \"k-\", linewidth=2, label=\"Observed: 142\")\n", + "\n", + "# Set the x and y axis labels\n", + "ax.set_xlabel(r\"95% CL limit on $\\sigma(pp \\rightarrow HH) / \\sigma$\")\n", + "ax.set_yticks([]) # Remove y-axis ticks\n", + "\n", + "# Set the title\n", + "ax.set_title(\"CMS Work in Progress\")\n", + "\n", + "# Add a legend\n", + "ax.legend()\n", + "\n", + "# Set x-axis limits\n", + "ax.set_xlim(0, 150) # Adjust based on the range of your data\n", + "ax.set_ylim(-0.2, 0.3) # Adjust y-axis to provide space for the kappa values\n", + "\n", + "# Use scientific notation for the x-axis\n", + "ax.xaxis.set_major_formatter(plt.ScalarFormatter(useMathText=True))\n", + "ax.ticklabel_format(style=\"sci\", axis=\"x\", scilimits=(0, 0))\n", + "\n", + "# Add kappa values above the observed limit within the figure box\n", + "kappa_values = r\"$\\kappa_{\\lambda} = 1, \\kappa_{t} = 2, \\kappa_{V} = 1$\"\n", + "plt.text(\n", + " observed_value,\n", + " 0.2,\n", + " kappa_values,\n", + " fontsize=12,\n", + " horizontalalignment=\"center\",\n", + " verticalalignment=\"bottom\",\n", + ")\n", + "\n", + "# Show grid\n", + "ax.grid(True, axis=\"x\")\n", + "\n", + "# Show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python310", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/paper/tables/nonres_lpsfs.tex b/paper/tables/nonres_lpsfs.tex new file mode 100644 index 00000000..71e813dd --- /dev/null +++ b/paper/tables/nonres_lpsfs.tex @@ -0,0 +1,9 @@ +\multirow{4}{*}{ggF} & SM ggF \HH & $1.05 \pm 0.24$ & 0.16 & 0.05 & 0.00 & 0.16 \\ + & SM VBF \HH & $1.17 \pm 0.45$ & 0.35 & 0.05 & 0.00 & 0.16 \\ + & VBF \HH ($\kapvv = 0$) & $1.09 \pm 0.18$ & 0.02 & 0.04 & 0.01 & 0.15 \\ + & VBF \HH ($\kapvv = 2$) & $1.10 \pm 0.18$ & 0.02 & 0.05 & 0.01 & 0.15 \\ +\midrule +\multirow{4}{*}{VBF} & SM ggF \HH & $0.95 \pm 0.28$ & 0.26 & 0.08 & 0.01 & 0.12 \\ + & SM VBF \HH & $1.08 \pm 0.46$ & 0.38 & 0.05 & 0.01 & 0.19 \\ + & VBF \HH ($\kapvv = 0$) & $0.93 \pm 0.27$ & 0.16 & 0.06 & 0.02 & 0.23 \\ + & VBF \HH ($\kapvv = 2$) & $0.94 \pm 0.27$ & 0.16 & 0.05 & 0.02 & 0.23 diff --git a/src/HHbbVV/postprocessing/InferenceAnalysis.ipynb b/src/HHbbVV/postprocessing/InferenceAnalysis.ipynb index 8b8918ff..5b874918 100644 --- a/src/HHbbVV/postprocessing/InferenceAnalysis.ipynb +++ b/src/HHbbVV/postprocessing/InferenceAnalysis.ipynb @@ -58,7 +58,7 @@ "metadata": {}, "outputs": [], "source": [ - "plot_dir = MAIN_DIR / \"plots/TaggerAnalysis/24May20\"\n", + "plot_dir = MAIN_DIR / \"plots/TaggerAnalysis/24May29\"\n", "# plot_dir = MAIN_DIR / \"plots/BDT/24Apr9\"\n", "plot_dir.mkdir(parents=True, exist_ok=True)\n", "\n", @@ -315,12 +315,14 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.metrics import roc_curve, auc, integrate\n", + "from sklearn.metrics import roc_curve, auc\n", + "from scipy import integrate\n", "\n", "rocs = {}\n", "# sig_key = \"HHbbVV\"\n", - "bg_keys = [\"TT\", \"QCD\"]\n", + "tot_bg_keys = [\"TT\", \"QCD\"]\n", "bg_skip = 1\n", + "weight_key = \"finalWeight\"\n", "\n", "\n", "for cutstr in cut_labels:\n", @@ -329,42 +331,47 @@ " # for sig_key in tqdm(nonres_sig_keys + res_sig_keys):\n", " for sig_key in tqdm(nonres_sig_keys):\n", " rocs[cutstr][sig_key] = {}\n", - " sig_cut = cuts_dict[sig_key][cutstr]\n", - " bg_cuts = [cuts_dict[bg_key][cutstr] for bg_key in bg_keys]\n", - "\n", - " y_true = np.concatenate(\n", - " [\n", - " np.ones(np.sum(sig_cut)),\n", - " np.zeros(int(np.ceil(np.sum(np.concatenate(bg_cuts)) / bg_skip))),\n", - " ]\n", - " )\n", - " # print(y_true[np.sum(sig_cut):])\n", - "\n", - " weights = np.concatenate(\n", - " [events_dict[sig_key][\"weight\"][sig_cut]]\n", - " + [\n", - " events_dict[bg_key][\"weight\"][bg_cut][::bg_skip]\n", - " for bg_key, bg_cut in zip(bg_keys, bg_cuts)\n", - " ],\n", - " )\n", + " # sig_cut = cuts_dict[sig_key][cutstr]\n", + " for bg_label, bg_keys in (\n", + " {\"Combined\": tot_bg_keys} | {bg_key: [bg_key] for bg_key in tot_bg_keys}\n", + " ).items():\n", + " rocs[cutstr][sig_key][bg_label] = {}\n", + " # bg_cuts = [cuts_dict[bg_key][cutstr] for bg_key in bg_keys]\n", + "\n", + " y_true = np.concatenate(\n", + " [\n", + " np.ones(len(events_dict[sig_key])),\n", + " np.zeros(\n", + " int(\n", + " np.ceil(\n", + " np.sum([len(events_dict[bg_key]) for bg_key in bg_keys]) / bg_skip\n", + " )\n", + " )\n", + " ),\n", + " ]\n", + " )\n", + " # print(y_true[np.sum(sig_cut):])\n", "\n", - " for t, pvars in plot_vars.items():\n", - " score_label = pvars[\"score_label\"]\n", - " scores = np.concatenate(\n", - " [events_dict[sig_key][score_label][sig_cut]]\n", - " + [\n", - " events_dict[bg_key][score_label][bg_cut][::bg_skip]\n", - " for bg_key, bg_cut in zip(bg_keys, bg_cuts)\n", - " ],\n", + " weights = np.concatenate(\n", + " [events_dict[sig_key][weight_key]]\n", + " + [events_dict[bg_key][weight_key][::bg_skip] for bg_key in bg_keys],\n", " )\n", - " # print(scores[np.sum(sig_cut):])\n", - " fpr, tpr, thresholds = roc_curve(y_true, scores, sample_weight=weights)\n", - " rocs[cutstr][sig_key][t] = {\n", - " \"fpr\": fpr,\n", - " \"tpr\": tpr,\n", - " \"thresholds\": thresholds,\n", - " # \"auc\": auc(fpr, tpr),\n", - " }" + "\n", + " for t, pvars in plot_vars.items():\n", + " score_label = pvars[\"score_label\"]\n", + " scores = np.concatenate(\n", + " [events_dict[sig_key][score_label]]\n", + " + [events_dict[bg_key][score_label][::bg_skip] for bg_key in bg_keys],\n", + " )\n", + " # print(scores[np.sum(sig_cut):])\n", + " fpr, tpr, thresholds = roc_curve(y_true, scores, sample_weight=weights)\n", + " rocs[cutstr][sig_key][bg_label][t] = {\n", + " \"fpr\": fpr,\n", + " \"tpr\": tpr,\n", + " \"thresholds\": thresholds,\n", + " \"auc\": integrate.trapz(tpr, fpr),\n", + " \"label\": bg_label,\n", + " }" ] }, { @@ -417,10 +424,11 @@ "metadata": {}, "outputs": [], "source": [ - "roc = rocs[cutstr][sig_key][t]\n", - "roc_auc = integrate.trapz(y=roc[\"tpr\"], x=roc[\"fpr\"])\n", - "print(\"AUC:\", roc_auc)\n", - "plotting.rocCurve(roc[\"fpr\"], roc[\"tpr\"], show=True, plot_dir=plot_dir, name=\"THVV\", auc=roc_auc)" + "roc = rocs[cutstr][sig_key][\"Combined\"][t]\n", + "plotting.rocCurve(roc[\"fpr\"], roc[\"tpr\"], show=True, plot_dir=plot_dir, name=\"THVV\", auc=roc[\"auc\"])\n", + "\n", + "# bg_rocs = {key: val[t] for key, val in rocs[cutstr][sig_key].items()}\n", + "# plotting.multiROCCurveGrey({\"all\": bg_rocs}, [], xlim=[0, 0.8], show=True, plot_dir=plot_dir, name=\"THVV_sep_bgs\")" ] }, { diff --git a/src/HHbbVV/postprocessing/PostProcess.ipynb b/src/HHbbVV/postprocessing/PostProcess.ipynb index a34b3d17..b8cda9bb 100644 --- a/src/HHbbVV/postprocessing/PostProcess.ipynb +++ b/src/HHbbVV/postprocessing/PostProcess.ipynb @@ -84,8 +84,8 @@ "year = \"2018\"\n", "bdt_preds_dir = samples_dir / \"24_04_05_k2v0_training_eqsig_vbf_vars_rm_deta/inferences\"\n", "\n", - "date = \"24May16\"\n", - "plot_dir = MAIN_DIR / f\"plots/PostProcessing/{date}/\"\n", + "date = \"24May31\"\n", + "plot_dir = MAIN_DIR / f\"plots/PostProcessing/{date}LPSFs\"\n", "templates_dir = f\"templates/{date}\"\n", "_ = os.system(f\"mkdir -p {plot_dir}\")\n", "_ = os.system(f\"mkdir -p {plot_dir}/cutflows/\")\n", @@ -116,7 +116,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -182,23 +181,6 @@ "## Control plots" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sel, _ = utils.make_selection(\n", - " {\n", - " \"bbFatJetPt\": [450, CUT_MAX_VAL],\n", - " \"VVFatJetPt\": [450, CUT_MAX_VAL],\n", - " \"vbf_Mass_jj\": [500, CUT_MAX_VAL],\n", - " },\n", - " events_dict,\n", - " bb_masks,\n", - ")" - ] - }, { "cell_type": "code", "execution_count": null, @@ -240,14 +222,14 @@ " # ShapeVar(var=\"nGoodMuons\", label=r\"# of Muons\", bins=[3, 0, 3]),\n", " # ShapeVar(var=\"nGoodElectrons\", label=r\"# of Electrons\", bins=[3, 0, 3]),\n", " # ShapeVar(var=\"nGoodJets\", label=r\"# of AK4 B-Jets\", bins=[5, 0, 5]),\n", - " ShapeVar(var=\"VBFJetPt0\", label=r\"Leading VBF-tagged Jet $p_T$\", bins=[20, 20, 300]),\n", - " ShapeVar(var=\"VBFJetPt1\", label=r\"Sub-leading VBF-tagged Jet $p_T$\", bins=[20, 20, 300]),\n", - " ShapeVar(var=\"VBFJetEta0\", label=r\"Leading VBF-tagged Jet $\\eta$\", bins=[9, -4.5, 4.5]),\n", - " ShapeVar(var=\"VBFJetEta1\", label=r\"Sub-leading VBF-tagged Jet $\\eta$\", bins=[9, -4.5, 4.5]),\n", - " ShapeVar(var=\"VBFJetPhi0\", label=r\"Leading VBF-tagged Jet $\\varphi$\", bins=[10, -3, 3]),\n", - " ShapeVar(var=\"VBFJetPhi1\", label=r\"Sub-leading VBF-tagged Jet $\\varphi$\", bins=[10, -3, 3]),\n", - " ShapeVar(var=\"vbf_Mass_jj\", label=r\"$m_{jj}^{VBF}$\", bins=[20, 0, 1000]),\n", - " ShapeVar(var=\"vbf_dEta_jj\", label=r\"$|\\Delta\\eta_{jj}^{VBF}|$\", bins=[20, 0, 6]),\n", + " # ShapeVar(var=\"VBFJetPt0\", label=r\"Leading VBF-tagged Jet $p_T$\", bins=[20, 20, 300]),\n", + " # ShapeVar(var=\"VBFJetPt1\", label=r\"Sub-leading VBF-tagged Jet $p_T$\", bins=[20, 20, 300]),\n", + " # ShapeVar(var=\"VBFJetEta0\", label=r\"Leading VBF-tagged Jet $\\eta$\", bins=[9, -4.5, 4.5]),\n", + " # ShapeVar(var=\"VBFJetEta1\", label=r\"Sub-leading VBF-tagged Jet $\\eta$\", bins=[9, -4.5, 4.5]),\n", + " # ShapeVar(var=\"VBFJetPhi0\", label=r\"Leading VBF-tagged Jet $\\varphi$\", bins=[10, -3, 3]),\n", + " # ShapeVar(var=\"VBFJetPhi1\", label=r\"Sub-leading VBF-tagged Jet $\\varphi$\", bins=[10, -3, 3]),\n", + " # ShapeVar(var=\"vbf_Mass_jj\", label=r\"$m_{jj}^{VBF}$\", bins=[20, 0, 1000]),\n", + " # ShapeVar(var=\"vbf_dEta_jj\", label=r\"$|\\Delta\\eta_{jj}^{VBF}|$\", bins=[20, 0, 6]),\n", " # ShapeVar(var=\"BDTScore\", label=r\"BDT Score\", bins=[50, 0, 1]),\n", "]\n", "\n", @@ -366,100 +348,58 @@ "postprocessing.plot_bdt_sculpting(events_dict, bb_masks, plot_dir, year, show=True)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check tagger mass sculpting" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "cuts = [0, 0.1, 0.5, 0.9, 0.95]\n", - "# bdtvars = [\"\", \"TT\", \"VJets\"]\n", - "bdtvars = [\"\"]\n", - "plot_keys = [\"TT\"]\n", + "weight_key = \"finalWeight\"\n", + "show = True\n", "\n", - "shape_var = ShapeVar(\n", - " var=\"bbFatJetParticleNetMass\", label=r\"$m^{bb}_{reg}$ (GeV)\", bins=[20, 50, 250]\n", - ")\n", + "cuts = {\n", + " \"bb\": [0.8, 0.9, 0.95],\n", + " \"VV\": [0, 0.1, 0.5, 0.8],\n", + "}\n", + "plot_keys = [data_key, \"QCD\", \"TT\", \"Z+Jets\", \"HHbbVV\", \"qqHH_CV_1_C2V_0_kl_1_HHbbVV\"]\n", + "# plot_keys = [\"QCD\"]\n", + "\n", + "taggers = {\n", + " \"bb\": (\"bbFatJetParticleNetMD_Txbb\", r\"$T^{bb}_{Xbb}$\"),\n", + " \"VV\": (\"VVFatJetParTMD_THWWvsT\", r\"$T^{VV}_{HWW}$\"),\n", + "}\n", + "\n", + "for jet in [\"bb\", \"VV\"]:\n", + " shape_var = ShapeVar(\n", + " var=f\"{jet}FatJetParticleNetMass\", label=rf\"$m^{{{jet}}}_{{reg}}$ (GeV)\", bins=[20, 50, 250]\n", + " )\n", + "\n", + " cut_var, cut_var_label = taggers[jet]\n", "\n", - "for var in bdtvars:\n", " for key in plot_keys:\n", " ed_key = {key: events_dict[key]}\n", " bbm_key = {key: bb_masks[key]}\n", "\n", - " fig, ax = plt.subplots(1, 1, figsize=(12, 12))\n", - " plt.rcParams.update({\"font.size\": 24})\n", - "\n", - " for i, cut in enumerate(cuts):\n", - " sel, _ = utils.make_selection({f\"BDTScore{var}\": [cut, CUT_MAX_VAL]}, ed_key, bbm_key)\n", - " h = utils.singleVarHist(ed_key, shape_var, bbm_key, selection=sel)\n", - "\n", - " hep.histplot(\n", - " h[key, ...] / np.sum(h[key, ...].values()),\n", - " yerr=True,\n", - " label=f\"BDTScore >= {cut}\",\n", - " # density=True,\n", - " ax=ax,\n", - " linewidth=2,\n", - " alpha=0.8,\n", - " )\n", - "\n", - " ax.set_xlabel(shape_var.label)\n", - " ax.set_ylabel(\"Fraction of Events\")\n", - " ax.legend()\n", - "\n", - " hep.cms.label(ax=ax, data=False, year=year, lumi=round(LUMI[year] / 1e3))\n", - " plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cuts = [0.01, 0.1, 0.5, 0.9, 0.99]\n", - "# bdtvars = [\"\", \"TT\", \"VJets\"]\n", - "bdtvars = [\"\"]\n", - "sig_scales = [1e5, 5e4, 2e4, 1e4, 2e3]\n", - "\n", - "# for ttcut in [0.01, 0.1, 0.5, 0.9, 0.99]:\n", - "# ttsel, _ = utils.make_selection({\"BDTScoreTT\": [ttcut, CUT_MAX_VAL]}, events_dict, bb_masks)\n", - "# cutstr = f\"tt{ttcut}\"\n", - "\n", - "# hists = postprocessing.control_plots(\n", - "# events_dict,\n", - "# bb_masks,\n", - "# nonres_sig_keys,\n", - "# control_plot_vars,\n", - "# f\"{plot_dir}/ControlPlots/{year}/\",\n", - "# year,\n", - "# hists={},\n", - "# bg_keys=[\"QCD\", \"TT\", \"ST\", \"V+Jets\", \"Diboson\"],\n", - "# selection=ttsel,\n", - "# cutstr=cutstr,\n", - "# show=True,\n", - "# )\n", - "\n", - "for var in bdtvars:\n", - " for i, cut in enumerate(cuts):\n", - " sel, _ = utils.make_selection({f\"BDTScore{var}\": [cut, CUT_MAX_VAL]}, events_dict, bb_masks)\n", - " cutstr = f\"bdt{var}{cut}\"\n", - " sig_scale = sig_scales[i]\n", - "\n", - " hists = postprocessing.control_plots(\n", - " events_dict,\n", - " bb_masks,\n", - " nonres_sig_keys,\n", - " control_plot_vars,\n", - " f\"{plot_dir}/ControlPlots/{year}/\",\n", + " plotting.cutsLinePlot(\n", + " ed_key,\n", + " shape_var,\n", + " key,\n", + " cut_var,\n", + " cut_var_label,\n", + " cuts[jet],\n", " year,\n", - " hists={},\n", - " bg_keys=[\"QCD\", \"TT\", \"ST\", \"V+Jets\", \"Diboson\"],\n", - " selection=sel,\n", - " cutstr=cutstr,\n", - " sig_scale_dict={\"HHbbVV\": sig_scale},\n", - " combine_pdf=False,\n", - " show=True,\n", + " weight_key,\n", + " bb_masks=bbm_key,\n", + " plot_dir=plot_dir,\n", + " name=f\"{year}_{cut_var}Cuts_{shape_var.var}_{key}\",\n", + " show=show,\n", " )" ] }, @@ -468,16 +408,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Overall BDT SF" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "nonres_sig_keys" + "## Get Lund plane weights" ] }, { @@ -487,50 +418,96 @@ "outputs": [], "source": [ "postprocessing.lpsfs(\n", + " list(nonres_samples.keys()),\n", + " selection_regions[\"lpsf_passggf\"],\n", + " systematics,\n", " events_dict,\n", " bb_masks,\n", - " nonres_sig_keys,\n", - " nonres_samples,\n", - " cutflow,\n", - " selection_regions[\"lpsf\"],\n", - " systematics,\n", " all_years=False,\n", ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check distributions with and without LP weights" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "events = events_dict[\"HHbbVV\"]\n", - "bb_mask = bb_masks[\"HHbbVV\"]\n", - "weight = events[\"finalWeight\"].values.squeeze()\n", - "weight_lp = weight * events[\"VV_lp_sf_nom\"].values.squeeze()\n", - "weight_lp_sys_up = weight * events[\"VV_lp_sf_sys_up\"].values.squeeze()\n", - "weight_lp_sys_down = weight * events[\"VV_lp_sf_sys_down\"].values.squeeze()\n", - "\n", - "plt.hist(\n", - " utils.get_feat(events, \"bbFatJetPt\", bb_mask),\n", - " np.linspace(250, 1200, 31),\n", - " weights=weight,\n", - " histtype=\"step\",\n", - " label=\"Pre-LP\",\n", - ")\n", - "plt.hist(\n", - " utils.get_feat(events, \"bbFatJetPt\", bb_mask),\n", - " np.linspace(250, 1200, 31),\n", - " weights=weight_lp,\n", - " histtype=\"step\",\n", - " label=\"Post-LP\",\n", - ")\n", - "plt.title(\"2018 HHbbVV\")\n", - "plt.xlabel(r\"$p_T^{VV}$ (GeV)\")\n", - "plt.ylabel(\"Events\")\n", - "# plt.hist(utils.get_feat(events, \"VVFatJetPt\", bb_mask), np.linspace(250, 2000, 31), weights=weight_lp_sys_up, histtype=\"step\", label=\"Post-LP Sys Up\")\n", - "# plt.hist(utils.get_feat(events, \"VVFatJetPt\", bb_mask), np.linspace(250, 2000, 31), weights=weight_lp_sys_down, histtype=\"step\", label=\"Post-LP Sys Down\")\n", - "plt.legend()" + "plt.rcParams.update({\"font.size\": 24})\n", + "\n", + "control_plot_vars = [\n", + " # ShapeVar(\n", + " # var=\"bbFatJetPt\", label=r\"$p^{bb}_T$ (GeV)\", bins=[20, 300, 2300], significance_dir=\"right\"\n", + " # ),\n", + " # ShapeVar(var=\"bbFatJetParticleNetMass\", label=r\"$m^{bb}_{reg}$ (GeV)\", bins=[20, 50, 250]),\n", + " ShapeVar(var=\"BDTScore\", label=r\"BDT Score\", bins=[20, 0, 1]),\n", + "]\n", + "\n", + "\n", + "for sig_key in nonres_samples.keys():\n", + " events = events_dict[sig_key]\n", + " bb_mask = bb_masks[sig_key]\n", + " weight = events[\"finalWeight\"].values.squeeze()\n", + " weight_lp = weight * events[\"VV_lp_sf_nom\"].values.squeeze()\n", + " weight_lp_sys_up = weight * events[\"VV_lp_sf_sys_up\"].values.squeeze()\n", + " weight_lp_sys_down = weight * events[\"VV_lp_sf_sys_down\"].values.squeeze()\n", + "\n", + " for shape_var in control_plot_vars:\n", + " h = Hist(\n", + " hist.axis.StrCategory([\"Pre-LP\", \"Post-LP\"], name=\"lptype\"),\n", + " shape_var.axis,\n", + " storage=\"weight\",\n", + " )\n", + "\n", + " h.fill(\n", + " **{\n", + " \"lptype\": \"Pre-LP\",\n", + " shape_var.var: utils.get_feat(events, shape_var.var, bb_mask),\n", + " \"weight\": weight,\n", + " }\n", + " )\n", + "\n", + " h.fill(\n", + " **{\n", + " \"lptype\": \"Post-LP\",\n", + " shape_var.var: utils.get_feat(events, shape_var.var, bb_mask),\n", + " \"weight\": weight_lp,\n", + " }\n", + " )\n", + "\n", + " for norm in [True, False]:\n", + " fig, ax = plt.subplots(figsize=(10, 10))\n", + "\n", + " for l in [\"Pre-LP\", \"Post-LP\"]:\n", + " plot_hist = (h[l, ...] / h[l, ...].values().sum()) if norm else h[l, ...]\n", + " hep.histplot(\n", + " plot_hist,\n", + " ax=ax,\n", + " histtype=\"step\",\n", + " label=l,\n", + " )\n", + "\n", + " plt.title(plotting.sample_label_map[sig_key], y=1.08)\n", + " plt.xlabel(shape_var.label)\n", + "\n", + " ylabel = \"Normalized Events\" if norm else \"Events\"\n", + " plt.ylabel(ylabel)\n", + " plt.legend()\n", + " hep.cms.label(label=\"Preliminary\", data=False, com=13)\n", + "\n", + " norm_str = \"_norm\" if norm else \"\"\n", + " plt.savefig(\n", + " plot_dir / f\"{year}_{shape_var.var}_{sig_key}_lpsf{norm_str}.pdf\",\n", + " bbox_inches=\"tight\",\n", + " )\n", + " plt.show()" ] }, { diff --git a/src/HHbbVV/postprocessing/TrainBDT.py b/src/HHbbVV/postprocessing/TrainBDT.py index c5ecc9d2..3a5dfd52 100644 --- a/src/HHbbVV/postprocessing/TrainBDT.py +++ b/src/HHbbVV/postprocessing/TrainBDT.py @@ -584,6 +584,7 @@ def plot_mass_shapes(train, test, sig_keys, model_dir, training_keys): shape_var, key, f"BDTScore{sig_key}", + "BDTScore", cuts, year, weight_key, @@ -604,6 +605,7 @@ def plot_mass_shapes(train, test, sig_keys, model_dir, training_keys): shape_var, key, f"BDTScore{sig_key}", + "BDTScore", cuts, "all", weight_key, @@ -618,6 +620,7 @@ def plot_mass_shapes(train, test, sig_keys, model_dir, training_keys): shape_var, key, f"BDTScore{sig_key}", + "BDTScore", cuts, "all", weight_key, diff --git a/src/HHbbVV/postprocessing/plotting.py b/src/HHbbVV/postprocessing/plotting.py index e60d2dd8..35af0256 100644 --- a/src/HHbbVV/postprocessing/plotting.py +++ b/src/HHbbVV/postprocessing/plotting.py @@ -835,7 +835,7 @@ def rocCurve( plt.xlim(*xlim) plt.ylim(*ylim) - hep.cms.label(data=False, rlabel="(13 TeV)") + hep.cms.label(data=False, label="Preliminary", rlabel="(13 TeV)") if len(name): plt.savefig(plot_dir / f"{name}.pdf", bbox_inches="tight") @@ -853,19 +853,35 @@ def _find_nearest(array, value): def multiROCCurveGrey( - rocs: dict, sig_effs: list[float], plot_dir: Path, name: str = "", show: bool = False + rocs: dict, + sig_effs: list[float], + plot_dir: Path, + xlim=None, + ylim=None, + name: str = "", + show: bool = False, ): - xlim = [0, 1] - ylim = [1e-6, 1] + """_summary_ + + Args: + rocs (dict): {label: {sig_key1: roc, sig_key2: roc, ...}, ...} where label is e.g Test or Train + sig_effs (list[float]): plot signal efficiency lines + """ + if ylim is None: + ylim = [1e-06, 1] + if xlim is None: + xlim = [0, 1] line_style = {"colors": "lightgrey", "linestyles": "dashed"} plt.figure(figsize=(12, 12)) for roc_sigs in rocs.values(): for roc in roc_sigs.values(): + auc_label = f" (AUC: {roc['auc']:.2f})" if "auc" in roc else "" + plt.plot( roc["tpr"], roc["fpr"], - label=roc["label"], + label=roc["label"] + auc_label, linewidth=2, ) @@ -874,13 +890,14 @@ def multiROCCurveGrey( plt.hlines(y=y, xmin=0, xmax=sig_eff, **line_style) plt.vlines(x=sig_eff, ymin=0, ymax=y, **line_style) - hep.cms.label(data=False, rlabel="") + hep.cms.label(data=False, label="Preliminary", rlabel="(13 TeV)") plt.yscale("log") plt.xlabel("Signal efficiency") plt.ylabel("Background efficiency") plt.xlim(*xlim) plt.ylim(*ylim) plt.legend(loc="upper left") + plt.grid(which="major") if len(name): plt.savefig(plot_dir / f"{name}.pdf", bbox_inches="tight") @@ -1111,6 +1128,7 @@ def cutsLinePlot( shape_var: utils.ShapeVar, plot_key: str, cut_var: str, + cut_var_label: str, cuts: list[float], year: str, weight_key: str, @@ -1154,7 +1172,7 @@ def cutsLinePlot( hep.histplot( hists[cut], yerr=True, - label=f"BDTScore >= {cut}", + label=f"{cut_var_label} >= {cut}", ax=ax, linewidth=2, alpha=0.8, diff --git a/src/HHbbVV/postprocessing/postprocessing.py b/src/HHbbVV/postprocessing/postprocessing.py index b9774902..5275dbf5 100644 --- a/src/HHbbVV/postprocessing/postprocessing.py +++ b/src/HHbbVV/postprocessing/postprocessing.py @@ -1298,6 +1298,7 @@ def lpsfs( assert bb_masks is not None, "Need bb_masks to calculate LP SFs for single year" events_dict[sig_key] = postprocess_lpsfs(events_dict[sig_key]) + continue for lp_region in lp_selection_regions: rlabel = lp_region.lpsf_region @@ -1533,6 +1534,7 @@ def plot_bdt_sculpting( shape_var, key, f"BDTScore{var}", + r"$BDT_{ggF}$" if var == "" else r"$BDT_{VBF}$", cuts, year, weight_key,