Skip to content

Commit

Permalink
Merge pull request #2241 from NNPDF/new_CMS_Z0J_8TEV
Browse files Browse the repository at this point in the history
Re-implementation of CMS_Z0J_8TEV in the new format
  • Loading branch information
RoyStegeman authored Jan 7, 2025
2 parents 77eccdd + 84fdd3b commit fb5a75d
Show file tree
Hide file tree
Showing 20 changed files with 17,077 additions and 3,383 deletions.
2 changes: 1 addition & 1 deletion conda-recipe/run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ set -e

# Python tests for the installed validphys package
# Note that the default tolerance in the conda test is higher than the pip test
pytest --pyargs --mpl validphys --mpl-default-tolerance 22
pytest --pyargs --mpl validphys --mpl-default-tolerance 24

platformstr=`uname`

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ data_central:
- 460.5
- 222.6
- 109.8
- 61.84
- 6.18400000e+01
- 30.19
- 13.55
- 1.35500000e+01
- 0.6181
- 9862.0
- 2863.0
Expand All @@ -18,16 +18,16 @@ data_central:
- 58.13
- 29.85
- 13.76
- 0.6122000000000001
- 6.12200000e-01
- 9205.0
- 2588.0
- 935.5
- 416.3
- 199.0
- 103.1
- 54.06
- 5.40600000e+01
- 28.45
- 13.64
- 1.36400000e+01
- 0.5521
- 6893.0
- 1933.0
Expand All @@ -37,10 +37,10 @@ data_central:
- 84.62
- 47.57
- 24.13
- 11.3
- 1.13000000e+01
- 0.4657
- 3709.0
- 1021.0
- 1.02100000e+03
- 381.6
- 176.6
- 90.49
Expand Down
17 changes: 17 additions & 0 deletions nnpdf_data/nnpdf_data/commondata/CMS_Z0J_8TEV/filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
'''
Filter script for CMS_Z0J_8TEV
'''

import logging
import os

from filter_utils import Extractor

logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')

current_dir = os.path.dirname(os.path.abspath(__file__))

if __name__ == "__main__":
CMS_Z0J_8TEV = Extractor(f"{current_dir}/metadata.yaml", "PT-Y", mult_factor=1000)
CMS_Z0J_8TEV.generate_data(variant='default')
CMS_Z0J_8TEV.generate_data(variant='sys_10')
276 changes: 276 additions & 0 deletions nnpdf_data/nnpdf_data/commondata/CMS_Z0J_8TEV/filter_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,276 @@
import functools
import logging
import os

import numpy as np
import yaml

from nnpdf_data.filter_utils.utils import prettify_float

yaml.add_representer(float, prettify_float)

current_dir = os.path.dirname(os.path.abspath(__file__))

MZ2_LOW = 81.0**2 # GeV2
MZ2_HIGH = 101.0**2 # GeV2
MZ2_MID = (MZ2_LOW + MZ2_HIGH) / 2 # GeV2
CMSLUMI12 = 2.6 # %

ABS_RAP_BINS = [
{'low': 0.0, 'high': 0.4},
{'low': 0.4, 'high': 0.8},
{'low': 0.8, 'high': 1.2},
{'low': 1.2, 'high': 1.6},
{'low': 1.6, 'high': 2.0},
]

STAT_ART_LABEL = 'art_corr_unc'
TABLE_TOKEN = 'Table'


class Extractor:
"""
Extracts kinematics, central data, and uncertainties for a given dataset
"""

def __init__(self, metadata_file, observable, mult_factor=1.0):
"""
Parameters
----------
metadata_file: str
Path to the metadata file
observable: str
The name of the observable for which the data is extracted. The name
must be listed in the metadata file.
mult_factor: float
Multiplication factor to apply to the central data points. This is
useful to convert the data in the metadata file to the desired
units.
"""

# Open metadata and select process
with open(metadata_file) as file:
metadata = yaml.safe_load(file)
self.metadata = next(
(
md
for md in metadata["implemented_observables"]
if md['observable_name'] == observable
),
None,
)
if self.metadata is None:
raise ValueError(f"{observable} is not listed in the metadata file.")

self.observable = observable
self.mult_factor = mult_factor

@functools.cache
def _retrieve_table(self, table_id):
"""
Implementation of the loading for the table.
Parameters
----------
table_id: int
Index that specifies the table.
Return
------
The table specified by `table_id`.
"""
with open(f'{current_dir}/rawdata/{TABLE_TOKEN}{table_id}.yaml') as tab:
tab_dict = yaml.safe_load(tab)
return tab_dict

def _generate_kinematics(self):
"""
Function that generates the kinematics by taking it from the table with
measured cross sections. The kinematics are generated in the format of a
list of dictionaries with the following keys: 'pT', 'abs_eta', 'M_Z2'.
The values of the keys are dictionaries with the keys 'min', 'mid', and
'max'.
"""
logging.info(f"Generating kinematics for CMS_{self.observable}...")

table = self.metadata["tables"][0]
tab_dict = self._retrieve_table(table)

data = tab_dict['independent_variables'][0]
label = self.metadata['kinematic_coverage']
kinematics = []
for rap_bin in ABS_RAP_BINS:
for pT_bin in data['values']:
pT_min = pT_bin['low']
pT_max = pT_bin['high']
abs_eta_low = rap_bin['low']
abs_eta_high = rap_bin['high']
kin_bin = {
label[0]: {'min': pT_min, 'mid': (pT_max + pT_min) / 2, 'max': pT_max},
label[1]: {
'min': abs_eta_low,
'mid': (abs_eta_low + abs_eta_high) / 2,
'max': abs_eta_high,
},
label[2]: {'min': MZ2_LOW, 'mid': MZ2_MID, 'max': MZ2_HIGH},
}
kinematics.append(kin_bin)

# Check number of data agrees with metadata
ndata = len(kinematics)
if not self.metadata['ndata'] == ndata:
raise ValueError(
f"Mismatch in 'ndata': expected {self.metadata['ndata']}, but got {ndata}"
)

return kinematics

def _generate_data_and_unc(self):
"""
Returns a list with central data points and a list with corresponding uncertainties.
"""
logging.info(f"Generating central data for CMS_{self.observable}...")
table = self.metadata['tables'][0]
tab_dict = self._retrieve_table(table)
tab_dict = tab_dict['dependent_variables']

# Loop over kinematic bins
dat_central = []
dat_unc = []
for rap_bin in tab_dict:
for pt_bin in rap_bin['values']:
dat_central.append(pt_bin['value'] * self.mult_factor)
dat_unc.append(pt_bin['errors'][0]['symerror'] * self.mult_factor)

return dat_central, dat_unc

def _build_covmat(self):
'''
Construct the covarianc matrix from the list of entries provided in HepData.
'''
ndata = self.metadata['ndata']
table_id = self.metadata['tables'][1]
raw_dict = self._retrieve_table(table_id)

matlist = [val['value'] for val in raw_dict['dependent_variables'][0]['values']]
covmat = np.array(matlist).reshape(ndata, ndata)

if not np.allclose(covmat, covmat.T):
raise ValueError('Covariance matrix is not symmetric.')

return covmat

def _build_unc_definitions(self, variant):
'''
Build the dictionary containing the definitions of the uncertainties to
be used in the uncertainty data file.
Parameters
----------
variant: str
Name of the variant to be implemented.
Return
------
Dict of dicts containing the specifications of each of the
uncertainties. Each sub-dictionary contains the name of the uncertainty,
its description, the type, and the treatment. The format is the one used
in the commondata.
'''
unc_definitions = {}

# Statistical uncertainties are always the same
for idx in range(self.metadata['ndata']):
unc_definitions[STAT_ART_LABEL + f'_{idx + 1}'] = {
'description': f'Artificial uncertainty {idx + 1}, corresponding to a covmat in eigenvector basis',
'treatment': 'ADD',
'type': 'CORR',
}

# Add lumi uncertainty
unc_definitions['corr_lumi_unc'] = {
'description': 'Luminosity uncertainty 2.6%',
'treatment': 'MULT',
'type': 'CMSLUMI12',
}

if variant == 'sys_10':
unc_definitions['uncorr_mc_unc'] = {
'description': 'MC uncertainty',
'treatment': 'MULT',
'type': 'UNCORR',
}

return unc_definitions

def generate_data(self, variant='default'):
'''
Collect central data, kinematics, and uncertainties and combine them
in the format used in the commondata.
Parameters
---------
variant: str
Name of the dataset variant to generate.
'''

# Check if the variant is one of two supported options
if variant not in ['default', 'sys_10']:
raise ValueError(f'The variant {variant} is not implemented.')

# Get central data and kinematics
central_data, _ = self._generate_data_and_unc()
kinematics = self._generate_kinematics()

# Uncertainty definitions
unc_definitions = self._build_unc_definitions(variant=variant)

# Get statistical uncertainties. They are represented in a covmat in
# eigenvector basis, hence they are called "artificial uncertainties".
# The original covmat can be reconstruted as covat = art_stat.T @ art_stat
covmat = self._build_covmat()
eigvals, eigvecs = np.linalg.eig(covmat)
art_stat = np.sqrt(eigvals) * eigvecs * self.mult_factor

unc_vals = [] # Initialize vector of uncertainties
for data_idx, data in enumerate(central_data):
unc_dict = {}
for unc_idx, unc_type in enumerate(unc_definitions.keys()):
if STAT_ART_LABEL in unc_type:
# Add statistical uncertainties
unc_dict[unc_type] = float(art_stat[data_idx, unc_idx])
elif unc_type == 'corr_lumi_unc':
unc_dict[unc_type] = data * CMSLUMI12 * 0.01
elif unc_type == 'uncorr_mc_unc':
unc_dict[unc_type] = data * 0.01
else:
raise ValueError(f'Uncertainty type {unc_type} is not known.')
unc_vals.append(unc_dict)

# Save kinematics into file
logging.info("Dumping kinematics to file...")
kinematics_yaml = {'bins': kinematics}
kins_file_name = self.metadata['kinematics']['file']
with open(current_dir + '/' + kins_file_name, 'w') as file:
yaml.dump(kinematics_yaml, file, sort_keys=False)
logging.info("Done!")

# Save central data into file
logging.info("Dumping central data to file...")
dat_central_yaml = {'data_central': central_data}
data_file_name = self.metadata['data_central']
with open(current_dir + '/' + data_file_name, 'w') as file:
yaml.dump(dat_central_yaml, file, sort_keys=False)
logging.info("Done!")

# Save unertainties
logging.info("Dumping uncertainties to file...")
uncertainties_yaml = {'definitions': unc_definitions, 'bins': unc_vals}
unc_file_name = (
self.metadata['data_uncertainties'][0]
if variant == 'default'
else self.metadata['variants'][variant]['data_uncertainties'][0]
)
with open(current_dir + '/' + unc_file_name, 'w') as file:
yaml.dump(uncertainties_yaml, file, sort_keys=False)
logging.info("Done!")
Loading

0 comments on commit fb5a75d

Please sign in to comment.