Skip to content

Commit

Permalink
Add dataset name checking script
Browse files Browse the repository at this point in the history
  • Loading branch information
lviliani committed Nov 19, 2024
1 parent 084d9f3 commit f97c0bc
Show file tree
Hide file tree
Showing 2 changed files with 148 additions and 0 deletions.
119 changes: 119 additions & 0 deletions bin/utils/check_dataset_names.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import re

def validate_block(name, pattern, block_name):
""" Helper function to validate individual blocks and provide feedback """
if block_name=="ME-PS":
match = re.fullmatch(pattern, name)
if not match:
return False, "Invalid ME-PS format"

me = match.group(1)
ps = match.group(3) # Only available in ME-PS combinations

if ps and me == ps:
return False, "ME and PS cannot be the same"

return True, "Valid ME-PS block"

else:
if re.match(pattern, name):
return True, f"{block_name} block is valid."
else:
return False, f"Invalid {block_name} block."

def validate_dataset_name(dataset_name):
# Define regex patterns for different blocks
process_pattern = r".*" # PROCESS is mandatory
binning_pattern = r"Bin-[\w-]+" # BINNING is optional
filter_pattern = r"Fil-[\w-]+" # FILTER is optional
param_pattern = r"Par-[\w-]+" # PARAMETERS is optional
tune_pattern = r"TuneCP[1-5]" # TUNE is mandatory (TuneCP1 to TuneCP5)
beame_pattern = r"13p6TeV|\d+TeV|\d+GeV" # BEAME is mandatory
me_ps_pattern = (
r"(pythia6|pythia8|pythia8-evtgen|herwig6|herwigpp|herwig7|sherpa|"
r"(madgraph|madgraphMLM|amcatnloFXFX|madgraph-madspin|madgraphMLM-madspin|"
r"amcatnloFXFX-madspin|amcatnlo|amcatnlo-madspin|alpgen|mcatnlo|powheg|"
r"powheg-madspin|powheg-JHUGenV\d*|powheg-minlo|powheg-minnlo|powheg-minlo-JHUGenV\d*|"
r"powheg-minnlo-JHUGen\d*|JHUGen|hardcol|bcvegpy2)"
r"-(pythia6|pythia8|herwig6|herwigpp|herwig7))"
)
blocks = dataset_name.split('_')

feedback = []
valid = True

# Step-by-step validation of blocks:

# Validate PROCESS (first block is mandatory)
if len(blocks) >= 1:
process_match, msg = validate_block(blocks[0], process_pattern, "PROCESS")
feedback.append(msg)
valid &= process_match
else:
feedback.append("Missing PROCESS block.")
valid = False

current_index = 1

# Validate BINNING (optional, second block if it starts with 'Bin-')
if len(blocks) > current_index and blocks[current_index].startswith('Bin-'):
binning_match, msg = validate_block(blocks[current_index], binning_pattern, "BINNING")
feedback.append(msg)
valid &= binning_match
current_index += 1
else:
feedback.append("BINNING block is missing or optional.")

# Validate FILTER (optional, next block if it starts with 'Fil-')
if len(blocks) > current_index and blocks[current_index].startswith('Fil-'):
filter_match, msg = validate_block(blocks[current_index], filter_pattern, "FILTER")
feedback.append(msg)
valid &= filter_match
current_index += 1
else:
feedback.append("FILTER block is missing or optional.")

# Validate PARAMETERS (optional, next block if it starts with 'Par-')
if len(blocks) > current_index and blocks[current_index].startswith('Par-'):
param_match, msg = validate_block(blocks[current_index], param_pattern, "PARAMETERS")
feedback.append(msg)
valid &= param_match
current_index += 1
else:
feedback.append("PARAMETERS block is missing or optional.")

# Validate TUNE (mandatory)
if len(blocks) > current_index:
tune_match, msg = validate_block(blocks[current_index], tune_pattern, "TUNE")
feedback.append(msg)
valid &= tune_match
current_index += 1
else:
feedback.append("Missing TUNE block.")
valid = False

# Validate BEAME (mandatory)
if len(blocks) > current_index:
beame_match, msg = validate_block(blocks[current_index], beame_pattern, "BEAME")
feedback.append(msg)
valid &= beame_match
current_index += 1
else:
feedback.append("Missing BEAME block.")
valid = False

# Validate ME-PS (mandatory)
if len(blocks) > current_index:
me_ps_match, msg = validate_block(blocks[current_index], me_ps_pattern, "ME-PS")
feedback.append(msg)
valid &= me_ps_match
else:
feedback.append("Missing ME-PS block.")
valid = False

# Return feedback and validity status
if valid:
return True, "Valid dataset name", feedback
else:
return False, "Invalid dataset name", feedback

29 changes: 29 additions & 0 deletions bin/utils/test_dataset_names.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from check_dataset_names import *

test_cases = ["DYto2L-4Jets_Bin-MLL-4to10_TuneCP5_13p6TeV_madgraphMLM-pythia8",
"DYto2L-4Jets_Bin-MLL-10to50_TuneCP5_13p6TeV_madgraphMLM-pythia8",
"DYto2L-2Jets_Bin-MLL-4to10_TuneCP5_13p6TeV_amcatnloFXFX-pythia8",
"DYto2L-2Jets_Bin-MLL-10to50_TuneCP5_13p6TeV_amcatnloFXFX-pythia8",
"DYto2L-4Jets_Bin-MLL-50_TuneCP5_13p6TeV_madgraphMLM-pythia8",
"DYto2L-2Jets_Bin-MLL-50_TuneCP5_13p6TeV_amcatnloFXFX-pythia8",
"DYto2L-4Jets_Bin-0J-MLL-50_TuneCP5_13p6TeV_madgraphMLM-pythia8",
"DYto2L-4Jets_Bin-1J-MLL-50_TuneCP5_13p6TeV_madgraphMLM-pythia8",
"DYto2L-4Jets_Bin-2J-MLL-50_TuneCP5_13p6TeV_madgraphMLM-pythia8",
"DYto2L-4Jets_Bin-3J-MLL-50_TuneCP5_13p6TeV_madgraphMLM-pythia8",
"DYto2L-4Jets_Bin-4J-MLL-50_TuneCP5_13p6TeV_madgraphMLM-pythia8",
"DYto2L-2Jets_Bin-0J-MLL-50_TuneCP5_13p6TeV_amcatnloFXFX-pythia8",
"DYto2L-2Jets_Bin-1J-MLL-50_TuneCP5_13p6TeV_amcatnloFXFX-pythia8",
"DYto2L-2Jets_2J-MLL-50_TuneCP5_13p6TeV_amcatnloFXFX-pythia8", #THIS IS NOT A VALID NAME
"DYto2L-2Jets_Bin-2J-MLL-50_TuneCP5_13p6TeV_amcatnloFXFX-pythia8"
]

# Validate the test cases
for name in test_cases:
valid, message, feedback = validate_dataset_name(name)
print(f"Dataset: {name} -> {message}")
if not valid:
for item in feedback:
print(f" - {item}")

print("\n")

0 comments on commit f97c0bc

Please sign in to comment.