Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update get siblings #255

Open
wants to merge 10 commits into
base: update_CMSSW_13
Choose a base branch
from
Binary file modified Configuration/data/electronFiducialMap_2022F_data.root
Binary file not shown.
Binary file modified Configuration/data/muonFiducialMap_2022F_data.root
Binary file not shown.
2 changes: 1 addition & 1 deletion Configuration/python/configurationOptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1811,7 +1811,7 @@

'EGamma_2022A' : 6,
'EGamma_2022B' : 112,
'EGamma_2022C' : 200,
'EGamma_2022C' : 3354,
'EGamma_2022D' : 33,
'EGamma_2022E' : 800,
'EGamma_2022F' : 1000,
Expand Down
2 changes: 1 addition & 1 deletion Configuration/python/histogramUtilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def getHistIntegral(sample,condor_dir,channel,hist,xlo,xhi):
# print "xhi is outside the range of the histogram, will include all the overflow instead"
xhiBin = histogram.GetXaxis().FindBin(float(xhi))
#intError = Double (0.0)
intError = 0.0
intError = c_double(0.0)
integral = histogram.IntegralAndError(xloBin, xhiBin, intError)

inputFile.Close()
Expand Down
4 changes: 2 additions & 2 deletions DBTools/python/condorSubArgumentsSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
CondorSubArgumentsSet = {
1 : {'Executable' : ''},
2 : {'Universe' : 'vanilla'},
3 : {'request_memory ' : '2048MB'},
3 : {'request_memory ' : '4092MB'},
4 : {'Arguments' : ''},
5 : {'Output' : 'condor_$(Process).out'},
6 : {'Error' : 'condor_$(Process).err'},
Expand All @@ -23,7 +23,7 @@
1 : {'Executable' : ''},
2 : {'Universe' : 'vanilla'},
3 : {'Getenv' : 'True'},
4 : {'request_memory ' : '2048MB'},
4 : {'request_memory ' : '4092MB'},
5 : {'Arguments' : ''},
6 : {'Output' : 'condor_$(Process).out'},
7 : {'Error' : 'condor_$(Process).err'},
Expand Down
57 changes: 57 additions & 0 deletions DBTools/python/createEventLists.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this file the same as the one in DBTools/scripts/createEventLists.py? If so, maybe this could be removed? I assume the other one to be the correct file, but correct me if I am wrong.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes good catch there should only be the one in python

Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/usr/bin/python3

import json
import ROOT as r
import numpy as np
import sys
import os

r.gInterpreter.Declare(
'''
#include "OSUT3Analysis/DBTools/interface/getEvents.h"

void dummyWrapper(){
getEvents();
}
'''
)
r.gSystem.Load('libOSUT3AnalysisDBTools.so')

if __name__ == "__main__":


if not len(sys.argv) >= 3:
print("Error: Need to provide the input json file")
sys.exit(1)

outputDir = '/data/users/mcarrigan/condor/EventLists/'
if len(sys.argv) >= 4:
outputDir = sys.argv[3]

f_json = sys.argv[1]

job = int(sys.argv[2])

dataset = f_json.split('/')[-1].split('-')[0]

outputPath = outputDir + dataset + '/'

if not os.path.exists(outputPath):
os.mkdir(outputPath)

with open(f_json) as secondary_fin:
secondary_dict = json.load(secondary_fin)
filename = list(secondary_dict.keys())[job]

#for filename in secondary_dict.keys():
if not filename.startswith('root://'): filename = 'root://cmsxrootd.fnal.gov:/' + filename
print("Getting events for", filename)
events = r.getEventsInFile(filename)
events = sorted(events, key=lambda x: (x.runNum, x.lumiBlock, x.event))
tmpEvents = np.array([str(x.runNum)+':'+str(x.lumiBlock)+':'+str(x.event) for x in events])
print(tmpEvents, len(tmpEvents))
fileStr = filename.split('/')[-1]
outputFile = outputPath + fileStr
np.savez(outputFile, eventList=tmpEvents)
print("saved file {} with {} events".format(outputFile, len(tmpEvents)))
#break
53 changes: 40 additions & 13 deletions DBTools/python/getSiblings.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def __init__(self, inputJSON, secondaryJSON, label):
self.nJobs = -1
self.jobNumber = -1
self.eventsPerJob = -1
self.redirector = None

self.local=False

Expand All @@ -59,6 +60,7 @@ def getSiblings(self):
self.getFilesFromList(args.jobNumber, args.totalJobs)
self.findMatches()
else:
print("Running default option...")
self.getFilesFromList(args.jobNumber, args.totalJobs)
self.findMatches()

Expand All @@ -72,12 +74,14 @@ def getDASInfo(dataset, jsonName=None):

test = ast.literal_eval(miniaod)
files = [x['file'][0]['name'] for x in test]
events = [len(x['events'][0]['number']) if 'events' in list(x.keys()) else 0 for x in test]
lumis = [[x['lumi'][0]['number']] if isinstance(x['lumi'][0]['number'], int) else x['lumi'][0]['number'] for x in test]
runs = [[x['run'][0]['run_number']] if isinstance(x['run'][0]['run_number'], int) else x['run'][0]['run_number'] for x in test]

miniDict = {}

for f, l, r in zip(files, lumis, runs):
for f, l, r, e in zip(files, lumis, runs, events):
if e == 0: continue
miniDict[f] = {'lumis': l, 'runs': r}

if jsonName:
Expand Down Expand Up @@ -125,20 +129,32 @@ def findMatches(self, jsonName='default.json'):
with open(self.secondaryJSON) as secondary_fin:
secondary_dict = json.load(secondary_fin)

secondary_dataset = self.secondaryJSON.split('/')[-1].split('-')[0]
eventsDir = '/data/users/mcarrigan/condor/EventLists/' + secondary_dataset

for inputFile in self.inputFiles:
if inputFile not in primary_dict.keys():
continue
p_file = inputFile
if 'root://' in inputFile:
self.redirector = 'root://'+inputFile.split('://')[1]
p_file = '/'+inputFile.split('://')[-1]
print("saving redirector as", self.redirector)
#print("looking for file", p_file)
if p_file not in primary_dict.keys():
continue
#print("found file ", p_file)
primary_info = primary_dict[p_file]
if p_file not in self.inputFiles: continue
if p_file not in self.inputFiles and inputFile not in self.inputFiles: continue
sibs = []
#print("looking for siblings")
for s_file, secondary_info in secondary_dict.items():
if len(np.intersect1d(primary_info['runs'], secondary_info['runs'])) == 0: continue
if len(np.intersect1d(primary_info['lumis'], secondary_info['lumis'])) != 0:
sibs.append(s_file)


#print("There are {} siblings".format(len(sibs)))
siblings[p_file] = sibs
self.getEventList(p_file, sibs)

self.getEventList(inputFile, sibs, eventsDir)

if self.eventsPerJob != -1 and len(self.sharedEvents) > self.eventsPerJob:
break
Expand All @@ -152,24 +168,33 @@ def findMatches(self, jsonName='default.json'):

self.siblingDict = siblings

def getEventList(self, primaryFile, siblings):
def getEventList(self, primaryFile, siblings, eventsDir):

primaryEvents = np.array([])
if not primaryFile in self.inputFiles:
print("File is missing from input file list")
return
if not primaryFile.startswith("root://") and not self.local:
primaryFile = 'root://cms-xrd-global.cern.ch:/' + primaryFile
print("getting primary file events")
events = r.getEventsInFile(primaryFile)
tmpEvents = np.array([str(x.runNum)+':'+str(x.lumiBlock)+':'+str(x.event) for x in events])
primaryEvents = np.concatenate((primaryEvents, tmpEvents))

secondaryEvents = np.array([])
for filename in siblings:
if not filename.startswith("root://") and not self.local:
filename = 'root://cms-xrd-global.cern.ch:/' + filename
events = r.getEventsInFile(filename)
tmpEvents = np.array([str(x.runNum)+':'+str(x.lumiBlock)+':'+str(x.event) for x in events])
for ifile, filename in enumerate(siblings):
fileStr = filename.split('/')[-1]
filePath = '/'.join([eventsDir,fileStr+'.npz'])
print("Looking for event list for file", filePath, filename)
if os.path.exists(filePath):
print("found event list!")
tmpEvents = np.load(filePath)['eventList']
else:
print("getting secondary file events", filename)
if not filename.startswith("root://") and not self.local:
filename = 'root://cms-xrd-global.cern.ch:/' + filename
events = r.getEventsInFile(filename)
tmpEvents = np.array([str(x.runNum)+':'+str(x.lumiBlock)+':'+str(x.event) for x in events])
secondaryEvents = np.concatenate((secondaryEvents, tmpEvents))

this_sharedEvents = np.intersect1d(primaryEvents, secondaryEvents)
Expand All @@ -180,6 +205,7 @@ def getEventList(self, primaryFile, siblings):


def getFilesFromList(self, jobNumber, nJobs):
print("getting files from list")
sys.path.append(os.getcwd())

self.jobNumber = jobNumber
Expand All @@ -191,6 +217,7 @@ def getFilesFromList(self, jobNumber, nJobs):
#If no job number or number of jobs is passed use the full file list
if jobNumber == -1 or nJobs == -1:
self.inputFiles = datasetInfo.listOfFiles
print("Using full file list")
return

filesPerJob = int (math.floor (len (datasetInfo.listOfFiles) / nJobs))
Expand All @@ -204,7 +231,7 @@ def getFilesFromList(self, jobNumber, nJobs):
if runList[0].startswith('file:'):
runList = [x.split('file:')[1] for x in runList]

#print("This is the run list:\n",runList)
print("This is the run list:\n",runList)

self.inputFiles = runList

Expand Down
2 changes: 1 addition & 1 deletion DBTools/python/osusub_cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def skimListExists(dataset):

def getSiblingList(sibList, runList, siblingDataset):

print("Trying to get sibling list")
print("Trying to get sibling list from", sibList)
siblings = []

fin = open(sibList, 'r')
Expand Down
13 changes: 13 additions & 0 deletions DBTools/scripts/eventListWrapper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/bash

(>&2 echo "Starting job on " `date`) # Date/time of start of job
(>&2 echo "Running on: `uname -a`") # Condor job is running on this node
(>&2 echo "System software: `cat /etc/redhat-release`") # Operating System on that node
(>&2 echo "Arguments passed to this script are: $@") #print all arguments

export CMSSW_VERSION_LOCAL=$1
source /cvmfs/cms.cern.ch/cmsset_default.sh
export SCRAM_ARCH=slc7_amd64_gcc820
#eval `scramv1 runtime -sh`

python3 createEventLists.py $2 $3
4 changes: 4 additions & 0 deletions DBTools/scripts/osusub.py
Original file line number Diff line number Diff line change
Expand Up @@ -907,6 +907,8 @@ def MakeSpecificConfig(Dataset, Directory, SkimDirectory, Label, SkimChannelName
ConfigFile.write(" print(\"No valid grid proxy. Not adding sibling files.\")\n")
if arguments.localSkim != None:
ConfigFile.write("siblings = ['file:{0}{1}'.format(" + arguments.localSkim + ", sib.split('/')[-1]) for sib in siblings] \n")
elif arguments.Redirector != "":
ConfigFile.write("siblings = ['root://{0}:/{1}'.format(\'" + RedirectorDic[arguments.Redirector] + "\', sib) for sib in siblings] \n")
ConfigFile.write("pset.process.source.secondaryFileNames.extend(siblings)\n\n")

# If the dataset has a Run3 skim sibling defined and not run over skim, add the corresponding files to the secondary file names
Expand All @@ -923,6 +925,8 @@ def MakeSpecificConfig(Dataset, Directory, SkimDirectory, Label, SkimChannelName
ConfigFile.write(" print( \"No valid grid proxy. Not adding sibling files.\")\n" )
if arguments.localSkim != None:
ConfigFile.write("siblings = ['file:{0}{1}'.format(\'" + arguments.localSkim + "\', sib.split('/')[-1]) for sib in siblings] \n")
elif arguments.Redirector != "":
ConfigFile.write("siblings = ['root://{0}:/{1}'.format(\'" + RedirectorDic[arguments.Redirector] + "\', sib) for sib in siblings] \n")
ConfigFile.write("pset.process.source.secondaryFileNames.extend(siblings)\n\n")

#if ...: make this an if statement for running over no cuts
Expand Down
Loading