Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update get siblings #255

Open
wants to merge 10 commits into
base: update_CMSSW_13
Choose a base branch
from
54 changes: 54 additions & 0 deletions DBTools/python/createEventLists.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this file the same as the one in DBTools/scripts/createEventLists.py? If so, maybe this could be removed? I assume the other one to be the correct file, but correct me if I am wrong.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes good catch there should only be the one in python

Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/python3

import json
import ROOT as r
import numpy as np
import sys
import os

r.gInterpreter.Declare(
'''
#include "OSUT3Analysis/DBTools/interface/getEvents.h"

void dummyWrapper(){
getEvents();
}
'''
)
r.gSystem.Load('libOSUT3AnalysisDBTools.so')

if __name__ == "__main__":


if not len(sys.argv) >= 3:
print("Error: Need to provide the input json file")
sys.exit(1)

outputDir = '/data/users/mcarrigan/condor/EventLists/'
if len(sys.argv) >= 4:
outputDir = sys.argv[3]

f_json = sys.argv[1]

job = int(sys.argv[2])

dataset = f_json.split('/')[-1].split('-')[0]

outputPath = outputDir + dataset + '/'

if not os.path.exists(outputPath):
os.mkdir(outputPath)

with open(f_json) as secondary_fin:
secondary_dict = json.load(secondary_fin)
filename = list(secondary_dict.keys())[job]

#for filename in secondary_dict.keys():
if not filename.startswith('root://'): filename = 'root://cmsxrootd.fnal.gov:/' + filename
events = r.getEventsInFile(filename)
tmpEvents = np.array([str(x.runNum)+':'+str(x.lumiBlock)+':'+str(x.event) for x in events])
print(tmpEvents, len(tmpEvents))
fileStr = filename.split('/')[-1]
outputFile = outputPath + fileStr
np.savez(outputFile, eventList=tmpEvents)
#break
49 changes: 37 additions & 12 deletions DBTools/python/getSiblings.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def __init__(self, inputJSON, secondaryJSON, label):
self.nJobs = -1
self.jobNumber = -1
self.eventsPerJob = -1
self.redirector = None

self.local=False

Expand All @@ -59,6 +60,7 @@ def getSiblings(self):
self.getFilesFromList(args.jobNumber, args.totalJobs)
self.findMatches()
else:
print("Running default option...")
self.getFilesFromList(args.jobNumber, args.totalJobs)
self.findMatches()

Expand Down Expand Up @@ -125,20 +127,32 @@ def findMatches(self, jsonName='default.json'):
with open(self.secondaryJSON) as secondary_fin:
secondary_dict = json.load(secondary_fin)

secondary_dataset = self.secondaryJSON.split('/')[-1].split('-')[0]
eventsDir = '/data/users/mcarrigan/condor/EventLists/' + secondary_dataset

for inputFile in self.inputFiles:
if inputFile not in primary_dict.keys():
continue
p_file = inputFile
if 'root://' in inputFile:
self.redirector = 'root://'+inputFile.split('://')[1]
p_file = '/'+inputFile.split('://')[-1]
print("saving redirector as", self.redirector)
#print("looking for file", p_file)
if p_file not in primary_dict.keys():
continue
#print("found file ", p_file)
primary_info = primary_dict[p_file]
if p_file not in self.inputFiles: continue
if p_file not in self.inputFiles and inputFile not in self.inputFiles: continue
sibs = []
#print("looking for siblings")
for s_file, secondary_info in secondary_dict.items():
if len(np.intersect1d(primary_info['runs'], secondary_info['runs'])) == 0: continue
if len(np.intersect1d(primary_info['lumis'], secondary_info['lumis'])) != 0:
sibs.append(s_file)


#print("There are {} siblings".format(len(sibs)))
siblings[p_file] = sibs
self.getEventList(p_file, sibs)

self.getEventList(inputFile, sibs, eventsDir)

if self.eventsPerJob != -1 and len(self.sharedEvents) > self.eventsPerJob:
break
Expand All @@ -152,24 +166,33 @@ def findMatches(self, jsonName='default.json'):

self.siblingDict = siblings

def getEventList(self, primaryFile, siblings):
def getEventList(self, primaryFile, siblings, eventsDir):

primaryEvents = np.array([])
if not primaryFile in self.inputFiles:
print("File is missing from input file list")
return
if not primaryFile.startswith("root://") and not self.local:
primaryFile = 'root://cms-xrd-global.cern.ch:/' + primaryFile
print("getting primary file events")
events = r.getEventsInFile(primaryFile)
tmpEvents = np.array([str(x.runNum)+':'+str(x.lumiBlock)+':'+str(x.event) for x in events])
primaryEvents = np.concatenate((primaryEvents, tmpEvents))

secondaryEvents = np.array([])
for filename in siblings:
if not filename.startswith("root://") and not self.local:
filename = 'root://cms-xrd-global.cern.ch:/' + filename
events = r.getEventsInFile(filename)
tmpEvents = np.array([str(x.runNum)+':'+str(x.lumiBlock)+':'+str(x.event) for x in events])
for ifile, filename in enumerate(siblings):
fileStr = filename.split('/')[-1]
filePath = '/'.join([eventsDir,fileStr+'.npz'])
print("Looking for event list for file", filePath, filename)
if os.path.exists(filePath):
print("found event list!")
tmpEvents = np.load(filePath)['eventList']
else:
print("getting secondary file events", filename)
if not filename.startswith("root://") and not self.local:
filename = 'root://cms-xrd-global.cern.ch:/' + filename
events = r.getEventsInFile(filename)
tmpEvents = np.array([str(x.runNum)+':'+str(x.lumiBlock)+':'+str(x.event) for x in events])
secondaryEvents = np.concatenate((secondaryEvents, tmpEvents))

this_sharedEvents = np.intersect1d(primaryEvents, secondaryEvents)
Expand All @@ -180,6 +203,7 @@ def getEventList(self, primaryFile, siblings):


def getFilesFromList(self, jobNumber, nJobs):
print("getting files from list")
sys.path.append(os.getcwd())

self.jobNumber = jobNumber
Expand All @@ -191,6 +215,7 @@ def getFilesFromList(self, jobNumber, nJobs):
#If no job number or number of jobs is passed use the full file list
if jobNumber == -1 or nJobs == -1:
self.inputFiles = datasetInfo.listOfFiles
print("Using full file list")
return

filesPerJob = int (math.floor (len (datasetInfo.listOfFiles) / nJobs))
Expand All @@ -204,7 +229,7 @@ def getFilesFromList(self, jobNumber, nJobs):
if runList[0].startswith('file:'):
runList = [x.split('file:')[1] for x in runList]

#print("This is the run list:\n",runList)
print("This is the run list:\n",runList)

self.inputFiles = runList

Expand Down
51 changes: 51 additions & 0 deletions DBTools/scripts/createEventLists.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/python3

import json
import ROOT as r
import numpy as np
import sys
import os

r.gInterpreter.Declare(
'''
#include "OSUT3Analysis/DBTools/interface/getEvents.h"

void dummyWrapper(){
getEvents();
}
'''
)
r.gSystem.Load('libOSUT3AnalysisDBTools.so')

if __name__ == "__main__":


if not len(sys.argv) >= 3:
print("Error: Need to provide the input json file")
sys.exit(1)

outputDir = '/data/users/mcarrigan/condor/EventLists/'
if len(sys.argv) >= 4:
outputDir = sys.argv[3]

f_json = sys.argv[1]

job = int(sys.argv[2])

dataset = f_json.split('/')[-1].split('-')[0]

outputPath = outputDir + dataset + '/'

if not os.path.exists(outputPath):
os.mkdir(outputPath)

with open(f_json) as secondary_fin:
secondary_dict = json.load(secondary_fin)
filename = list(secondary_dict.keys())[job]

if not filename.startswith('root://'): filename = 'root://cmsxrootd.fnal.gov:/' + filename
events = r.getEventsInFile(filename)
tmpEvents = np.array([str(x.runNum)+':'+str(x.lumiBlock)+':'+str(x.event) for x in events])
fileStr = filename.split('/')[-1]
outputFile = outputPath + fileStr
np.savez(outputFile, eventList=tmpEvents)
8 changes: 8 additions & 0 deletions DBTools/scripts/eventListWrapper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/usr/bin/bash

export CMSSW_VERSION_LOCAL=$1
source /cvmfs/cms.cern.ch/cmsset_default.sh
export SCRAM_ARCH=slc7_amd64_gcc820
eval `scramv1 runtime -sh`

python3 createEventLists.py $2 $3
72 changes: 72 additions & 0 deletions DBTools/scripts/submitCreateEventLists.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import os
import argparse
import json
import shutil
from OSUT3Analysis.DBTools.getSiblings import *
import sys

def writeCondorSub(exe, nJobs, outDir, jsonFile, requirements, wrapper, cmssw):
f = open('run.sub', 'w')
submitLines = """
Universe = vanilla
Rank = TARGET.IsLocalSlot
request_disk = {6}
request_memory = {5}
request_cpus = {4}
executable = {7}
arguments = {8} {2} $(PROCESS)
log = {3}/log_$(PROCESS).log
output = {3}/out_$(PROCESS).out
error = {3}/error_$(PROCESS).err
should_transfer_files = Yes
when_to_transfer_output = ON_EXIT
transfer_input_files = {1}, {2}, {7}
transfer_output_files = ""
getenv = true
x509userproxy = /tmp/x509up_u1009
queue {0}
""".format(nJobs,exe,jsonFile,outDir,requirements[0],requirements[1],requirements[2],wrapper, cmssw)

f.write(submitLines)
f.close()


if __name__ == "__main__":


parser = argparse.ArgumentParser()
parser.add_argument("-j", "--json", type=str, help="Input json of dataset to get events from")
parser.add_argument('-d', '--dataset', type=str, help="Dataset to get json from")
args = parser.parse_args()

jsonFile = '/home/mcarrigan/scratch0/disTracksML/CMSSW_13_0_13/src/DisappTrks/BackgroundEstimation/test/debugMuonSkim/Muon_Run2022E-EXODisappTrk-27Jun2023-v1_AOD.json'
wrapper = os.environ['CMSSW_BASE'] + '/src/OSUT3Analysis/DBTools/scripts/eventListWrapper.sh'
script = os.environ['CMSSW_BASE'] + '/src/OSUT3Analysis/DBTools/python/createEventLists.py'
requirements = ['2', '2000MB', '3000MB'] #CPU, Memory, Disk
cmssw = os.environ['CMSSW_VERSION']

if args.json:
jsonFile = args.json

if args.dataset:
jsonFile = args.dataset.replace('/', '_')[1:]
getSiblings.getDASInfo(args.dataset, jsonName = '{}.json'.format(jsonFile))
jsonFile = '{}.json'.format(args.dataset)

outputDir = '/abyss/users/mcarrigan/log/DisappTrks/EventLists_{}'.format(jsonFile.split('/')[-1].split('.')[0])

nJobs = 0
with open(jsonFile) as fin:
f_json = json.load(fin)
nJobs = len(f_json.keys())

if not os.path.exists(outputDir):
os.mkdir(outputDir)

writeCondorSub(script, nJobs, outputDir, jsonFile, requirements, wrapper, cmssw)

shutil.copy(script, outputDir)
shutil.copy('run.sub', outputDir)
shutil.copy(jsonFile, outputDir)

os.system('condor_submit run.sub')