OSU-CMS · carriganm95 · Sep 26, 2024 · Sep 27, 2024 · Sep 27, 2024 · Sep 28, 2024
diff --git a/DBTools/python/createEventLists.py b/DBTools/python/createEventLists.py
@@ -0,0 +1,54 @@
+#!/usr/bin/python3
+
+import json
+import ROOT as r
+import numpy as np
+import sys
+import os
+
+r.gInterpreter.Declare(
+    '''
+    #include "OSUT3Analysis/DBTools/interface/getEvents.h"
+
+    void dummyWrapper(){
+        getEvents();
+    }
+    '''
+)
+r.gSystem.Load('libOSUT3AnalysisDBTools.so')
+
+if __name__ == "__main__":
+
+
+    if not len(sys.argv) >= 3:
+        print("Error: Need to provide the input json file")
+        sys.exit(1)
+
+    outputDir = '/data/users/mcarrigan/condor/EventLists/'
+    if len(sys.argv) >= 4:
+        outputDir = sys.argv[3]
+
+    f_json = sys.argv[1]
+
+    job = int(sys.argv[2])
+
+    dataset = f_json.split('/')[-1].split('-')[0]
+
+    outputPath = outputDir + dataset + '/'
+
+    if not os.path.exists(outputPath):
+        os.mkdir(outputPath)
+
+    with open(f_json) as secondary_fin:
+        secondary_dict = json.load(secondary_fin)
+        filename = list(secondary_dict.keys())[job]
+
+    #for filename in secondary_dict.keys():
+    if not filename.startswith('root://'): filename = 'root://cmsxrootd.fnal.gov:/' + filename
+    events = r.getEventsInFile(filename)
+    tmpEvents = np.array([str(x.runNum)+':'+str(x.lumiBlock)+':'+str(x.event) for x in events])
+    print(tmpEvents, len(tmpEvents))
+    fileStr = filename.split('/')[-1]
+    outputFile = outputPath + fileStr
+    np.savez(outputFile, eventList=tmpEvents)
+    #break
diff --git a/DBTools/python/getSiblings.py b/DBTools/python/getSiblings.py
@@ -47,6 +47,7 @@ def __init__(self, inputJSON, secondaryJSON, label):
         self.nJobs = -1
         self.jobNumber = -1
         self.eventsPerJob = -1
+        self.redirector = None
 
         self.local=False
 
@@ -59,6 +60,7 @@ def getSiblings(self):
             self.getFilesFromList(args.jobNumber, args.totalJobs)
             self.findMatches()
         else:
+            print("Running default option...")
             self.getFilesFromList(args.jobNumber, args.totalJobs)
             self.findMatches()
 
@@ -125,20 +127,32 @@ def findMatches(self, jsonName='default.json'):
             with open(self.secondaryJSON) as secondary_fin:
                 secondary_dict = json.load(secondary_fin)
 
+                secondary_dataset = self.secondaryJSON.split('/')[-1].split('-')[0]
+                eventsDir = '/data/users/mcarrigan/condor/EventLists/' + secondary_dataset
+
                 for inputFile in self.inputFiles:
-                    if inputFile not in primary_dict.keys():
-                        continue
                     p_file = inputFile
+                    if 'root://' in inputFile: 
+                        self.redirector = 'root://'+inputFile.split('://')[1]
+                        p_file = '/'+inputFile.split('://')[-1]
+                        print("saving redirector as", self.redirector)
+                    #print("looking for file", p_file)
+                    if p_file not in primary_dict.keys():
+                        continue
+                    #print("found file ", p_file)
                     primary_info = primary_dict[p_file]
-                    if p_file not in self.inputFiles: continue
+                    if p_file not in self.inputFiles and inputFile not in self.inputFiles: continue
                     sibs = []
+                    #print("looking for siblings")
                     for s_file, secondary_info in secondary_dict.items():
                         if len(np.intersect1d(primary_info['runs'], secondary_info['runs'])) == 0: continue
                         if len(np.intersect1d(primary_info['lumis'], secondary_info['lumis'])) != 0:
                             sibs.append(s_file)
-
+
+                    #print("There are {} siblings".format(len(sibs))) 
                     siblings[p_file] = sibs
-                    self.getEventList(p_file, sibs)
+
+                    self.getEventList(inputFile, sibs, eventsDir)
 
                     if self.eventsPerJob != -1 and len(self.sharedEvents) > self.eventsPerJob:
                         break                        
@@ -152,24 +166,33 @@ def findMatches(self, jsonName='default.json'):
 
         self.siblingDict = siblings
 
-    def getEventList(self, primaryFile, siblings):
+    def getEventList(self, primaryFile, siblings, eventsDir):
 
         primaryEvents = np.array([])
         if not primaryFile in self.inputFiles: 
             print("File is missing from input file list")
             return
         if not primaryFile.startswith("root://") and not self.local: 
             primaryFile = 'root://cms-xrd-global.cern.ch:/' + primaryFile
+        print("getting primary file events")
         events = r.getEventsInFile(primaryFile)
         tmpEvents = np.array([str(x.runNum)+':'+str(x.lumiBlock)+':'+str(x.event) for x in events])
         primaryEvents = np.concatenate((primaryEvents, tmpEvents))
 
         secondaryEvents = np.array([])
-        for filename in siblings:
-            if not filename.startswith("root://") and not self.local: 
-                filename = 'root://cms-xrd-global.cern.ch:/' + filename
-            events = r.getEventsInFile(filename)
-            tmpEvents = np.array([str(x.runNum)+':'+str(x.lumiBlock)+':'+str(x.event) for x in events])
+        for ifile, filename in enumerate(siblings):
+            fileStr = filename.split('/')[-1]
+            filePath = '/'.join([eventsDir,fileStr+'.npz'])
+            print("Looking for event list for file", filePath, filename)
+            if os.path.exists(filePath):
+                print("found event list!")
+                tmpEvents = np.load(filePath)['eventList']
+            else:
+                print("getting secondary file events", filename)
+                if not filename.startswith("root://") and not self.local: 
+                    filename = 'root://cms-xrd-global.cern.ch:/' + filename
+                events = r.getEventsInFile(filename)
+                tmpEvents = np.array([str(x.runNum)+':'+str(x.lumiBlock)+':'+str(x.event) for x in events])
             secondaryEvents = np.concatenate((secondaryEvents, tmpEvents))
 
         this_sharedEvents = np.intersect1d(primaryEvents, secondaryEvents)
@@ -180,6 +203,7 @@ def getEventList(self, primaryFile, siblings):
 
 
     def getFilesFromList(self, jobNumber, nJobs):
+        print("getting files from list")
         sys.path.append(os.getcwd())
 
         self.jobNumber = jobNumber
@@ -191,6 +215,7 @@ def getFilesFromList(self, jobNumber, nJobs):
         #If no job number or number of jobs is passed use the full file list
         if jobNumber == -1 or nJobs == -1:
             self.inputFiles = datasetInfo.listOfFiles
+            print("Using full file list")
             return
 
         filesPerJob = int (math.floor (len (datasetInfo.listOfFiles) / nJobs))
@@ -204,7 +229,7 @@ def getFilesFromList(self, jobNumber, nJobs):
         if runList[0].startswith('file:'):
             runList = [x.split('file:')[1] for x in runList]
 
-        #print("This is the run list:\n",runList)
+        print("This is the run list:\n",runList)
 
         self.inputFiles = runList
 

diff --git a/DBTools/scripts/createEventLists.py b/DBTools/scripts/createEventLists.py
@@ -0,0 +1,51 @@
+#!/usr/bin/python3
+
+import json
+import ROOT as r
+import numpy as np
+import sys
+import os
+
+r.gInterpreter.Declare(
+    '''
+    #include "OSUT3Analysis/DBTools/interface/getEvents.h"
+
+    void dummyWrapper(){
+        getEvents();
+    }
+    '''
+)
+r.gSystem.Load('libOSUT3AnalysisDBTools.so')
+
+if __name__ == "__main__":
+
+
+    if not len(sys.argv) >= 3:
+        print("Error: Need to provide the input json file")
+        sys.exit(1)
+
+    outputDir = '/data/users/mcarrigan/condor/EventLists/'
+    if len(sys.argv) >= 4:
+        outputDir = sys.argv[3]
+
+    f_json = sys.argv[1]
+
+    job = int(sys.argv[2])
+
+    dataset = f_json.split('/')[-1].split('-')[0]
+
+    outputPath = outputDir + dataset + '/'
+
+    if not os.path.exists(outputPath):
+        os.mkdir(outputPath)
+
+    with open(f_json) as secondary_fin:
+        secondary_dict = json.load(secondary_fin)
+        filename = list(secondary_dict.keys())[job]
+
+    if not filename.startswith('root://'): filename = 'root://cmsxrootd.fnal.gov:/' + filename
+    events = r.getEventsInFile(filename)
+    tmpEvents = np.array([str(x.runNum)+':'+str(x.lumiBlock)+':'+str(x.event) for x in events])
+    fileStr = filename.split('/')[-1]
+    outputFile = outputPath + fileStr
+    np.savez(outputFile, eventList=tmpEvents)
diff --git a/DBTools/scripts/eventListWrapper.sh b/DBTools/scripts/eventListWrapper.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/bash
+
+export CMSSW_VERSION_LOCAL=$1
+source /cvmfs/cms.cern.ch/cmsset_default.sh
+export SCRAM_ARCH=slc7_amd64_gcc820
+eval `scramv1 runtime -sh`
+
+python3 createEventLists.py $2 $3
diff --git a/DBTools/scripts/submitCreateEventLists.py b/DBTools/scripts/submitCreateEventLists.py
@@ -0,0 +1,72 @@
+import os
+import argparse
+import json
+import shutil
+from OSUT3Analysis.DBTools.getSiblings import *
+import sys
+
+def writeCondorSub(exe, nJobs, outDir, jsonFile, requirements, wrapper, cmssw):
+    f = open('run.sub', 'w')
+    submitLines = """
+    Universe = vanilla
+    Rank = TARGET.IsLocalSlot
+    request_disk = {6}
+    request_memory = {5}
+    request_cpus = {4}
+    executable              = {7}
+    arguments               = {8} {2} $(PROCESS)
+    log                     = {3}/log_$(PROCESS).log
+    output                  = {3}/out_$(PROCESS).out
+    error                   = {3}/error_$(PROCESS).err
+    should_transfer_files   = Yes
+    when_to_transfer_output = ON_EXIT
+    transfer_input_files = {1}, {2}, {7}
+    transfer_output_files = ""
+    getenv = true
+    x509userproxy = /tmp/x509up_u1009
+    queue {0}
+    """.format(nJobs,exe,jsonFile,outDir,requirements[0],requirements[1],requirements[2],wrapper, cmssw)
+
+    f.write(submitLines)
+    f.close()    
+
+
+if __name__ == "__main__":
+
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-j", "--json", type=str, help="Input json of dataset to get events from")
+    parser.add_argument('-d', '--dataset', type=str, help="Dataset to get json from")
+    args = parser.parse_args()
+
+    jsonFile = '/home/mcarrigan/scratch0/disTracksML/CMSSW_13_0_13/src/DisappTrks/BackgroundEstimation/test/debugMuonSkim/Muon_Run2022E-EXODisappTrk-27Jun2023-v1_AOD.json'
+    wrapper = os.environ['CMSSW_BASE'] + '/src/OSUT3Analysis/DBTools/scripts/eventListWrapper.sh'
+    script = os.environ['CMSSW_BASE'] + '/src/OSUT3Analysis/DBTools/python/createEventLists.py'
+    requirements = ['2', '2000MB', '3000MB'] #CPU, Memory, Disk
+    cmssw = os.environ['CMSSW_VERSION']
+
+    if args.json:
+        jsonFile = args.json
+
+    if args.dataset:
+        jsonFile = args.dataset.replace('/', '_')[1:]
+        getSiblings.getDASInfo(args.dataset, jsonName = '{}.json'.format(jsonFile))
+        jsonFile = '{}.json'.format(args.dataset)
+
+    outputDir = '/abyss/users/mcarrigan/log/DisappTrks/EventLists_{}'.format(jsonFile.split('/')[-1].split('.')[0])
+
+    nJobs = 0
+    with open(jsonFile) as fin:
+        f_json = json.load(fin)
+        nJobs = len(f_json.keys())
+
+    if not os.path.exists(outputDir):
+        os.mkdir(outputDir)
+
+    writeCondorSub(script, nJobs, outputDir, jsonFile, requirements, wrapper, cmssw)
+
+    shutil.copy(script, outputDir)
+    shutil.copy('run.sub', outputDir)
+    shutil.copy(jsonFile, outputDir)
+
+    os.system('condor_submit run.sub')