forked from maximilianh/pubMunch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pubRunAnnot
executable file
·148 lines (122 loc) · 6.23 KB
/
pubRunAnnot
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python
# first load the standard libraries from python
# we require at least python 2.7
#from sys import *
import sys
if sys.version_info[0]==2 and not sys.version_info[1]>=7:
print "Sorry, this program requires at least python 2.7"
print "You can download a more current python version from python.org and compile it"
print "into your homedir (or anywhere) with 'configure --prefix ~/python27'; make;"
print "then run this program by specifying your own python executable like this: "
print " ~/python27/bin/python <scriptFile>"
print "or add ~/python27/bin to your PATH before /usr/bin"
exit(1)
# load default python packages
import logging, optparse, os, collections, tarfile, mimetypes, tempfile, \
copy, shutil, glob, time
from os.path import *
# add <scriptDir>/lib/ to package search path
progFile = os.path.abspath(sys.argv[0])
progDir = os.path.dirname(progFile)
pubToolsLibDir = os.path.join(progDir, "lib")
sys.path.insert(0, pubToolsLibDir)
# now load our own libraries
import maxRun, pubStore, pubConf, pubGeneric, pubAlg, maxCommon
from maxCommon import *
# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser("""usage: %prog [options] <algorithmName> <in> <out> <opt1>=<val1> <opt2>=<val2> ...- run an algorithm on a directory of fulltext files and write results to out directory
<algorithmName> can be a script from the scripts directory, like "protSearch.py". (Extension optional)
By default, only a variable "headers" and a function "annotateFile" is needed in the script. If
the script contains classes to separate different functions, you need to add the class name, separated by
a ":", like "dnaSearch.py:Annotate"
Use the option -o to run plain text files through the annotators.
Otherwise, <in> is a directory of tab-sep text files created by pubConvXXX converters.
It can be relative to pubConf.datasetDir.
pubRunAnnot will then:
- search <in> for fulltext chunks (*.articles.gz and *.files.gz)
- for each chunk submit a cluster or local job to write results to <out>
""")
parser.add_option("-o", "--onlyText", dest="onlyText", action="store_true", help="Run plain text files through the annotator. <in> can be a single text file or a directory with .txt files")
parser.add_option("-t", "--test", dest="test", action="store_true", help="run locally in single process, not on cluster or multi processes, to see error messages, dump annotations to stdout")
#parser.add_option("", "--realTest", dest="realTest", action="store_true", help="spawn processes on local machine, like a cluster run, but error messages are visible, write annotations to final output files")
parser.add_option("-a", "--addFields", dest="addFields", action="append", help="add article fields to output files, e.g. doi or title. Option can be used several times", default=[])
parser.add_option("-l", "--limit", dest="limitJobs", action="store", type="int", help="limit jobs to X concurrent jobs on the cluster")
parser.add_option("-r", "--ram", dest="ram", action="store", type="int", help="request x GB of ram for the jobs on the cluster")
parser.add_option("", "--cat", dest="concat", action="store_true", help="write output to <out> and concat all files to <out>.tab when jobs are finished")
parser.add_option("-k", "--keepOldFiles", dest="keepOldFiles", action="store_true", help="do not wipe the output dir before running the jobs")
pubGeneric.addGeneralOptions(parser)
(options, args) = parser.parse_args()
# ==== FUNCTIONs =====
def checkCleanDir(outDir, keepOldFiles):
if isfile(outDir.rstrip("/")):
raise Exception("%s already exists and is a file" % outDir)
if not isdir(outDir):
logging.info("Creating dir %s" % outDir)
os.makedirs(outDir)
oldGzFiles = os.listdir(outDir)
if len(oldGzFiles)>0 and not keepOldFiles:
logging.info("Directory %s contains %d .gz files" % (outDir, len(oldGzFiles)))
logging.info("Waiting for 3 secs, then deleting them")
time.sleep(3)
pm = maxCommon.ProgressMeter(len(oldGzFiles))
for oldGzFname in oldGzFiles:
os.remove(join(outDir, oldGzFname))
pm.taskCompleted()
def testMode():
reload(sys)
sys.setdefaultencoding('utf-8')
outName = "stdout"
if options.onlyText:
inFiles = [inName]
else:
inDir = pubConf.resolveTextDir(inName.split(",")[0])
inFiles = glob.glob(join(inDir, "*.articles.gz"))
for inFname in inFiles:
logging.info("Running on %s" % inFname)
if options.onlyText:
logging.info("Running only on file %s" % options.onlyText)
reader = pubStore.PubReaderTest(inFname)
else:
reader = pubStore.PubReaderFile(inFname)
pubAlg.runAnnotate(reader, alg, paramDict, outName)
sys.exit(0)
# ----------- MAIN --------------
if len(args)<3:
parser.print_help()
sys.exit(1)
pubGeneric.setupLogging(progFile, options)
algName, inName, outName = args[:3]
assert("=" not in outName)
assert("=" not in inName)
paramStrings = args[3:]
paramDict = {}
paramDict = pubGeneric.stringListToDict(paramStrings)
for k,v in paramDict.iteritems():
if isfile(v):
paramDict[k]=abspath(v)
# we can't test jython algorithms
algType = "annotate"
if not algName.startswith("java"):
alg = pubAlg.getAlg(algName, "Annotate") # makes sure that algName exists
if "processRow" in dir(alg):
algType = "processRow"
paramDict["startAnnotId"] = 0
paramDict["addFields"] = options.addFields
if options.test or options.onlyText:
testMode()
checkCleanDir(outName, options.keepOldFiles)
inNames = inName.split(",")
maxJobs = None
if options.limitJobs:
maxJobs = options.limitJobs
# make sure two runs with different algs and input data names have different
# batch directories
inBaseNames = [basename(x.rstrip("/")) for x in inNames]
batchBase = algType+"_"+algName.replace(":","")+"_"+"-".join(inBaseNames)+"_"+outName
runner = pubGeneric.makeClusterRunner(batchBase, maxJob=maxJobs, maxRam=options.ram)
if algType == "annotate":
pubAlg.annotate(algName, inNames, paramDict, outName, runner=runner, addFields=options.addFields, concat=options.concat)
else:
logging.debug("running a processRow job")
assert(len(inNames)==1)
pubAlg.submitProcessRow(runner, algName, inNames[0], outName, paramDict)