forked from maximilianh/pubMunch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pubChange
executable file
·103 lines (79 loc) · 3.3 KB
/
pubChange
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python
# first load the standard libraries from python
# we require at least python 2.5
#from sys import *
import sys
if sys.version_info[0]==2 and not sys.version_info[1]>=7:
print "Sorry, this program requires at least python 2.7"
exit(1)
# load default python packages
import logging, optparse, os, glob, zipfile, types, gzip, shutil
from os.path import *
# add <scriptDir>/lib/ to package search path
progFile = os.path.abspath(sys.argv[0])
progDir = os.path.dirname(progFile)
pubToolsLibDir = os.path.join(progDir, "lib")
sys.path.insert(0, pubToolsLibDir)
# now load our own libraries
import pubChange
import pubGeneric, maxRun, pubStore, pubConf, maxCommon, pubXml, pubPubmed
# === CONSTANTS ===================================
# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser("""usage: %prog [options] command options
command "filterText":
create subset of dataset(s) with text that contains a keyword
%prog filter <datasetList> <keywordListCommaSep> <datasetOut>
example:
%prog filter pmc,elsevier,crawler ebola,filovirus ebola
command "filterPmid":
create subset of dataset(s) with a list of PMIDs, create a new dataset
%prog filter <datasetList> <pmidListFile> <datasetOut>
example:
%prog filter pmc,elsevier,crawler uniprotPmids.txt uniProtText
command "addPmids":
read PMIDs from datasetDir/medline.ids.tab, rewrite all
articles.gz files and add the new PMID.
medlines.ids.tab can be created with pubFingerprint.
example:
%prog addPmids elsevier
""")
#parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
#parser.add_option("", "--minId", dest="minId", action="store", help="numerical IDs written to the pubStore start at this number times one billion to prevent overlaps of numerical IDs between publishers, default %default", default=pubConf.identifierStart["medline"])
#parser.add_option("", "--parse", dest="parse", action="store", help="for debugging, just parse one single xml file", default=None)
pubGeneric.addGeneralOptions(parser)
(options, args) = parser.parse_args()
# ==== FUNCTIONs =====
def main(args, options):
cmd = args[0]
if cmd=="filterPmid":
inSpec, pmidFname, outDir = args[1:]
pmidFname = os.path.abspath(pmidFname)
assert(isfile(pmidFname))
pubChange.filterCmd(inSpec, pmidFname, outDir, options)
elif cmd=="filterText":
inSpec, searchSpec, outDir = args[1:]
partsDir = join(outDir, "parts")
maxCommon.mustBeEmptyDir(partsDir, makeDir=True)
outFnames = pubChange.filterCmd(inSpec, searchSpec, partsDir, options)
pubChange.rechunk(partsDir, outDir)
# cleanup
for fname in outFnames:
os.remove(fname)
os.removedirs(partsDir)
elif cmd=="filterJob":
inSpec, pmidFname, outSpec = args[1:]
pubChange.filterOneChunk(inSpec, pmidFname, outSpec)
elif cmd=="rechunk":
inDir, outDir = args[1:]
pubChange.rechunk(inDir, outDir)
elif cmd=="addPmids":
pubChange.addPmids(args[1])
else:
raise Exception("Unknown command %s" % cmd)
# ----------- MAIN --------------
if args==[]:
parser.print_help()
exit(1)
# normal operation
pubGeneric.setupLogging(progFile, options)
main(args, options)