forked from maximilianh/pubMunch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pubPrepMarkerDir
executable file
·298 lines (260 loc) · 11.6 KB
/
pubPrepMarkerDir
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
#!/usr/bin/env python
# load default python packages
import sys, logging, optparse, os, glob, shutil, gzip, collections
from os.path import *
# add <scriptDir>/lib/ to package search path
progFile = os.path.abspath(sys.argv[0])
progDir = os.path.dirname(progFile)
pubToolsLibDir = os.path.join(progDir, "lib")
sys.path.insert(0, pubToolsLibDir)
# now load our own libraries
import pubConf, pubGeneric, util, maxCommon
from os.path import *
# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser("""usage: %prog <markerType>: Download marker info from UCSC database and manualMarkers directory, write to directory specified in pubConf.py
markerType can be gene, snp, genbank, refseq, ensembl or "all"
""")
#parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
#parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="show more debug messages")
#parser.add_option("-o", "--only", dest="only", action="store", help="only create data for one type, e.g. symbol, band, uniprot, snp")
pubGeneric.addGeneralOptions(parser, noCluster=True)
(options, args) = parser.parse_args()
# ==== FUNCTIONS =====
def mysqlToFile(fname, db, sql, append=False):
" write mysql cmd output to file "
logging.info("Writing %s, mysql query: %s" % (fname, sql))
redirSym = ">"
if append==True:
redirSym = ">>"
cmd = "mysql %s -NB -e '%s' %s %s" % (db, sql, redirSym, fname)
util.execCmdLine(cmd)
def writeNames(fname, outFname):
" write name fields of bed to gz tab file "
logging.info("Writing %s" % outFname)
ofh = gzip.open(outFname, "w")
#ofh.write("#id\n")
for line in open(fname):
fields = line.strip().split("\t")
name = fields[3]
chrom = fields[0]
ofh.write(name)
ofh.write("\n")
def makeUniqueNames(bedIn, bedOut, stripVersion=False):
" write from bedIn to bedOut, skip all lines with a duplicate name field or spaces in name or hap chroms"
logging.info("Filtering %s to %s" % (bedIn, bedOut))
ofh = open(bedOut, "w")
oldNames = set()
inCount = 0
outCount = 0
for line in open(bedIn):
inCount += 1
fields = line.strip().split("\t")
name = fields[3]
if stripVersion:
name = name.split(".")[0]
fields[3] = name
line = "\t".join(fields)+"\n"
chrom = fields[0]
if name in ["unknown"] or len(name)<=2:
continue
if " " in name:
continue
if "hap" in chrom:
continue
if name not in oldNames:
outCount += 1
ofh.write(line)
oldNames.add(name)
logging.info("input lines %d, output lines %d, removed %d lines with duplicate names" % \
(inCount, outCount, outCount - inCount))
def rewriteBands(bedIn, bedOut):
" prefix names with chrom, for bands, and add the larger scale band, too "
logging.info("Adding chromosome numbers, filtering %s to %s" % (bedIn, bedOut))
origBands = {} # dict with [chrom, start, end] tuples
for line in open(bedIn):
fields = line.strip().split("\t")
chrom, start, end, band = fields
start = int(start)
end = int(end)
chromNum = chrom.replace("chr","")
bandName = chromNum+band
origBands[bandName] = (chrom, start, end)
largeBands = {}
for bandName, coordTuple in origBands.iteritems():
if not "." in bandName:
continue
chrom, start, end = coordTuple
largeBand = bandName.split(".")[0]
if not largeBand in largeBands:
largeBands[largeBand] = (chrom, start, end)
else:
oldStart = largeBands[largeBand][1]
oldEnd = largeBands[largeBand][2]
largeBands[largeBand] = (chrom, min(oldStart, start, end), max(oldEnd, start, end))
allBands = origBands.items()
allBands.extend(largeBands.items())
ofh = open(bedOut, "w")
for bandName, coordTuple in allBands:
chrom, start, end = coordTuple
row = [chrom, str(start), str(end), bandName]
ofh.write("\t".join(row))
ofh.write("\n")
ofh.close()
def rewriteSnp(bedIn, bedOut):
" massage snp coordinates to force them into normal bed format "
logging.info("forcing snpbed to bed format, filtering %s to %s" % (bedIn, bedOut))
ofh = open(bedOut, "w")
oldNames = set()
for line in open(bedIn):
fields = line.strip().split("\t")
start, end = fields[1:3]
start, end = int(start), int(end)
if end<start:
start, end = end, start
if end==start:
end = end+1
row = [fields[0], str(start), str(end), fields[3]]
ofh.write("\t".join(row))
ofh.write("\n")
def addToDict(fname, data):
" parse tab-sep file with two fields into dict id -> list of values "
lCount = 0
oldKeyCount = len(data)
for line in open(fname):
key, val = line.strip().split("\t")
data[key].append(val)
lCount += 1
logging.info("Read %d lines, %d new keys" % (lCount, len(data)-oldKeyCount))
return data
def writeGzDict(data, fname):
logging.info("Writing %d keys to %s" % (len(data), fname))
ofh = gzip.open(fname, "w")
for key, valList in data.iteritems():
for val in valList:
assert("|" not in val)
ofh.write("%s\t%s\n" % (key, "|".join(valList)))
def mysqlAddKeyVal(db, query, dataDict):
tempFname = join(pubConf.TEMPDIR, "uniprot.tmp.bed")
mysqlToFile(tempFname, db, query)
addToDict(tempFname, dataDict)
os.remove(tempFname)
return dataDict
#def makeGeneDict(geneDictFname):
# " use the UniProt database to create a dictionary uniprotId to all other relevant ids "
# data = collections.defaultdict(list)
#
# # uniprot alternative acc numbers
# mysqlAddKeyVal("uniProt", 'select acc, val from otherAcc', data)
# # uniprot names
# mysqlAddKeyVal("uniProt", 'select acc, val from displayId', data)
# # pdb accessions
# mysqlAddKeyVal("uniProt", 'select acc, extAcc1 from extDbRef JOIN extDb ON (id=extDb) WHERE val="PDB"', data)
# # refseq
# mysqlAddKeyVal("uniProt", 'select acc, extAcc2 from extDbRef JOIN extDb ON (id=extDb) WHERE val="RefSeq";', data)
# # ensembl gene ids
# #mysqlAddKeyVal("uniProt", 'select acc, extAcc3 from extDbRef JOIN extDb ON (id=extDb) WHERE val="Ensembl";', data)
# writeGzDict(data, geneDictFname)
def writePdbFnames(pdbDir, pdbDictFname):
" find all files in a pdb local copy and write acc numvers to ouput gz files "
# pdb100d.ent.gz
logging.info("Reading filenames from %s" % pdbDir)
accList = []
for dir, path in pubGeneric.findFiles(pdbDir, ".gz"):
acc = basename(path).split(".")[0].replace("pdb","")
accList.append(acc)
logging.info("Found %d accession numbers, writing to %s" % (len(accList), pdbDictFname))
ofh = gzip.open(pdbDictFname, "w")
for acc in accList:
ofh.write(acc)
ofh.write("\n")
ofh.close()
pubGeneric.indexKvFile(pdbDictFname)
def prepMarkerDir(markerDir, markerType):
" config targetdir for blatCdna "
if not isdir(markerDir):
logging.info("Creating %s" %markerDir)
os.mkdir(markerDir)
tempFile = join(pubConf.TEMPDIR, "filter.tmp.bed")
if markerType=="all" or markerType=="gene":
# gene symbols
symbolBedFname = join(markerDir, "hg19.gene.bed")
mysqlToFile(tempFile, "hg19", 'SELECT chrom, txStart, txEnd, kgXref.geneSymbol from knownGene JOIN kgXref ON kgXref.kgID=knownGene.name')
makeUniqueNames(tempFile, symbolBedFname)
#symbolDictFname = join(markerDir, "gene.dict.tab.gz")
#writeNames(symbolBedFname, symbolDictFname)
if markerType=="all" or markerType=="band":
bandBed = join(markerDir, "hg19.band.bed")
bandDict = join(markerDir, "band.dict.tab.gz")
mysqlToFile(tempFile, "hg19", "SELECT chrom, chromStart, chromEnd, name from cytoBand")
rewriteBands(tempFile, bandBed)
writeNames(bandBed, bandDict)
if markerType=="all" or markerType=="uniprot":
upBedFname = join(markerDir, "hg19.uniprot.bed")
upDictFname = join(markerDir, "uniprotAccs.txt.gz")
# write all uniprot accessions to the dict file
upFnames = ["uniprot.tab", "uniprotTrembl.tab"]
#upFnames = ["uniprot.test.tab"]
accs = set()
ofh = gzip.open(upDictFname, "w")
for upFname in upFnames:
upFname=join(pubConf.dbRefDir, upFname)
logging.info("Parsing %s to get a list of all accession IDs" % upFname)
i = 0
for row in maxCommon.iterTsvRows(upFname):
accList = row.accList.split("|")
#logging.info("Got %d keys" % len(accList))
for acc in accList:
ofh.write("%s\n" % acc)
ofh.close()
pubGeneric.indexKvFile(upDictFname)
#logging.info("Got %s uniprot accessions, writing to %s" % (len(accs), upDictFname))
#ofh = gzip.open(upDictFname, "w")
#accs = list(accs)
#accs.sort() # better compression
#for acc in accs:
#ofh.write(acc)
#ofh.write("\n")
#ofh.close()
# write a map for uniprot on hg19
logging.info("Creating hg19 map")
mysqlToFile(upBedFname, "hg19", 'SELECT chrom, txStart, txEnd, kgXref.spID from knownGene JOIN kgXref ON kgXref.kgID=knownGene.name where spID<>"" and not spId like "%-%"')
# rather keep duplicates
#makeUniqueNames(tempFile, upBedFname)
if markerType=="all" or markerType=="pdb":
# get all pdb accessions
#pdbDir = join(pubConf.pdbBaseDir, "pdb")
pdbDir = pubConf.pdbBaseDir
pdbDictFname = join(markerDir, "pdbAccs.txt.gz")
writePdbFnames(pdbDir, pdbDictFname)
pubGeneric.indexKvFile(pdbDictFname)
# genome map file
pdbBedFname = join(markerDir, "hg19.pdb.bed")
mysqlToFile(tempFile, "hg19", 'SELECT chrom, txStart, txEnd, uniProt.extDbRef.extAcc1 from knownGene JOIN kgXref ON kgXref.kgID=knownGene.name JOIN uniProt.extDbRef ON (spID=acc) JOIN uniProt.extDb ON (id=extDb) WHERE extDb.val="PDB" AND kgXref.spID<>"" AND NOT kgXref.spId like "%-%"')
makeUniqueNames(tempFile, pdbBedFname)
#writeNames(upBedFname, upDictFname)
if markerType=="all" or markerType=="snp":
snpBed = join(markerDir, "hg19.snp.bed")
mysqlToFile(tempFile, "hg19", "SELECT chrom, chromStart, chromEnd, name from snp137")
rewriteSnp(tempFile, snpBed)
# no snp dict - we use regular expressions for these
if markerType=="genbank" or markerType=="all":
gbBed = join(markerDir, "hg19.genbank.bed")
mysqlToFile(tempFile, "hg19", "SELECT tName, tStart, tEnd, qName from gbDnaBigChain")
mysqlToFile(tempFile, "hg19", "SELECT tName, tStart, tEnd, qName from all_est", append=True)
mysqlToFile(tempFile, "hg19", "SELECT tName, tStart, tEnd, qName from all_mrna", append=True)
makeUniqueNames(tempFile, gbBed, stripVersion=True)
if markerType=="refseq" or markerType=="all":
refBed = join(markerDir, "hg19.refseq.bed")
mysqlToFile(refBed, "hg19", "SELECT chrom, txStart, txEnd, name from refGene")
if markerType=="ensembl" or markerType=="all":
refBed = join(markerDir, "hg19.ensembl.bed")
mysqlToFile(refBed, "hg19", "SELECT chrom, txStart, txEnd, name2 from ensGene")
if isfile(tempFile):
os.remove(tempFile)
if len(args)==0:
parser.print_help()
exit(1)
pubGeneric.setupLogging(progFile, options)
markerDir = pubConf.markerDbDir
markerType = args[0]
prepMarkerDir(markerDir, markerType)