forked from maximilianh/pubMunch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pubSearch
executable file
·123 lines (100 loc) · 3.92 KB
/
pubSearch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python
import maxCommon, glob, collections, os, sys, optparse, logging, operator, types
# add <scriptDir>/lib/ to package search path
progFile = os.path.abspath(sys.argv[0])
progDir = os.path.dirname(progFile)
pubToolsLibDir = os.path.join(progDir, "lib")
sys.path.insert(0, pubToolsLibDir)
import html
import pubGeneric
# ==== FUNCTIONs =====
def parseGeneList(inFname):
" parse a file with gene/gene/desc fields and return as dict sortedPair -> desc "
logging.info("Parsing %s" % inFname)
pairToRows = collections.defaultdict(int)
for line in open(inFname):
fields = line.strip().split()
sym1, sym2 = fields[:2]
genePair = [sym1, sym2]
genePair.sort()
genePair = tuple(genePair)
pairToRows[genePair]=(sym1, sym2)
return pairToRows
def annotateFusionGenes(inFname):
FusRow = collections.namedtuple("fusRec", ["gene1", "gene2", "annot", "dbAnnots", "count"])
dataFname = "elsPmcMl.fusCounts.tab"
logging.info("Parsing %s" % dataFname)
pairToRows = collections.defaultdict(int)
for row in maxCommon.iterTsvRows(dataFname):
genePair = [row.sym1, row.sym2]
genePair.sort()
genePair = tuple(genePair)
pairToRows[genePair]=int(row.count)
dbList = [('cosmic',"cosmicFusions.tab"), \
('OMIM', 'chimerDb.omim.tsv'), \
('ChimerDb', 'chimerDb.pubmed.tsv'), \
('Mitelman', 'chimerDb.mitelman.tsv')]
dbPairs = {}
for dbName, fname in dbList:
dbPairs[dbName] = parseGeneList(fname)
logging.info("Annotating file %s" % inFname)
#for inFname in ["lungCancerFusions.tab", "amlFusionsDz.tab"]:
fusAnnots = []
cancerPairs = set()
for lineCount, line in enumerate(open(inFname)):
fields = line.strip().split()
gene1, gene2 = fields[:2]
annot = ""
if len(fields)>2:
annot = fields[2]
if gene1==gene2:
continue
genePair = [gene1, gene2]
genePair.sort()
genePair = tuple(genePair)
count = pairToRows.get(genePair, 0)
dbAnnots = []
for dbName in dbPairs:
if genePair in dbPairs[dbName]:
dbAnnots.append(dbName)
fusAnnots.append(FusRow(gene1, gene2, annot, dbAnnots, count))
fusAnnots.sort(key=operator.itemgetter(-1), reverse=True)
return fusAnnots
def printFusAnnots_text(fusAnnots):
for fusData in fusAnnots:
print inFname+"\t"+ "\t".join([str(x) for x in fusData])
def printFusAnnots_html(fusAnnots):
h = html.htmlWriter("stdout")
h.head("FusionFinder", styleString=html.getStylesheet("dyndrive"))
h.startBody("TCGA AML - Leukemia (%d gene pairs)" % len(fusAnnots))
h.startTable([100,100,100,100,100,100], "gene1,gene2,patient count,databases,publications".split(","))
for fusData in fusAnnots:
if len(fusData.dbAnnots)==0 and fusData.count==0:
continue
h.startTr()
for val in fusData:
if type(val)==types.ListType:
val = ",".join(val)
elif type(val)==types.IntType:
val = str(val)
h.td(val)
h.endTr()
h.endTable()
h.endHtml()
def main():
parser = optparse.OptionParser("""usage: %prog [options] infile - annotate fusion gene file, two genes per line""")
parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="show more debug messages")
parser.add_option("", "--html", dest="html", action="store_true", help="output as html")
(options, args) = parser.parse_args()
pubGeneric.setupLogging(progFile, options)
if args==[]:
parser.print_help()
exit(1)
inFname = args[0]
fusData = annotateFusionGenes(inFname)
if options.html:
printFusAnnots_html(fusData)
else:
printFusAnnots_text(fusData)
main()