-
Notifications
You must be signed in to change notification settings - Fork 0
/
PCmain.py
85 lines (58 loc) · 3.48 KB
/
PCmain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
'''
PCmain.py -- main function to calculate pathway centrality for pre-defined functional gene sets between a set of disease genes and a set of differentially expressed genes in given protein-protein interaction networks.
usage: python PCmain.py
'''
import sys
import argparse
import os
import PCmodules.cleanUpInput as ci
import PCmodules.getLCC as lc
import PCmodules.getSPDisease as sp
import PCmodules.calculatePC as pc
import PCmodules.calculatePvalue as pv
def main():
parser = argparse.ArgumentParser(description="Pathway Centrality calculation")
parser.add_argument("-d", "--disease_gene_file", type=argparse.FileType('r'),
help="one column file, gene identification should match with other input files.")
parser.add_argument("-e", "--diff_exp_gene_file", type=argparse.FileType('r'),
help="one column file, gene identification should match with other input files.")
parser.add_argument("-o", "--output_directory", help="A directory where all output files are stored: default will be named output and placed in the working directory.",)
parser.add_argument("-p", "--ppi_network_file", type=argparse.FileType('r'),
help="two column file delimited by tab, gene identification should match with other input files.")
parser.add_argument("-g", "--pathway_gmt_file", type=argparse.FileType('r'),
help=".gmt file format, gene identification should match with other input files.")
option = parser.parse_args()
# set output directory
outputdir = "./output"
if option.output_directory is not None:
outputdir = option.output_directory + "/"
if not os.path.exists(outputdir):
os.makedirs(outputdir)
# read input files 1: disease genes and differentially expressed genes
# create three new files in output directory: pc_disease_genes.txt, pc_diff_exp_genes.txt, pc_overlapping_genes.txt
# pc_disease_genes.txt: filtered disease gene set --- differentially expressed genes removed
# pc_diff_exp_genes.txt: same as initial input
# pc_overlapping_genes.txt: for record purposes --- genes removed from disease gene set
ci.cleanupGenesets(option.disease_gene_file, option.diff_exp_gene_file, outputdir)
# read input files 2: ppi network file
# create one new file in output directory: pc_network_lcc.txt
# pc_network_lcc.txt: the largest connected component of the given ppi network, self-loop removed
lc.getLCC(option.ppi_network_file, outputdir)
# calculate shortest paths between disease genes and differentially expressed genes
# create one new file in output directory: pc_shortest_paths.txt
# pc_shortest_paths.txt: all possible shortest paths between disease genes and differentially expressed genes in the claulcated largest connected component
nwfile = outputdir + "/pc_network_lcc.txt"
diseasegenefile = outputdir + "/pc_disease_genes.txt"
diffexpgenefile = outputdir + "/pc_diff_exp_genes.txt"
sp.calculateShortestPaths(nwfile, diseasegenefile, diffexpgenefile, outputdir)
# calculate pathway centrality score for input pathway gene sets
# calculatePC.py
shortestpathfile = outputdir + "/pc_shortest_paths.txt"
pc.calculatePathwayCentrality(shortestpathfile, option.pathway_gmt_file, outputdir)
# calculate p-value from permutation test using 2-core genes of the input network
num_trial = 10000
pathwayproteinfile = outputdir + "/pc_pathway_genes.txt"
groupcentralityfile = outputdir + "/pc_scores.txt"
pv.significanceAssessment(nwfile, shortestpathfile, pathwayproteinfile, groupcentralityfile, outputdir, num_trial)
if __name__ == "__main__":
main()