forked from dwightman/PGC-ALZ2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Enrichmentcoms.txt
150 lines (110 loc) · 4.38 KB
/
Enrichmentcoms.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
##################### Enrichment of genomic risk loci ########################
#MHC (6:32180146-32713511) is excluded from these analyses
#chromatin state enrichment from https://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegmentations/ChmmModels/coreMarks/jointModel/final/all.mnemonics.bedFiles.tgz
#overlap.R
args = commandArgs(trailingOnly=TRUE)
library(data.table)
library(GenomicRanges)
chrom<-fread("tmp")
colnames(chrom)<-c("chr", "start", "end","state")
#risk loci
data<-fread("data")
colnames(data)<-c("chr","pos")
#get ranges for chromatin interaction first contact
ci1_gr <- with(chrom, GRanges(seqname=chr, IRanges(start=start, end=end)))
#get ranges for the variants
data_gr <- with(data, GRanges(seqnames=chr, IRanges(start=pos, end=pos)))
#get overlap
overlap <- findOverlaps(data_gr, ci1_gr)
#find the chromatin states that overlap with each variants
chrstates<-(subjectHits(overlap))
table(chrom$state[chrstates])
print("number less than equal to 7:")
a<-sum(chrom$state[chrstates]<=7)
print(a)
print("number more than 7:")
b<-sum(chrom$state[chrstates]>7)
print(b)
#all variants
data<-fread("fulldata")
colnames(data)<-c("chr","pos")
#get ranges for chromatin interaction first contact
ci1_gr <- with(chrom, GRanges(seqname=chr, IRanges(start=start, end=end)))
#get ranges for the variants
data_gr <- with(data, GRanges(seqnames=chr, IRanges(start=pos, end=pos)))
#get overlap
overlap <- findOverlaps(data_gr, ci1_gr)
#find the chromatin states that overlap with each variants
chrstates<-(subjectHits(overlap))
table(chrom$state[chrstates])
print("number less than equal to 7:")
c<-sum(chrom$state[chrstates]<=7)
#remove number of variants that are in locus
c<-c-a
print(c)
print("number more than 7:")
d<-sum(chrom$state[chrstates]>7)
#remove number of variants that are in locus
d<-d-b
print(d)
mat<-matrix(c(a,c,b,d),nrow=2,ncol=2)
mat
fisher.test(mat)
q<-fisher.test(mat)
###########################################
#usage Rscript overlap.R
#requires files called "data" and "tmp"
#data is the chr and pos of the meta-analysis genomic risk loci snps
#tmp is the chr, start, and end of the active chromatin regions
### ANNOVAR enrichment
#annotate with hg19_refGeneMrna.fa.gz
#annotate all variants in meta-analysis
perl table_annovar.pl testedvariants.txt humandb/ -buildver hg19 -out testedvariants -protocol refGene -operation g -nastring . -csvout
#annotate all variants in genomic risk loci
perl table_annovar.pl Alocivariants.txt humandb/ -buildver hg19 -out glocivariants -protocol refGene -operation g -nastring . -csvout
#get counts for each annovar group for all the meta-analysis variants
tail -n +3 testedvariants.hg19_multianno.csv | cut -d',' -f6 | sed 's/"//g' | sort | uniq -c > testedsum.txt
#get counts for each annovar group for all the genomic risk loci variants
tail -n +2 glocivariants.hg19_multianno.csv | cut -d',' -f6 | sed 's/"//g' | sort | uniq -c > glocisum.txt
#compare.R
library(data.table)
t<-fread("testedsum.txt")
g<-fread("glocisum.txt")
#overlap annotations
t<-t[t$V2 %in% g$V2,]
all(t$V2==g$V2)
dataset<-c()
prop<-c()
OR<-c()
min<-c()
max<-c()
pval<-c()
for (i in 1:dim(g)[1]){
a<-g$V1[i]
b<-45479-a
c<-t$V1[i]-a
d<-(14390943-c)-b
mat<-matrix(c(a,c,b,d),nrow=2,ncol=2)
mat
f<-fisher.test(mat)
dataset<-c(dataset,g$V2[i])
prop<-c(prop,a/(a+b))
OR<-c(OR,f$estimate)
min<-c(min,f$conf.int[1])
max<-c(max,f$conf.int[2])
pval<-c(pval,f$p.value)
}
Fenrich<-as.data.frame(cbind(dataset,prop,OR,min,max,pval))
colnames(Fenrich)[1]<-"annotation"
Fenrich$annotation<-as.factor(Fenrich$annotation)
fwrite(Fenrich, file="GenomicrisklociannovarEnrich.txt", sep=" ", col.names=T, row.names=F, quote=F, na=NA)
##the Enrichment analyses using replicated loci was performed the same way except variants from the following regions (GRCh37) were removed:AGRN (1:985377-1057677), NCK2 (2:106122777-106235428), CLNK (4:11014822-11044972), TNIP1 (5:150432388-150432388), HAVCR2 (5:156506344-156547031), MHC(6:32180146-32713511), TMEM106B (7:12233848-12285140), SHARPIN (8:145018354-145158607), USP6NL/ECHDC3 (10:11487834-11723537), CCDC6 (10:61629823-61785671), ADAM10 (15:58838575-59272096), APH1B (15:63441242-63595878),SCIMP/RABEP1 (17:4958842-5013491), GRN (17:42430244-42590812), ABI3(17:47297297-47475549), TSPOAP1-AS1 (17:56398006-56410041), ACE (17:61545779-61578207), NTN5 (19:49168942-49252574), CD33 (19:51710654-51737991), LILRB2(19:54814234-54834217), APP (21:27473875-27563105).