forked from macarthur-lab/gene_lists
-
Notifications
You must be signed in to change notification settings - Fork 0
/
DRG_WoodRD.R
36 lines (26 loc) · 1.19 KB
/
DRG_WoodRD.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# About -------------------------------------------------------------------
# Date: 31/03/2015
# Extract DNA Repair Genes from WoodRD website
# http://sciencepark.mdanderson.org/labs/wood/dna_repair_genes.html
# Workspace ---------------------------------------------------------------
require(XML)
# Data prep ---------------------------------------------------------------
WoodRD <- readHTMLTable("http://sciencepark.mdanderson.org/labs/wood/dna_repair_genes.html")
d <- WoodRD[[1]]
#convert  to NA
d[d=="Â"] <- NA
#exclude subheader rows
d <- d[ !(is.na(d$V3) & is.na(d$V4)), ]
#exclude "Top of Page" rows
d <- d[ !(d$V2=="Top of Page" | d$V3=="Top of Page"),]
#keep Gene name, exclude names in brackets
d <- gsub("\\(|\\)","",d$V1)
d <- sort(unlist(lapply(strsplit(d," "),"[",1)))
# Output ------------------------------------------------------------------
write.table(d,"DRG_WoodRD.tsv",col.names = FALSE,row.names = FALSE,quote = FALSE)
#Tidy up
rm(list=ls())
# TESTING... --------------------------------------------------------------
# require(org.Hs.eg.db)
# GeneSymbol <- mappedkeys(org.Hs.egSYMBOL2EG)
# HGNC <- read.table("http://www.genenames.org/cgi-bin/genefamilies/download-all/tsv",sep="\t")