-
Notifications
You must be signed in to change notification settings - Fork 0
/
BatchCorrect.R
121 lines (104 loc) · 4.28 KB
/
BatchCorrect.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
library(Seurat)
library(dplyr)
library(Matrix)
#install.packages("batchelor")
rds = readRDS(file = "/data/lab/projects/TCRseq/Input/sc_HRD_with_batch_updated.rds")
#str(rds)
rds3=UpdateSeuratObject(rds)
rds3[["percent.mt"]] = PercentageFeatureSet(rds3, pattern = "^mt-")
counts = rds3@assays$RNA@counts
#rm(rds,rds3)
##### change gene ID to gene name. #####
geneID = dimnames(counts)[[1]]
geneID = as.data.frame(geneID)
colnames(geneID)="ID"
library("biomaRt")
ensembl <- useMart("ensembl", dataset="mmusculus_gene_ensembl")
ID2name <- getBM(attributes=c('ensembl_gene_id','external_gene_name'),
filters = 'ensembl_gene_id',
values = geneID,
mart = ensembl)
#### failed solution ####
#ID2name = merge(geneID, ID2name, by.x = "ID", by.y = "ensembl_gene_id", all.x = T)
#ID2name[which(is.na(ID2name[,2])),2] = ID2name[which(is.na(ID2name[,2])),1]
### it shuffled gene order
#### failed solution ####
#### failed solution ####
#library(hash)
#hID2name = hash(ID2name$ID, ID2name$external_gene_name)
#hID2name[[ "ENSMUSG00000095041"]]
#v = hID2name[[ dimnames(counts)[[1]] ]]
#t = as.data.frame(values(v))
#values(v, keys = "ENSMUSG00000095041")
# Error in get(k, x) : object 'ENSMUSG00000097347' not found
#### failed solution ####
colnames(ID2name) = c("ID","GeneName")
library(plyr)
ID2name = join(geneID, ID2name, by = "ID", type = "left", match = "all")
which(duplicated(ID2name$GeneName))
ID2name[which(duplicated(ID2name$GeneName)),]
ID2name[which(is.na(ID2name$GeneName)),2] = ID2name[which(is.na(ID2name$GeneName)),1]
which(duplicated(ID2name$GeneName))
dimnames(counts)[[1]] = ID2name$GeneName
##### change gene ID to gene name. #####
#rds3norml <- NormalizeData(rds3, normalization.method = "LogNormalize", scale.factor = 10000)
#counts1 = rds3norml@assays$RNA@counts
barcodes = dimnames(counts)[[2]]
unique(substr(barcodes,20,25)) # BRCA1 mut and BRCA2 mut: 17QQ and 21EE
C17QQ2 = counts[,grepl("17QQ2", barcodes)]
C17QQ3 = counts[,grepl("17QQ3", barcodes)]
C21E1 = counts[,grepl("21E1", barcodes)]
C21E2 = counts[,grepl("21E2", barcodes)]
C21E3 = counts[,grepl("21E3", barcodes)]
CPAR1 = counts[,grepl("PAR1", barcodes)]
CPAR2 = counts[,grepl("PAR2", barcodes)]
CPAR3 = counts[,grepl("PAR3", barcodes)]
str(CPAR3)
library(batchelor)
#out = fastMNN(C17QQ2,C17QQ3,C21E1,C21E2,C21E3,CPAR1,CPAR2,CPAR3)
#saveRDS(object=out, file="/data/lab/projects/TCRseq/output/MNNcorrect.rds")
#out=readRDS("/data/lab/projects/TCRseq/output/MNNcorrect.rds")
#RunPCA(object=out)
rm(rds3,rds,counts)
#summary(out)
#runPCA(out)
#class(out@assays)
#str(out@assays)
#dim(out@assays)
#colnames(out@assays@data@listData)
#str(out@assays@data@listData$reconstructed)
#getListElement(out@assays[3,3],2)
#names(out@assays)
#dimnames(out@assays@data@ListData)
Sdataobj <- CreateSeuratObject(counts = out@assays@data@listData$reconstructed)
saveRDS(Sdataobj, file="/data/lab/projects/TCRseq/output/tmp.rds")
#Sdataobj = readRDS(file="/data/lab/projects/TCRseq/output/tmp.rds")
Sdataobj <- FindVariableFeatures(Sdataobj, selection.method = "vst", nfeatures = 2000)
#https://www.rdocumentation.org/packages/Seurat/versions/3.1.4/topics/RunTSNE
all.genes <- rownames(Sdataobj)
Sdataobj <- ScaleData(Sdataobj, features = all.genes)
Sdataobj <- RunPCA(Sdataobj, features = VariableFeatures(object = Sdataobj))
#Sdataobj <- JackStraw(Sdataobj, num.replicate = 100)
#Sdataobj <- ScoreJackStraw(Sdataobj, dims = 1:20)
#pdf("/data/lab/projects/TCRseq/output/JackStraw.fastMNN.pdf")
#JackStrawPlot(Sdataobj, dims = 1:20)
#dev.off()
Sdataobj <- FindNeighbors(Sdataobj, dims = 1:20)
Sdataobj <- FindClusters(Sdataobj, resolution = 1)
Sdataobj = RunTSNE(Sdataobj, dims = 1:20)
pdf("/data/lab/projects/TCRseq/output/tsne.fastMNN.pdf")
DimPlot(Sdataobj, reduction = "tsne", label = T)
dev.off()
pdf("/data/lab/projects/TCRseq/output/pca.fastMNN.pdf")
DimPlot(Sdataobj, reduction = "pca")
dev.off()
pdf("/data/lab/projects/TCRseq/output/CD8a.BC.pdf")
FeaturePlot(Sdataobj, features = "ENSMUSG00000053977", min.cutoff = "q9")
dev.off()
pdf("/data/lab/projects/TCRseq/output/CD4.BC.pdf")
FeaturePlot(Sdataobj, features = "ENSMUSG00000023274", min.cutoff = "q9")
dev.off()
features = c("ENSMUSG00000041959","ENSMUSG00000050592")
pdf("/data/lab/projects/TCRseq/output/heatmap.pdf")
DoHeatmap(subset(Sdataobj, downsample = 10), features = features, size = 3)
dev.off()