-
Notifications
You must be signed in to change notification settings - Fork 0
/
genomic-admixture-2.0.sh
128 lines (93 loc) · 4.67 KB
/
genomic-admixture-2.0.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# WORKFLOW DOCUMENT
## GENOMIC ADMIXTURE IN 60 HONEY BEES SAMPLED FROM 4 DISTINCT LOCALITIES (SAN DIEGO, MEXICO, COSTA RICA, PANAMA)
### LAST MODIFIED NOV 16, 2020
#### AUTHOR: DANIELA ZARATE
#######################################################################################################################
####### KEY COMMANDS MAGIC BOX #######
ssh [email protected] # login Triton Supercomputer Cluster (TSCC)
qsub -q hotel -l nodes=1:ppn=16 -l walltime=25:00:00 script.sh
qstat -u dazarate # check on status of the job
scp -C *fastq.gz [email protected]:/oasis/tscc/scratch/dazarate/harpur-honeypot/harpur-bams
yqd --user=id --queue
gstatement -u <user_name> -s 2019-01-01 -e 2019-02-28
#PBS -l nodes=tscc-1-1:ppn=2 # to specify a node
####### KEY COMMANDS MAGIC BOX #######
#######################################################################################################################
## trim for quality and length with popoolation_1.2.2 basic perl trimming script
## maybe I should change the fastq type from sanger to illumina
# quality 24 length 40
## example perl trimmer
## note we should change sanger to illumina
perl /projects/ps-dazarate/popoolation_1.2.2/basic-pipeline/trim-fastq.pl \
--input1 /projects/ps-dazarate/Wallberg.Ref.HoneyBees/scutellata.B4.raw.reads.fastq \
--fastq-type sanger \
--output /oasis/tscc/scratch/dazarate/scutellata.B4.trim \
--quality-threshold 25 --min-length 40
# Index reference genome in BWA and map the individual reads to reference
module load bwa
bwa index -a is /projects/ps-dazarate/European_genome_scaffolds/Amel_4.5_scaffolds.fa
bwa mem ${REF} ${TRIM_1}.trim > ${TRIM_1}.map
# Write as a BAM file (binary)
module load samtools
samtools view -bS ${FILE}.map > ${FILE}.bam
# Remove low quality reads
module load samtools
samtools view -q 20 -bS ${FILE}.bam > ${FILE}.filtered.bam
# Sort the BAM files individually (for pure bees only 1 file)
samtools sort ${FILE}.filtered.bam > ${FILE}.sort.bam
# index
samtools index ${FILE}.bam
# create BAM files for all admix pops and source pops (Wallberg et al. 2014) 70 honey bees: 10 A, 10 M, 10 C, 10 O
# once all the BAM files are created
#######################################################################################################################
#!/bin/bash
# use ANGSD to generage BEAGLE genotype likelihood file from BAM files of all 130 honey bees
NGSADMIX_BOX=/media/data/dazarate/ngsadmix_box
angsd.v930 \
-GL 1 \
-out ${NGSADMIX_BOX}/all.bees \
-nThreads 20 \
-doGlf 2 \
-doMajorMinor 1 \
-SNP_pval 1e-6 \
-doMaf 1 \
-minMapQ 40 \
-minQ 20 \
-rmTriallelic
-bam ${NGSADMIX_BOX}/list.all.bees.txt
# using only invariant sites
# -doGlf 2 : Beagle haplotype imputation can be performed directly on genotype likelihoods. To generate beagle input file use this
# -doMajorMinor 1 : infer major and minor allele
# -GL 1 : estimate genotype likelihood, SAMTOOLS model (4 available: GATK, SOAPsnp, SYK)
# -SNP_pval 1e-6 : We test for polymorphic sites and only output the ones with are likelhood ratio test p-value<1e-6
# -minMapQ : Discard reads with mapping quality below
# -minQ : Discard bases with base quality below
# -trim : Number of based to discard at both ends of the reads
# -remove_bads : Discard 'bad' reads, (flag >=256)
# -setMinDepthInd : Only use site if atleast minInd of samples has this minimum depth
# -setMaxDepth : If total depth is larger then site is removed from analysis
# -setMaxDepthInd : If depth persample is larger then individual is removed from analysis (from site).
# -setMinDepth : If total depth is smaller then site is removed from analysis
# -rmTriallelic 0.000000 : (Remove sites with a pvalue lower) Erin suggests: .05 and .001
## this seems to take 15 hours for all the bees together
#######################################################################################################################
# use NGSADMIX to estimate ancestry
#!/bin/bash
ngsadmix \
-likes /media/data/dazarate/all.bees.beagle.gz \
-K 4 -P 8 \
-o /media/data/dazarate/all.bees.K4 \
-minMaf 0.05 \
-minInd 122
#-likes beagle file of genotype likelihoods
# -K number of clusters
# -o prefix of output file names
# -P Number of threads used
# -misTol Tolerance for considering a site as missing. Default = 0.05.
# To include high quality genotypes only, increase this value (for example, 0.9)
# -minInd Minumum number of informative individuals. Default = 0
# It only keeps sites where there is at least x # of individuals with NGS data
# -minMaf Minimum minor allele frequency. Default = 5%
# -minLrt Minimum likelihood ratio value for maf>0. Default = 0
## Input file has dim (AFTER filtering): nsites=2,726,805