-
Notifications
You must be signed in to change notification settings - Fork 0
/
pairwise-watterson-pipeline.sh
181 lines (122 loc) · 5.67 KB
/
pairwise-watterson-pipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# The Master Calculating Pi per Population Script
### Last modified: 14 September 2020
# to run
#######################################################################################
# Step 1: Calculate SAF Global Estimate using ANGSD per scaffold basis
# -doSaf 1 : Calculate the Site allele frequency likelihood based on individual genotype likelihoods assuming HWE
# for now, we can use the REF as ANC...ancestral state needs to be supplied for the full SFS
## but you can use the -fold 1 to estimate the folded SFS and then use the reference as ancestral.
# use the entire population for this
# To Run: script needs to be in the same directory as pop and scaffold files
parallel --noswap --joblog logs/m_pop_saf_globalEstimate.log --jobs 4 ./saf.global.estimate.sh $bam_list {1} $minINd :::: scaffolds.txt
parallel --noswap --joblog logs/o_pop_saf_globalEstimate.log --jobs 4 ./scripts/saf.global.estimate.sh san.diego.pop {1} 8 :::: scaffolds.txt
parallel --noswap --joblog logs/san_pop_saf_globalEstimate.log --jobs 4 ./saf.global.estimate.sh san.diego.pop {1} 8 :::: scaffolds.txt
# Needed:
INPUT=$BAM_FILE # list of paths of bam files
SCAFFOLD # list of all scaffolds
> use parallel to run a list of scaffolds per population
> use saf.global.estimate.sh Script
> Actual Script Below:
#!/bin/bash
# this script calculates global SFS for a population of individuals
# for eventual nucleotide diversity calculation
# define your variables
POP=$1 # first argument is a list of BAMS
SCAFFOLD=$2
minInd=$3 # change according to number of individuals
# reference genome
REF=/media/data/dazarate/European_genome_scaffolds/Amel_4.5_scaffolds.fa
echo "calculating site allele frequency likelihood global estimate based on individual genotype likelihoods"
OUTDIR=/media/data/dazarate/nucleotide.diversity/pi/results
OUT=$POP.$SCAFFOLD.out
echo "out is $OUT"
echo "pop is $POP"
echo "minInd is $minInd"
echo "scaffold is ${SCAFFOLD}"
angsd.v928 \
-bam $POP \
-r ${SCAFFOLD}: \
-doSaf 1 \
-anc $REF \
-GL 2 \
-P 10 \
-minQ 20 \
-out $OUTDIR/$OUT \
-minMapQ 30 \
-minInd $minInd
OUTPUTS= .out.arg, .out.saf.pos.gz, .out.saf.idx
########################################################################################
# Step 2 : Merge SAF files for the entire population for all scaffolds
# fast, doesn't need screen
### changed angsd version here to see if this fixes merge error
singularity exec /media/data/dazarate/singularity.images/angsd.v928.sif /opt/angsd/angsd/misc/realSFS cat -outnames san.pop.merged san.diego*saf.idx
# the outputs will automatically have SAF extensions
./realSFS cat a.pop*saf.gz
-> This will cat together .saf files from angsd, both a .saf.gz and a .saf.idx file
> cp m.pop.merged.saf* ~/nucleotide.diversity/pi/mergedSAF
########################################################################################
# Step 3 : Calculate SFS from SAF estimate, fold if not providing ancestral state
> cd ~/nucleotide.diversity/pi/mergedSAF
> use saf2sfs.sh script
> input is .saf.idx file
> to run: ./saf2sfs.sh $MERGED_SAF
#!/bin/bash
# script to fold the SAF to create the SFS using realSFS from angsd
SAF_INPUT=$1 # first argument is the SAF
OUTDIR=/media/data/dazarate/nucleotide.diversity/pi/SFS
OUT=$OUTDIR/$SAF_INPUT
echo "SAF_INPUT is $SAF_INPUT"
echo "SHORT_SAF is $SHORT_SAF"
echo "out is $OUT"
echo "Obtaining maximum likelihood estimate of the SFS using realSFS"
singularity exec /media/data/dazarate/singularity.images/angsd.v928.sif /opt/angsd/angsd/misc/realSFS $SAF_INPUT -fold 1 -P 10 > $OUT.sfs
########################################################################################
# Step 4: Calculate thetas from SFS using ANGSD.
$ angsd -bam $bam -out $out -doThetas 1 -doSaf 1 -pest $out.sfs -anc $ANC -GL 2
# Need
> list of BAMS
> SFS file
> Reference Genome
> script = theta.caller.sh
> to run :
> cd ~/nucleotide.diversity/pi/thetas
> ./theta.caller.sh $BAM $SFS
> ./theta.caller.sh ../o.pop ../SFS/o.pop.merged.saf.idx.sfs
# actual script:
#!/bin/bash
BAM=$1
SFS=$2
#SCAFFOLD=$3
OUT_DIR=/media/data/dazarate/nucleotide.diversity//pi/thetas
OUT=$OUT_DIR"/"${BAM}.out
REF=/media/data/dazarate/European_genome_scaffolds/Amel_4.5_scaffolds.fa
angsd.v928 \
-bam $BAM \
-out $OUT \
-doThetas 1 \
-doSaf 1 \
-pest $SFS \
-anc $REF \
-GL 2
> outputs .thetas.gz and a .thetas.idx file
> san diego pop took 4/5 hours
########################################################################################
# Step 5: Use thetaStat to print summary statistics
./misc/thetaStat do_stat out.thetas.idx
> thetaStat do_stat <thetas.idx> -outnames <filename>
> can decide to use window sizes
> thetaStat do_stat <thetas.idx> x -win 50000 -step 10000 -outnames <filename>
> singularity exec /media/data/dazarate/singularity.images/angsd.v928.sif /opt/angsd/angsd/misc/thetaStat do_stat san.diego.thetas.idx -outnames san.diego
> produces pestPG file
########################################################################################
# Step 6: Calculate genome-wide estimate of pi per population
> download pestPG files
> scp [email protected]:~/nucleotide.diversity/pi/pestPG/san.diego.pestPG ~/Documents/SCRIPTS/pi_calculations/
> use R to calculate average of tP (and tW if you so choose) and average of nSites
> use pi.calculation.R script
> pi = (average of tP / average of nSites) # is this right? I think I have to divide by total sites in the genome ?
######################################################################################
# Step 7: I Believe I have to account for small sample size here ???
########################################################################################
# Step 7: Jackknife the pi estimates
> use jackknife.R script in R