-
Notifications
You must be signed in to change notification settings - Fork 0
/
realigner_target_creator.sh
76 lines (68 loc) · 2.29 KB
/
realigner_target_creator.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/bin/bash
#
# Usage:
# realigner_target_creator.sh <giraffe|okapi> <ref.fa> <indir> <outdir> <ndata_threads> <njobs>
#
# Description:
# Identifies regions where alignments may potentially be improved using
# GATK RealignerTargetCreator.
#
# Requirements:
# gatk 3.8-1
# java 1.8
# parallel
# picard
# samtools
#
# Important:
# Each job will use 'N data threads' + 4 CPUs ('-nt' + ParallelGCThreads).
# Each data thread will use 20GB RAM.
# Adjust path to Picard and GATK in the variables ${picard} and ${gatk}.
# path to Picard
picard=~/software/picard.jar
# path to GATK v3.8-1
gatk=~/software/GenomeAnalysisTK.jar
# index reference genome FASTA
samtools faidx $2
# create a sequence dictionary for the reference genome FASTA
java -jar ${picard} CreateSequenceDictionary R=$2 O=${2/.fa/.dict}
# get formatted string with input BAMs
if [[ $1 == 'giraffe' ]]; then
bams=$(find $3 -name '*.dedup.bam' ! -name 'WOAK*' -printf '-I %p ')
elif [[ $1 == 'okapi' ]]; then
bams=$(find $3 -name 'WOAK.dedup.bam' -printf '-I %p ')
else
echo "Unrecognized argument: $1 ... expected 'giraffe' or 'okapi'"
exit 1
fi
# get chromosome IDs and spans formatted as 'chr:start-end'
cat $2.fai \
| awk '{ print $1":1-"$2}' \
| while read chr; do # iterate over chromosomes
# echo command to a file
echo \
"java -Xmx20G -XX:ParallelGCThreads=4 -jar ${gatk} \
-T RealignerTargetCreator \
-nt $5 \
-R $2 \
-L ${chr} \
${bams} \
-o $4/${chr%:*}.intervals \
&> $4/${chr%:*}.rtc.log" \
>> $4/rtc.jobs
done
# run RealignerTargetCreator in parallel
cat $4/rtc.jobs | parallel -j $6
# search for warn messages
n_warns=$(grep -L 'Done. There were no warn messages.' $4/*.rtc.log | wc -l)
# if there are no warnings, concatenate intervals and remove intermediate files
if [[ ${n_warns} -eq 0 ]]; then
ls -v $4/*.intervals > $4/chr.intervals.list &&
cat $(cat $4/chr.intervals.list) > $4/$1.realigner.intervals &&
echo 'RealignmentTargetCreator completed successfully!' &&
rm $4/*.{jobs,list}
# if there are warnings, print a list of files in which they occurred
else
echo 'These files had warn messages:'
grep -L 'Done. There were no warn messages.' $4/*.rtc.log
fi