-
Notifications
You must be signed in to change notification settings - Fork 2
/
genome_ref_cal.sh
95 lines (77 loc) · 3.15 KB
/
genome_ref_cal.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/bin/bash
###################################################################
# Script Name : genome_ref_cal.sh
# Date : 2021-07-26
# Version : 3.5
# Author : Dietmar Fernandez ([email protected])
# : Revised by Manuel Rueda ([email protected])
# Usage : bash genome_ref_cal.sh input.vcf.gz
# Description : This script infers the reference genome from a vcf file.
# Description : - We created a dictionary consisting of unique chr/pos/ref (SNP) in hg17, grch37, grch38 reference genomes.
# Description : - The script searches for matches in the 3 dicts and outputs (infer_ref) the reference genome having more matches.
# Requirements : gzip (tested version 1.10), awk (tested version Awk 5.0.1), grep (tested version 3.4), sed (tested sed (GNU sed) 4.7), wc (tested version (GNU coreutils) 8.30).
###################################################################
set -eu
export LC_ALL=C
# Variables
rand_var=10000 # 10K random variants
match=100 # Number of matches for the grep
min_match=20 # Minimum number of matches to accept the inferred genome
genomes=("hg17" "grch37" "grch38") # reference genomes
# Path for dictionary files (put here your own path)
path_dic="./ref_dics"
function usage {
USAGE="""
Usage: $0 input.vcf.gz
"""
echo "$USAGE"
exit 1
}
# Check arguments
if [ $# -eq 0 ]
then
usage
fi
# Load arguments
input_vcf=$1
base=$(basename $input_vcf .vcf.gz)
# We get an id from bash process to be used when we append contents ( >> $$.file )
echo "Job id $$"
# STEP 1 - We split the file by CHROM into multiple gzipped files
echo "Splitting vcf by chr..."
zcat $input_vcf | cut -f1,2,4 | grep -v '^#' | awk '{print | "gzip >" $1 ".variants.gz"}'
# STEP 2 - For each chromosome we query a subset of variants against the dictionary
for chr in $(ls -1 *.variants.gz | awk -F'.' '{print $1}' | sort -V)
do
chr_str=$(echo $chr | sed 's/chr//')
echo "Running chr$chr_str..."
# First we add var|chr stats to $base.chr
{ echo -n "$chr "; zcat $chr.variants.gz | wc -l; } >> $$.$base.chr # NB: Comment this line to reduce cpu time
# Secondly we select 10K random variants
zcat $chr.variants.gz | shuf -n $rand_var | sort | sed 's/chr//g' > subset.$chr.variants
# And finally we look for the first 100 matches in the 3 dictionaries
for ref in ${genomes[@]}
do
{ echo -n "$chr "; zgrep -m $match -Ef subset.$chr.variants $path_dic/subset.chr$chr_str.final.dic_$ref | wc -l; } >> $$.matches_$ref
done
done
# STEP 3 - We analyze the results
for ref in ${genomes[@]}
do
tot_var=$(awk '{sum+=$2;}END{print sum;}' $$.matches_$ref)
tot_chr=$(awk '{print $1}' $$.matches_$ref | wc -l)
echo "There are $tot_var matches and $tot_chr chromosomes in reference $ref" >> $$.final
done
# We will only accept the result if $match_ref >= $min_match, otherwise we set it to NA
match_ref=$(awk '{print $3}' $$.final | sort -n | tail -1)
infer_ref=$(sort -nk3 $$.final | tail -1 | awk '{print $NF}')
if [ "$match_ref" -ge "$min_match" ]
then
echo $infer_ref > infer_ref
else
echo 'NA' > infer_ref
fi
# STEP 4 - Cleaning up
rm $$.*matches* *.variants*
# End
echo "All done!"