diff --git a/scripts/download_reference.sh b/scripts/download_reference.sh index e86bfb7..51eb804 100755 --- a/scripts/download_reference.sh +++ b/scripts/download_reference.sh @@ -18,10 +18,13 @@ fi output_dir=`dirname ${output_file}` genome_build=${2:-"37"} +# Mitochondrial refseq moved to a different folder, hence the "mt_remote" if [ ${genome_build} = "37" ]; then - remote="ftp://ftp.ncbi.nlm.nih.gov/genomes/Homo_sapiens/ARCHIVE/BUILD.37.3/Assembled_chromosomes/seq" + remote="ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/" + mt_remote="ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_assembly_structure/non-nuclear/assembled_chromosomes/FASTA/" elif [ ${genome_build} = "38" ]; then - remote="ftp://ftp.ncbi.nlm.nih.gov/genomes/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.109/Assembled_chromosomes/seq/" + remote="ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/" + mt_remote="ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_structure/non-nuclear/assembled_chromosomes/FASTA/" else echo "Error: Unsupported genome build ${genome_build}, valid values are 37,38" exit 1 @@ -30,16 +33,21 @@ fi temp_dir=`mktemp -d 2>/dev/null || mktemp -d -t ${output_dir}` pushd ${temp_dir} -for chrom in `seq 1 22` X Y MT +for chrom in `seq 1 22` X Y do - wget ${remote}/*_ref_*chr${chrom}.fa.gz + wget ${remote}/chr${chrom}.fna.gz done +wget ${mt_remote}/chrMT.fna.gz -for chrom in `seq 1 22` X Y MT +build_fa(){ + echo ">${1}" >> ${2} + gunzip -c chr${1}.fna.gz | grep -v ">" >> "${2}" +} +for chrom in `seq 1 22` X Y do - echo ">${chrom}" >> ${output_file} - gunzip -c *_ref_*chr${chrom}.fa.gz | grep -v ">" >> "${output_file}" + build_fa $chrom $output_file done +build_fa "MT" $output_file if hash samtools 2>/dev/null; then samtools faidx ${output_file}