From 1fefdbb7aed02016c4276080c303b85b40351cc3 Mon Sep 17 00:00:00 2001 From: Gibran Hemani Date: Thu, 24 Oct 2024 23:51:17 +0100 Subject: [PATCH] allowing pgen file format to be used to get chrx to work in regenie --- 00-extract-pruned-variants.sh | 27 ++++++++++++----- 04c-gwas.sh | 55 +++++++++++++++++++++++++++++++++++ utils/bgen_to_pgen.sh | 38 ++++++++++++++++++++++++ utils/update_bgen.sh | 15 ++++++++-- 4 files changed, 124 insertions(+), 11 deletions(-) create mode 100755 04c-gwas.sh create mode 100755 utils/bgen_to_pgen.sh diff --git a/00-extract-pruned-variants.sh b/00-extract-pruned-variants.sh index 32422b7..e8845ea 100755 --- a/00-extract-pruned-variants.sh +++ b/00-extract-pruned-variants.sh @@ -71,14 +71,25 @@ for i in $(seq 1 $nchr) do bgen=$(awk -v i=$i 'NR==i { print $1 }' ${genotype_input_list}) sample=$(awk -v i=$i 'NR==i { print $2 }' ${genotype_input_list}) - ./bin/plink2 \ - --bgen ${bgen} ref-first \ - --sample ${sample} \ - --extract range ${prunefile} \ - --make-bed \ - --out ${genotype_processed_dir}/bgen_extract/$(basename ${bgen} .bgen) \ - --threads ${env_threads} - echo "${genotype_processed_dir}/bgen_extract/$(basename ${bgen} .bgen)" >> ${genotype_processed_dir}/bgen_extract/mergelist + # check if $sample is empty - this would mean it's a pgen fileset + if [ -z "$sample" ]; then + ./bin/plink2 \ + --bgen ${bgen} \ + --sample ${sample} \ + --extract range ${prunefile} \ + --make-bed \ + --out ${genotype_processed_dir}/bgen_extract/$(basename ${bgen} .bgen) \ + --threads ${env_threads} + echo "${genotype_processed_dir}/bgen_extract/$(basename ${bgen})" >> ${genotype_processed_dir}/bgen_extract/mergelist + else + ./bin/plink2 \ + --bgen ${bgen} ref-first \ + --sample ${sample} \ + --extract range ${prunefile} \ + --make-bed \ + --out ${genotype_processed_dir}/bgen_extract/$(basename ${bgen} .bgen) \ + --threads ${env_threads} + echo "${genotype_processed_dir}/bgen_extract/$(basename ${bgen} .bgen)" >> ${genotype_processed_dir}/bgen_extract/mergelist done ./bin/plink2 \ diff --git a/04c-gwas.sh b/04c-gwas.sh new file mode 100755 index 0000000..b9f384d --- /dev/null +++ b/04c-gwas.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +# strict stop if there are any errors +set -e + +# get environmental variables +source config.env + +# create results directory +mkdir -p ${results_dir}/04 + +# log everything from this script to a logfile in the results director +exec &> >(tee ${results_dir}/04/logfile_aggregate) + +nchr=$(cat ${genotype_input_list} | grep -c '^') +echo $nchr + +Rscript ${results_dir}/04 $nchr ${phenotype_processed_dir}/phenolist + + + +nphen=$(cat ${phenotype_processed_dir}/phenolist | grep -c '^') + +cat ${phenotype_processed_dir}/phenolist | xargs basename + +phenotype_processed_dir="/local-scratch/projects/Lifecourse-GWAS/gib/alspac/phen_proc2" +echo $phenotype_processed_dir + +gwas=${phenotype_processed_dir}/$(cat ${phenotype_processed_dir}/phenolist | head -n 10 | tail -n 1) +echo $gwas +for gwas in $(cat ${phenotype_processed_dir}/phenolist) +do + bn=$(basename $gwas | sed "s/.phen$//g") + echo $bn + out=${results_dir}/04/${bn}.regenie.gz + > ${out} + echo $out + # for i in 1:nchr + for i in $(seq 1 $nchr) + do + cat ${phenotype_processed_dir}/regenie/step2_${i}_${bn}.regenie.gz >> $out + done +done + +ls -l /local-scratch/projects/Lifecourse-GWAS/gib/alspac/phen_proc2/regenie/step2_*_bmi_10-11_both.regenie.gz + +ls -lh $out +ls -lh $out + + +less + +libr + + diff --git a/utils/bgen_to_pgen.sh b/utils/bgen_to_pgen.sh new file mode 100755 index 0000000..4a15840 --- /dev/null +++ b/utils/bgen_to_pgen.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +set -e + +source config.env + +pgendir=$1 + +if [ -z $pgendir ]; then + echo "Usage: ./bgen_to_pgen.sh " + exit 1 +fi + +mkdir -p ${pgendir} + +nchr=$(cat ${genotype_input_list} | grep -c '^') +dn=$(head -n 1 ${genotype_input_list} | awk '{ print $1 }' | xargs dirname) +mkdir -p $dn/pgen + +tf=$(mktemp) +for i in $(seq 1 ${nchr}) +do + bgen=$(awk -v i=$i 'NR==i { print $1 }' ${genotype_input_list}) + sample=$(awk -v i=$i 'NR==i { print $2 }' ${genotype_input_list}) + bn=$(basename $bgen .bgen) + dn=$(dirname $bgen) + + ./bin/plink2 --bgen ${bgen} ref-first --sample ${sample} --make-pgen --out ${pgendir}/${bn} --threads ${env_threads} + echo "${pgendir}/${bn}" >> $tf +done + +cp ${genotype_input_list} ${genotype_input_list}.original +mv ${tf} ${genotype_input_list} + +echo "Original bgen files are now listed in ${genotype_input_list}.original" +echo "New pgen files are now listed in ${genotype_input_list}" + +echo "Successfully converted to bgen files to pgen" diff --git a/utils/update_bgen.sh b/utils/update_bgen.sh index 7ec38dc..a5576aa 100755 --- a/utils/update_bgen.sh +++ b/utils/update_bgen.sh @@ -4,6 +4,15 @@ set -e source config.env +newdir=$1 + +if [ -z $newdir ]; then + echo "Usage: ./bgen_to_pgen.sh " + exit 1 +fi + +mkdir -p ${newdir} + nchr=$(cat ${genotype_input_list} | grep -c '^') dn=$(head -n 1 ${genotype_input_list} | awk '{ print $1 }' | xargs dirname) mkdir -p $dn/bgen1.2 @@ -16,9 +25,9 @@ do bn=$(basename $bgen .bgen) dn=$(dirname $bgen) - ./bin/plink2 --bgen ${bgen} ref-first --sample ${sample} --export bgen-1.2 --out ${dn}/bgen1.2/${bn} --threads ${env_threads} - ./bin/bgenix -g ${dn}/bgen1.2/${bn}.bgen -index -clobber - echo "${dn}/bgen1.2/${bn}.bgen ${dn}/bgen1.2/${bn}.sample" >> $tf + ./bin/plink2 --bgen ${bgen} ref-first --sample ${sample} --export bgen-1.2 --out ${newdir}/${bn} --threads ${env_threads} + ./bin/bgenix -g ${newdir}/${bn}.bgen -index -clobber + echo "${newdir}/${bn}.bgen ${newdir}/${bn}.sample" >> $tf done cp ${genotype_input_list} ${genotype_input_list}.original