From 4bb04c43275e1c17dfbd6bba48869eee519bb8b2 Mon Sep 17 00:00:00 2001 From: Anna Cuomo Date: Wed, 25 Sep 2024 10:02:21 +1000 Subject: [PATCH] Address Katie's get_genotype_vcf.py issue (#71) (#144) * make ld prune params customisable * default flag vals in config * update readme about ld * keep autosome chroms only fore vre * update readme re autosome chroms * only subset if necessary * reduce checkpoints, remove use of checkpoint_mt * remove most checkpoints --- README.md | 8 +++++++- get_genotype_vcf.py | 45 ++++++++++++++++++++------------------------- 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 58ba76f6..aedcabc0 100644 --- a/README.md +++ b/README.md @@ -41,10 +41,16 @@ Outputs: * VCF files containing all retained rare variants (one per chromosome) + corresponding index files (`.csi`) * plink object for only 2,000 variants (minor allele count > 20), after LD pruning - this is for the estimation of the variance ratio (VRE plink files) -### Notes (get_genotype_vcf.py) +### Notes SAIGE-QTL allows numeric chromosomes only, so both the .bim and the VCF files are modified in this script to remove the 'chr' notation (so that e.g., 'chr1' becomes '1'). +For the VRE estimation, we need to select 2,000 (this number can be customised) variables at random, except we need them to be common enough (MAC>20, also customisable), and we want them to be somewhat independent, to be more representative of the entire cohort. +We also subset to only variants on autosome chromosomes. +To achieve this, we perform LD pruning of the variables so that variants in strong LD get pruned to one in each LD block. +Because the LD pruning operation is very costly, we downsample the variants first (to 1% by default). +The LD pruning parameters can be user-adjusted, with default values as described in the [methods's documentation](https://hail.is/docs/0.2/guides/genetics.html#pruning-variants-in-linkage-disequilibrium). + ## Get sample covariates Script: `get_sample_covariates.py` diff --git a/get_genotype_vcf.py b/get_genotype_vcf.py index f40cc96a..ceaac68c 100644 --- a/get_genotype_vcf.py +++ b/get_genotype_vcf.py @@ -218,6 +218,8 @@ def remove_chr_from_bim(input_bim: str, output_bim: str, bim_renamed: str): '--relateds-to-drop-path', default='gs://cpg-bioheart-test/large_cohort/v1-0/relateds_to_drop.ht', ) +@click.option('--ld-prune-r2', default=0.2) +@click.option('--ld-prune-bp-window-size', default=500000) @click.command() def main( vds_path: str, @@ -230,6 +232,8 @@ def main( exclude_indels: bool, plink_job_storage: str, relateds_to_drop_path: str, + ld_prune_r2: float, + ld_prune_bp_window_size: int, ): """ Write genotypes as VCF @@ -316,13 +320,13 @@ def main( plink_existence_outcome = can_reuse(vre_bim_path) logging.info(f'Does {vre_bim_path} exist? {plink_existence_outcome}') if not plink_existence_outcome: + # keep autosome chromosomes only + vds = hl.vds.filter_chromosomes(vds, keep_autosomes=True) + # split multiallelic loci pre densifying to mt vds = hl.vds.split_multi(vds, filter_changed_loci=True) + # densify to mt mt = hl.vds.to_dense_mt(vds) - # set a checkpoint, and either re-use or write - post_dense_checkpoint = output_path('post_dense_checkpoint.mt', category='tmp') - mt = checkpoint_mt(mt, post_dense_checkpoint) - # filter out related samples from vre too # this will get dropped as the vds file will already be clean related_ht = hl.read_table(relateds_to_drop_path) @@ -330,12 +334,6 @@ def main( related_samples = hl.literal(related_samples) mt = mt.filter_cols(~related_samples.contains(mt['s'])) - # set a checkpoint, and either re-use or write - post_unrelated_checkpoint = output_path( - 'post_unrelated_checkpoint.mt', category='tmp' - ) - mt = checkpoint_mt(mt, post_unrelated_checkpoint) - # again filter for biallelic SNPs mt = mt.filter_rows( ~(mt.was_split) # biallelic (exclude multiallelic) @@ -344,34 +342,31 @@ def main( ) mt = hl.variant_qc(mt) - print(vre_mac_threshold) - print(mt.count()) - # minor allele count (MAC) > {vre_mac_threshold} vre_mt = mt.filter_rows(mt.variant_qc.AC[1] > vre_mac_threshold) - # set a checkpoint, and either re-use or write - post_common_checkpoint = output_path( - 'common_reduced_checkpoint.mt', category='tmp' - ) - vre_mt = checkpoint_mt(vre_mt, post_common_checkpoint) - if (n_ac_vars := vre_mt.count_rows()) == 0: raise ValueError('No variants left, exiting!') logging.info(f'MT filtered to common enough variants, {n_ac_vars} left') # since pruning is very costly, subset first a bit random.seed(0) - vre_mt = vre_mt.sample_rows(p=0.01) - logging.info('subset completed') + if n_ac_vars > {vre_n_markers * 100}: + vre_mt = vre_mt.sample_rows(p=0.01) + logging.info('subset completed') + + # set a checkpoint, and either re-use or write + post_downsampling_checkpoint = output_path( + 'common_subset_checkpoint.mt', category='tmp' + ) + vre_mt = checkpoint_mt(vre_mt, post_downsampling_checkpoint, force=True) # perform LD pruning - pruned_variant_table = hl.ld_prune(vre_mt.GT, r2=0.2, bp_window_size=500000) + pruned_variant_table = hl.ld_prune( + vre_mt.GT, r2=ld_prune_r2, bp_window_size=ld_prune_bp_window_size + ) vre_mt = vre_mt.filter_rows(hl.is_defined(pruned_variant_table[vre_mt.row_key])) - post_prune_checkpoint = output_path('post_prune_checkpoint.mt', category='tmp') - vre_mt = checkpoint_mt(vre_mt, post_prune_checkpoint) - logging.info(f'pruning completed, {vre_mt.count_rows()} variants left') # randomly sample {vre_n_markers} variants random.seed(0)