Address Katie's get_genotype_vcf.py issue (#71) (#144)

* make ld prune params customisable * default flag vals in config * update readme about ld * keep autosome chroms only fore vre * update readme re autosome chroms * only subset if necessary * reduce checkpoints, remove use of checkpoint_mt * remove most checkpoints
populationgenomics · Sep 25, 2024 · 4bb04c4 · 4bb04c4
1 parent 1d1b9bf
commit 4bb04c4
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 26 deletions.
diff --git a/README.md b/README.md
@@ -41,10 +41,16 @@ Outputs:
 * VCF files containing all retained rare variants (one per chromosome) + corresponding index files (`.csi`)
 * plink object for only 2,000 variants (minor allele count > 20), after LD pruning - this is for the estimation of the variance ratio (VRE plink files)
 
-### Notes (get_genotype_vcf.py)
+### Notes
 
 SAIGE-QTL allows numeric chromosomes only, so both the .bim and the VCF files are modified in this script to remove the 'chr' notation (so that e.g., 'chr1' becomes '1').
 
+For the VRE estimation, we need to select 2,000 (this number can be customised) variables at random, except we need them to be common enough (MAC>20, also customisable), and we want them to be somewhat independent, to be more representative of the entire cohort.
+We also subset to only variants on autosome chromosomes.
+To achieve this, we perform LD pruning of the variables so that variants in strong LD get pruned to one in each LD block.
+Because the LD pruning operation is very costly, we downsample the variants first (to 1% by default).
+The LD pruning parameters can be user-adjusted, with default values as described in the [methods's documentation](https://hail.is/docs/0.2/guides/genetics.html#pruning-variants-in-linkage-disequilibrium).
+
 ## Get sample covariates
 
 Script: `get_sample_covariates.py`

diff --git a/get_genotype_vcf.py b/get_genotype_vcf.py
@@ -218,6 +218,8 @@ def remove_chr_from_bim(input_bim: str, output_bim: str, bim_renamed: str):
     '--relateds-to-drop-path',
     default='gs://cpg-bioheart-test/large_cohort/v1-0/relateds_to_drop.ht',
 )
+@click.option('--ld-prune-r2', default=0.2)
+@click.option('--ld-prune-bp-window-size', default=500000)
 @click.command()
 def main(
     vds_path: str,
@@ -230,6 +232,8 @@ def main(
     exclude_indels: bool,
     plink_job_storage: str,
     relateds_to_drop_path: str,
+    ld_prune_r2: float,
+    ld_prune_bp_window_size: int,
 ):
     """
     Write genotypes as VCF
@@ -316,26 +320,20 @@ def main(
     plink_existence_outcome = can_reuse(vre_bim_path)
     logging.info(f'Does {vre_bim_path} exist? {plink_existence_outcome}')
     if not plink_existence_outcome:
+        # keep autosome chromosomes only
+        vds = hl.vds.filter_chromosomes(vds, keep_autosomes=True)
+        # split multiallelic loci pre densifying to mt
         vds = hl.vds.split_multi(vds, filter_changed_loci=True)
+        # densify to mt
         mt = hl.vds.to_dense_mt(vds)
 
-        # set a checkpoint, and either re-use or write
-        post_dense_checkpoint = output_path('post_dense_checkpoint.mt', category='tmp')
-        mt = checkpoint_mt(mt, post_dense_checkpoint)
-
         # filter out related samples from vre too
         # this will get dropped as the vds file will already be clean
         related_ht = hl.read_table(relateds_to_drop_path)
         related_samples = related_ht.s.collect()
         related_samples = hl.literal(related_samples)
         mt = mt.filter_cols(~related_samples.contains(mt['s']))
 
-        # set a checkpoint, and either re-use or write
-        post_unrelated_checkpoint = output_path(
-            'post_unrelated_checkpoint.mt', category='tmp'
-        )
-        mt = checkpoint_mt(mt, post_unrelated_checkpoint)
-
         # again filter for biallelic SNPs
         mt = mt.filter_rows(
             ~(mt.was_split)  # biallelic (exclude multiallelic)
@@ -344,34 +342,31 @@ def main(
         )
         mt = hl.variant_qc(mt)
 
-        print(vre_mac_threshold)
-        print(mt.count())
-
         # minor allele count (MAC) > {vre_mac_threshold}
         vre_mt = mt.filter_rows(mt.variant_qc.AC[1] > vre_mac_threshold)
 
-        # set a checkpoint, and either re-use or write
-        post_common_checkpoint = output_path(
-            'common_reduced_checkpoint.mt', category='tmp'
-        )
-        vre_mt = checkpoint_mt(vre_mt, post_common_checkpoint)
-
         if (n_ac_vars := vre_mt.count_rows()) == 0:
             raise ValueError('No variants left, exiting!')
         logging.info(f'MT filtered to common enough variants, {n_ac_vars} left')
 
         # since pruning is very costly, subset first a bit
         random.seed(0)
-        vre_mt = vre_mt.sample_rows(p=0.01)
-        logging.info('subset completed')
+        if n_ac_vars > {vre_n_markers * 100}:
+            vre_mt = vre_mt.sample_rows(p=0.01)
+            logging.info('subset completed')
+
+        # set a checkpoint, and either re-use or write
+        post_downsampling_checkpoint = output_path(
+            'common_subset_checkpoint.mt', category='tmp'
+        )
+        vre_mt = checkpoint_mt(vre_mt, post_downsampling_checkpoint, force=True)
 
         # perform LD pruning
-        pruned_variant_table = hl.ld_prune(vre_mt.GT, r2=0.2, bp_window_size=500000)
+        pruned_variant_table = hl.ld_prune(
+            vre_mt.GT, r2=ld_prune_r2, bp_window_size=ld_prune_bp_window_size
+        )
         vre_mt = vre_mt.filter_rows(hl.is_defined(pruned_variant_table[vre_mt.row_key]))
 
-        post_prune_checkpoint = output_path('post_prune_checkpoint.mt', category='tmp')
-        vre_mt = checkpoint_mt(vre_mt, post_prune_checkpoint)
-
         logging.info(f'pruning completed, {vre_mt.count_rows()} variants left')
         # randomly sample {vre_n_markers} variants
         random.seed(0)