Skip to content

Commit

Permalink
Address Katie's get_genotype_vcf.py issue (#71) (#144)
Browse files Browse the repository at this point in the history
* make ld prune params customisable

* default flag vals in config

* update readme about ld

* keep autosome chroms only fore vre

* update readme re autosome chroms

* only subset if necessary

* reduce checkpoints, remove use of checkpoint_mt

* remove most checkpoints
  • Loading branch information
annacuomo authored Sep 25, 2024
1 parent 1d1b9bf commit 4bb04c4
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 26 deletions.
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,16 @@ Outputs:
* VCF files containing all retained rare variants (one per chromosome) + corresponding index files (`.csi`)
* plink object for only 2,000 variants (minor allele count > 20), after LD pruning - this is for the estimation of the variance ratio (VRE plink files)

### Notes (get_genotype_vcf.py)
### Notes

SAIGE-QTL allows numeric chromosomes only, so both the .bim and the VCF files are modified in this script to remove the 'chr' notation (so that e.g., 'chr1' becomes '1').

For the VRE estimation, we need to select 2,000 (this number can be customised) variables at random, except we need them to be common enough (MAC>20, also customisable), and we want them to be somewhat independent, to be more representative of the entire cohort.
We also subset to only variants on autosome chromosomes.
To achieve this, we perform LD pruning of the variables so that variants in strong LD get pruned to one in each LD block.
Because the LD pruning operation is very costly, we downsample the variants first (to 1% by default).
The LD pruning parameters can be user-adjusted, with default values as described in the [methods's documentation](https://hail.is/docs/0.2/guides/genetics.html#pruning-variants-in-linkage-disequilibrium).

## Get sample covariates

Script: `get_sample_covariates.py`
Expand Down
45 changes: 20 additions & 25 deletions get_genotype_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,8 @@ def remove_chr_from_bim(input_bim: str, output_bim: str, bim_renamed: str):
'--relateds-to-drop-path',
default='gs://cpg-bioheart-test/large_cohort/v1-0/relateds_to_drop.ht',
)
@click.option('--ld-prune-r2', default=0.2)
@click.option('--ld-prune-bp-window-size', default=500000)
@click.command()
def main(
vds_path: str,
Expand All @@ -230,6 +232,8 @@ def main(
exclude_indels: bool,
plink_job_storage: str,
relateds_to_drop_path: str,
ld_prune_r2: float,
ld_prune_bp_window_size: int,
):
"""
Write genotypes as VCF
Expand Down Expand Up @@ -316,26 +320,20 @@ def main(
plink_existence_outcome = can_reuse(vre_bim_path)
logging.info(f'Does {vre_bim_path} exist? {plink_existence_outcome}')
if not plink_existence_outcome:
# keep autosome chromosomes only
vds = hl.vds.filter_chromosomes(vds, keep_autosomes=True)
# split multiallelic loci pre densifying to mt
vds = hl.vds.split_multi(vds, filter_changed_loci=True)
# densify to mt
mt = hl.vds.to_dense_mt(vds)

# set a checkpoint, and either re-use or write
post_dense_checkpoint = output_path('post_dense_checkpoint.mt', category='tmp')
mt = checkpoint_mt(mt, post_dense_checkpoint)

# filter out related samples from vre too
# this will get dropped as the vds file will already be clean
related_ht = hl.read_table(relateds_to_drop_path)
related_samples = related_ht.s.collect()
related_samples = hl.literal(related_samples)
mt = mt.filter_cols(~related_samples.contains(mt['s']))

# set a checkpoint, and either re-use or write
post_unrelated_checkpoint = output_path(
'post_unrelated_checkpoint.mt', category='tmp'
)
mt = checkpoint_mt(mt, post_unrelated_checkpoint)

# again filter for biallelic SNPs
mt = mt.filter_rows(
~(mt.was_split) # biallelic (exclude multiallelic)
Expand All @@ -344,34 +342,31 @@ def main(
)
mt = hl.variant_qc(mt)

print(vre_mac_threshold)
print(mt.count())

# minor allele count (MAC) > {vre_mac_threshold}
vre_mt = mt.filter_rows(mt.variant_qc.AC[1] > vre_mac_threshold)

# set a checkpoint, and either re-use or write
post_common_checkpoint = output_path(
'common_reduced_checkpoint.mt', category='tmp'
)
vre_mt = checkpoint_mt(vre_mt, post_common_checkpoint)

if (n_ac_vars := vre_mt.count_rows()) == 0:
raise ValueError('No variants left, exiting!')
logging.info(f'MT filtered to common enough variants, {n_ac_vars} left')

# since pruning is very costly, subset first a bit
random.seed(0)
vre_mt = vre_mt.sample_rows(p=0.01)
logging.info('subset completed')
if n_ac_vars > {vre_n_markers * 100}:
vre_mt = vre_mt.sample_rows(p=0.01)
logging.info('subset completed')

# set a checkpoint, and either re-use or write
post_downsampling_checkpoint = output_path(
'common_subset_checkpoint.mt', category='tmp'
)
vre_mt = checkpoint_mt(vre_mt, post_downsampling_checkpoint, force=True)

# perform LD pruning
pruned_variant_table = hl.ld_prune(vre_mt.GT, r2=0.2, bp_window_size=500000)
pruned_variant_table = hl.ld_prune(
vre_mt.GT, r2=ld_prune_r2, bp_window_size=ld_prune_bp_window_size
)
vre_mt = vre_mt.filter_rows(hl.is_defined(pruned_variant_table[vre_mt.row_key]))

post_prune_checkpoint = output_path('post_prune_checkpoint.mt', category='tmp')
vre_mt = checkpoint_mt(vre_mt, post_prune_checkpoint)

logging.info(f'pruning completed, {vre_mt.count_rows()} variants left')
# randomly sample {vre_n_markers} variants
random.seed(0)
Expand Down

0 comments on commit 4bb04c4

Please sign in to comment.