Skip to content

Commit

Permalink
Add rare variant mode argument option and grpmax filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
jkgoodrich committed Sep 19, 2024
1 parent 183aea5 commit 0d5ff39
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 14 deletions.
103 changes: 90 additions & 13 deletions gnomad_qc/v4/assessment/calculate_per_sample_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,12 +239,13 @@ def process_test_args(
"for testing per-sample counts, we recommend using only --test-gene to "
"test that --create-filter-group-ht is working as expected."
)
if test_dataset and not create_per_sample_counts:
if test_dataset and not (create_per_sample_counts or create_intermediate):
err_msg = (
"The use of the test VDS (--test-dataset) is only relevant when also"
" testing per-sample counts (--create-per-sample-counts-ht), we "
"recommend using only --test-gene to test that --create-filter-group-ht"
" is working as expected."
" testing per-sample counts (--create-intermediate-mt-for-sample-counts"
"and/or --create-per-sample-counts-ht), we recommend using only "
"--test-gene to test that --create-filter-group-ht is working as "
"expected."
)
if err_msg:
raise ValueError(
Expand Down Expand Up @@ -360,7 +361,9 @@ def get_summary_stats_filter_groups_ht(
capture_regions: bool = False,
vep_canonical: bool = True,
vep_mane: bool = False,
rare_variants_afs: Optional[list[float]] = None,
rare_variants_afs: Optional[List[float]] = None,
rare_variants_grpmax: Optional[List[float]] = None,
rare_variant_mode: bool = False,
) -> hl.Table:
"""
Create Table annotated with an array of booleans indicating whether a variant belongs to certain filter groups.
Expand Down Expand Up @@ -390,6 +393,10 @@ def get_summary_stats_filter_groups_ht(
:param vep_mane: Whether to filter to only MANE transcripts. Default is False.
:param rare_variants_afs: Optional list of allele frequency thresholds to use for
rare variant filtering.
:param rare_variants_grpmax: Optional list of grpmax thresholds to use for rare
variant filtering.
:param rare_variant_mode: Whether to only include the rare variant filter groups
and variant_qc as the common filter groups. Default is False.
:return: Table containing an ArrayExpression of filter groups for summary stats.
"""
# Filter to only canonical or MANE transcripts if requested and get the most
Expand Down Expand Up @@ -419,6 +426,8 @@ def get_summary_stats_filter_groups_ht(

# Create filter expressions for LCR, variant QC filters, and rare variant AFs if
# requested.
grpmax_expr = ht.grpmax
grpmax_expr = grpmax_expr.gnomad.AF if "gnomad" in grpmax_expr else grpmax_expr.AF
filter_exprs = {
"all_variants": hl.literal(True),
**(get_capture_filter_exprs(ht) if capture_regions else {}),
Expand All @@ -427,18 +436,40 @@ def get_summary_stats_filter_groups_ht(
filter_lcr=True,
filter_expr=ht.filters,
freq_expr=ht.freq[0].AF,
grpmax_expr=grpmax_expr,
max_af=rare_variants_afs,
max_grpmax=rare_variants_grpmax,
),
**csq_filter_expr,
}

# Create the metadata for all requested filter groups.
ss_filters = deepcopy(SUM_STAT_FILTERS)
ss_filters["max_af"] = rare_variants_afs
if rare_variants_afs:
ss_filters["max_af"] = rare_variants_afs
if rare_variants_grpmax:
ss_filters["max_grpmax"] = rare_variants_grpmax

# If rare_variant_mode is True, then only include the rare variant filter groups
# and variant_qc as the common filter groups.
common_filters = deepcopy(COMMON_FILTERS)
common_filter_combos = deepcopy(COMMON_FILTER_COMBOS)
if rare_variant_mode:
common_filter_combos = []
rare_variant_max = {}
if rare_variants_afs:
rare_variant_max["max_af"] = max(rare_variants_afs)
common_filters["max_af"] = rare_variants_afs
common_filter_combos.append(["variant_qc", "max_af"])
if rare_variants_grpmax:
rare_variant_max["max_grpmax"] = max(rare_variants_grpmax)
common_filters["max_grpmax"] = rare_variants_grpmax
common_filter_combos.append(["variant_qc", "max_grpmax"])

filter_group_meta = get_summary_stats_filter_group_meta(
ss_filters,
common_filter_combos=COMMON_FILTER_COMBOS,
common_filter_override=COMMON_FILTERS,
common_filter_combos=common_filter_combos,
common_filter_override=common_filters,
lof_filter_combos=LOF_FILTER_COMBOS,
lof_filter_override=LOF_FILTERS_FOR_COMBO,
filter_key_rename=MAP_FILTER_FIELD_TO_META,
Expand Down Expand Up @@ -498,7 +529,20 @@ def get_summary_stats_filter_groups_ht(
logger.info("Filter groups for summary stats: %s", filter_group_meta)

# Filter to only variants that are not in low confidence regions.
return ht.filter(ht._no_lcr).drop("_no_lcr")
ht.filter(ht._no_lcr).drop("_no_lcr")

# If rare_variant_mode is True, then filter to only the variants that have an AF
# less than the maximum AF in the rare_variants_afs list or a grpmax less than the
# maximum grpmax in the rare_variants_grpmax list. Variants with a missing AF or
# grpmax will be also be kept to avoid filtering variants that are found in
# non-popmax populations.
if rare_variant_mode:
idx = [
ht.filter_group_meta.index({k: str(v)}) for k, v in rare_variant_max.items()
]
ht = ht.filter(hl.any(lambda i: hl.or_else(ht.filter_groups[i], True), idx))

return ht


def load_mt_for_sample_counts(
Expand Down Expand Up @@ -1144,9 +1188,7 @@ def get_pipeline_resources(
"v4 release HT": {"release_ht": release_sites(data_type=data_type)}
},
output_resources={
"filter_groups_ht": get_summary_stats_filtering_groups(
data_type=data_type, test=test
)
"filter_groups_ht": get_summary_stats_filtering_groups(**res_base_args)
},
)

Expand Down Expand Up @@ -1248,6 +1290,20 @@ def main(args):
sex_chr_only = args.sex_chr_only_stats
exomes = data_type == "exomes"
rare_variants_afs = args.rare_variants_afs
rare_variants_grpmax = args.rare_variants_grpmax
run_rare_variant_mode = args.run_rare_variant_mode
custom_suffix = args.custom_suffix
if custom_suffix or run_rare_variant_mode:
custom_suffix = ("rare_variants" if run_rare_variant_mode else "") + (
f".{custom_suffix}" if custom_suffix else ""
)
if run_rare_variant_mode and (
create_per_sample_counts or combine_chr_stats or aggregate_stats
):
raise NotImplementedError(
"Running rare variant mode with create_per_sample_counts, "
"combine_chr_stats, or aggregate_stats is not implemented yet."
)

if autosomes_only and sex_chr_only:
raise ValueError(
Expand Down Expand Up @@ -1292,7 +1348,7 @@ def main(args):
by_ancestry=args.by_ancestry,
by_subset=args.by_subset,
overwrite=overwrite,
custom_suffix=args.custom_suffix,
custom_suffix=custom_suffix,
)
meta_ht = per_sample_stats_resources.meta_ht.ht()

Expand All @@ -1314,6 +1370,7 @@ def main(args):
res.check_resource_existence()

ht = res.release_ht.ht()
ht.describe()
ht = ht._filter_partitions(test_partitions) if test_partitions else ht
ht = hl.filter_intervals(ht, filter_intervals) if test_gene else ht

Expand All @@ -1323,6 +1380,8 @@ def main(args):
vep_canonical=args.vep_canonical,
vep_mane=args.vep_mane,
rare_variants_afs=rare_variants_afs,
rare_variants_grpmax=rare_variants_grpmax,
rare_variant_mode=run_rare_variant_mode,
).write(res.filter_groups_ht.path, overwrite=overwrite)

if create_intermediate:
Expand Down Expand Up @@ -1477,6 +1536,17 @@ def main(args):
choices=["exomes", "genomes"],
help="Data type (exomes or genomes) to produce summary stats for.",
)
parser.add_argument(
"--run-rare-variant-mode",
help=(
"Run in rare variant mode. This will filter to variants with "
"AF < max(--rare-variants-afs) or grpmax < max(--rare-variants-grpmax)."
"It also uses the max AFs and max grpmax cutoffs as common filters in the "
"filter groups to get all filtering stratifications by AF and grpmax "
"cutoff."
),
action="store_true",
)
parser.add_argument(
"--create-filter-group-ht",
help="Create Table of filter groups for summary stats.",
Expand Down Expand Up @@ -1534,10 +1604,17 @@ def main(args):
)
parser.add_argument(
"--rare-variants-afs",
nargs="+",
type=float,
default=SUM_STAT_FILTERS["max_af"],
help="The allele frequency threshold to use for rare variants.",
)
parser.add_argument(
"--rare-variants-grpmax",
type=float,
nargs="+",
help="The genetic ancestry group max allele frequency threshold to use for rare variants.",
)
parser.add_argument(
"--vep-canonical",
help="Whether to filter to only canonical transcripts. when using --by-csqs.",
Expand Down
9 changes: 8 additions & 1 deletion gnomad_qc/v4/resources/assessment.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def _assessment_root(
def get_summary_stats_filtering_groups(
data_type: str = "exomes",
test: bool = False,
suffix: str = None,
) -> VersionedTableResource:
"""
Get the filtering groups used in the summary stats.
Expand All @@ -38,13 +39,19 @@ def get_summary_stats_filtering_groups(
"genomes".
:param test: Whether to use a tmp path for analysis of the test VDS instead of the
full v4 VDS.
:param suffix: Additional name to append, containing some method or filtering
information.
:return: Filtering groups used in the summary stats.
"""
return VersionedTableResource(
CURRENT_RELEASE,
{
version: TableResource(
f"{_assessment_root(version=version, test=test, data_type=data_type)}/gnomad.{data_type}.v{version}.per_sample_filtering_groups.ht"
f"{_assessment_root(version=version, test=test, data_type=data_type)}"
f"/gnomad.{data_type}"
f".v{version}"
f"{'.'+suffix if suffix else ''}"
f".per_sample_filtering_groups.ht"
)
for version in RELEASES
},
Expand Down

0 comments on commit 0d5ff39

Please sign in to comment.